Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch multi-process Excluding Merge-Ins
This is equivalent to a diff from 3ffef65b7c to 8d149a52d3
2012-09-05
| ||
11:23 | Merge in multi-process branch. check-in: ecae27d73a user: dan tags: trunk | |
10:32 | Fix a bug in intra-process connection locking. Turn on multi-process mode by default. Leaf check-in: 8d149a52d3 user: dan tags: multi-process | |
2012-09-04
| ||
20:17 | Defer closing file descriptors until all fcntl() locks have been dropped. check-in: 3d0cf4bb36 user: dan tags: multi-process | |
2012-07-16
| ||
00:03 | Fix errors in the examples of numeric encoding on the key-encoding wiki page. check-in: 10befd97f8 user: drh tags: trunk | |
2012-07-07
| ||
19:52 | Merge trunk changes. check-in: d8523ddd93 user: dan tags: multi-process | |
12:21 | minor doc update. check-in: 3ffef65b7c user: stephan tags: trunk | |
11:44 | merged in lsm_env-xsize branch. check-in: 3dd0037efb user: stephan tags: trunk | |
Changes to lsm-test/lsmtest1.c.
︙ | ︙ | |||
56 57 58 59 60 61 62 | return zRet; } static int testControlDb(TestDb **ppDb){ #ifdef HAVE_KYOTOCABINET return tdb_open("kyotocabinet", "tmp.db", 1, ppDb); #else | | > > > > > > > > > > > > > | 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | return zRet; } static int testControlDb(TestDb **ppDb){ #ifdef HAVE_KYOTOCABINET return tdb_open("kyotocabinet", "tmp.db", 1, ppDb); #else return tdb_open("sqlite3", ":memory:", 1, ppDb); #endif } void testDatasourceFetch( TestDb *pDb, /* Database handle */ Datasource *pData, int iKey, int *pRc /* IN/OUT: Error code */ ){ void *pKey; int nKey; /* Database key to query for */ void *pVal; int nVal; /* Expected result of query */ testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal); testFetch(pDb, pKey, nKey, pVal, nVal, pRc); } /* ** This function is called to test that the contents of database pDb ** are as expected. In this case, expected is defined as containing ** key-value pairs iFirst through iLast, inclusive, from data source ** pData. In other words, a loop like the following could be used to ** construct a database with identical contents from scratch. |
︙ | ︙ |
Changes to lsm-test/lsmtest5.c.
︙ | ︙ | |||
522 523 524 525 526 527 528 | } /* Open a new database connection. Initialize the pseudo-random number ** argument based on the thread number. */ iPrng = testPrngValue(iThread); pDb = testOpen(p->zSystem, 0, &rc); | > | > | 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 | } /* Open a new database connection. Initialize the pseudo-random number ** argument based on the thread number. */ iPrng = testPrngValue(iThread); pDb = testOpen(p->zSystem, 0, &rc); if( rc==0 ){ tdb_lsm_config_work_hook(pDb, xMt1Work, 0); } /* Loop until either an error occurs or some other thread sets the ** halt flag. */ while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){ int iKey; /* Perform a read operation on an arbitrarily selected key. */ |
︙ | ︙ |
Changes to lsm-test/lsmtest_main.c.
︙ | ︙ | |||
169 170 171 172 173 174 175 | res = memcmp(pKey1, pKey2, MIN(nKey1, nKey2)); if( res==0 ){ res = nKey1 - nKey2; } return res; } | | > > | > | 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | res = memcmp(pKey1, pKey2, MIN(nKey1, nKey2)); if( res==0 ){ res = nKey1 - nKey2; } return res; } int test_scan_debug = 0; static void scanCompareCb( void *pCtx, void *pKey, int nKey, void *pVal, int nVal ){ ScanResult *p = (ScanResult *)pCtx; u8 *aKey = (u8 *)pKey; u8 *aVal = (u8 *)pVal; int i; if( test_scan_debug ) printf("%.*s\n", nKey, (char *)pKey); #if 0 if( test_scan_debug ) printf("%.20s\n", (char *)pVal); #endif #if 0 /* Check tdb_fetch() matches */ int rc = 0; testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc); assert( rc==0 ); #endif |
︙ | ︙ | |||
456 457 458 459 460 461 462 | return (nFail!=0); } static lsm_db *configure_lsm_db(TestDb *pDb){ lsm_db *pLsm; pLsm = tdb_lsm(pDb); if( pLsm ){ | | | 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 | return (nFail!=0); } static lsm_db *configure_lsm_db(TestDb *pDb){ lsm_db *pLsm; pLsm = tdb_lsm(pDb); if( pLsm ){ tdb_lsm_config_str(pDb, "mmap=1 autowork=1 nmerge=4 worker_nmerge=4"); } return pLsm; } static void do_speed_write_hook2( void *pCtx, |
︙ | ︙ |
Changes to lsm-test/lsmtest_tdb3.c.
︙ | ︙ | |||
308 309 310 311 312 313 314 315 316 317 318 319 320 321 | } static int testEnvUnlink(lsm_env *pEnv, const char *zFile){ lsm_env *pRealEnv = tdb_lsm_env(); unused_parameter(pEnv); return pRealEnv->xUnlink(pRealEnv, zFile); } static void doSystemCrash(LsmDb *pDb){ lsm_env *pEnv = tdb_lsm_env(); int iFile; int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector; char *zFile = pDb->zName; | > > > > > > > > > > > > > > > > > > > > > | 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | } static int testEnvUnlink(lsm_env *pEnv, const char *zFile){ lsm_env *pRealEnv = tdb_lsm_env(); unused_parameter(pEnv); return pRealEnv->xUnlink(pRealEnv, zFile); } static int testEnvLock(lsm_file *pFile, int iLock, int eType){ LsmFile *p = (LsmFile *)pFile; lsm_env *pRealEnv = tdb_lsm_env(); return pRealEnv->xLock(p->pReal, iLock, eType); } static int testEnvShmMap(lsm_file *pFile, int iRegion, int sz, void **pp){ LsmFile *p = (LsmFile *)pFile; lsm_env *pRealEnv = tdb_lsm_env(); return pRealEnv->xShmMap(p->pReal, iRegion, sz, pp); } static void testEnvShmBarrier(void){ } static int testEnvShmUnmap(lsm_file *pFile, int bDel){ LsmFile *p = (LsmFile *)pFile; lsm_env *pRealEnv = tdb_lsm_env(); return pRealEnv->xShmUnmap(p->pReal, bDel); } static void doSystemCrash(LsmDb *pDb){ lsm_env *pEnv = tdb_lsm_env(); int iFile; int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector; char *zFile = pDb->zName; |
︙ | ︙ | |||
572 573 574 575 576 577 578 579 580 581 582 583 584 585 | { "block_size", 0, LSM_CONFIG_BLOCK_SIZE }, { "safety", 0, LSM_CONFIG_SAFETY }, { "autowork", 0, LSM_CONFIG_AUTOWORK }, { "log_size", 0, LSM_CONFIG_LOG_SIZE }, { "mmap", 0, LSM_CONFIG_MMAP }, { "use_log", 0, LSM_CONFIG_USE_LOG }, { "nmerge", 0, LSM_CONFIG_NMERGE }, { "worker_nmerge", 1, LSM_CONFIG_NMERGE }, { 0, 0 } }; const char *z = zStr; while( z[0] && pDb ){ const char *zStart; | > > | 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 | { "block_size", 0, LSM_CONFIG_BLOCK_SIZE }, { "safety", 0, LSM_CONFIG_SAFETY }, { "autowork", 0, LSM_CONFIG_AUTOWORK }, { "log_size", 0, LSM_CONFIG_LOG_SIZE }, { "mmap", 0, LSM_CONFIG_MMAP }, { "use_log", 0, LSM_CONFIG_USE_LOG }, { "nmerge", 0, LSM_CONFIG_NMERGE }, { "max_freelist", 0, LSM_CONFIG_MAX_FREELIST }, { "multi_proc", 0, LSM_CONFIG_MULTIPLE_PROCESSES }, { "worker_nmerge", 1, LSM_CONFIG_NMERGE }, { 0, 0 } }; const char *z = zStr; while( z[0] && pDb ){ const char *zStart; |
︙ | ︙ | |||
691 692 693 694 695 696 697 698 699 700 701 702 703 704 | pDb->env.xTruncate = testEnvTruncate; pDb->env.xSync = testEnvSync; pDb->env.xSectorSize = testEnvSectorSize; pDb->env.xRemap = testEnvRemap; pDb->env.xFileid = testEnvFileid; pDb->env.xClose = testEnvClose; pDb->env.xUnlink = testEnvUnlink; rc = lsm_new(&pDb->env, &pDb->db); if( rc==LSM_OK ){ lsm_config_log(pDb->db, xLog, 0); lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb); tdb_lsm_config_str((TestDb *)pDb, zCfg); rc = lsm_open(pDb->db, zFilename); | > > > > | 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 | pDb->env.xTruncate = testEnvTruncate; pDb->env.xSync = testEnvSync; pDb->env.xSectorSize = testEnvSectorSize; pDb->env.xRemap = testEnvRemap; pDb->env.xFileid = testEnvFileid; pDb->env.xClose = testEnvClose; pDb->env.xUnlink = testEnvUnlink; pDb->env.xLock = testEnvLock; pDb->env.xShmBarrier = testEnvShmBarrier; pDb->env.xShmMap = testEnvShmMap; pDb->env.xShmUnmap = testEnvShmUnmap; rc = lsm_new(&pDb->env, &pDb->db); if( rc==LSM_OK ){ lsm_config_log(pDb->db, xLog, 0); lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb); tdb_lsm_config_str((TestDb *)pDb, zCfg); rc = lsm_open(pDb->db, zFilename); |
︙ | ︙ | |||
726 727 728 729 730 731 732 | } int test_lsm_lomem_open( const char *zFilename, int bClear, TestDb **ppDb ){ | > | | 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 | } int test_lsm_lomem_open( const char *zFilename, int bClear, TestDb **ppDb ){ const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384 max_freelist=4"; return testLsmOpen(zCfg, zFilename, bClear, ppDb); } lsm_db *tdb_lsm(TestDb *pDb){ if( pDb->pMethods->xClose==test_lsm_close ){ return ((LsmDb *)pDb)->db; } |
︙ | ︙ |
Changes to src/build.c.
︙ | ︙ | |||
1394 1395 1396 1397 1398 1399 1400 | zExtra = (char *)(&pIndex->zName[nName+1]); memcpy(pIndex->zName, zName, nName+1); pIndex->pTable = pTab; pIndex->nColumn = nCol; pIndex->onError = (u8)onError; pIndex->pSchema = pTab->pSchema; | < < < < < < < < < < < < > > > > > > > > > > > > > > > > > > | < | > | 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 | zExtra = (char *)(&pIndex->zName[nName+1]); memcpy(pIndex->zName, zName, nName+1); pIndex->pTable = pTab; pIndex->nColumn = nCol; pIndex->onError = (u8)onError; pIndex->pSchema = pTab->pSchema; } *pzExtra = zExtra; return pIndex; } static int addIndexToHash(sqlite4 *db, Index *pIdx){ if( db->init.busy ){ Hash *pIdxHash = &pIdx->pSchema->idxHash; int nName = sqlite4Strlen30(pIdx->zName); Index *p; p = sqlite4HashInsert(pIdxHash, pIdx->zName, nName, pIdx); if( p ){ assert( p==pIdx ); db->mallocFailed = 1; return SQLITE4_NOMEM; } } return SQLITE4_OK; } /* ** Allocate and populate an Index structure representing an implicit ** primary key. In implicit primary key behaves similarly to the built-in ** INTEGER PRIMARY KEY columns in SQLite 3. */ static void addImplicitPrimaryKey( Parse *pParse, /* Parse context */ Table *pTab, /* Table to add implicit PRIMARY KEY to */ int iDb ){ sqlite4 *db = pParse->db; Index *pIndex; /* New index */ char *zExtra; assert( !pTab->pIndex || pTab->pIndex->eIndexType!=SQLITE4_INDEX_PRIMARYKEY ); assert( sqlite4Strlen30("binary")==6 ); pIndex = newIndex(pParse, pTab, pTab->zName, 1, OE_Abort, 1+6, &zExtra); if( addIndexToHash(db, pIndex) ){ sqlite4DbFree(db, pIndex); pIndex = 0; } if( pIndex ){ pIndex->aiColumn[0] = -1; pIndex->azColl[0] = zExtra; memcpy(zExtra, "binary", 7); pIndex->eIndexType = SQLITE4_INDEX_PRIMARYKEY; pIndex->pNext = pTab->pIndex; pTab->pIndex = pIndex; sqlite4DefaultRowEst(pIndex); |
︙ | ︙ | |||
2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 | ** in-memory database structures. */ if( db->init.busy ){ db->flags |= SQLITE4_InternChanges; if( pTblName!=0 || bPrimaryKey ){ pIndex->tnum = db->init.newTnum; } } /* If the db->init.busy is 0 then create the index on disk. This ** involves writing the index into the master table and filling in the ** index with the current table contents. ** ** The db->init.busy is 0 when the user first enters a CREATE INDEX | > | 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 | ** in-memory database structures. */ if( db->init.busy ){ db->flags |= SQLITE4_InternChanges; if( pTblName!=0 || bPrimaryKey ){ pIndex->tnum = db->init.newTnum; } if( addIndexToHash(db, pIndex) ) goto exit_create_index; } /* If the db->init.busy is 0 then create the index on disk. This ** involves writing the index into the master table and filling in the ** index with the current table contents. ** ** The db->init.busy is 0 when the user first enters a CREATE INDEX |
︙ | ︙ |
Changes to src/kvlsm.c.
︙ | ︙ | |||
438 439 440 441 442 443 444 445 446 447 | KVLsm *pNew; int rc = SQLITE4_OK; pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm)); if( pNew==0 ){ rc = SQLITE4_NOMEM; }else{ memset(pNew, 0, sizeof(KVLsm)); pNew->base.pStoreVfunc = &kvlsmMethods; pNew->base.pEnv = pEnv; | > > > > > > > < > > > > > > > > > | 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 | KVLsm *pNew; int rc = SQLITE4_OK; pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm)); if( pNew==0 ){ rc = SQLITE4_NOMEM; }else{ struct Config { const char *zParam; int eParam; } aConfig[] = { { "lsm_block_size", LSM_CONFIG_BLOCK_SIZE } }; memset(pNew, 0, sizeof(KVLsm)); pNew->base.pStoreVfunc = &kvlsmMethods; pNew->base.pEnv = pEnv; rc = lsm_new(0, &pNew->pDb); if( rc==SQLITE4_OK ){ int i; for(i=0; i<ArraySize(aConfig); i++){ const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam); if( zVal ){ int nVal = sqlite4Atoi(zVal); lsm_config(pNew->pDb, aConfig[i].eParam, &nVal); } } rc = lsm_open(pNew->pDb, zName); } if( rc!=SQLITE4_OK ){ lsm_close(pNew->pDb); sqlite4_free(pEnv, pNew); pNew = 0; |
︙ | ︙ |
Changes to src/lsm.h.
︙ | ︙ | |||
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | /* 64-bit integer type used for file offsets. */ typedef long long int lsm_i64; /* 64-bit signed integer type */ /* Forward reference */ typedef struct lsm_env lsm_env; /* Runtime environment */ /* ** Run-time environment used by LSM */ struct lsm_env { int nByte; /* Size of this structure in bytes */ int iVersion; /* Version number of this structure */ /****** file i/o ***********************************************/ void *pVfsCtx; int (*xFullpath)(lsm_env*, const char *, char *, int *); int (*xOpen)(lsm_env*, const char *, lsm_file **); int (*xRead)(lsm_file *, lsm_i64, void *, int); int (*xWrite)(lsm_file *, lsm_i64, void *, int); int (*xTruncate)(lsm_file *, lsm_i64); int (*xSync)(lsm_file *); int (*xSectorSize)(lsm_file *); int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*); int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf); int (*xClose)(lsm_file *); int (*xUnlink)(lsm_env*, const char *); /****** memory allocation ****************************************/ void *pMemCtx; void *(*xMalloc)(lsm_env*, int); /* malloc(3) function */ void *(*xRealloc)(lsm_env*, void *, int); /* realloc(3) function */ void (*xFree)(lsm_env*, void *); /* free(3) function */ | > > > > > > > > > < < | 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | /* 64-bit integer type used for file offsets. */ typedef long long int lsm_i64; /* 64-bit signed integer type */ /* Forward reference */ typedef struct lsm_env lsm_env; /* Runtime environment */ /* Candidate values for the 3rd argument to lsm_env.xLock() */ #define LSM_LOCK_UNLOCK 0 #define LSM_LOCK_SHARED 1 #define LSM_LOCK_EXCL 2 /* ** Run-time environment used by LSM */ struct lsm_env { int nByte; /* Size of this structure in bytes */ int iVersion; /* Version number of this structure */ /****** file i/o ***********************************************/ void *pVfsCtx; int (*xFullpath)(lsm_env*, const char *, char *, int *); int (*xOpen)(lsm_env*, const char *, lsm_file **); int (*xRead)(lsm_file *, lsm_i64, void *, int); int (*xWrite)(lsm_file *, lsm_i64, void *, int); int (*xTruncate)(lsm_file *, lsm_i64); int (*xSync)(lsm_file *); int (*xSectorSize)(lsm_file *); int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*); int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf); int (*xClose)(lsm_file *); int (*xUnlink)(lsm_env*, const char *); int (*xLock)(lsm_file*, int, int); int (*xShmMap)(lsm_file*, int, int, void **); void (*xShmBarrier)(void); int (*xShmUnmap)(lsm_file*, int); /****** memory allocation ****************************************/ void *pMemCtx; void *(*xMalloc)(lsm_env*, int); /* malloc(3) function */ void *(*xRealloc)(lsm_env*, void *, int); /* realloc(3) function */ void (*xFree)(lsm_env*, void *); /* free(3) function */ sqlite4_size_t (*xSize)(lsm_env*, void *); /* xSize function */ /****** mutexes ****************************************************/ void *pMutexCtx; int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */ int (*xMutexNew)(lsm_env*, lsm_mutex**); /* Get a new dynamic mutex */ void (*xMutexDel)(lsm_mutex *); /* Delete an allocated mutex */ void (*xMutexEnter)(lsm_mutex *); /* Grab a mutex */ int (*xMutexTry)(lsm_mutex *); /* Attempt to obtain a mutex */ |
︙ | ︙ | |||
163 164 165 166 167 168 169 170 | ** LSM_CONFIG_USE_LOG ** A read/write boolean parameter. True (the default) to use the log ** file normally. False otherwise. ** ** LSM_CONFIG_NMERGE ** A read/write integer parameter. The minimum number of segments to ** merge together at a time. Default value 4. */ | > > > > > > > > > > > | | | | | | | | | > > | 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | ** LSM_CONFIG_USE_LOG ** A read/write boolean parameter. True (the default) to use the log ** file normally. False otherwise. ** ** LSM_CONFIG_NMERGE ** A read/write integer parameter. The minimum number of segments to ** merge together at a time. Default value 4. ** ** LSM_CONFIG_MAX_FREELIST ** A read/write integer parameter. The maximum number of free-list ** entries that are stored in a database checkpoint (the others are ** stored elsewhere in the database). ** ** There is no reason for an application to configure or query this ** parameter. It is only present because configuring a small value ** makes certain parts of the lsm code easier to test. ** ** LSM_CONFIG_MULTIPLE_PROCESSES */ #define LSM_CONFIG_WRITE_BUFFER 1 #define LSM_CONFIG_PAGE_SIZE 2 #define LSM_CONFIG_SAFETY 3 #define LSM_CONFIG_BLOCK_SIZE 4 #define LSM_CONFIG_AUTOWORK 5 #define LSM_CONFIG_LOG_SIZE 6 #define LSM_CONFIG_MMAP 7 #define LSM_CONFIG_USE_LOG 8 #define LSM_CONFIG_NMERGE 9 #define LSM_CONFIG_MAX_FREELIST 10 #define LSM_CONFIG_MULTIPLE_PROCESSES 11 #define LSM_SAFETY_OFF 0 #define LSM_SAFETY_NORMAL 1 #define LSM_SAFETY_FULL 2 /* |
︙ | ︙ |
Changes to src/lsmInt.h.
︙ | ︙ | |||
41 42 43 44 45 46 47 | /* ** Default values for various data structure parameters. These may be ** overridden by calls to lsm_config(). */ #define LSM_PAGE_SIZE 4096 #define LSM_BLOCK_SIZE (2 * 1024 * 1024) #define LSM_TREE_BYTES (2 * 1024 * 1024) | < > > > > > > > > > > | 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | /* ** Default values for various data structure parameters. These may be ** overridden by calls to lsm_config(). */ #define LSM_PAGE_SIZE 4096 #define LSM_BLOCK_SIZE (2 * 1024 * 1024) #define LSM_TREE_BYTES (2 * 1024 * 1024) #define LSM_DEFAULT_LOG_SIZE (128*1024) #define LSM_DEFAULT_NMERGE 4 /* Places where a NULL needs to be changed to a real lsm_env pointer ** are marked with NEED_ENV */ #define NEED_ENV ((lsm_env*)0) /* Initial values for log file checksums. These are only used if the ** database file does not contain a valid checkpoint. */ #define LSM_CKSUM0_INIT 42 #define LSM_CKSUM1_INIT 42 #define LSM_META_PAGE_SIZE 4096 /* "mmap" mode is currently only used in environments with 64-bit address ** spaces. The following macro is used to test for this. */ #define LSM_IS_64_BIT (sizeof(void*)==8) #define LSM_AUTOWORK_QUANT 32 /* Minimum number of free-list entries to store in the checkpoint, assuming ** the free-list contains this many entries. i.e. if overflow is required, ** the first LSM_CKPT_MIN_FREELIST entries are stored in the checkpoint and ** the remainder in an LSM system entry. */ #define LSM_CKPT_MIN_FREELIST 6 #define LSM_CKPT_MAX_REFREE 2 #define LSM_CKPT_MIN_NONLSM (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE) typedef struct Database Database; typedef struct DbLog DbLog; typedef struct FileSystem FileSystem; typedef struct Level Level; typedef struct LogMark LogMark; typedef struct LogRegion LogRegion; typedef struct LogWriter LogWriter; |
︙ | ︙ | |||
84 85 86 87 88 89 90 91 92 93 94 95 96 97 | typedef struct Tree Tree; typedef struct TreeMark TreeMark; typedef struct TreeVersion TreeVersion; typedef struct TreeCursor TreeCursor; typedef struct Merge Merge; typedef struct MergeInput MergeInput; typedef unsigned char u8; typedef unsigned short int u16; typedef unsigned int u32; typedef lsm_i64 i64; typedef unsigned long long int u64; /* A page number is an integer. */ | > > > > > | 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | typedef struct Tree Tree; typedef struct TreeMark TreeMark; typedef struct TreeVersion TreeVersion; typedef struct TreeCursor TreeCursor; typedef struct Merge Merge; typedef struct MergeInput MergeInput; typedef struct TreeHeader TreeHeader; typedef struct ShmHeader ShmHeader; typedef struct ShmChunk ShmChunk; typedef struct ShmReader ShmReader; typedef unsigned char u8; typedef unsigned short int u16; typedef unsigned int u32; typedef lsm_i64 i64; typedef unsigned long long int u64; /* A page number is an integer. */ |
︙ | ︙ | |||
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | #define LSM_NOMEM_BKPT lsmErrorBkpt(LSM_NOMEM) #define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT) #define LSM_MISUSE_BKPT lsmErrorBkpt(LSM_MISUSE) #define unused_parameter(x) (void)(x) #define array_size(x) (sizeof(x)/sizeof(x[0])) /* ** A string that can grow by appending. */ struct LsmString { lsm_env *pEnv; /* Run-time environment */ int n; /* Size of string. -1 indicates error */ int nAlloc; /* Space allocated for z[] */ char *z; /* The string content */ }; /* ** An instance of this structure represents a point in the history of the | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | < | > | | | | | 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | #define LSM_NOMEM_BKPT lsmErrorBkpt(LSM_NOMEM) #define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT) #define LSM_MISUSE_BKPT lsmErrorBkpt(LSM_MISUSE) #define unused_parameter(x) (void)(x) #define array_size(x) (sizeof(x)/sizeof(x[0])) /* The size of each shared-memory chunk */ #define LSM_SHM_CHUNK_SIZE (32*1024) /* The number of bytes reserved at the start of each shm chunk for MM. */ #define LSM_SHM_CHUNK_HDR (3 * 4) /* The number of available read locks. */ #define LSM_LOCK_NREADER 6 /* Lock definitions */ #define LSM_LOCK_DMS1 1 #define LSM_LOCK_DMS2 2 #define LSM_LOCK_WRITER 3 #define LSM_LOCK_WORKER 4 #define LSM_LOCK_CHECKPOINTER 5 #define LSM_LOCK_READER(i) ((i) + LSM_LOCK_CHECKPOINTER + 1) /* ** Hard limit on the number of free-list entries that may be stored in ** a checkpoint (the remainder are stored as a system record in the LSM). ** See also LSM_CONFIG_MAX_FREELIST. */ #define LSM_MAX_FREELIST_ENTRIES 100 /* ** A string that can grow by appending. */ struct LsmString { lsm_env *pEnv; /* Run-time environment */ int n; /* Size of string. -1 indicates error */ int nAlloc; /* Space allocated for z[] */ char *z; /* The string content */ }; typedef struct LsmFile LsmFile; struct LsmFile { lsm_file *pFile; LsmFile *pNext; }; /* ** An instance of the following type is used to store an ordered list of ** u32 values. ** ** Note: This is a place-holder implementation. It should be replaced by ** a version that avoids making a single large allocation when the array ** contains a large number of values. For this reason, the internals of ** this object should only manipulated by the intArrayXXX() functions in ** lsm_tree.c. */ typedef struct IntArray IntArray; struct IntArray { int nAlloc; int nArray; u32 *aArray; }; /* ** An instance of this structure represents a point in the history of the ** tree structure to roll back to. Refer to comments in lsm_tree.c for ** details. */ struct TreeMark { u32 iRoot; /* Offset of root node in shm file */ u32 nHeight; /* Current height of tree structure */ u32 iWrite; /* Write offset in shm file */ u32 nChunk; /* Number of chunks in shared-memory file */ u32 iFirst; /* First chunk in linked list */ int iRollback; /* Index in lsm->rollback to revert to */ }; /* ** An instance of this structure represents a point in the database log. */ struct LogMark { i64 iOff; /* Offset into log (see lsm_log.c) */ |
︙ | ︙ | |||
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | struct DbLog { u32 cksum0; /* Checksum 0 at offset iOff */ u32 cksum1; /* Checksum 1 at offset iOff */ LogRegion aRegion[3]; /* Log file regions (see docs in lsm_log.c) */ }; /* ** Database handle structure. */ struct lsm_db { /* Database handle configuration */ lsm_env *pEnv; /* runtime environment */ int (*xCmp)(void *, int, void *, int); /* Compare function */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | > > > < > | > > > > > > > > > > | | 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | struct DbLog { u32 cksum0; /* Checksum 0 at offset iOff */ u32 cksum1; /* Checksum 1 at offset iOff */ LogRegion aRegion[3]; /* Log file regions (see docs in lsm_log.c) */ }; /* ** Tree header structure. */ struct TreeHeader { u32 iTreeId; /* Current tree id */ u32 iTransId; /* Current transaction id */ u32 iRoot; /* Offset of root node in shm file */ u32 nHeight; /* Current height of tree structure */ u32 iWrite; /* Write offset in shm file */ u32 nChunk; /* Number of chunks in shared-memory file */ u32 iFirst; /* First chunk in linked list */ u32 nByte; /* Size of current tree structure in bytes */ DbLog log; /* Current layout of log file */ i64 iCkpt; /* Id of ckpt log space is reclaimed for */ u32 aCksum[2]; /* Checksums 1 and 2. */ }; /* ** Database handle structure. ** ** mLock: ** A bitmask representing the locks currently held by the connection. ** An LSM database supports N distinct locks, where N is some number less ** than or equal to 16. Locks are numbered starting from 1 (see the ** definitions for LSM_LOCK_WRITER and co.). ** ** The least significant 16-bits in mLock represent EXCLUSIVE locks. The ** most significant are SHARED locks. So, if a connection holds a SHARED ** lock on lock region iLock, then the following is true: ** ** (mLock & ((iLock+16-1) << 1)) ** ** Or for an EXCLUSIVE lock: ** ** (mLock & ((iLock-1) << 1)) */ struct lsm_db { /* Database handle configuration */ lsm_env *pEnv; /* runtime environment */ int (*xCmp)(void *, int, void *, int); /* Compare function */ /* Values configured by calls to lsm_config */ int eSafety; /* LSM_SAFETY_OFF, NORMAL or FULL */ int bAutowork; /* Configured by LSM_CONFIG_AUTOWORK */ int nTreeLimit; /* Configured by LSM_CONFIG_WRITE_BUFFER */ int nMerge; /* Configured by LSM_CONFIG_NMERGE */ int nLogSz; /* Configured by LSM_CONFIG_LOG_SIZE */ int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */ int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */ int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */ int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */ int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */ /* Sub-system handles */ FileSystem *pFS; /* On-disk portion of database */ Database *pDatabase; /* Database shared data */ /* Client transaction context */ Snapshot *pClient; /* Client snapshot (non-NULL in read trans) */ int iReader; /* Read lock held (-1 == unlocked) */ MultiCursor *pCsr; /* List of all open cursors */ LogWriter *pLogWriter; /* Context for writing to the log file */ int nTransOpen; /* Number of opened write transactions */ int nTransAlloc; /* Allocated size of aTrans[] array */ TransMark *aTrans; /* Array of marks for transaction rollback */ IntArray rollback; /* List of tree-nodes to roll back */ /* Worker context */ Snapshot *pWorker; /* Worker snapshot (or NULL) */ /* Debugging message callback */ void (*xLog)(void *, int, const char *); void *pLogCtx; /* Work done notification callback */ void (*xWork)(lsm_db *, void *); void *pWorkCtx; u32 mLock; /* Mask of current locks. See lsmShmLock(). */ lsm_db *pNext; /* Next connection to same database */ int nShm; /* Size of apShm[] array */ void **apShm; /* Shared memory chunks */ ShmHeader *pShmhdr; /* Live shared-memory header */ TreeHeader treehdr; /* Local copy of tree-header */ u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)]; }; struct Segment { int iFirst; /* First page of this run */ int iLast; /* Last page of this run */ Pgno iRoot; /* Root page number (if any) */ int nSize; /* Size of this run in pages */ }; /* ** iSplitTopic/pSplitKey/nSplitKey: ** If nRight>0, this buffer contains a copy of the largest key that has ** already been written to the left-hand-side of the level. */ struct Level { Segment lhs; /* Left-hand (main) segment */ int iAge; /* Number of times data has been written */ int nRight; /* Size of apRight[] array */ Segment *aRhs; /* Old segments being merged into this */ int iSplitTopic; /* Split key topic (if nRight>0) */ void *pSplitKey; /* Pointer to split-key (if nRight>0) */ int nSplitKey; /* Number of bytes in split-key */ Merge *pMerge; /* Merge operation currently underway */ Level *pNext; /* Next level in tree */ }; /* |
︙ | ︙ | |||
266 267 268 269 270 271 272 | ** The first argument to this macro is a pointer to a Segment structure. ** Returns true if the structure instance indicates that the separators ** array is valid. */ #define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0) /* | > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < < < < > | > > > > > > > > > > > > > > > > > > > > > > > | > > | | | < < < < < < < < | 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 | ** The first argument to this macro is a pointer to a Segment structure. ** Returns true if the structure instance indicates that the separators ** array is valid. */ #define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0) /* ** The values that accompany the lock held by a database reader. */ struct ShmReader { i64 iTreeId; i64 iLsmId; }; /* ** An instance of this structure is stored in the first shared-memory ** page. The shared-memory header. ** ** bWriter: ** Immediately after opening a write transaction taking the WRITER lock, ** each writer client sets this flag. It is cleared right before the ** WRITER lock is relinquished. If a subsequent writer finds that this ** flag is already set when a write transaction is opened, this indicates ** that a previous writer failed mid-transaction. ** ** iMetaPage: ** If the database file does not contain a valid, synced, checkpoint, this ** value is set to 0. Otherwise, it is set to the meta-page number that ** contains the most recently written checkpoint (either 1 or 2). ** ** hdr1, hdr2: ** The two copies of the in-memory tree header. Two copies are required ** in case a writer fails while updating one of them. */ struct ShmHeader { u32 aClient[LSM_META_PAGE_SIZE / 4]; u32 aWorker[LSM_META_PAGE_SIZE / 4]; u32 bWriter; u32 iMetaPage; TreeHeader hdr1; TreeHeader hdr2; ShmReader aReader[LSM_LOCK_NREADER]; }; /* ** An instance of this structure is stored at the start of each shared-memory ** chunk except the first (which is the header chunk - see above). */ struct ShmChunk { u32 iFirstTree; u32 iLastTree; u32 iNext; }; #define LSM_APPLIST_SZ 4 typedef struct Freelist Freelist; typedef struct FreelistEntry FreelistEntry; /* ** An instance of the following structure stores the current database free ** block list. The free list is a list of blocks that are not currently ** used by the worker snapshot. Assocated with each block in the list is the ** snapshot id of the most recent snapshot that did actually use the block. */ struct Freelist { FreelistEntry *aEntry; /* Free list entries */ int nEntry; /* Number of valid slots in aEntry[] */ int nAlloc; /* Allocated size of aEntry[] */ }; struct FreelistEntry { u32 iBlk; /* Block number */ i64 iId; /* Largest snapshot id to use this block */ }; /* ** A snapshot of a database. A snapshot contains all the information required ** to read or write a database file on disk. See the description of struct ** Database below for futher details. */ struct Snapshot { Database *pDatabase; /* Database this snapshot belongs to */ Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */ i64 iId; /* Snapshot id */ /* Used by worker snapshots only */ int nBlock; /* Number of blocks in database file */ u32 aiAppend[LSM_APPLIST_SZ]; /* Append point list */ Freelist freelist; /* Free block list */ int nFreelistOvfl; /* Number of extra free-list entries in LSM */ }; #define LSM_INITIAL_SNAPSHOT_ID 11 /* ** Functions from file "lsm_ckpt.c". */ int lsmCheckpointWrite(lsm_db *); int lsmCheckpointLevels(lsm_db *, int, void **, int *); int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal); int lsmCheckpointOverflow(lsm_db *pDb, void **, int *, int *); int lsmCheckpointOverflowRequired(lsm_db *pDb); int lsmCheckpointOverflowLoad(lsm_db *pDb, Freelist *); int lsmCheckpointRecover(lsm_db *); int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **); int lsmCheckpointLoad(lsm_db *pDb); int lsmCheckpointLoadWorker(lsm_db *pDb); int lsmCheckpointStore(lsm_db *pDb, int); i64 lsmCheckpointId(u32 *, int); i64 lsmCheckpointLogOffset(u32 *); int lsmCheckpointPgsz(u32 *); int lsmCheckpointBlksz(u32 *); void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog); void lsmCheckpointZeroLogoffset(lsm_db *); int lsmCheckpointSaveWorker(lsm_db *pDb, int, int); int lsmDatabaseFull(lsm_db *pDb); int lsmCheckpointSynced(lsm_db *pDb, i64 *piId); /* ** Functions from file "lsm_tree.c". */ int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree); void lsmTreeRelease(lsm_env *, Tree *); void lsmTreeClear(lsm_db *); void lsmTreeInit(lsm_db *); int lsmTreeSize(lsm_db *); int lsmTreeEndTransaction(lsm_db *pDb, int bCommit); int lsmTreeBeginTransaction(lsm_db *pDb); int lsmTreeLoadHeader(lsm_db *pDb); int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal); void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark); void lsmTreeMark(lsm_db *pDb, TreeMark *pMark); int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **); void lsmTreeCursorDestroy(TreeCursor *); int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes); int lsmTreeCursorNext(TreeCursor *pCsr); int lsmTreeCursorPrev(TreeCursor *pCsr); int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast); void lsmTreeCursorReset(TreeCursor *pCsr); int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey); int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal); int lsmTreeCursorValid(TreeCursor *pCsr); int lsmTreeCursorSave(TreeCursor *pCsr); /* ** Functions from file "mem.c". */ int lsmPoolNew(lsm_env *pEnv, Mempool **ppPool); void lsmPoolDestroy(lsm_env *pEnv, Mempool *pPool); void *lsmPoolMalloc(lsm_env *pEnv, Mempool *pPool, int nByte); |
︙ | ︙ | |||
384 385 386 387 388 389 390 | lsm_env *lsmFsEnv(FileSystem *); lsm_env *lsmPageEnv(Page *); FileSystem *lsmPageFS(Page *); int lsmFsSectorSize(FileSystem *); void lsmSortedSplitkey(lsm_db *, Level *, int *); | < | < < > > > > > > > > | < | | 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 | lsm_env *lsmFsEnv(FileSystem *); lsm_env *lsmPageEnv(Page *); FileSystem *lsmPageFS(Page *); int lsmFsSectorSize(FileSystem *); void lsmSortedSplitkey(lsm_db *, Level *, int *); /* Reading sorted run content. */ int lsmFsDbPageGet(FileSystem *, Pgno, Page **); int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **); int lsmFsPageWrite(Page *); u8 *lsmFsPageData(Page *, int *); int lsmFsPageRelease(Page *); int lsmFsPagePersist(Page *); void lsmFsPageRef(Page *); Pgno lsmFsPageNumber(Page *); int lsmFsNRead(FileSystem *); int lsmFsNWrite(FileSystem *); int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **); int lsmFsMetaPageRelease(MetaPage *); u8 *lsmFsMetaPageData(MetaPage *, int *); #ifdef LSM_DEBUG int lsmFsIntegrityCheck(lsm_db *); #endif int lsmFsPageWritable(Page *); /* Functions to read, write and sync the log file. */ int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr); int lsmFsSyncLog(FileSystem *pFS); int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr); int lsmFsTruncateLog(FileSystem *pFS, i64 nByte); int lsmFsCloseAndDeleteLog(FileSystem *pFS); /* And to sync the db file */ int lsmFsSyncDb(FileSystem *); /* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */ int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut); int lsmConfigMmap(lsm_db *pDb, int *piParam); int lsmEnvOpen(lsm_env *, const char *, lsm_file **); int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile); int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock); int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **); void lsmEnvShmBarrier(lsm_env *); void lsmEnvShmUnmap(lsm_env *, lsm_file *, int); /* ** End of functions from "lsm_file.c". **************************************************************************/ /* ** Functions from file "lsm_sorted.c". */ int lsmInfoPageDump(lsm_db *, Pgno, int, char **); int lsmSortedFlushTree(lsm_db *, int *); void lsmSortedCleanup(lsm_db *); int lsmSortedAutoWork(lsm_db *, int nUnit); void lsmSortedRemap(lsm_db *pDb); void lsmSortedFreeLevel(lsm_env *pEnv, Level *); int lsmSortedFlushDb(lsm_db *); int lsmSortedAdvanceAll(lsm_db *pDb); int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *); int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *); void *lsmSortedSplitKey(Level *pLevel, int *pnByte); void lsmSortedSaveTreeCursors(lsm_db *); int lsmMCursorNew(lsm_db *, MultiCursor **); void lsmMCursorClose(MultiCursor *); |
︙ | ︙ | |||
496 497 498 499 500 501 502 | */ void lsmLogMessage(lsm_db *, int, const char *, ...); int lsmFlushToDisk(lsm_db *); /* ** Functions from file "lsm_log.c". */ | | | | > | < > > > < < < < < < < < < | < < < < > > | > > | > | > > > > | > > > | > > > > > > > > > > > > | 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 | */ void lsmLogMessage(lsm_db *, int, const char *, ...); int lsmFlushToDisk(lsm_db *); /* ** Functions from file "lsm_log.c". */ int lsmLogBegin(lsm_db *pDb); int lsmLogWrite(lsm_db *, void *, int, void *, int); int lsmLogCommit(lsm_db *); void lsmLogEnd(lsm_db *pDb, int bCommit); void lsmLogTell(lsm_db *, LogMark *); void lsmLogSeek(lsm_db *, LogMark *); int lsmLogRecover(lsm_db *); void lsmLogCheckpoint(lsm_db *, lsm_i64); int lsmLogStructure(lsm_db *pDb, char **pzVal); /************************************************************************** ** Functions from file "lsm_shared.c". */ int lsmDbDatabaseConnect(lsm_db*, const char *); void lsmDbDatabaseRelease(lsm_db *); int lsmBeginReadTrans(lsm_db *); int lsmBeginWriteTrans(lsm_db *); int lsmBeginFlush(lsm_db *); int lsmBeginWork(lsm_db *); void lsmFinishWork(lsm_db *, int, int, int *); int lsmFinishRecovery(lsm_db *); void lsmFinishReadTrans(lsm_db *); int lsmFinishWriteTrans(lsm_db *, int); int lsmFinishFlush(lsm_db *, int); int lsmSnapshotSetFreelist(lsm_db *, int *, int); Snapshot *lsmDbSnapshotClient(lsm_db *); Snapshot *lsmDbSnapshotWorker(lsm_db *); void lsmSnapshotSetCkptid(Snapshot *, i64); Level *lsmDbSnapshotLevel(Snapshot *); void lsmDbSnapshotSetLevel(Snapshot *, Level *); void lsmDbRecoveryComplete(lsm_db *, int); int lsmBlockAllocate(lsm_db *, int *); int lsmBlockFree(lsm_db *, int); int lsmBlockRefree(lsm_db *, int); void lsmFreelistDeltaBegin(lsm_db *); void lsmFreelistDeltaEnd(lsm_db *); int lsmFreelistDelta(lsm_db *pDb); DbLog *lsmDatabaseLog(lsm_db *pDb); #ifdef LSM_DEBUG int lsmHoldingClientMutex(lsm_db *pDb); int lsmShmAssertLock(lsm_db *db, int iLock, int eOp); int lsmShmAssertWorker(lsm_db *db); #endif void lsmFreeSnapshot(lsm_env *, Snapshot *); /* Candidate values for the 3rd argument to lsmShmLock() */ #define LSM_LOCK_UNLOCK 0 #define LSM_LOCK_SHARED 1 #define LSM_LOCK_EXCL 2 int lsmShmChunk(lsm_db *db, int iChunk, void **ppData); int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock); void lsmShmBarrier(lsm_db *db); #ifdef LSM_DEBUG void lsmShmHasLock(lsm_db *db, int iLock, int eOp); #else # define lsmShmHasLock(x,y,z) #endif int lsmReadlock(lsm_db *, i64 iLsm, i64 iTree); int lsmReleaseReadlock(lsm_db *); int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse); int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse); int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId); int lsmDbMultiProc(lsm_db *); void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *); /************************************************************************** ** functions in lsm_str.c */ void lsmStringInit(LsmString*, lsm_env *pEnv); int lsmStringExtend(LsmString*, int); int lsmStringAppend(LsmString*, const char *, int); |
︙ | ︙ |
Changes to src/lsm_ckpt.c.
︙ | ︙ | |||
32 33 34 35 36 37 38 | ** 2. The checkpoint id LSW. ** 3. The number of integer values in the entire checkpoint, including ** the two checksum values. ** 4. The total number of blocks in the database. ** 5. The block size. ** 6. The number of levels. ** 7. The nominal database page size. | | < > > > > > | | < < | | | > > | | | < < | < < < < | > > | > > > > | > > > > > > > > | > > | > > > | > > | > > | < | | > | > | | > | > > > > | | | | | > < < < < < < < < | < < | < < < < | < < | < < < < < | | < < < < < < < | < | < < < < < < < < | < < < | < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | > > > > > | < < | 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | ** 2. The checkpoint id LSW. ** 3. The number of integer values in the entire checkpoint, including ** the two checksum values. ** 4. The total number of blocks in the database. ** 5. The block size. ** 6. The number of levels. ** 7. The nominal database page size. ** 8. Flag indicating if there exists a FREELIST record in the database. ** ** Log pointer: ** ** 4 integers (2 for a 64-bit offset and 2 for a 64-bit checksum). See ** ckptExportLog() and ckptImportLog(). ** ** Append points: ** ** 4 integers. See ckptExportAppendlist(). ** ** For each level in the database, a level record. Formatted as follows: ** ** 0. Age of the level. ** 1. The number of right-hand segments (nRight, possibly 0), ** 2. Segment record for left-hand segment (4 integers defined below), ** 3. Segment record for each right-hand segment (4 integers defined below), ** 4. If nRight>0, The number of segments involved in the merge ** 5. if nRight>0, Current nSkip value (see Merge structure defn.), ** 6. For each segment in the merge: ** 5a. Page number of next cell to read during merge ** 5b. Cell number of next cell to read during merge ** 7. Page containing current split-key. ** 8. Cell within page containing current split-key. ** ** The freelist. ** ** 1. Number of free-list entries stored in checkpoint header. ** 2. For each entry: ** 2a. Block number of free block. ** 2b. MSW of associated checkpoint id. ** 2c. LSW of associated checkpoint id. ** ** If the overflow flag is set, then extra free-list entries may be stored ** in the FREELIST record. The FREELIST record contains 3 32-bit integers ** per entry, in the same format as above (without the "number of entries" ** field). ** ** The checksum: ** ** 1. Checksum value 1. ** 2. Checksum value 2. ** ** In the above, a segment record is: ** ** 1. First page of array, ** 2. Last page of array, ** 3. Root page of array (or 0), ** 4. Size of array in pages, */ /* ** LARGE NUMBERS OF LEVEL RECORDS: ** ** A limit on the number of rhs segments that may be present in the database ** file. Defining this limit ensures that all level records fit within ** the 4096 byte limit for checkpoint blobs. ** ** The number of right-hand-side segments in a database is counted as ** follows: ** ** * For each level in the database not undergoing a merge, add 1. ** ** * For each level in the database that is undergoing a merge, add ** the number of segments on the rhs of the level. ** ** A level record not undergoing a merge is 6 integers. A level record ** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the ** separators from the next level) is (6*nRhs+12) integers. The maximum ** per right-hand-side level is therefore 12 integers. So the maximum ** size of all level records in a checkpoint is 12*40=480 integers. */ #define LSM_MAX_RHS_SEGMENTS 40 /* ** LARGE NUMBERS OF FREELIST ENTRIES: ** ** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h) ** on the number of free-list entries stored in a checkpoint. Since each ** free-list entry consists of 3 integers, the maximum free-list size is ** 3*100=300 integers. Combined with the limit on rhs segments defined ** above, this ensures that a checkpoint always fits within a 4096 byte ** meta page. ** ** If the database contains more than 100 free blocks, the "overflow" flag ** in the checkpoint header is set and the remainder are stored in the ** system FREELIST entry in the LSM (along with user data). The value ** accompanying the FREELIST key in the LSM is, like a checkpoint, an array ** of 32-bit big-endian integers. As follows: ** ** For each entry: ** a. Block number of free block. ** b. MSW of associated checkpoint id. ** c. LSW of associated checkpoint id. ** ** The number of entries is not required - it is implied by the size of the ** value blob containing the integer array. ** ** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit. ** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST. */ /* ** The argument to this macro must be of type u32. On a little-endian ** architecture, it returns the u32 value that results from interpreting ** the 4 bytes as a big-endian value. On a big-endian architecture, it ** returns the value that would be produced by intepreting the 4 bytes ** of the input value as a little-endian integer. */ #define BYTESWAP32(x) ( \ (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \ + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \ ) static const int one = 1; #define LSM_LITTLE_ENDIAN (*(u8 *)(&one)) /* Sizes, in integers, of various parts of the checkpoint. */ #define CKPT_HDR_SIZE 8 #define CKPT_LOGPTR_SIZE 4 #define CKPT_SEGMENT_SIZE 4 #define CKPT_CKSUM_SIZE 2 #define CKPT_APPENDLIST_SIZE LSM_APPLIST_SZ /* A #define to describe each integer in the checkpoint header. */ #define CKPT_HDR_ID_MSW 0 #define CKPT_HDR_ID_LSW 1 #define CKPT_HDR_NCKPT 2 #define CKPT_HDR_NBLOCK 3 #define CKPT_HDR_BLKSZ 4 #define CKPT_HDR_NLEVEL 5 #define CKPT_HDR_PGSZ 6 #define CKPT_HDR_OVFL 7 #define CKPT_HDR_LO_MSW 8 #define CKPT_HDR_LO_LSW 9 #define CKPT_HDR_LO_CKSUM1 10 #define CKPT_HDR_LO_CKSUM2 11 typedef struct CkptBuffer CkptBuffer; /* ** Dynamic buffer used to accumulate data for a checkpoint. */ struct CkptBuffer { lsm_env *pEnv; int nAlloc; u32 *aCkpt; }; /* ** Calculate the checksum of the checkpoint specified by arguments aCkpt and ** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning. ** ** The value of the nCkpt parameter includes the two checksum values at ** the end of the checkpoint. They are not used as inputs to the checksum ** calculation. The checksum is based on the array of (nCkpt-2) integers ** at aCkpt[]. */ static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){ int i; u32 cksum1 = 1; u32 cksum2 = 2; if( nCkpt % 2 ){ cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF; cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000; } for(i=0; (i+3)<nCkpt; i+=2){ cksum1 += cksum2 + aCkpt[i]; cksum2 += cksum1 + aCkpt[i+1]; } *piCksum1 = cksum1; *piCksum2 = cksum2; } /* ** Set integer iIdx of the checkpoint accumulating in buffer *p to iVal. */ static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){ if( *pRc ) return; if( iIdx>=p->nAlloc ){ int nNew = LSM_MAX(8, iIdx*2); p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32)); if( !p->aCkpt ){ *pRc = LSM_NOMEM_BKPT; return; } p->nAlloc = nNew; } p->aCkpt[iIdx] = iVal; } /* ** Argument aInt points to an array nInt elements in size. Switch the ** endian-ness of each element of the array. */ static void ckptChangeEndianness(u32 *aInt, int nInt){ if( LSM_LITTLE_ENDIAN ){ int i; for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]); } } /* ** Object *p contains a checkpoint in native byte-order. The checkpoint is ** nCkpt integers in size, not including any checksum. This function sets ** the two checksum elements of the checkpoint accordingly. */ static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){ if( *pRc==LSM_OK ){ u32 aCksum[2] = {0, 0}; ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]); ckptSetValue(p, nCkpt, aCksum[0], pRc); ckptSetValue(p, nCkpt+1, aCksum[1], pRc); } } /* ** Append a 6-value segment record corresponding to pSeg to the checkpoint |
︙ | ︙ | |||
248 249 250 251 252 253 254 | ckptSetValue(p, iOut++, pSeg->iRoot, pRc); ckptSetValue(p, iOut++, pSeg->nSize, pRc); *piOut = iOut; } static void ckptExportLevel( | | | | | | 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | ckptSetValue(p, iOut++, pSeg->iRoot, pRc); ckptSetValue(p, iOut++, pSeg->nSize, pRc); *piOut = iOut; } static void ckptExportLevel( Level *pLevel, /* Level object to serialize */ CkptBuffer *p, /* Append new level record to this ckpt */ int *piOut, /* IN/OUT: Size of checkpoint so far */ int *pRc /* IN/OUT: Error code */ ){ int iOut = *piOut; Merge *pMerge; pMerge = pLevel->pMerge; ckptSetValue(p, iOut++, pLevel->iAge, pRc); ckptSetValue(p, iOut++, pLevel->nRight, pRc); |
︙ | ︙ | |||
284 285 286 287 288 289 290 | ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc); } *piOut = iOut; } /* | | | > > > > > > | > > > > | | | | > > > | > > > > > > > > > > > > > > > > > > | < < < < < | < < < < < < < < < < < < < < < < < < < | | < | | < > | > | > > > | > > > > | | | < < | | | < < | | > | < < < < < | < | < < | | | > > > > > > | 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 | ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc); } *piOut = iOut; } /* ** Populate the log offset fields of the checkpoint buffer. 4 values. */ static void ckptExportLog( lsm_db *pDb, int bFlush, CkptBuffer *p, int *piOut, int *pRc ){ int iOut = *piOut; assert( iOut==CKPT_HDR_LO_MSW ); if( bFlush ){ DbLog *pLog = &pDb->treehdr.log; i64 iOff = pLog->aRegion[2].iEnd; ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc); ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc); ckptSetValue(p, iOut++, pLog->cksum0, pRc); ckptSetValue(p, iOut++, pLog->cksum1, pRc); }else{ for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){ ckptSetValue(p, iOut, pDb->pShmhdr->aWorker[iOut], pRc); } } *piOut = iOut; } static void ckptExportAppendlist( lsm_db *db, /* Database connection */ CkptBuffer *p, /* Checkpoint buffer to write to */ int *piOut, /* IN/OUT: Offset within checkpoint buffer */ int *pRc /* IN/OUT: Error code */ ){ int i; int iOut = *piOut; u32 *aiAppend = db->pWorker->aiAppend; for(i=0; i<CKPT_APPENDLIST_SIZE; i++){ ckptSetValue(p, iOut++, aiAppend[i], pRc); } *piOut = iOut; }; static int ckptExportSnapshot( lsm_db *pDb, /* Connection handle */ int nOvfl, /* Number of free-list entries in LSM */ int bLog, /* True to update log-offset fields */ i64 iId, /* Checkpoint id */ int bCksum, /* If true, include checksums */ void **ppCkpt, /* OUT: Buffer containing checkpoint */ int *pnCkpt /* OUT: Size of checkpoint in bytes */ ){ int rc = LSM_OK; /* Return Code */ FileSystem *pFS = pDb->pFS; /* File system object */ Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */ int nLevel = 0; /* Number of levels in checkpoint */ int iLevel; /* Used to count out nLevel levels */ int iOut = 0; /* Current offset in aCkpt[] */ Level *pLevel; /* Level iterator */ int i; /* Iterator used while serializing freelist */ CkptBuffer ckpt; int nFree; nFree = pSnap->freelist.nEntry; if( nOvfl>=0 ){ nFree -= nOvfl; }else{ nOvfl = pDb->pShmhdr->aWorker[CKPT_HDR_OVFL]; } /* Initialize the output buffer */ memset(&ckpt, 0, sizeof(CkptBuffer)); ckpt.pEnv = pDb->pEnv; iOut = CKPT_HDR_SIZE; /* Write the log offset into the checkpoint. */ ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc); /* Write the append-point list */ ckptExportAppendlist(pDb, &ckpt, &iOut, &rc); /* Figure out how many levels will be written to the checkpoint. */ for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++; /* Serialize nLevel levels. */ iLevel = 0; for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){ ckptExportLevel(pLevel, &ckpt, &iOut, &rc); iLevel++; } /* Write the freelist */ if( rc==LSM_OK ){ ckptSetValue(&ckpt, iOut++, nFree, &rc); for(i=0; i<nFree; i++){ FreelistEntry *p = &pSnap->freelist.aEntry[i]; ckptSetValue(&ckpt, iOut++, p->iBlk, &rc); ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc); ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc); } } /* Write the checkpoint header */ assert( iId>=0 ); ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc); ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc); ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc); ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc); ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc); ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc); ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc); ckptSetValue(&ckpt, CKPT_HDR_OVFL, nOvfl, &rc); if( bCksum ){ ckptAddChecksum(&ckpt, iOut, &rc); }else{ ckptSetValue(&ckpt, iOut, 0, &rc); ckptSetValue(&ckpt, iOut+1, 0, &rc); } iOut += 2; assert( iOut<=1024 ); #if 0 lsmLogMessage(pDb, rc, "ckptExportSnapshot(): id=%d freelist: %d/%d", (int)iId, nFree, nOvfl ); #endif *ppCkpt = (void *)ckpt.aCkpt; if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut; return rc; } /* |
︙ | ︙ | |||
523 524 525 526 527 528 529 | } *ppLevel = pRet; *piIn = iIn; return rc; } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 557 558 559 560 561 562 563 564 565 566 567 568 569 570 | } *ppLevel = pRet; *piIn = iIn; return rc; } int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){ int rc = LSM_OK; if( nVal>0 ){ u32 *aIn; aIn = lsmMallocRc(pDb->pEnv, nVal, &rc); |
︙ | ︙ | |||
625 626 627 628 629 630 631 | } } } return rc; } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 591 592 593 594 595 596 597 598 599 600 601 602 603 604 | } } } return rc; } /* ** Return the data for the LEVELS record. ** ** The size of the checkpoint that can be stored in the database header ** must not exceed 1024 32-bit integers. Normally, it does not. However, ** if it does, part of the checkpoint must be stored in the LSM. This ** routine returns that part. |
︙ | ︙ | |||
769 770 771 772 773 774 775 | *paVal = 0; } return rc; } /* | | < < < < > | < < < > > | > > > | > > > | > > > > > > > > > > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > | > > > > | > > > > | < > > > | > > > > > > > > > > | > | > | > > > | > > > | > > | > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | < < | > > > > > > > > > > > | > > > > > | > > > | > > > > > > > > > > > | > > > > > | > > > > | > > > > | > > > > > > > > > > > | > > | > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > | > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > | > > > > > > > > > > > > > > > > | > | > > | > > > | > > > > > > > > > > > > > > > > > > > > > > > | > > > > > | > > | > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > | | | < < < < < < | < < | < | < < < | | | < < | < < < < < < < < > > > > | > > > | > | > | > > | > > > > > > | | > > > > > > > | > > > > > | > > | 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 | *paVal = 0; } return rc; } /* ** The worker lock must be held to call this function. ** ** The function serializes and returns the data that should be stored as ** the FREELIST system record. */ int lsmCheckpointOverflow( lsm_db *pDb, /* Database handle (must hold worker lock) */ void **ppVal, /* OUT: lsmMalloc'd buffer */ int *pnVal, /* OUT: Size of *ppVal in bytes */ int *pnOvfl /* OUT: Number of freelist entries in buf */ ){ int rc = LSM_OK; int nRet; Snapshot *p = pDb->pWorker; assert( lsmShmAssertWorker(pDb) ); assert( pnOvfl && ppVal && pnVal ); assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES ); if( p->nFreelistOvfl ){ rc = lsmCheckpointOverflowLoad(pDb, &p->freelist); if( rc!=LSM_OK ) return rc; p->nFreelistOvfl = 0; } if( p->freelist.nEntry<=pDb->nMaxFreelist ){ nRet = 0; *pnVal = 0; *ppVal = 0; }else{ int i; /* Iterator variable */ int iOut = 0; /* Current size of blob in ckpt */ CkptBuffer ckpt; /* Used to build FREELIST blob */ nRet = (p->freelist.nEntry - pDb->nMaxFreelist); memset(&ckpt, 0, sizeof(CkptBuffer)); ckpt.pEnv = pDb->pEnv; for(i=p->freelist.nEntry-nRet; rc==LSM_OK && i<p->freelist.nEntry; i++){ FreelistEntry *pEntry = &p->freelist.aEntry[i]; ckptSetValue(&ckpt, iOut++, pEntry->iBlk, &rc); ckptSetValue(&ckpt, iOut++, (pEntry->iId >> 32) & 0xFFFFFFFF, &rc); ckptSetValue(&ckpt, iOut++, pEntry->iId & 0xFFFFFFFF, &rc); } ckptChangeEndianness(ckpt.aCkpt, iOut); *ppVal = ckpt.aCkpt; *pnVal = iOut*sizeof(u32); } *pnOvfl = nRet; return rc; } /* ** The connection must be the worker in order to call this function. ** ** True is returned if there are currently too many free-list entries ** in-memory to store in a checkpoint. Before calling lsmCheckpointSaveWorker() ** to save the current worker snapshot, a new top-level LSM segment must ** be created so that some of them can be written to the LSM. */ int lsmCheckpointOverflowRequired(lsm_db *pDb){ assert( lsmShmAssertWorker(pDb) ); return (pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist); } /* ** Connection pDb must be the worker to call this function. ** ** Load the FREELIST record from the database. Decode it and append the ** results to list pFreelist. */ int lsmCheckpointOverflowLoad( lsm_db *pDb, Freelist *pFreelist ){ int rc; int nVal = 0; void *pVal = 0; assert( lsmShmAssertWorker(pDb) ); /* Load the blob of data from the LSM. If that is successful (and the ** blob is greater than zero bytes in size), decode the contents and ** merge them into the current contents of *pFreelist. */ rc = lsmSortedLoadFreelist(pDb, &pVal, &nVal); if( pVal ){ u32 *aFree = (u32 *)pVal; int nFree = nVal / sizeof(int); ckptChangeEndianness(aFree, nFree); if( (nFree % 3) ){ rc = LSM_CORRUPT_BKPT; }else{ int iNew = 0; /* Offset of next element in aFree[] */ int iOld = 0; /* Next element in freelist fl */ Freelist fl = *pFreelist; /* Original contents of *pFreelist */ memset(pFreelist, 0, sizeof(Freelist)); while( rc==LSM_OK && (iNew<nFree || iOld<fl.nEntry) ){ int iBlk; i64 iId; if( iOld>=fl.nEntry ){ iBlk = aFree[iNew]; iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2]; iNew += 3; }else if( iNew>=nFree ){ iBlk = fl.aEntry[iOld].iBlk; iId = fl.aEntry[iOld].iId; iOld += 1; }else{ iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2]; if( iId<fl.aEntry[iOld].iId ){ iBlk = aFree[iNew]; iNew += 3; }else{ iBlk = fl.aEntry[iOld].iBlk; iId = fl.aEntry[iOld].iId; iOld += 1; } } rc = lsmFreelistAppend(pDb->pEnv, pFreelist, iBlk, iId); } lsmFree(pDb->pEnv, fl.aEntry); #ifdef LSM_DEBUG if( rc==LSM_OK ){ int i; for(i=1; rc==LSM_OK && i<pFreelist->nEntry; i++){ assert( pFreelist->aEntry[i].iId >= pFreelist->aEntry[i-1].iId ); } assert( pFreelist->nEntry==(fl.nEntry + nFree/3) ); } #endif } lsmFree(pDb->pEnv, pVal); } return rc; } /* ** Read the checkpoint id from meta-page pPg. */ static i64 ckptLoadId(MetaPage *pPg){ i64 ret = 0; if( pPg ){ int nData; u8 *aData = lsmFsMetaPageData(pPg, &nData); ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) + ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4])); } return ret; } /* ** Return true if the buffer passed as an argument contains a valid ** checkpoint. */ static int ckptChecksumOk(u32 *aCkpt){ u32 nCkpt = aCkpt[CKPT_HDR_NCKPT]; u32 cksum1; u32 cksum2; if( nCkpt<CKPT_HDR_NCKPT || nCkpt>(LSM_META_PAGE_SIZE)/sizeof(u32) ) return 0; ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2); return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]); } /* ** Attempt to load a checkpoint from meta page iMeta. ** ** This function is a no-op if *pRc is set to any value other than LSM_OK ** when it is called. If an error occurs, *pRc is set to an LSM error code ** before returning. ** ** If no error occurs and the checkpoint is successfully loaded, copy it to ** ShmHeader.aClient[] and ShmHeader.aWorker[], and set ShmHeader.iMetaPage ** to indicate its origin. In this case return 1. Or, if the checkpoint ** cannot be loaded (because the checksum does not compute), return 0. */ static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){ int bLoaded = 0; /* Return value */ if( *pRc==LSM_OK ){ int rc = LSM_OK; /* Error code */ u32 *aCkpt = 0; /* Pointer to buffer containing checkpoint */ u32 nCkpt; /* Number of elements in aCkpt[] */ int nData; /* Bytes of data in aData[] */ u8 *aData; /* Meta page data */ aData = lsmFsMetaPageData(pPg, &nData); nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]); if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){ aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc); } if( aCkpt ){ memcpy(aCkpt, aData, nCkpt*sizeof(u32)); ckptChangeEndianness(aCkpt, nCkpt); if( ckptChecksumOk(aCkpt) ){ ShmHeader *pShm = pDb->pShmhdr; memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32)); memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32)); memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32)); pShm->iMetaPage = iMeta; bLoaded = 1; } } lsmFree(pDb->pEnv, aCkpt); *pRc = rc; } return bLoaded; } /* ** Initialize the shared-memory header with an empty snapshot. This function ** is called when no valid snapshot can be found in the database header. */ static void ckptLoadEmpty(lsm_db *pDb){ u32 aCkpt[] = { 0, /* CKPT_HDR_ID_MSW */ 10, /* CKPT_HDR_ID_LSW */ 0, /* CKPT_HDR_NCKPT */ 0, /* CKPT_HDR_NBLOCK */ 0, /* CKPT_HDR_BLKSZ */ 0, /* CKPT_HDR_NLEVEL */ 0, /* CKPT_HDR_PGSZ */ 0, /* CKPT_HDR_OVFL */ 0, 0, 1234, 5678, /* The log pointer and initial checksum */ 0, 0, 0, 0, /* The append list */ 0, /* The free block list */ 0, 0 /* Space for checksum values */ }; u32 nCkpt = array_size(aCkpt); ShmHeader *pShm = pDb->pShmhdr; aCkpt[CKPT_HDR_NCKPT] = nCkpt; aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz; aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz; ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]); memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32)); memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32)); memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32)); } /* ** This function is called as part of database recovery to initialize the ** ShmHeader.aClient[] and ShmHeader.aWorker[] snapshots. */ int lsmCheckpointRecover(lsm_db *pDb){ int rc = LSM_OK; /* Return Code */ i64 iId1; /* Id of checkpoint on meta-page 1 */ i64 iId2; /* Id of checkpoint on meta-page 2 */ int bLoaded = 0; /* True once checkpoint has been loaded */ int cmp; /* True if (iId2>iId1) */ MetaPage *apPg[2] = {0, 0}; /* Meta-pages 1 and 2 */ rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]); if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]); iId1 = ckptLoadId(apPg[0]); iId2 = ckptLoadId(apPg[1]); cmp = (iId2 > iId1); bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc); if( bLoaded==0 ){ bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc); } /* The database does not contain a valid checkpoint. Initialize the shared ** memory header with an empty checkpoint. */ if( bLoaded==0 ){ ckptLoadEmpty(pDb); } lsmFsMetaPageRelease(apPg[0]); lsmFsMetaPageRelease(apPg[1]); return rc; } /* ** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta. */ int lsmCheckpointStore(lsm_db *pDb, int iMeta){ MetaPage *pPg = 0; int rc; assert( iMeta==1 || iMeta==2 ); rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg); if( rc==LSM_OK ){ u8 *aData; int nData; int nCkpt; nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT]; aData = lsmFsMetaPageData(pPg, &nData); memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32)); ckptChangeEndianness((u32 *)aData, nCkpt); rc = lsmFsMetaPageRelease(pPg); } return rc; } /* ** Copy the current client snapshot from shared-memory to pDb->aSnapshot[]. */ int lsmCheckpointLoad(lsm_db *pDb){ while( 1 ){ int rc; int nInt; ShmHeader *pShm = pDb->pShmhdr; nInt = pShm->aClient[CKPT_HDR_NCKPT]; memcpy(pDb->aSnapshot, pShm->aClient, nInt*sizeof(u32)); if( ckptChecksumOk(pDb->aSnapshot) ) return LSM_OK; rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0); if( rc==LSM_BUSY ){ usleep(50); }else{ if( rc==LSM_OK ){ if( ckptChecksumOk(pShm->aClient)==0 ){ nInt = pShm->aWorker[CKPT_HDR_NCKPT]; memcpy(pShm->aClient, pShm->aWorker, nInt*sizeof(u32)); } nInt = pShm->aClient[CKPT_HDR_NCKPT]; memcpy(pDb->aSnapshot, &pShm->aClient, nInt*sizeof(u32)); lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0); if( ckptChecksumOk(pDb->aSnapshot)==0 ){ rc = LSM_CORRUPT_BKPT; } } return rc; } } } int lsmCheckpointLoadWorker(lsm_db *pDb){ int rc; ShmHeader *pShm = pDb->pShmhdr; /* Must be holding the WORKER lock to do this */ assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) ); if( ckptChecksumOk(pShm->aWorker)==0 ){ int nInt = (int)pShm->aClient[CKPT_HDR_NCKPT]; memcpy(pShm->aWorker, pShm->aClient, nInt*sizeof(u32)); if( ckptChecksumOk(pShm->aWorker)==0 ) return LSM_CORRUPT_BKPT; } rc = lsmCheckpointDeserialize(pDb, 1, pShm->aWorker, &pDb->pWorker); assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) ); return rc; } int lsmCheckpointDeserialize( lsm_db *pDb, int bInclFreelist, /* If true, deserialize free-list */ u32 *aCkpt, Snapshot **ppSnap ){ int rc = LSM_OK; Snapshot *pNew; pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc); if( rc==LSM_OK ){ int nFree; int nCopy; int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL]; int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE; pNew->iId = lsmCheckpointId(aCkpt, 0); pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK]; rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel); /* Make a copy of the append-list */ nCopy = sizeof(u32) * LSM_APPLIST_SZ; memcpy(pNew->aiAppend, &aCkpt[CKPT_HDR_SIZE+CKPT_LOGPTR_SIZE], nCopy); /* Copy the free-list */ if( bInclFreelist ){ pNew->nFreelistOvfl = aCkpt[CKPT_HDR_OVFL]; nFree = aCkpt[iIn++]; if( nFree ){ pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc( pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc ); if( rc==LSM_OK ){ int i; for(i=0; i<nFree; i++){ FreelistEntry *p = &pNew->freelist.aEntry[i]; p->iBlk = aCkpt[iIn++]; p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1]; iIn += 2; } pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree; } } } } if( rc!=LSM_OK ){ lsmFreeSnapshot(pDb->pEnv, pNew); pNew = 0; } *ppSnap = pNew; return rc; } /* ** Connection pDb must be the worker connection in order to call this ** function. It returns true if the database already contains the maximum ** number of levels or false otherwise. ** ** This is used when flushing the in-memory tree to disk. If the database ** is already full, then the caller should invoke lsm_work() or similar ** until it is not full before creating a new level by flushing the in-memory ** tree to disk. Limiting the number of levels in the database ensures that ** the records describing them always fit within the checkpoint blob. */ int lsmDatabaseFull(lsm_db *pDb){ Level *p; int nRhs = 0; assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) ); assert( pDb->pWorker ); for(p=pDb->pWorker->pLevel; p; p=p->pNext){ nRhs += (p->nRight ? p->nRight : 1); } return (nRhs >= LSM_MAX_RHS_SEGMENTS); } /* ** The connection passed as the only argument is currently the worker ** connection. Some work has been performed on the database by the connection, ** but no new snapshot has been written into shared memory. ** ** This function updates the shared-memory worker and client snapshots with ** the new snapshot produced by the work performed by pDb. ** ** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM ** error code is returned. */ int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){ Snapshot *pSnap = pDb->pWorker; ShmHeader *pShm = pDb->pShmhdr; void *p = 0; int n = 0; int rc; rc = ckptExportSnapshot(pDb, nOvfl, bFlush, pSnap->iId+1, 1, &p, &n); if( rc!=LSM_OK ) return rc; assert( ckptChecksumOk((u32 *)p) ); assert( n<=LSM_META_PAGE_SIZE ); memcpy(pShm->aWorker, p, n); lsmShmBarrier(pDb); memcpy(pShm->aClient, p, n); lsmFree(pDb->pEnv, p); return LSM_OK; } int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){ int rc = LSM_OK; const int nAttempt = 3; int i; for(i=0; i<nAttempt; i++){ MetaPage *pPg; u32 iMeta; iMeta = pDb->pShmhdr->iMetaPage; rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg); if( rc==LSM_OK ){ int nCkpt; int nData; u8 *aData; aData = lsmFsMetaPageData(pPg, &nData); assert( nData==LSM_META_PAGE_SIZE ); nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]); if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){ u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc); if( aCopy ){ memcpy(aCopy, aData, nCkpt*sizeof(u32)); ckptChangeEndianness(aCopy, nCkpt); if( ckptChecksumOk(aCopy) ){ *piId = lsmCheckpointId(aCopy, 0); } lsmFree(pDb->pEnv, aCopy); } } lsmFsMetaPageRelease(pPg); } if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage==iMeta ) break; } return (rc==LSM_OK && i==3) ? LSM_BUSY : LSM_OK; } /* ** Return the checkpoint-id of the checkpoint array passed as the first ** argument to this function. If the second argument is true, then assume ** that the checkpoint is made up of 32-bit big-endian integers. If it ** is false, assume that the integers are in machine byte order. */ i64 lsmCheckpointId(u32 *aCkpt, int bDisk){ i64 iId; if( bDisk ){ u8 *aData = (u8 *)aCkpt; iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32); iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4])); }else{ iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW]; } return iId; } i64 lsmCheckpointLogOffset(u32 *aCkpt){ return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW]; } int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; } int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; } void lsmCheckpointLogoffset( u32 *aCkpt, DbLog *pLog ){ u32 iOffMSB = aCkpt[CKPT_HDR_LO_MSW]; u32 iOffLSB = aCkpt[CKPT_HDR_LO_LSW]; pLog->aRegion[2].iStart = (((i64)iOffMSB) << 32) + ((i64)iOffLSB); pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1]; pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2]; } void lsmCheckpointZeroLogoffset(lsm_db *pDb){ u32 nCkpt; nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT]; assert( nCkpt>CKPT_HDR_NCKPT ); assert( nCkpt==pDb->pShmhdr->aClient[CKPT_HDR_NCKPT] ); assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aClient, nCkpt*sizeof(u32)) ); assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aWorker, nCkpt*sizeof(u32)) ); pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0; pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0; ckptChecksum(pDb->aSnapshot, nCkpt, &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1] ); memcpy(pDb->pShmhdr->aClient, pDb->aSnapshot, nCkpt*sizeof(u32)); memcpy(pDb->pShmhdr->aWorker, pDb->aSnapshot, nCkpt*sizeof(u32)); } |
Changes to src/lsm_file.c.
︙ | ︙ | |||
29 30 31 32 33 34 35 | ** exist - since it would always overlap with the meta pages. If the ** page-size is (say) 512 bytes, then the first usable page in the database ** is page 33. ** ** It is assumed that the first two meta pages and the data that follows ** them are located on different disk sectors. So that if a power failure ** while writing to a meta page there is no risk of damage to the other | | > | | 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | ** exist - since it would always overlap with the meta pages. If the ** page-size is (say) 512 bytes, then the first usable page in the database ** is page 33. ** ** It is assumed that the first two meta pages and the data that follows ** them are located on different disk sectors. So that if a power failure ** while writing to a meta page there is no risk of damage to the other ** meta page or any other part of the database file. TODO: This may need ** to be revisited. ** ** Blocks: ** ** The database file is also divided into blocks. The default block size is ** 2MB. When writing to the database file, an attempt is made to write data ** in contiguous block-sized chunks. ** ** The first and last page on each block are special in that they are 4 ** bytes smaller than all other pages. This is because the last four bytes ** of space on the first and last pages of each block are reserved for ** pointers to other blocks (i.e. a 32-bit block number). ** ** Runs: ** ** A run is a sequence of pages that the upper layer uses to store a ** sorted array of database keys (and accompanying data - values, FC ** pointers and so on). Given a page within a run, it is possible to |
︙ | ︙ | |||
73 74 75 76 77 78 79 80 81 82 83 84 85 86 | ** THE LOG FILE ** ** This file opens and closes the log file. But it does not contain any ** logic related to the log file format. Instead, it exports the following ** functions that are used by the code in lsm_log.c to read and write the ** log file: ** ** lsmFsWriteLog ** lsmFsSyncLog ** lsmFsReadLog ** lsmFsTruncateLog ** lsmFsCloseAndDeleteLog ** */ | > | 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | ** THE LOG FILE ** ** This file opens and closes the log file. But it does not contain any ** logic related to the log file format. Instead, it exports the following ** functions that are used by the code in lsm_log.c to read and write the ** log file: ** ** lsmFsOpenLog ** lsmFsWriteLog ** lsmFsSyncLog ** lsmFsReadLog ** lsmFsTruncateLog ** lsmFsCloseAndDeleteLog ** */ |
︙ | ︙ | |||
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | ** ** In non-mmap() mode, this list is an LRU list of cached pages with nRef==0. */ struct FileSystem { lsm_db *pDb; /* Database handle that owns this object */ lsm_env *pEnv; /* Environment pointer */ char *zDb; /* Database file name */ int nMetasize; /* Size of meta pages in bytes */ int nPagesize; /* Database page-size in bytes */ int nBlocksize; /* Database block-size in bytes */ /* r/w file descriptors for both files. */ lsm_file *fdDb; /* Database file */ lsm_file *fdLog; /* Log file */ /* mmap() mode things */ int bUseMmap; /* True to use mmap() to access db file */ void *pMap; /* Current mapping of database file */ i64 nMap; /* Bytes mapped at pMap */ | > > | 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | ** ** In non-mmap() mode, this list is an LRU list of cached pages with nRef==0. */ struct FileSystem { lsm_db *pDb; /* Database handle that owns this object */ lsm_env *pEnv; /* Environment pointer */ char *zDb; /* Database file name */ char *zLog; /* Database file name */ int nMetasize; /* Size of meta pages in bytes */ int nPagesize; /* Database page-size in bytes */ int nBlocksize; /* Database block-size in bytes */ /* r/w file descriptors for both files. */ LsmFile *pLsmFile; lsm_file *fdDb; /* Database file */ lsm_file *fdLog; /* Log file */ /* mmap() mode things */ int bUseMmap; /* True to use mmap() to access db file */ void *pMap; /* Current mapping of database file */ i64 nMap; /* Bytes mapped at pMap */ |
︙ | ︙ | |||
189 190 191 192 193 194 195 | ** lsmEnvSync() ** lsmEnvSectorSize() ** lsmEnvClose() ** lsmEnvTruncate() ** lsmEnvUnlink() ** lsmEnvRemap() */ | | | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | ** lsmEnvSync() ** lsmEnvSectorSize() ** lsmEnvClose() ** lsmEnvTruncate() ** lsmEnvUnlink() ** lsmEnvRemap() */ int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){ return pEnv->xOpen(pEnv, zFile, ppNew); } static int lsmEnvRead( lsm_env *pEnv, lsm_file *pFile, lsm_i64 iOff, void *pRead, |
︙ | ︙ | |||
216 217 218 219 220 221 222 | } static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSync(pFile); } static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSectorSize(pFile); } | | > > > > > > > > > > > > > > > > > > > > > > > > > > | | > | 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | } static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSync(pFile); } static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSectorSize(pFile); } int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xClose(pFile); } static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){ return pEnv->xTruncate(pFile, nByte); } static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){ return pEnv->xUnlink(pEnv, zDel); } static int lsmEnvRemap( lsm_env *pEnv, lsm_file *pFile, i64 szMin, void **ppMap, i64 *pszMap ){ return pEnv->xRemap(pFile, szMin, ppMap, pszMap); } int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){ if( pFile==0 ) return LSM_OK; return pEnv->xLock(pFile, iLock, eLock); } int lsmEnvShmMap( lsm_env *pEnv, lsm_file *pFile, int iChunk, int sz, void **ppOut ){ return pEnv->xShmMap(pFile, iChunk, sz, ppOut); } void lsmEnvShmBarrier(lsm_env *pEnv){ return pEnv->xShmBarrier(); } void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){ return pEnv->xShmUnmap(pFile, bDel); } /* ** Write the contents of string buffer pStr into the log file, starting at ** offset iOff. */ int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){ assert( pFS->fdLog ); return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n); } /* ** fsync() the log file. */ int lsmFsSyncLog(FileSystem *pFS){ assert( pFS->fdLog ); return lsmEnvSync(pFS->pEnv, pFS->fdLog); } /* ** Read nRead bytes of data starting at offset iOff of the log file. Append ** the results to string buffer pStr. */ int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){ int rc; /* Return code */ assert( pFS->fdLog ); rc = lsmStringExtend(pStr, nRead); if( rc==LSM_OK ){ rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead); pStr->n += nRead; } return rc; } |
︙ | ︙ | |||
308 309 310 311 312 313 314 | static lsm_file *fsOpenFile( FileSystem *pFS, /* File system object */ int bLog, /* True for log, false for db */ int *pRc /* IN/OUT: Error code */ ){ lsm_file *pFile = 0; if( *pRc==LSM_OK ){ | < < < < < | | | | > > > > > > > > > > > > > > > | > > > | > > | | | > > | < | 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 | static lsm_file *fsOpenFile( FileSystem *pFS, /* File system object */ int bLog, /* True for log, false for db */ int *pRc /* IN/OUT: Error code */ ){ lsm_file *pFile = 0; if( *pRc==LSM_OK ){ *pRc = lsmEnvOpen(pFS->pEnv, (bLog ? pFS->zLog : pFS->zDb), &pFile); } return pFile; } /* ** If it is not already open, this function opens the log file. It returns ** LSM_OK if successful (or if the log file was already open) or an LSM ** error code otherwise. ** ** The log file must be opened before any of the following may be called: ** ** lsmFsWriteLog ** lsmFsSyncLog ** lsmFsReadLog */ int lsmFsOpenLog(FileSystem *pFS){ int rc = LSM_OK; if( 0==pFS->fdLog ){ pFS->fdLog = fsOpenFile(pFS, 1, &rc); } return rc; } /* ** Open a connection to a database stored within the file-system (the ** "system of files"). */ int lsmFsOpen(lsm_db *pDb, const char *zDb){ FileSystem *pFS; int rc = LSM_OK; int nDb = strlen(zDb); int nByte; assert( pDb->pFS==0 ); assert( pDb->pWorker==0 && pDb->pClient==0 ); nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1; pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc); if( pFS ){ pFS->zDb = (char *)&pFS[1]; pFS->zLog = &pFS->zDb[nDb+1]; pFS->nPagesize = LSM_PAGE_SIZE; pFS->nBlocksize = LSM_BLOCK_SIZE; pFS->nMetasize = 4 * 1024; pFS->pDb = pDb; pFS->pEnv = pDb->pEnv; /* Make a copy of the database and log file names. */ memcpy(pFS->zDb, zDb, nDb+1); memcpy(pFS->zLog, zDb, nDb); memcpy(&pFS->zLog[nDb], "-log", 5); /* Allocate the hash-table here. At some point, it should be changed ** so that it can grow dynamicly. */ pFS->nCacheMax = 2048; pFS->nHash = 4096; pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc); pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc); /* Open the database file */ pFS->fdDb = fsOpenFile(pFS, 0, &rc); if( rc!=LSM_OK ){ lsmFsClose(pFS); pFS = 0; } } |
︙ | ︙ | |||
381 382 383 384 385 386 387 | Page *pNext = pPg->pLruNext; if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData); lsmFree(pEnv, pPg); pPg = pNext; } if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb ); | > > > > > | | > | > | 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 | Page *pNext = pPg->pLruNext; if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData); lsmFree(pEnv, pPg); pPg = pNext; } if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb ); if( pFS->fdLog ){ if( lsmDbMultiProc(pFS->pDb) ){ lsmDbDeferredClose(pFS->pDb, pFS->fdLog, pFS->pLsmFile); pFS->pLsmFile = 0; }else{ lsmEnvClose(pFS->pEnv, pFS->fdLog ); } } lsmFree(pEnv, pFS->pLsmFile); lsmFree(pEnv, pFS->apHash); lsmFree(pEnv, pFS); } } /* ** Allocate a buffer and populate it with the output of the xFileid() |
︙ | ︙ | |||
623 624 625 626 627 628 629 630 631 632 633 634 635 | FileSystem *pFS, i64 iSz, int *pRc ){ if( *pRc==LSM_OK && iSz>pFS->nMap ){ Page *pFix; int rc; rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap); if( rc==LSM_OK ){ u8 *aData = (u8 *)pFS->pMap; for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){ pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)]; } | > < | 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | FileSystem *pFS, i64 iSz, int *pRc ){ if( *pRc==LSM_OK && iSz>pFS->nMap ){ Page *pFix; int rc; u8 *aOld = pFS->pMap; rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap); if( rc==LSM_OK ){ u8 *aData = (u8 *)pFS->pMap; for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){ pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)]; } lsmSortedRemap(pFS->pDb); } *pRc = rc; } } /* |
︙ | ︙ | |||
779 780 781 782 783 784 785 | Snapshot *pSnapshot, Segment *pIgnore, /* Ignore this run when searching */ int iBlk ){ int rc = LSM_OK; /* Return code */ int iFirst; /* First page on block iBlk */ int iLast; /* Last page on block iBlk */ | < | > | < | | < < > > | 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 | Snapshot *pSnapshot, Segment *pIgnore, /* Ignore this run when searching */ int iBlk ){ int rc = LSM_OK; /* Return code */ int iFirst; /* First page on block iBlk */ int iLast; /* Last page on block iBlk */ Level *pLevel; /* Used to iterate through levels */ int iIn; /* Used to iterate through append points */ int iOut = 0; /* Used to output append points */ u32 *aApp = pSnapshot->aiAppend; iFirst = fsFirstPageOnBlock(pFS, iBlk); iLast = fsLastPageOnBlock(pFS, iBlk); /* Check if any other run in the snapshot has a start or end page ** within this block. If there is such a run, return early. */ for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){ if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){ return LSM_OK; } } for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){ if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){ aApp[iOut++] = aApp[iIn]; } } while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0; if( rc==LSM_OK ){ rc = lsmBlockFree(pFS->pDb, iBlk); } return rc; } |
︙ | ︙ | |||
931 932 933 934 935 936 937 | iPg++; } } return fsPageGet(pFS, iPg, 0, ppNext); } | | < < < | < < < < < < < < < | < < | < | < < | < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | | 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | iPg++; } } return fsPageGet(pFS, iPg, 0, ppNext); } static Pgno findAppendPoint(FileSystem *pFS){ int i; u32 *aiAppend = pFS->pDb->pWorker->aiAppend; u32 iRet = 0; for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){ if( (iRet = aiAppend[i]) ) aiAppend[i] = 0; } return iRet; } /* ** Append a page to file iFile. Return a reference to it. lsmFsPageWrite() ** has already been called on the returned reference. */ int lsmFsSortedAppend( FileSystem *pFS, Snapshot *pSnapshot, Segment *p, Page **ppOut ){ int rc = LSM_OK; Page *pPg = 0; *ppOut = 0; int iApp = 0; int iNext = 0; int iPrev = p->iLast; if( iPrev==0 ){ iApp = findAppendPoint(pFS); }else if( fsIsLast(pFS, iPrev) ){ Page *pLast = 0; rc = fsPageGet(pFS, iPrev, 0, &pLast); if( rc!=LSM_OK ) return rc; iApp = lsmGetU32(&pLast->aData[pFS->nPagesize-4]); lsmFsPageRelease(pLast); }else{ |
︙ | ︙ | |||
1131 1132 1133 1134 1135 1136 1137 | if( rc==LSM_OK ){ int iPg = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]); int iBlk = fsPageToBlock(pFS, iPg); lsmBlockRefree(pFS->pDb, iBlk); lsmFsPageRelease(pLast); } }else{ | > > > > | > > > | 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 | if( rc==LSM_OK ){ int iPg = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]); int iBlk = fsPageToBlock(pFS, iPg); lsmBlockRefree(pFS->pDb, iBlk); lsmFsPageRelease(pLast); } }else{ int i; u32 *aiAppend = pFS->pDb->pWorker->aiAppend; for(i=0; i<LSM_APPLIST_SZ; i++){ if( aiAppend[i]==0 ){ aiAppend[i] = p->iLast+1; break; } } } } return rc; } /* ** Obtain a reference to page number iPg. |
︙ | ︙ | |||
1399 1400 1401 1402 1403 1404 1405 | ** eventually free the string using lsmFree(). ** ** If an error occurs, *pzOut is set to NULL and an LSM error code returned. */ int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){ int rc = LSM_OK; Snapshot *pWorker; /* Worker snapshot */ | < > > > | > | 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 | ** eventually free the string using lsmFree(). ** ** If an error occurs, *pzOut is set to NULL and an LSM error code returned. */ int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){ int rc = LSM_OK; Snapshot *pWorker; /* Worker snapshot */ Segment *pArray = 0; /* Array to report on */ Level *pLvl; /* Used to iterate through db levels */ int bUnlock = 0; *pzOut = 0; if( iFirst==0 ) return LSM_ERROR; /* Obtain the worker snapshot */ pWorker = pDb->pWorker; if( !pWorker ){ rc = lsmBeginWork(pDb); if( rc!=LSM_OK ) return rc; pWorker = pDb->pWorker; bUnlock = 1; } /* Search for the array that starts on page iFirst */ for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pArray==0; pLvl=pLvl->pNext){ if( 0==(pArray = startsWith(&pLvl->lhs, iFirst)) ){ int i; for(i=0; i<pLvl->nRight; i++){ |
︙ | ︙ | |||
1447 1448 1449 1450 1451 1452 1453 | lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk)); } lsmStringAppendf(&str, " %d", pArray->iLast); *pzOut = str.z; } | > > | > < | > | < < < < | | | | | | | | | | | | | | | | | | | < < | | | > > > < | > > > > > | > | | < | | | | > > | > > > > | | | > | | > > < | 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 | lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk)); } lsmStringAppendf(&str, " %d", pArray->iLast); *pzOut = str.z; } if( bUnlock ){ int rcwork = LSM_BUSY; lsmFinishWork(pDb, 0, 0, &rcwork); } return rc; } /* ** Helper function for lsmFsIntegrityCheck() */ static void checkBlocks( FileSystem *pFS, Segment *pSeg, int bExtra, /* If true, count the "next" block if any */ int nUsed, u8 *aUsed ){ if( pSeg ){ if( pSeg && pSeg->nSize>0 ){ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); int iBlk; int iLastBlk; iBlk = fsPageToBlock(pFS, pSeg->iFirst); iLastBlk = fsPageToBlock(pFS, pSeg->iLast); while( iBlk ){ assert( iBlk<=nUsed ); /* assert( aUsed[iBlk-1]==0 ); */ aUsed[iBlk-1] = 1; if( iBlk!=iLastBlk ){ fsBlockNext(pFS, iBlk, &iBlk); }else{ iBlk = 0; } } if( bExtra && (pSeg->iLast % nPagePerBlock)==0 ){ fsBlockNext(pFS, iLastBlk, &iBlk); aUsed[iBlk-1] = 1; } } } } /* ** This function checks that all blocks in the database file are accounted ** for. For each block, exactly one of the following must be true: ** ** + the block is part of a sorted run, or ** + the block is on the free-block list ** ** This function also checks that there are no references to blocks with ** out-of-range block numbers. ** ** If no errors are found, non-zero is returned. If an error is found, an ** assert() fails. */ int lsmFsIntegrityCheck(lsm_db *pDb){ int i; int j; Freelist freelist = {0, 0, 0}; FileSystem *pFS = pDb->pFS; u8 *aUsed; Level *pLevel; Snapshot *pWorker = pDb->pWorker; int nBlock = pWorker->nBlock; aUsed = lsmMallocZero(pDb->pEnv, nBlock); if( aUsed==0 ){ /* Malloc has failed. Since this function is only called within debug ** builds, this probably means the user is running an OOM injection test. ** Regardless, it will not be possible to run the integrity-check at this ** time, so assume the database is Ok and return non-zero. */ return 1; } for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){ int i; checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed); for(i=0; i<pLevel->nRight; i++){ checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed); } } if( pWorker->nFreelistOvfl ){ int rc = lsmCheckpointOverflowLoad(pDb, &freelist); assert( rc==LSM_OK || rc==LSM_NOMEM ); if( rc!=LSM_OK ) return 1; } for(j=0; j<2; j++){ Freelist *pFreelist; if( j==0 ) pFreelist = &pWorker->freelist; if( j==1 ) pFreelist = &freelist; for(i=0; i<pFreelist->nEntry; i++){ u32 iBlk = pFreelist->aEntry[i].iBlk; assert( iBlk<=nBlock ); assert( aUsed[iBlk-1]==0 ); aUsed[iBlk-1] = 1; } } for(i=0; i<nBlock; i++) assert( aUsed[i]==1 ); lsmFree(pDb->pEnv, aUsed); lsmFree(pDb->pEnv, freelist.aEntry); return 1; } |
Changes to src/lsm_log.c.
︙ | ︙ | |||
300 301 302 303 304 305 306 | ** is assumed that the caller is holding the client-mutex when it is ** called. ** ** Before returning, this function allocates the LogWriter object that ** will be used to write to the log file during the write transaction. ** LSM_OK is returned if no error occurs, otherwise an LSM error code. */ | | < | | 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | ** is assumed that the caller is holding the client-mutex when it is ** called. ** ** Before returning, this function allocates the LogWriter object that ** will be used to write to the log file during the write transaction. ** LSM_OK is returned if no error occurs, otherwise an LSM error code. */ int lsmLogBegin(lsm_db *pDb){ int rc = LSM_OK; LogWriter *pNew; LogRegion *aReg; if( pDb->bUseLog==0 ) return LSM_OK; rc = lsmFsOpenLog(pDb->pFS); pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc); if( pNew ){ lsmStringInit(&pNew->buf, pDb->pEnv); rc = lsmStringExtend(&pNew->buf, 2); } if( rc!=LSM_OK ){ assert( pNew==0 || pNew->buf.z==0 ); |
︙ | ︙ | |||
342 343 344 345 346 347 348 | ** ** 2) Region 1 is zero bytes in size and region 2 occurs earlier in the ** file than region 0. In this case, append data to region 2, but ** remember to jump over region 1 if required. ** ** 3) Region 2 is the last in the file. Append to it. */ | | | | | 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | ** ** 2) Region 1 is zero bytes in size and region 2 occurs earlier in the ** file than region 0. In this case, append data to region 2, but ** remember to jump over region 1 if required. ** ** 3) Region 2 is the last in the file. Append to it. */ aReg = &pDb->treehdr.log.aRegion[0]; assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart ); assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart ); pNew->cksum0 = pDb->treehdr.log.cksum0; pNew->cksum1 = pDb->treehdr.log.cksum1; if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=pDb->nLogSz ){ /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP ** into the log file in this case. Pad it out to 8 bytes using a PAD2 ** record so that the checksums can be updated immediately. */ u8 aJump[] = { LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00 |
︙ | ︙ | |||
399 400 401 402 403 404 405 | ** or false otherwise. The caller must hold the client-mutex to call ** this function. ** ** A call to this function deletes the LogWriter object allocated by ** lsmLogBegin(). If the transaction is being committed, the shared state ** in *pLog is updated before returning. */ | | > < > | 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 | ** or false otherwise. The caller must hold the client-mutex to call ** this function. ** ** A call to this function deletes the LogWriter object allocated by ** lsmLogBegin(). If the transaction is being committed, the shared state ** in *pLog is updated before returning. */ void lsmLogEnd(lsm_db *pDb, int bCommit){ DbLog *pLog; LogWriter *p; if( pDb->bUseLog==0 ) return; p = pDb->pLogWriter; pLog = &pDb->treehdr.log; if( bCommit ){ pLog->aRegion[2].iEnd = p->iOff; pLog->cksum0 = p->cksum0; pLog->cksum1 = p->cksum1; if( p->iRegion1End ){ /* This happens when the transaction had to jump over some other |
︙ | ︙ | |||
432 433 434 435 436 437 438 | /* ** This function is called after a checkpoint is synced into the database ** file. The checkpoint specifies that the log starts at offset iOff. ** The shared state in *pLog is updated to reflect the fact that space ** in the log file that occurs logically before offset iOff may now ** be reused. */ | | > < | 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | /* ** This function is called after a checkpoint is synced into the database ** file. The checkpoint specifies that the log starts at offset iOff. ** The shared state in *pLog is updated to reflect the fact that space ** in the log file that occurs logically before offset iOff may now ** be reused. */ void lsmLogCheckpoint(lsm_db *pDb, lsm_i64 iOff){ DbLog *pLog = &pDb->treehdr.log; int iRegion; for(iRegion=0; iRegion<3; iRegion++){ LogRegion *p = &pLog->aRegion[iRegion]; if( iOff>=p->iStart && iOff<=p->iEnd ) break; p->iStart = 0; p->iEnd = 0; } |
︙ | ︙ | |||
723 724 725 726 727 728 729 | if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0; } /* ** TODO: Thread safety of this function? */ int lsmLogStructure(lsm_db *pDb, char **pzVal){ | | | 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 | if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0; } /* ** TODO: Thread safety of this function? */ int lsmLogStructure(lsm_db *pDb, char **pzVal){ DbLog *pLog = &pDb->treehdr.log; *pzVal = lsmMallocPrintf(pDb->pEnv, "%d %d %d %d %d %d", (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd, (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd, (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd ); return (*pzVal ? LSM_OK : LSM_NOMEM_BKPT); |
︙ | ︙ | |||
885 886 887 888 889 890 891 | /* ** Recover the contents of the log file. */ int lsmLogRecover(lsm_db *pDb){ LsmString buf1; /* Key buffer */ LsmString buf2; /* Value buffer */ LogReader reader; /* Log reader object */ | | | > > > | | 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 | /* ** Recover the contents of the log file. */ int lsmLogRecover(lsm_db *pDb){ LsmString buf1; /* Key buffer */ LsmString buf2; /* Value buffer */ LogReader reader; /* Log reader object */ int rc = LSM_OK; /* Return code */ int nCommit = 0; /* Number of transactions to recover */ int iPass; int nJump = 0; /* Number of LSM_LOG_JUMP records in pass 0 */ DbLog *pLog; rc = lsmFsOpenLog(pDb->pFS); if( rc!=LSM_OK ) return rc; lsmTreeInit(pDb); pLog = &pDb->treehdr.log; lsmCheckpointLogoffset(pDb->pShmhdr->aWorker, pLog); logReaderInit(pDb, pLog, 1, &reader); lsmStringInit(&buf1, pDb->pEnv); lsmStringInit(&buf2, pDb->pEnv); /* The outer for() loop runs at most twice. The first iteration is to ** count the number of committed transactions in the log. The second ** iterates through those transactions and updates the in-memory tree |
︙ | ︙ | |||
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 | if( rc==LSM_OK && iPass==0 ){ if( nCommit==0 ){ if( pLog->aRegion[2].iStart==0 ){ iPass = 1; }else{ pLog->aRegion[2].iStart = 0; iPass = -1; } } logReaderInit(pDb, pLog, 0, &reader); nCommit = nCommit * -1; } } | > | 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 | if( rc==LSM_OK && iPass==0 ){ if( nCommit==0 ){ if( pLog->aRegion[2].iStart==0 ){ iPass = 1; }else{ pLog->aRegion[2].iStart = 0; iPass = -1; lsmCheckpointZeroLogoffset(pDb); } } logReaderInit(pDb, pLog, 0, &reader); nCommit = nCommit * -1; } } |
︙ | ︙ |
Changes to src/lsm_main.c.
︙ | ︙ | |||
37 38 39 40 41 42 43 | /* If there is at least one cursor or a write transaction open, the database ** handle must be holding a pointer to a client snapshot. And the reverse ** - if there are no open cursors and no write transactions then there must ** not be a client snapshot. */ assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) ); | < < < | 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* If there is at least one cursor or a write transaction open, the database ** handle must be holding a pointer to a client snapshot. And the reverse ** - if there are no open cursors and no write transactions then there must ** not be a client snapshot. */ assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) ); assert( pDb->nTransOpen>=0 ); } #else # define assert_db_state(x) #endif /* ** The default key-compare function. |
︙ | ︙ | |||
80 81 82 83 84 85 86 87 | pDb->bAutowork = 1; pDb->eSafety = LSM_SAFETY_NORMAL; pDb->xCmp = xCmp; pDb->nLogSz = LSM_DEFAULT_LOG_SIZE; pDb->nDfltPgsz = LSM_PAGE_SIZE; pDb->nDfltBlksz = LSM_BLOCK_SIZE; pDb->nMerge = LSM_DEFAULT_NMERGE; pDb->bUseLog = 1; | > | > < < < < < < < < < < < < < < < < < < < < < > > | | < < < < < < < < < < < < < < < < | < < | < < | < < < < | < < < < | < < < | < < < < < < < < < < < < < < | 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | pDb->bAutowork = 1; pDb->eSafety = LSM_SAFETY_NORMAL; pDb->xCmp = xCmp; pDb->nLogSz = LSM_DEFAULT_LOG_SIZE; pDb->nDfltPgsz = LSM_PAGE_SIZE; pDb->nDfltBlksz = LSM_BLOCK_SIZE; pDb->nMerge = LSM_DEFAULT_NMERGE; pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES; pDb->bUseLog = 1; pDb->iReader = -1; pDb->bMultiProc = 1; return LSM_OK; } lsm_env *lsm_get_env(lsm_db *pDb){ assert( pDb->pEnv ); return pDb->pEnv; } /* ** If database handle pDb is currently holding a client snapshot, but does ** not have any open cursors or write transactions, release it. */ static void dbReleaseClientSnapshot(lsm_db *pDb){ if( pDb->nTransOpen==0 && pDb->pCsr==0 ){ lsmFinishReadTrans(pDb); } } static int dbAutoWork(lsm_db *pDb, int nUnit){ int rc = LSM_OK; /* Return code */ assert( pDb->pWorker==0 ); assert( pDb->bAutowork ); assert( nUnit>0 ); /* If one is required, run a checkpoint. */ #if 0 rc = lsmCheckpointWrite(pDb); #endif rc = lsmBeginWork(pDb); if( rc==LSM_OK ) rc = lsmSortedAutoWork(pDb, nUnit); if( pDb->pWorker && pDb->pWorker->pLevel ){ lsmFinishWork(pDb, 0, -1, &rc); }else{ int rcdummy = LSM_BUSY; lsmFinishWork(pDb, 0, 0, &rcdummy); } return rc; } static int getFullpathname( lsm_env *pEnv, const char *zRel, char **pzAbs |
︙ | ︙ | |||
234 235 236 237 238 239 240 | ** than one purpose - to open both the database and log files, and ** perhaps to unlink the log file during disconnection. An absolute ** path is required to ensure that the correct files are operated ** on even if the application changes the cwd. */ rc = getFullpathname(pDb->pEnv, zFilename, &zFull); assert( rc==LSM_OK || zFull==0 ); | | | | > > > > | < > > | < | > | | | > > | | > | > > > | < < > > | | > | < | > | < < | < < < < | 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | ** than one purpose - to open both the database and log files, and ** perhaps to unlink the log file during disconnection. An absolute ** path is required to ensure that the correct files are operated ** on even if the application changes the cwd. */ rc = getFullpathname(pDb->pEnv, zFilename, &zFull); assert( rc==LSM_OK || zFull==0 ); /* Open the database file. */ if( rc==LSM_OK ){ rc = lsmFsOpen(pDb, zFull); } /* Connect to the database */ if( rc==LSM_OK ){ rc = lsmDbDatabaseConnect(pDb, zFilename); } /* Configure the file-system connection with the page-size and block-size ** of this database. Even if the database file is zero bytes in size ** on disk, these values have been set in shared-memory by now, and so are ** guaranteed not to change during the lifetime of this connection. */ if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb)) ){ lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot)); lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot)); } lsmFree(pDb->pEnv, zFull); } return rc; } /* ** This function flushes the contents of the in-memory tree to disk. It ** returns LSM_OK if successful, or an error code otherwise. */ int lsmFlushToDisk(lsm_db *pDb){ int rc = LSM_OK; /* Return code */ int nOvfl = 0; /* Number of free-list entries in LSM */ /* Must not hold the worker snapshot when this is called. */ assert( pDb->pWorker==0 ); rc = lsmBeginWork(pDb); /* Save the position of each open cursor belonging to pDb. */ if( rc==LSM_OK ){ rc = lsmSaveCursors(pDb); } if( rc==LSM_OK && pDb->bAutowork ){ rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT); } while( rc==LSM_OK && lsmDatabaseFull(pDb) ){ rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT); } /* Write the contents of the in-memory tree into the database file and ** update the worker snapshot accordingly. Then flush the contents of ** the db file to disk too. No calls to fsync() are made here - just ** write(). */ if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, &nOvfl); if( rc==LSM_OK ) lsmTreeClear(pDb); lsmFinishWork(pDb, 1, nOvfl, &rc); /* Restore the position of any open cursors */ if( rc==LSM_OK && pDb->pCsr ){ lsmFreeSnapshot(pDb->pEnv, pDb->pClient); pDb->pClient = 0; rc = lsmCheckpointLoad(pDb); if( rc==LSM_OK ){ rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient); } if( rc==LSM_OK ){ rc = lsmRestoreCursors(pDb); } } #if 0 if( rc==LSM_OK ) lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "flush"); #endif return rc; } int lsm_close(lsm_db *pDb){ int rc = LSM_OK; if( pDb ){ assert_db_state(pDb); if( pDb->pCsr || pDb->nTransOpen ){ rc = LSM_MISUSE_BKPT; }else{ lsmDbDatabaseRelease(pDb); lsmFsClose(pDb->pFS); lsmFree(pDb->pEnv, pDb->aTrans); lsmFree(pDb->pEnv, pDb); } } return rc; |
︙ | ︙ | |||
419 420 421 422 423 424 425 426 427 428 429 430 431 432 | case LSM_CONFIG_NMERGE: { int *piVal = va_arg(ap, int *); if( *piVal>1 ) pDb->nMerge = *piVal; *piVal = pDb->nMerge; break; } default: rc = LSM_MISUSE; break; } va_end(ap); | > > > > > > > > > > > > > > > > > > > > > > | 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 | case LSM_CONFIG_NMERGE: { int *piVal = va_arg(ap, int *); if( *piVal>1 ) pDb->nMerge = *piVal; *piVal = pDb->nMerge; break; } case LSM_CONFIG_MAX_FREELIST: { int *piVal = va_arg(ap, int *); if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){ pDb->nMaxFreelist = *piVal; } *piVal = pDb->nMaxFreelist; break; } case LSM_CONFIG_MULTIPLE_PROCESSES: { int *piVal = va_arg(ap, int *); if( pDb->pDatabase ){ /* If lsm_open() has been called, this is a read-only parameter. ** Set the output variable to true if this connection is currently ** in multi-process mode. */ *piVal = lsmDbMultiProc(pDb); }else{ pDb->bMultiProc = *piVal = (*piVal!=0); } break; } default: rc = LSM_MISUSE; break; } va_end(ap); |
︙ | ︙ | |||
444 445 446 447 448 449 450 | char **pzOut /* OUT: Nul-terminated string (tcl list) */ ){ Level *pTopLevel = 0; /* Top level of snapshot to report on */ int rc = LSM_OK; Level *p; LsmString s; Snapshot *pWorker; /* Worker snapshot */ | | > > | > > > | > | 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 | char **pzOut /* OUT: Nul-terminated string (tcl list) */ ){ Level *pTopLevel = 0; /* Top level of snapshot to report on */ int rc = LSM_OK; Level *p; LsmString s; Snapshot *pWorker; /* Worker snapshot */ int bUnlock = 0; /* Obtain the worker snapshot */ pWorker = pDb->pWorker; if( !pWorker ){ rc = lsmBeginWork(pDb); if( rc!=LSM_OK ) return rc; pWorker = pDb->pWorker; bUnlock = 1; } /* Format the contents of the snapshot as text */ pTopLevel = lsmDbSnapshotLevel(pWorker); lsmStringInit(&s, pDb->pEnv); for(p=pTopLevel; rc==LSM_OK && p; p=p->pNext){ int i; lsmStringAppendf(&s, "%s{", (s.n ? " " : "")); lsmAppendSegmentList(&s, "", &p->lhs); for(i=0; rc==LSM_OK && i<p->nRight; i++){ lsmAppendSegmentList(&s, " ", &p->aRhs[i]); } lsmStringAppend(&s, "}", 1); } rc = s.n>=0 ? LSM_OK : LSM_NOMEM; /* Release the snapshot and return */ if( bUnlock ){ int rcdummy = LSM_BUSY; lsmFinishWork(pDb, 0, 0, &rcdummy); } *pzOut = s.z; return rc; } int lsm_info(lsm_db *pDb, int eParam, ...){ int rc = LSM_OK; va_list ap; |
︙ | ︙ | |||
543 544 545 546 547 548 549 | if( pDb->nTransOpen==0 ){ bCommit = 1; rc = lsm_begin(pDb, 1); } if( rc==LSM_OK ){ | < | | < | 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 | if( pDb->nTransOpen==0 ){ bCommit = 1; rc = lsm_begin(pDb, 1); } if( rc==LSM_OK ){ rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal); } lsmSortedSaveTreeCursors(pDb); if( rc==LSM_OK ){ int pgsz = lsmFsPageSize(pDb->pFS); int nQuant = LSM_AUTOWORK_QUANT * pgsz; int nBefore; int nAfter; int nDiff; if( nQuant>pDb->nTreeLimit ){ nQuant = pDb->nTreeLimit; } nBefore = lsmTreeSize(pDb); rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal); nAfter = lsmTreeSize(pDb); nDiff = (nAfter/nQuant) - (nBefore/nQuant); if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){ rc = dbAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT); } } /* If a transaction was opened at the start of this function, commit it. |
︙ | ︙ | |||
737 738 739 740 741 742 743 | if( rc==LSM_OK && pDb->nTransOpen==0 ){ rc = lsmBeginWriteTrans(pDb); } if( rc==LSM_OK ){ for(i=pDb->nTransOpen; i<iLevel; i++){ | | > > | > > > > > > | 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 | if( rc==LSM_OK && pDb->nTransOpen==0 ){ rc = lsmBeginWriteTrans(pDb); } if( rc==LSM_OK ){ for(i=pDb->nTransOpen; i<iLevel; i++){ lsmTreeMark(pDb, &pDb->aTrans[i].tree); lsmLogTell(pDb, &pDb->aTrans[i].log); } pDb->nTransOpen = iLevel; } } return rc; } int lsm_commit(lsm_db *pDb, int iLevel){ int bFlush = 0; int rc = LSM_OK; assert_db_state( pDb ); /* A value less than zero means close the innermost nested transaction. */ if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1); if( iLevel<pDb->nTransOpen ){ if( iLevel==0 ){ /* Commit the transaction to disk. */ if( lsmTreeSize(pDb)>pDb->nTreeLimit ){ lsmTreeEndTransaction(pDb, 1); bFlush = 1; rc = lsmFlushToDisk(pDb); } if( rc==LSM_OK ) rc = lsmLogCommit(pDb); if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){ rc = lsmFsSyncLog(pDb->pFS); } lsmFinishWriteTrans(pDb, (rc==LSM_OK)); } pDb->nTransOpen = iLevel; } dbReleaseClientSnapshot(pDb); if( pDb->bAutowork && bFlush && rc==LSM_OK ){ rc = lsmCheckpointWrite(pDb); } return rc; } int lsm_rollback(lsm_db *pDb, int iLevel){ int rc = LSM_OK; assert_db_state( pDb ); |
︙ | ︙ |
Changes to src/lsm_mem.c.
︙ | ︙ | |||
105 106 107 108 109 110 111 | lsmFree(pEnv, p); }else{ pRet = lsmReallocOrFree(pEnv, p, N); if( !pRet ) *pRc = LSM_NOMEM_BKPT; } return pRet; } | < | 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | lsmFree(pEnv, p); }else{ pRet = lsmReallocOrFree(pEnv, p, N); if( !pRet ) *pRc = LSM_NOMEM_BKPT; } return pRet; } char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){ int nByte; char *zRet; nByte = strlen(zIn); zRet = lsmMalloc(pEnv, nByte+1); if( zRet ){ |
︙ | ︙ |
Changes to src/lsm_shared.c.
︙ | ︙ | |||
11 12 13 14 15 16 17 | ************************************************************************* ** ** Utilities used to help multiple LSM clients to coexist within the ** same process space. */ #include "lsmInt.h" | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < > < < < < | | < < < < < | < | | < | < < | < < < < | < | < < < < < < < | | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | ************************************************************************* ** ** Utilities used to help multiple LSM clients to coexist within the ** same process space. */ #include "lsmInt.h" /* ** Global data. All global variables used by code in this file are grouped ** into the following structure instance. ** ** pDatabase: ** Linked list of all Database objects allocated within this process. ** This list may not be traversed without holding the global mutex (see ** functions enterGlobalMutex() and leaveGlobalMutex()). */ static struct SharedData { Database *pDatabase; /* Linked list of all Database objects */ } gShared; /* ** Database structure. There is one such structure for each distinct ** database accessed by this process. They are stored in the singly linked ** list starting at global variable gShared.pDatabase. Database objects are ** reference counted. Once the number of connections to the associated ** database drops to zero, they are removed from the linked list and deleted. */ struct Database { /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */ char *zName; /* Canonical path to database file */ void *pId; /* Database id (file inode) */ int nId; /* Size of pId in bytes */ int nDbRef; /* Number of associated lsm_db handles */ Database *pDbNext; /* Next Database structure in global list */ /* Protected by the local mutex (pClientMutex) */ lsm_file *pFile; /* Used for locks/shm in multi-proc mode */ LsmFile *pLsmFile; /* List of deferred closes */ lsm_mutex *pClientMutex; /* Protects the apShmChunk[] and pConn */ int nShmChunk; /* Number of entries in apShmChunk[] array */ void **apShmChunk; /* Array of "shared" memory regions */ lsm_db *pConn; /* List of connections to this db. */ }; /* ** Functions to enter and leave the global mutex. This mutex is used ** to protect the global linked-list headed at gShared.pDatabase. */ static int enterGlobalMutex(lsm_env *pEnv){ lsm_mutex *p; int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); if( rc==LSM_OK ) lsmMutexEnter(pEnv, p); return rc; } |
︙ | ︙ | |||
225 226 227 228 229 230 231 | } static void assertNotInFreelist(Freelist *p, int iBlk){ int i; for(i=0; i<p->nEntry; i++){ assert( p->aEntry[i].iBlk!=iBlk ); } } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | } static void assertNotInFreelist(Freelist *p, int iBlk){ int i; for(i=0; i<p->nEntry; i++){ assert( p->aEntry[i].iBlk!=iBlk ); } } #else # define assertNotInFreelist(x,y) #endif /* ** Append an entry to the free-list. */ int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){ /* Assert that this is not an attempt to insert a duplicate block number */ assertNotInFreelist(p, iBlk); /* Extend the space allocated for the freelist, if required */ assert( p->nAlloc>=p->nEntry ); if( p->nAlloc==p->nEntry ){ |
︙ | ︙ | |||
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | /* Append the new entry to the freelist */ p->aEntry[p->nEntry].iBlk = iBlk; p->aEntry[p->nEntry].iId = iId; p->nEntry++; return LSM_OK; } /* ** Remove the first entry of the free-list. */ static void flRemoveEntry0(Freelist *p){ int nNew = p->nEntry - 1; assert( nNew>=0 ); memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew); p->nEntry = nNew; } /* | > > > > > > > > > > > > | > > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | /* Append the new entry to the freelist */ p->aEntry[p->nEntry].iBlk = iBlk; p->aEntry[p->nEntry].iId = iId; p->nEntry++; return LSM_OK; } static int flInsertEntry(lsm_env *pEnv, Freelist *p, int iBlk){ int rc; rc = lsmFreelistAppend(pEnv, p, iBlk, 1); if( rc==LSM_OK ){ memmove(&p->aEntry[1], &p->aEntry[0], sizeof(FreelistEntry)*(p->nEntry-1)); p->aEntry[0].iBlk = iBlk; p->aEntry[0].iId = 1; } return rc; } /* ** Remove the first entry of the free-list. */ static void flRemoveEntry0(Freelist *p){ int nNew = p->nEntry - 1; assert( nNew>=0 ); memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew); p->nEntry = nNew; } /* ** tHIS Function frees all resources held by the Database structure passed ** as the only argument. */ static void freeDatabase(lsm_env *pEnv, Database *p){ assert( holdingGlobalMutex(pEnv) ); if( p ){ /* Free the mutexes */ lsmMutexDel(pEnv, p->pClientMutex); if( p->pFile ){ lsmEnvClose(pEnv, p->pFile); } /* Free the memory allocated for the Database struct itself */ lsmFree(pEnv, p); } } static void doDbDisconnect(lsm_db *pDb){ int rc; /* Block for an exclusive lock on DMS1. This lock serializes all calls ** to doDbConnect() and doDbDisconnect() across all processes. */ rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); if( rc==LSM_OK ){ /* Try an exclusive lock on DMS2. If successful, this is the last ** connection to the database. In this case flush the contents of the ** in-memory tree to disk and write a checkpoint. */ rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0); if( rc==LSM_OK ){ /* Flush the in-memory tree, if required. If there is data to flush, ** this will create a new client snapshot in Database.pClient. The ** checkpoint (serialization) of this snapshot may be written to disk ** by the following block. */ rc = lsmTreeLoadHeader(pDb); if( rc==LSM_OK && lsmTreeSize(pDb)>0 ){ rc = lsmFlushToDisk(pDb); } /* Write a checkpoint to disk. */ if( rc==LSM_OK ){ rc = lsmCheckpointWrite(pDb); } /* If the checkpoint was written successfully, delete the log file */ if( rc==LSM_OK && pDb->pFS ){ Database *p = pDb->pDatabase; lsmFsCloseAndDeleteLog(pDb->pFS); if( p->pFile ) lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1); } } } lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0); lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); pDb->pShmhdr = 0; } static int doDbConnect(lsm_db *pDb){ int rc; /* Obtain a pointer to the shared-memory header */ assert( pDb->pShmhdr==0 ); rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr); if( rc!=LSM_OK ) return rc; /* Block for an exclusive lock on DMS1. This lock serializes all calls ** to doDbConnect() and doDbDisconnect() across all processes. */ rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); if( rc!=LSM_OK ){ pDb->pShmhdr = 0; return rc; } /* Try an exclusive lock on DMS2. If successful, this is the first and ** only connection to the database. In this case initialize the ** shared-memory and run log file recovery. */ rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0); if( rc==LSM_OK ){ memset(pDb->pShmhdr, 0, sizeof(ShmHeader)); rc = lsmCheckpointRecover(pDb); if( rc==LSM_OK ){ rc = lsmLogRecover(pDb); } }else if( rc==LSM_BUSY ){ rc = LSM_OK; } /* Take a shared lock on DMS2. This lock "cannot" fail, as connections ** may only hold an exclusive lock on DMS2 if they first hold an exclusive ** lock on DMS1. And this connection is currently holding the exclusive ** lock on DSM1. */ if( rc==LSM_OK ){ rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0); } /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */ if( rc!=LSM_OK ){ lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0); pDb->pShmhdr = 0; } lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); return rc; } /* ** Return a reference to the shared Database handle for the database ** identified by canonical path zName. If this is the first connection to ** the named database, a new Database object is allocated. Otherwise, a ** pointer to an existing object is returned. ** ** If successful, *ppDatabase is set to point to the shared Database ** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL ** and and LSM error code returned. ** ** Each successful call to this function should be (eventually) matched ** by a call to lsmDbDatabaseRelease(). */ int lsmDbDatabaseConnect( lsm_db *pDb, /* Database handle */ const char *zName /* Path to db file */ ){ lsm_env *pEnv = pDb->pEnv; int rc; /* Return code */ Database *p = 0; /* Pointer returned via *ppDatabase */ int nId = 0; |
︙ | ︙ | |||
380 381 382 383 384 385 386 | } /* If no suitable Database object was found, allocate a new one. */ if( p==0 ){ int nName = strlen(zName); p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc); | < < < < < < | | | < > | > | | > | > > > > > > > > < | | < < | < < < < < | > | | | | > > > > > > > > > > > < < | < < | < < | < | | | | | < | < | < < | < < | | | < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | > | < > | > | < < | < < | < < < | < > | | < < < < | < > > | < < < < | | < < | < | | < | < | < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | < < | > > < < | < < < < < < < < > | > > | > < | < > | < | | | > > > > > > > > | < | | < < < < < < > > > > | > | > | | > > | > | < < < < | | < > > > | > > | > > | | < < | | | | | > | > | | < < | > | | < < | < | < > | < < < > | > > > > | | | < < | > | < | | < < < < | | < > | > > | | | | | < | | | < < | < | < < < < | < | > > < < | < < > | > | > > | > > > > > > > > > > > > > > > > | | | > > | < < < < | > | < > | > < < | | < < < | | < | > | > > > | > > > > > > > > | < > | | < < | | < < < < < > < < | | < < < < < < < < < < < < < < < | < | > > | > > > > < < < < < < < < < | < < < | | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 | } /* If no suitable Database object was found, allocate a new one. */ if( p==0 ){ int nName = strlen(zName); p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc); /* Allocate the mutex */ if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pClientMutex); /* If no error has occurred, fill in other fields and link the new ** Database structure into the global list starting at ** gShared.pDatabase. Otherwise, if an error has occurred, free any ** resources allocated and return without linking anything new into ** the gShared.pDatabase list. */ if( rc==LSM_OK ){ p->zName = (char *)&p[1]; memcpy((void *)p->zName, zName, nName+1); p->pId = (void *)&p->zName[nName+1]; memcpy(p->pId, pId, nId); p->nId = nId; p->pDbNext = gShared.pDatabase; gShared.pDatabase = p; } /* If running in multi-process mode, open the shared fd */ if( rc==LSM_OK && pDb->bMultiProc ){ rc = lsmEnvOpen(pDb->pEnv, p->zName, &p->pFile); } if( rc!=LSM_OK ){ freeDatabase(pEnv, p); p = 0; } } if( p ) p->nDbRef++; leaveGlobalMutex(pEnv); if( p ){ lsmMutexEnter(pDb->pEnv, p->pClientMutex); pDb->pNext = p->pConn; p->pConn = pDb; lsmMutexLeave(pDb->pEnv, p->pClientMutex); } } lsmFree(pEnv, pId); pDb->pDatabase = p; if( rc==LSM_OK ){ rc = doDbConnect(pDb); } return rc; } /* ** Release a reference to a Database object obtained from ** lsmDbDatabaseConnect(). There should be exactly one call to this function ** for each successful call to Find(). */ void lsmDbDatabaseRelease(lsm_db *pDb){ Database *p = pDb->pDatabase; if( p ){ lsm_db **ppDb; if( pDb->pShmhdr ){ doDbDisconnect(pDb); } lsmMutexEnter(pDb->pEnv, p->pClientMutex); for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext)); *ppDb = pDb->pNext; lsmMutexLeave(pDb->pEnv, p->pClientMutex); enterGlobalMutex(pDb->pEnv); p->nDbRef--; if( p->nDbRef==0 ){ Database **pp; /* Remove the Database structure from the linked list. */ for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext)); *pp = p->pDbNext; /* Free the Database object and shared memory buffers. */ if( p->pFile==0 ){ int i; for(i=0; i<p->nShmChunk; i++){ lsmFree(pDb->pEnv, p->apShmChunk[i]); } }else{ LsmFile *pIter; LsmFile *pNext; for(pIter=p->pLsmFile; pIter; pIter=pNext){ pNext = pIter->pNext; lsmEnvClose(pDb->pEnv, pIter->pFile); lsmFree(pDb->pEnv, pIter); } } lsmFree(pDb->pEnv, p->apShmChunk); freeDatabase(pDb->pEnv, p); } leaveGlobalMutex(pDb->pEnv); } } Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){ return pSnapshot->pLevel; } void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){ pSnap->pLevel = pLevel; } /* ** Allocate a new database file block to write data to, either by extending ** the database file or by recycling a free-list entry. The worker snapshot ** must be held in order to call this function. ** ** If successful, *piBlk is set to the block number allocated and LSM_OK is ** returned. Otherwise, *piBlk is zeroed and an lsm error code returned. */ int lsmBlockAllocate(lsm_db *pDb, int *piBlk){ Snapshot *p = pDb->pWorker; Freelist *pFree; /* Database free list */ int iRet = 0; /* Block number of allocated block */ int rc = LSM_OK; assert( pDb->pWorker ); pFree = &p->freelist; if( pFree->nEntry>0 ){ /* The first block on the free list was freed as part of the work done ** to create the snapshot with id iFree. So, we can reuse this block if ** snapshot iFree or later has been checkpointed and all currently ** active clients are reading from snapshot iFree or later. */ i64 iFree = pFree->aEntry[0].iId; int bInUse = 0; /* The "is in use" bit */ rc = lsmLsmInUse(pDb, iFree, &bInUse); /* The "has been checkpointed" bit */ if( rc==LSM_OK && bInUse==0 ){ i64 iId = 0; rc = lsmCheckpointSynced(pDb, &iId); if( rc!=LSM_OK || iId<iFree ) bInUse = 1; if( rc==LSM_BUSY ) rc = LSM_OK; } if( rc==LSM_OK && bInUse==0 ){ iRet = pFree->aEntry[0].iBlk; flRemoveEntry0(pFree); assert( iRet!=0 ); } } /* If no block was allocated from the free-list, allocate one at the ** end of the file. */ if( rc==LSM_OK && iRet==0 ){ iRet = ++pDb->pWorker->nBlock; } *piBlk = iRet; return LSM_OK; } /* ** Free a database block. The worker snapshot must be held in order to call ** this function. ** ** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. ** LSM_NOMEM). */ int lsmBlockFree(lsm_db *pDb, int iBlk){ Snapshot *p = pDb->pWorker; assert( lsmShmAssertWorker(pDb) ); /* TODO: Should assert() that lsmCheckpointOverflow() has not been called */ return lsmFreelistAppend(pDb->pEnv, &p->freelist, iBlk, p->iId); } /* ** Refree a database block. The worker snapshot must be held in order to call ** this function. ** ** Refreeing is required when a block is allocated using lsmBlockAllocate() ** but then not used. This function is used to push the block back onto ** the freelist. Refreeing a block is different from freeing is, as a refreed ** block may be reused immediately. Whereas a freed block can not be reused ** until (at least) after the next checkpoint. */ int lsmBlockRefree(lsm_db *pDb, int iBlk){ int rc = LSM_OK; /* Return code */ Snapshot *p = pDb->pWorker; if( iBlk==p->nBlock ){ p->nBlock--; }else{ rc = flInsertEntry(pDb->pEnv, &p->freelist, iBlk); } return rc; } /* ** If required, copy a database checkpoint from shared memory into the ** database itself. ** ** The WORKER lock must not be held when this is called. This is because ** this function may indirectly call fsync(). And the WORKER lock should ** not be held that long (in case it is required by a client flushing an ** in-memory tree to disk). */ int lsmCheckpointWrite(lsm_db *pDb){ int rc; /* Return Code */ assert( pDb->pWorker==0 ); assert( 1 || pDb->pClient==0 ); assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) ); rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0); if( rc!=LSM_OK ) return rc; rc = lsmCheckpointLoad(pDb); if( rc==LSM_OK ){ ShmHeader *pShm = pDb->pShmhdr; int bDone = 0; /* True if checkpoint is already stored */ /* Check if this checkpoint has already been written to the database ** file. If so, set variable bDone to true. */ if( pShm->iMetaPage ){ MetaPage *pPg; /* Meta page */ u8 *aData; /* Meta-page data buffer */ int nData; /* Size of aData[] in bytes */ i64 iCkpt; /* Id of checkpoint just loaded */ i64 iDisk; /* Id of checkpoint already stored in db */ iCkpt = lsmCheckpointId(pDb->aSnapshot, 0); rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg); if( rc==LSM_OK ){ aData = lsmFsMetaPageData(pPg, &nData); iDisk = lsmCheckpointId((u32 *)aData, 1); lsmFsMetaPageRelease(pPg); } bDone = (iDisk>=iCkpt); } if( rc==LSM_OK && bDone==0 ){ int iMeta = (pShm->iMetaPage % 2) + 1; rc = lsmFsSyncDb(pDb->pFS); if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta); if( rc==LSM_OK ) rc = lsmFsSyncDb(pDb->pFS); if( rc==LSM_OK ) pShm->iMetaPage = iMeta; } } /* If no error has occured, then the snapshot currently in pDb->aSnapshot ** has been synced to disk. This means it may be possible to wrap the ** log file. Obtain the WRITER lock and update the relevent tree-header ** fields to reflect this. */ if( rc==LSM_OK ){ u64 iLogoff = lsmCheckpointLogOffset(pDb->aSnapshot); if( pDb->nTransOpen==0 ){ rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); } if( rc==LSM_OK ){ rc = lsmTreeLoadHeader(pDb); if( rc==LSM_OK ) lsmLogCheckpoint(pDb, iLogoff); if( rc==LSM_OK ) lsmTreeEndTransaction(pDb, 1); if( rc==LSM_BUSY ) rc = LSM_OK; if( pDb->nTransOpen==0 ){ rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); } } if( rc==LSM_BUSY ) rc = LSM_OK; } lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0); return rc; } int lsmBeginWork(lsm_db *pDb){ int rc; /* Attempt to take the WORKER lock */ rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0); /* Deserialize the current worker snapshot */ if( rc==LSM_OK ){ rc = lsmCheckpointLoadWorker(pDb); if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase; } return rc; } void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){ if( p ){ lsmSortedFreeLevel(pEnv, p->pLevel); lsmFree(pEnv, p->freelist.aEntry); lsmFree(pEnv, p); } } /* ** Argument bFlush is true if the contents of the in-memory tree has just ** been flushed to disk. The significance of this is that once the snapshot ** created to hold the updated state of the database is synced to disk, log ** file space can be recycled. */ void lsmFinishWork(lsm_db *pDb, int bFlush, int nOvfl, int *pRc){ /* If no error has occurred, serialize the worker snapshot and write ** it to shared memory. */ if( *pRc==LSM_OK ){ *pRc = lsmCheckpointSaveWorker(pDb, bFlush, nOvfl); } if( pDb->pWorker ){ lsmFreeSnapshot(pDb->pEnv, pDb->pWorker); pDb->pWorker = 0; } lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0); } /* ** Called when recovery is finished. */ int lsmFinishRecovery(lsm_db *pDb){ lsmTreeEndTransaction(pDb, 1); return LSM_OK; } /* ** Begin a read transaction. This function is a no-op if the connection ** passed as the only argument already has an open read transaction. */ int lsmBeginReadTrans(lsm_db *pDb){ const int MAX_READLOCK_ATTEMPTS = 5; int rc = LSM_OK; /* Return code */ int iAttempt = 0; assert( pDb->pWorker==0 ); assert( (pDb->pClient!=0)==(pDb->iReader>=0) ); while( rc==LSM_OK && pDb->pClient==0 && (iAttempt++)<MAX_READLOCK_ATTEMPTS ){ assert( pDb->pCsr==0 && pDb->nTransOpen==0 ); /* Load the in-memory tree header. */ rc = lsmTreeLoadHeader(pDb); /* Load the database snapshot */ if( rc==LSM_OK ){ rc = lsmCheckpointLoad(pDb); } /* Take a read-lock on the tree and snapshot just loaded. Then check ** that the shared-memory still contains the same values. If so, proceed. ** Otherwise, relinquish the read-lock and retry the whole procedure ** (starting with loading the in-memory tree header). */ if( rc==LSM_OK ){ ShmHeader *pShm = pDb->pShmhdr; i64 iTree = pDb->treehdr.iTreeId; i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0); rc = lsmReadlock(pDb, iSnap, iTree); if( rc==LSM_OK ){ if( (i64)pShm->hdr1.iTreeId==iTree && pShm->hdr1.iTransId==pDb->treehdr.iTransId && lsmCheckpointId(pShm->aClient, 0)==iSnap ){ /* Read lock has been successfully obtained. Deserialize the ** checkpoint just loaded. TODO: This will be removed after ** lsm_sorted.c is changed to work directly from the serialized ** version of the snapshot. */ rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient); assert( (rc==LSM_OK)==(pDb->pClient!=0) ); }else{ rc = lsmReleaseReadlock(pDb); } } if( rc==LSM_BUSY ) rc = LSM_OK; } } if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY; return rc; } /* ** Close the currently open read transaction. */ void lsmFinishReadTrans(lsm_db *pDb){ Snapshot *pClient = pDb->pClient; /* Worker connections should not be closing read transactions. And ** read transactions should only be closed after all cursors and write ** transactions have been closed. Finally pClient should be non-NULL ** only iff pDb->iReader>=0. */ assert( pDb->pWorker==0 ); assert( pDb->pCsr==0 && pDb->nTransOpen==0 ); if( pClient ){ lsmFreeSnapshot(pDb->pEnv, pDb->pClient); pDb->pClient = 0; } if( pDb->iReader>=0 ) lsmReleaseReadlock(pDb); assert( (pDb->pClient!=0)==(pDb->iReader>=0) ); } /* ** Open a write transaction. */ int lsmBeginWriteTrans(lsm_db *pDb){ int rc; /* Return code */ ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */ assert( pDb->nTransOpen==0 ); /* If there is no read-transaction open, open one now. */ rc = lsmBeginReadTrans(pDb); /* Attempt to take the WRITER lock */ if( rc==LSM_OK ){ rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); } /* If the previous writer failed mid-transaction, run emergency rollback. */ if( rc==LSM_OK && pShm->bWriter ){ /* TODO: This! */ assert( 0 ); rc = LSM_CORRUPT_BKPT; } /* Check that this connection is currently reading from the most recent ** version of the database. If not, return LSM_BUSY. */ if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){ rc = LSM_BUSY; } if( rc==LSM_OK ){ rc = lsmLogBegin(pDb); } /* If everything was successful, set the "transaction-in-progress" flag ** and return LSM_OK. Otherwise, if some error occurred, relinquish the ** WRITER lock and return an error code. */ if( rc==LSM_OK ){ pShm->bWriter = 1; pDb->treehdr.iTransId++; }else{ lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb); } return rc; } /* ** End the current write transaction. The connection is left with an open ** read transaction. It is an error to call this if there is no open write ** transaction. ** ** If the transaction was committed, then a commit record has already been ** written into the log file when this function is called. Or, if the ** transaction was rolled back, both the log file and in-memory tree ** structure have already been restored. In either case, this function ** merely releases locks and other resources held by the write-transaction. ** ** LSM_OK is returned if successful, or an LSM error code otherwise. */ int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){ lsmLogEnd(pDb, bCommit); lsmTreeEndTransaction(pDb, bCommit); lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); return LSM_OK; } /* ** Return non-zero if the caller is holding the client mutex. */ #ifdef LSM_DEBUG int lsmHoldingClientMutex(lsm_db *pDb){ return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex); } #endif /* ** Obtain a read-lock on database version identified by the combination ** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or ** an LSM error code otherwise. */ int lsmReadlock(lsm_db *db, i64 iLsm, i64 iTree){ ShmHeader *pShm = db->pShmhdr; int i; int rc = LSM_OK; assert( db->iReader<0 ); /* Search for an exact match. */ for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){ ShmReader *p = &pShm->aReader[i]; if( p->iLsmId==iLsm && p->iTreeId==iTree ){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iTree ){ db->iReader = i; }else if( rc==LSM_BUSY ){ rc = LSM_OK; } } } /* Try to obtain a write-lock on each slot, in order. If successful, set ** the slot values to iLsm/iTree. */ for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); if( rc==LSM_BUSY ){ rc = LSM_OK; }else{ ShmReader *p = &pShm->aReader[i]; p->iLsmId = iLsm; p->iTreeId = iTree; rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); if( rc==LSM_OK ) db->iReader = i; } } /* Search for any usable slot */ for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){ ShmReader *p = &pShm->aReader[i]; if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); if( rc==LSM_OK ){ if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){ db->iReader = i; } }else if( rc==LSM_BUSY ){ rc = LSM_OK; } } } return rc; } static int isInUse(lsm_db *db, i64 iLsm, i64 iTree, int *pbInUse){ ShmHeader *pShm = db->pShmhdr; int i; int rc = LSM_OK; for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){ ShmReader *p = &pShm->aReader[i]; if( p->iLsmId && p->iTreeId && (p->iTreeId<=iTree || p->iLsmId<=iLsm) ){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); if( rc==LSM_OK ){ p->iTreeId = p->iLsmId = 0; lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); } } } if( rc==LSM_BUSY ){ *pbInUse = 1; return LSM_OK; } *pbInUse = 0; return rc; } int lsmTreeInUse(lsm_db *db, u32 iTreeId, int *pbInUse){ if( db->treehdr.iTreeId==iTreeId ){ *pbInUse = 1; return LSM_OK; } return isInUse(db, 0, (i64)iTreeId, pbInUse); } int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){ if( db->pClient && db->pClient->iId<=iLsmId ){ *pbInUse = 1; return LSM_OK; } return isInUse(db, iLsmId, 0, pbInUse); } /* ** Release the read-lock currently held by connection db. */ int lsmReleaseReadlock(lsm_db *db){ int rc = LSM_OK; if( db->iReader>=0 ){ rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0); db->iReader = -1; } return rc; } /* ** This function may only be called after a successful call to ** lsmDbDatabaseConnect(). It returns true if the connection is in ** multi-process mode, or false otherwise. */ int lsmDbMultiProc(lsm_db *pDb){ return pDb->pDatabase && (pDb->pDatabase->pFile!=0); } void lsmDbDeferredClose(lsm_db *pDb, lsm_file *pFile, LsmFile *pLsmFile){ Database *p = pDb->pDatabase; lsm_env *pEnv = pDb->pEnv; lsmMutexEnter(pEnv, p->pClientMutex); pLsmFile->pFile = pFile; pLsmFile->pNext = p->pLsmFile; p->pLsmFile = pLsmFile; lsmMutexLeave(pEnv, p->pClientMutex); } /************************************************************************* ************************************************************************** ************************************************************************** ************************************************************************** ************************************************************************** *************************************************************************/ /* ** Retrieve a pointer to shared-memory chunk iChunk. Chunks are numbered ** starting from 0 (i.e. the header chunk is chunk 0). */ int lsmShmChunk(lsm_db *db, int iChunk, void **ppData){ int rc = LSM_OK; void *pRet = 0; Database *p = db->pDatabase; lsm_env *pEnv = db->pEnv; /* Enter the client mutex */ assert( iChunk>=0 ); lsmMutexEnter(pEnv, p->pClientMutex); if( iChunk>=p->nShmChunk ){ int nNew = iChunk+1; void **apNew; apNew = (void **)lsmRealloc(pEnv, p->apShmChunk, sizeof(void*) * nNew); if( apNew==0 ){ rc = LSM_NOMEM_BKPT; }else{ memset(&apNew[p->nShmChunk], 0, sizeof(void*) * (nNew-p->nShmChunk)); p->apShmChunk = apNew; p->nShmChunk = nNew; } } if( rc==LSM_OK && p->apShmChunk[iChunk]==0 ){ void *pChunk = 0; if( p->pFile==0 ){ /* Single process mode */ pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc); }else{ /* Multi-process mode */ rc = lsmEnvShmMap(pEnv, p->pFile, iChunk, LSM_SHM_CHUNK_SIZE, &pChunk); } p->apShmChunk[iChunk] = pChunk; } if( rc==LSM_OK ){ pRet = p->apShmChunk[iChunk]; } /* Release the client mutex */ lsmMutexLeave(pEnv, p->pClientMutex); *ppData = pRet; return rc; } /* ** Attempt to obtain the lock identified by the iLock and bExcl parameters. ** If successful, return LSM_OK. If the lock cannot be obtained because ** there exists some other conflicting lock, return LSM_BUSY. If some other ** error occurs, return an LSM error code. ** ** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER, ** or else a value returned by the LSM_LOCK_READER macro. */ int lsmShmLock( lsm_db *db, int iLock, int eOp, /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */ int bBlock /* True for a blocking lock */ ){ lsm_db *pIter; const u32 me = (1 << (iLock-1)); const u32 ms = (1 << (iLock+16-1)); int rc = LSM_OK; Database *p = db->pDatabase; assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) ); assert( iLock<=16 ); assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); /* Check for a no-op. Proceed only if this is not one of those. */ if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0) || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms) || (eOp==LSM_LOCK_EXCL && (db->mLock & me)==0) ){ int nExcl = 0; /* Number of connections holding EXCLUSIVE */ int nShared = 0; /* Number of connections holding SHARED */ lsmMutexEnter(db->pEnv, p->pClientMutex); /* Figure out the locks currently held by this process on iLock, not ** including any held by connection db. */ for(pIter=p->pConn; pIter; pIter=pIter->pNext){ assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 ); if( pIter!=db ){ if( pIter->mLock & me ){ nExcl++; }else if( pIter->mLock & ms ){ nShared++; } } } assert( nExcl==0 || nExcl==1 ); assert( nExcl==0 || nShared==0 ); assert( nExcl==0 || (db->mLock & (me|ms))==0 ); switch( eOp ){ case LSM_LOCK_UNLOCK: if( nShared==0 ){ lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_UNLOCK); } db->mLock &= ~(me|ms); break; case LSM_LOCK_SHARED: if( nExcl ){ rc = LSM_BUSY; }else{ if( nShared==0 ){ rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_SHARED); } db->mLock |= ms; db->mLock &= ~me; } break; default: assert( eOp==LSM_LOCK_EXCL ); if( nExcl || nShared ){ rc = LSM_BUSY; }else{ rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_EXCL); db->mLock |= (me|ms); } break; } lsmMutexLeave(db->pEnv, p->pClientMutex); } return rc; } #ifdef LSM_DEBUG int shmLockType(lsm_db *db, int iLock){ const u32 me = (1 << (iLock-1)); const u32 ms = (1 << (iLock+16-1)); if( db->mLock & me ) return LSM_LOCK_EXCL; if( db->mLock & ms ) return LSM_LOCK_SHARED; return LSM_LOCK_UNLOCK; } /* ** The arguments passed to this function are similar to those passed to ** the lsmShmLock() function. However, instead of obtaining a new lock ** this function returns true if the specified connection already holds ** (or does not hold) such a lock, depending on the value of eOp. As ** follows: ** ** (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock ** (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock. ** (eOp==LSM_LOCK_EXCL) -> true if db has an EXCLUSIVE lock on iLock. */ int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){ int ret; int eHave; assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) ); assert( iLock<=16 ); assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); eHave = shmLockType(db, iLock); switch( eOp ){ case LSM_LOCK_UNLOCK: ret = (eHave==LSM_LOCK_UNLOCK); break; case LSM_LOCK_SHARED: ret = (eHave!=LSM_LOCK_UNLOCK); break; case LSM_LOCK_EXCL: ret = (eHave==LSM_LOCK_EXCL); break; default: assert( !"bad eOp value passed to lsmShmAssertLock()" ); break; } return ret; } int lsmShmAssertWorker(lsm_db *db){ return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker; } /* ** This function does not contribute to library functionality, and is not ** included in release builds. It is intended to be called from within ** an interactive debugger. ** ** When called, this function prints a single line of human readable output ** to stdout describing the locks currently held by the connection. For ** example: ** ** (gdb) call print_db_locks(pDb) ** (shared on dms2) (exclusive on writer) */ void print_db_locks(lsm_db *db){ int iLock; for(iLock=0; iLock<16; iLock++){ int bOne = 0; const char *azLock[] = {0, "shared", "exclusive"}; const char *azName[] = { 0, "dms1", "dms2", "writer", "worker", "checkpointer", "reader0", "reader1", "reader2", "reader3", "reader4", "reader5" }; int eHave = shmLockType(db, iLock); if( azLock[eHave] ){ printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]); bOne = 1; } } printf("\n"); } void print_all_db_locks(lsm_db *db){ lsm_db *p; for(p=db->pDatabase->pConn; p; p=p->pNext){ printf("%s connection %p ", ((p==db)?"*":""), p); print_db_locks(p); } } #endif void lsmShmBarrier(lsm_db *db){ lsmEnvShmBarrier(db->pEnv); } |
Changes to src/lsm_sorted.c.
︙ | ︙ | |||
260 261 262 263 264 265 266 | int nTree; int *aTree; BtreeCursor *pBtCsr; Snapshot *pSnap; /* Used by cursors flushing the in-memory tree only */ | | | 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | int nTree; int *aTree; BtreeCursor *pBtCsr; Snapshot *pSnap; /* Used by cursors flushing the in-memory tree only */ int *pnOvfl; /* Number of free-list entries to store */ void *pSystemVal; /* Pointer to buffer to free */ }; #define CURSOR_DATA_TREE 0 #define CURSOR_DATA_SYSTEM 1 #define CURSOR_DATA_SEGMENT 2 |
︙ | ︙ | |||
284 285 286 287 288 289 290 | ** flushing the in-memory tree to disk - the new free-list and levels record ** are flushed along with it. ** ** CURSOR_AT_FREELIST ** This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually ** pointing at a free list. ** | < < < < < | 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | ** flushing the in-memory tree to disk - the new free-list and levels record ** are flushed along with it. ** ** CURSOR_AT_FREELIST ** This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually ** pointing at a free list. ** ** CURSOR_IGNORE_SYSTEM ** If set, this cursor ignores system keys. ** ** CURSOR_NEXT_OK ** Set if it is Ok to call lsm_csr_next(). ** ** CURSOR_PREV_OK ** Set if it is Ok to call lsm_csr_prev(). */ #define CURSOR_IGNORE_DELETE 0x00000001 #define CURSOR_NEW_SYSTEM 0x00000002 #define CURSOR_AT_FREELIST 0x00000004 #define CURSOR_IGNORE_SYSTEM 0x00000010 #define CURSOR_NEXT_OK 0x00000020 #define CURSOR_PREV_OK 0x00000040 typedef struct MergeWorker MergeWorker; typedef struct Hierarchy Hierarchy; |
︙ | ︙ | |||
483 484 485 486 487 488 489 490 491 492 493 494 495 496 | static int pageGetFlags(u8 *aData, int nData){ return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]); } static u8 *pageGetCell(u8 *aData, int nData, int iCell){ return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])]; } /* ** Return the decoded (possibly relative) pointer value stored in cell ** iCell from page aData/nData. */ static int pageGetRecordPtr(u8 *aData, int nData, int iCell){ int iRet; /* Return value */ | > > > > > > > > > | 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 | static int pageGetFlags(u8 *aData, int nData){ return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]); } static u8 *pageGetCell(u8 *aData, int nData, int iCell){ return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])]; } /* ** Return the number of cells on page pPg. */ static int pageObjGetNRec(Page *pPg){ int nData; u8 *aData = lsmFsPageData(pPg, &nData); return pageGetNRec(aData, nData); } /* ** Return the decoded (possibly relative) pointer value stored in cell ** iCell from page aData/nData. */ static int pageGetRecordPtr(u8 *aData, int nData, int iCell){ int iRet; /* Return value */ |
︙ | ︙ | |||
565 566 567 568 569 570 571 572 573 574 575 576 577 578 | u8 *aData; int nData; u8 *aCell; int eType; aData = fsPageData(pPg, &nData); assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) ); aCell = pageGetCell(aData, nData, iKey); eType = *aCell++; aCell += lsmVarintGet32(aCell, piPtr); if( eType==0 ){ int rc; | > | 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 | u8 *aData; int nData; u8 *aCell; int eType; aData = fsPageData(pPg, &nData); assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) ); assert( iKey>=0 && iKey<pageGetNRec(aData, nData) ); aCell = pageGetCell(aData, nData, iKey); eType = *aCell++; aCell += lsmVarintGet32(aCell, piPtr); if( eType==0 ){ int rc; |
︙ | ︙ | |||
597 598 599 600 601 602 603 | static int btreeCursorLoadKey(BtreeCursor *pCsr){ int rc = LSM_OK; if( pCsr->iPg<0 ){ pCsr->pKey = 0; pCsr->nKey = 0; pCsr->eType = 0; }else{ | > > > > | | | | | | > > > > > | 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 | static int btreeCursorLoadKey(BtreeCursor *pCsr){ int rc = LSM_OK; if( pCsr->iPg<0 ){ pCsr->pKey = 0; pCsr->nKey = 0; pCsr->eType = 0; }else{ int iPg; for(iPg=pCsr->iPg; iPg>=0; iPg--){ int iCell = pCsr->aPg[pCsr->iPg].iCell; if( iCell>=0 ){ int dummy; rc = pageGetBtreeKey( pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell, &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob ); pCsr->eType |= SORTED_SEPARATOR; break; } } if( iPg<0 ) rc = LSM_CORRUPT_BKPT; } return rc; } static int btreeCursorPtr(u8 *aData, int nData, int iCell){ int nCell; |
︙ | ︙ | |||
822 823 824 825 826 827 828 | /* Populate any other aPg[] array entries */ if( rc==LSM_OK && nDepth>1 ){ Blob blob = {0,0,0}; void *pSeek; int nSeek; int iTopicSeek; int dummy; | < > | > > > > > > > > > | | | > | 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 | /* Populate any other aPg[] array entries */ if( rc==LSM_OK && nDepth>1 ){ Blob blob = {0,0,0}; void *pSeek; int nSeek; int iTopicSeek; int dummy; int iPg = 0; int iLoad = pCsr->pSeg->iRoot; Page *pPg = pCsr->aPg[nDepth-1].pPage; if( pageObjGetNRec(pPg)==0 ){ /* This can happen when pPg is the right-most leaf in the b-tree. ** In this case, set the iTopicSeek/pSeek/nSeek key to a value ** greater than any real key. */ assert( iCell==-1 ); iTopicSeek = 1000; pSeek = 0; nSeek = 0; }else{ rc = pageGetBtreeKey(pPg, 0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob ); } do { Page *pPg; rc = lsmFsDbPageGet(pCsr->pFS, iLoad, &pPg); assert( rc==LSM_OK || pPg==0 ); if( rc==LSM_OK ){ u8 *aData; /* Buffer containing page data */ |
︙ | ︙ | |||
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 | pCsr->aPtr[0].pSeg = &pLevel->lhs; pCsr->nPtr = nPtr; for(i=0; i<pLevel->nRight; i++){ pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i]; } } return rc; } static int levelCursorInitRun( lsm_db *pDb, Segment *pSeg, | > > > > | 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 | pCsr->aPtr[0].pSeg = &pLevel->lhs; pCsr->nPtr = nPtr; for(i=0; i<pLevel->nRight; i++){ pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i]; } } if( nPtr>1 && pLevel->pSplitKey==0 ){ lsmSortedSplitkey(pDb, pLevel, &rc); } return rc; } static int levelCursorInitRun( lsm_db *pDb, Segment *pSeg, |
︙ | ︙ | |||
1539 1540 1541 1542 1543 1544 1545 | ){ int iRet; if( pLeft->pPg==0 ){ iRet = 1; }else if( pRight->pPg==0 ){ iRet = 0; }else{ | > > | | | 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 | ){ int iRet; if( pLeft->pPg==0 ){ iRet = 1; }else if( pRight->pPg==0 ){ iRet = 0; }else{ int res = rtTopic(pLeft->eType) - rtTopic(pRight->eType); if( res==0 ){ res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey); } if( res==0 || (res<0 && bLargest==0) || (res>0 && bLargest) ){ iRet = 0; }else{ iRet = 1; } } return iRet; |
︙ | ︙ | |||
1971 1972 1973 1974 1975 1976 1977 | pCsr->pNext = pDb->pCsr; pDb->pCsr = pCsr; } } if( rc==LSM_OK ){ if( useTree ){ | < | 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 | pCsr->pNext = pDb->pCsr; pDb->pCsr = pCsr; } } if( rc==LSM_OK ){ if( useTree ){ rc = lsmTreeCursorNew(pDb, &pCsr->pTreeCsr); } pCsr->pDb = pDb; pCsr->pSnap = pSnap; pCsr->xCmp = pDb->xCmp; if( bUserOnly ){ pCsr->flags |= CURSOR_IGNORE_SYSTEM; |
︙ | ︙ | |||
2024 2025 2026 2027 2028 2029 2030 | } /* ** If the free-block list is not empty, then have this cursor visit a key ** with (a) the system bit set, and (b) the key "F" and (c) a value blob ** containing the entire serialized free-block list. */ | | > | 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 | } /* ** If the free-block list is not empty, then have this cursor visit a key ** with (a) the system bit set, and (b) the key "F" and (c) a value blob ** containing the entire serialized free-block list. */ static void multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){ assert( pCsr ); pCsr->pnOvfl = pnOvfl; pCsr->flags |= CURSOR_NEW_SYSTEM; } /* ** Allocate a new cursor to read the database (the in-memory tree and all ** levels). If successful, set *ppCsr to point to the new cursor object ** and return SQLITE4_OK. Otherwise, set *ppCsr to NULL and return an |
︙ | ︙ | |||
2114 2115 2116 2117 2118 2119 2120 | case CURSOR_DATA_SYSTEM: if( pCsr->flags & CURSOR_AT_FREELIST ){ pKey = (void *)"FREELIST"; nKey = 8; eType = SORTED_SYSTEM_WRITE; } | < < < < < | 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 | case CURSOR_DATA_SYSTEM: if( pCsr->flags & CURSOR_AT_FREELIST ){ pKey = (void *)"FREELIST"; nKey = 8; eType = SORTED_SYSTEM_WRITE; } break; default: { int iSeg = iKey - CURSOR_DATA_SEGMENT; if( iSeg==pCsr->nSegCsr && pCsr->pBtCsr ){ pKey = pCsr->pBtCsr->pKey; nKey = pCsr->pBtCsr->nKey; |
︙ | ︙ | |||
2156 2157 2158 2159 2160 2161 2162 | lsmTreeCursorValue(pCsr->pTreeCsr, ppVal, pnVal); }else{ *ppVal = 0; *pnVal = 0; } }else if( iVal==CURSOR_DATA_SYSTEM ){ if( pCsr->flags & CURSOR_AT_FREELIST ){ | | > | | | < < < < < | > > > > < < < | < < < < < < < < < | > | | < < < > > | | > > > | 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 | lsmTreeCursorValue(pCsr->pTreeCsr, ppVal, pnVal); }else{ *ppVal = 0; *pnVal = 0; } }else if( iVal==CURSOR_DATA_SYSTEM ){ if( pCsr->flags & CURSOR_AT_FREELIST ){ void *aVal; int nVal; assert( pCsr->pSystemVal==0 ); rc = lsmCheckpointOverflow(pCsr->pDb, &aVal, &nVal, pCsr->pnOvfl); *ppVal = pCsr->pSystemVal = aVal; *pnVal = nVal; }else{ *ppVal = 0; *pnVal = 0; } }else if( iVal-CURSOR_DATA_SEGMENT<pCsr->nSegCsr && segmentCursorValid(&pCsr->aSegCsr[iVal-CURSOR_DATA_SEGMENT]) ){ segmentCursorValue(&pCsr->aSegCsr[iVal-CURSOR_DATA_SEGMENT], ppVal, pnVal); }else{ *ppVal = 0; *pnVal = 0; } assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) ); return rc; } int lsmSortedLoadFreelist( lsm_db *pDb, /* Database handle (must be worker) */ void **ppVal, /* OUT: Blob containing LSM free-list */ int *pnVal /* OUT: Size of *ppVal blob in bytes */ ){ MultiCursor *pCsr = 0; /* Cursor used to retreive free-list */ int rc; /* Return Code */ assert( pDb->pWorker ); assert( *ppVal==0 && *pnVal==0 ); rc = multiCursorAllocate(pDb, 1, &pCsr); if( rc==LSM_OK ){ rc = lsmMCursorLast(pCsr); if( rc==LSM_OK && pCsr->eType==SORTED_SYSTEM_WRITE && pCsr->key.nData==8 && 0==memcmp(pCsr->key.pData, "FREELIST", 8) ){ void *pVal; int nVal; /* Value read from database */ rc = lsmMCursorValue(pCsr, &pVal, &nVal); if( rc==LSM_OK ){ *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc); if( *ppVal ){ memcpy(*ppVal, pVal, nVal); *pnVal = nVal; } } } lsmMCursorClose(pCsr); } return rc; } static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){ int i1; int i2; int iRes; |
︙ | ︙ | |||
2423 2424 2425 2426 2427 2428 2429 | int iPtr = 0; if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE; assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE ); assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 ); assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 ); | < | 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 | int iPtr = 0; if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE; assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE ); assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 ); assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 ); pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK); lsmTreeCursorSeek(pCsr->pTreeCsr, pKey, nKey, &res); switch( eESeek ){ case LSM_SEEK_EQ: if( res!=0 ){ lsmTreeCursorReset(pCsr->pTreeCsr); |
︙ | ︙ | |||
2552 2553 2554 2555 2556 2557 2558 | if( iKey==CURSOR_DATA_TREE ){ if( bReverse ){ rc = lsmTreeCursorPrev(pCsr->pTreeCsr); }else{ rc = lsmTreeCursorNext(pCsr->pTreeCsr); } }else if( iKey==CURSOR_DATA_SYSTEM ){ | | < < | < < < < | 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 | if( iKey==CURSOR_DATA_TREE ){ if( bReverse ){ rc = lsmTreeCursorPrev(pCsr->pTreeCsr); }else{ rc = lsmTreeCursorNext(pCsr->pTreeCsr); } }else if( iKey==CURSOR_DATA_SYSTEM ){ assert( pCsr->flags & CURSOR_AT_FREELIST ); assert( pCsr->flags & CURSOR_NEW_SYSTEM ); assert( bReverse==0 ); pCsr->flags &= ~CURSOR_AT_FREELIST; }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nSegCsr) ){ assert( bReverse==0 && pCsr->pBtCsr ); rc = btreeCursorNext(pCsr->pBtCsr); }else{ LevelCursor *pLevel = &pCsr->aSegCsr[iKey-CURSOR_DATA_SEGMENT]; rc = segmentCursorAdvance(pLevel, bReverse); } |
︙ | ︙ | |||
3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 | static int mergeWorkerDone(MergeWorker *pMW){ return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr); } static void sortedFreeLevel(lsm_env *pEnv, Level *p){ if( p ){ lsmFree(pEnv, p->pMerge); lsmFree(pEnv, p->aRhs); lsmFree(pEnv, p); } } static void sortedInvokeWorkHook(lsm_db *pDb){ if( pDb->xWork ){ pDb->xWork(pDb, pDb->pWorkCtx); } } | > | | | > > | > | | | < < < | < < | 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 | static int mergeWorkerDone(MergeWorker *pMW){ return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr); } static void sortedFreeLevel(lsm_env *pEnv, Level *p){ if( p ){ lsmFree(pEnv, p->pSplitKey); lsmFree(pEnv, p->pMerge); lsmFree(pEnv, p->aRhs); lsmFree(pEnv, p); } } static void sortedInvokeWorkHook(lsm_db *pDb){ if( pDb->xWork ){ pDb->xWork(pDb, pDb->pWorkCtx); } } static int sortedNewToplevel( lsm_db *pDb, /* Connection handle */ int bTree, /* True to store contents of in-memory tree */ int *pnOvfl /* OUT: Number of free-list entries stored */ ){ int rc = LSM_OK; /* Return Code */ MultiCursor *pCsr = 0; Level *pNext = 0; /* The current top level */ Level *pNew; /* The new level itself */ Segment *pDel = 0; /* Delete separators from this segment */ int iLeftPtr = 0; assert( pnOvfl ); /* Allocate the new level structure to write to. */ pNext = lsmDbSnapshotLevel(pDb->pWorker); pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc); /* Create a cursor to gather the data required by the new segment. The new ** segment contains everything in the tree and pointers to the next segment ** in the database (if any). */ if( rc==LSM_OK ){ rc = multiCursorNew(pDb, pDb->pWorker, bTree, 0, &pCsr); if( rc==LSM_OK ){ pNew->pNext = pNext; lsmDbSnapshotSetLevel(pDb->pWorker, pNew); } if( rc==LSM_OK ){ if( pNext ){ assert( pNext->pMerge==0 || pNext->nRight>0 ); if( pNext->pMerge==0 ){ if( pNext->lhs.iRoot ){ rc = multiCursorAddLevel(pCsr, pNext, MULTICURSOR_ADDLEVEL_LHS_SEP); if( rc==LSM_OK ){ pDel = &pNext->lhs; } } iLeftPtr = pNext->lhs.iFirst; } }else{ /* The new level will be the only level in the LSM. There is no reason ** to write out delete keys in this case. */ multiCursorIgnoreDelete(pCsr); } } if( rc==LSM_OK ){ multiCursorVisitFreelist(pCsr, pnOvfl); multiCursorReadSeparators(pCsr); } } if( rc!=LSM_OK ){ lsmMCursorClose(pCsr); }else{ Merge merge; /* Merge object used to create new level */ |
︙ | ︙ | |||
3534 3535 3536 3537 3538 3539 3540 | while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){ rc = mergeWorkerStep(&mergeworker); } mergeWorkerShutdown(&mergeworker, &rc); pNew->pMerge = 0; } | < | 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 | while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){ rc = mergeWorkerStep(&mergeworker); } mergeWorkerShutdown(&mergeworker, &rc); pNew->pMerge = 0; } /* Link the new level into the top of the tree. */ if( rc==LSM_OK ){ if( pDel ){ pDel->iRoot = 0; } }else{ |
︙ | ︙ | |||
3569 3570 3571 3572 3573 3574 3575 | ** ** In both cases, the connection hold a worker snapshot reference. In ** the first, the connection also holds the in-memory tree write-version. ** In the second, no in-memory tree version reference is held at all. */ int lsmSortedFlushTree( lsm_db *pDb, /* Connection handle */ | | < < < < | | < | | < | < | < < < < | 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 | ** ** In both cases, the connection hold a worker snapshot reference. In ** the first, the connection also holds the in-memory tree write-version. ** In the second, no in-memory tree version reference is held at all. */ int lsmSortedFlushTree( lsm_db *pDb, /* Connection handle */ int *pnOvfl /* OUT: Number of free-list entries written */ ){ int rc; assert( pDb->pWorker ); /* If there is nothing to do, return early. */ if( lsmTreeSize(pDb)==0 && lsmCheckpointOverflowRequired(pDb)==0 ){ *pnOvfl = 0; return LSM_OK; } rc = sortedNewToplevel(pDb, 1, pnOvfl); assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) ); #if 0 lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "tree flush"); #endif return rc; } /* ** The nMerge levels in the LSM beginning with pLevel consist of a ** left-hand-side segment only. Replace these levels with a single new ** level consisting of a new empty segment on the left-hand-side and the |
︙ | ︙ | |||
3641 3642 3643 3644 3645 3646 3647 | Level *p = pLevel; Level **pp; pNew->nRight = nMerge; pNew->iAge = pLevel->iAge+1; for(i=0; i<nMerge; i++){ pNext = p->pNext; pNew->aRhs[i] = p->lhs; | | | 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 | Level *p = pLevel; Level **pp; pNew->nRight = nMerge; pNew->iAge = pLevel->iAge+1; for(i=0; i<nMerge; i++){ pNext = p->pNext; pNew->aRhs[i] = p->lhs; sortedFreeLevel(pDb->pEnv, p); p = pNext; } /* Replace the old levels with the new. */ pTopLevel = lsmDbSnapshotLevel(pDb->pWorker); pNew->pNext = p; for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext)); |
︙ | ︙ | |||
3796 3797 3798 3799 3800 3801 3802 | int nRemaining = nWork; /* Units of work to do before returning */ Snapshot *pWorker = pDb->pWorker; assert( lsmFsIntegrityCheck(pDb) ); assert( pWorker ); if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK; | < | 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 | int nRemaining = nWork; /* Units of work to do before returning */ Snapshot *pWorker = pDb->pWorker; assert( lsmFsIntegrityCheck(pDb) ); assert( pWorker ); if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK; while( nRemaining>0 ){ Level *pLevel; Level *pTopLevel = lsmDbSnapshotLevel(pWorker); /* Find the longest contiguous run of levels not currently undergoing a ** merge with the same age in the structure. Or the level being merged |
︙ | ︙ | |||
3937 3938 3939 3940 3941 3942 3943 | /* Clean up the MergeWorker object initialized above. If no error ** has occurred, invoke the work-hook to inform the application that ** the database structure has changed. */ mergeWorkerShutdown(&mergeworker, &rc); if( rc==LSM_OK ) sortedInvokeWorkHook(pDb); #if 0 | | | 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 | /* Clean up the MergeWorker object initialized above. If no error ** has occurred, invoke the work-hook to inform the application that ** the database structure has changed. */ mergeWorkerShutdown(&mergeworker, &rc); if( rc==LSM_OK ) sortedInvokeWorkHook(pDb); #if 0 lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work"); #endif } } if( pnWrite ){ *pnWrite = (nWork - nRemaining); |
︙ | ︙ | |||
4034 4035 4036 4037 4038 4039 4040 | /* ** Perform work to merge database segments together. */ int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){ int rc = LSM_OK; /* Return code */ /* This function may not be called if pDb has an open read or write | | < < > > | > > | > > | | < < < < > | | | | | > > > > > > > | | 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 | /* ** Perform work to merge database segments together. */ int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){ int rc = LSM_OK; /* Return code */ /* This function may not be called if pDb has an open read or write ** transaction. Return LSM_MISUSE if an application attempts this. */ if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT; /* If the FLUSH flag is set, try to flush the contents of the in-memory ** tree to disk. */ if( (flags & LSM_WORK_FLUSH) ){ rc = lsmBeginWriteTrans(pDb); if( rc==LSM_OK ){ rc = lsmFlushToDisk(pDb); lsmFinishWriteTrans(pDb, 1); lsmFinishReadTrans(pDb); } } if( rc==LSM_OK && nPage>0 ){ int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0); int nWrite = 0; int nOvfl = -1; assert( pDb->pWorker==0 ); rc = lsmBeginWork(pDb); if( rc==LSM_OK ){ rc = sortedWork(pDb, nPage, bOptimize, &nWrite); } if( rc==LSM_OK && nWrite ){ rc = lsmSortedFlushDb(pDb); if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){ rc = sortedNewToplevel(pDb, 0, &nOvfl); } } if( nWrite ){ lsmFinishWork(pDb, 0, nOvfl, &rc); }else{ int rcdummy = LSM_BUSY; lsmFinishWork(pDb, 0, 0, &rcdummy); } assert( pDb->pWorker==0 ); if( pnWrite ) *pnWrite = nWrite; }else if( pnWrite ){ *pnWrite = 0; } /* If the LSM_WORK_CHECKPOINT flag is specified and one is available, ** write a checkpoint out to disk. */ |
︙ | ︙ | |||
4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 | int lsmInfoPageDump(lsm_db *pDb, Pgno iPg, int bHex, char **pzOut){ int rc = LSM_OK; /* Return code */ Snapshot *pWorker; /* Worker snapshot */ Snapshot *pRelease = 0; /* Snapshot to release */ Page *pPg = 0; /* Handle for page iPg */ int i, j; /* Loop counters */ const int perLine = 16; /* Bytes per line in the raw hex dump */ *pzOut = 0; if( iPg==0 ) return LSM_ERROR; /* Obtain the worker snapshot */ pWorker = pDb->pWorker; if( !pWorker ){ | > > > > | > > | 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 | int lsmInfoPageDump(lsm_db *pDb, Pgno iPg, int bHex, char **pzOut){ int rc = LSM_OK; /* Return code */ Snapshot *pWorker; /* Worker snapshot */ Snapshot *pRelease = 0; /* Snapshot to release */ Page *pPg = 0; /* Handle for page iPg */ int i, j; /* Loop counters */ const int perLine = 16; /* Bytes per line in the raw hex dump */ int bEndWork = 0; *pzOut = 0; if( iPg==0 ) return LSM_ERROR; /* Obtain the worker snapshot */ #if 0 pWorker = pDb->pWorker; if( !pWorker ){ rc = lsmBeginWork(pDb); if( rc!=LSM_OK ) return rc; pWorker = pDb->pWorker; bEndWork = 1; } #endif rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg); if( rc==LSM_OK ){ Blob blob = {0, 0, 0, 0}; int nKeyWidth = 0; LsmString str; int nRec; |
︙ | ︙ | |||
4369 4370 4371 4372 4373 4374 4375 | } *pzOut = str.z; sortedBlobFree(&blob); lsmFsPageRelease(pPg); } | < | 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 | } *pzOut = str.z; sortedBlobFree(&blob); lsmFsPageRelease(pPg); } return rc; } void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){ assert( pDb->xLog ); if( pRun && pRun->iFirst ){ char *zSeg; |
︙ | ︙ | |||
4408 4409 4410 4411 4412 4413 4414 | int bKeys, /* Output the keys from each segment */ int bVals, /* Output the values from each segment */ const char *zWhy /* Caption to print near top of dump */ ){ Snapshot *pDump = pSnap; Level *pTopLevel; | < | < < < | 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 | int bKeys, /* Output the keys from each segment */ int bVals, /* Output the values from each segment */ const char *zWhy /* Caption to print near top of dump */ ){ Snapshot *pDump = pSnap; Level *pTopLevel; assert( pSnap ); pTopLevel = lsmDbSnapshotLevel(pDump); if( pDb->xLog && pTopLevel ){ Level *pLevel; int iLevel = 0; lsmLogMessage(pDb, LSM_OK, "Database structure (%s)", zWhy); |
︙ | ︙ | |||
4479 4480 4481 4482 4483 4484 4485 | sortedDumpSegment(pDb, &pLevel->lhs, bVals); for(i=0; i<pLevel->nRight; i++){ sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals); } } } } | < < < < | 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 | sortedDumpSegment(pDb, &pLevel->lhs, bVals); for(i=0; i<pLevel->nRight; i++){ sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals); } } } } } void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){ Level *pNext; Level *p; for(p=pLevel; p; p=pNext){ |
︙ | ︙ |
Changes to src/lsm_tree.c.
︙ | ︙ | |||
46 47 48 49 50 51 52 | ** ** To reduce this overhead, the data structure used for a tree node is ** designed so that it may be edited in place exactly once without ** affecting existing users. In other words, the node structure is capable ** of storing two separate versions of the node at the same time. ** When a node is to be edited, if the node structure already contains ** two versions, a copy is made as in the append-only approach. Or, if | | | 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | ** ** To reduce this overhead, the data structure used for a tree node is ** designed so that it may be edited in place exactly once without ** affecting existing users. In other words, the node structure is capable ** of storing two separate versions of the node at the same time. ** When a node is to be edited, if the node structure already contains ** two versions, a copy is made as in the append-only approach. Or, if ** it only contains a single version, it is edited in place. ** ** This reduces the overhead so that, roughly, one new node structure ** must be allocated for each write (on top of those allocations that ** would have been required by a non-MVCC tree). Logic: Assume that at ** any time, 50% of nodes in the tree already contain 2 versions. When ** a new entry is written to a node, there is a 50% chance that a copy ** of the node will be required. And a 25% chance that a copy of its |
︙ | ︙ | |||
91 92 93 94 95 96 97 | typedef struct TreeKey TreeKey; typedef struct TreeNode TreeNode; typedef struct TreeLeaf TreeLeaf; typedef struct NodeVersion NodeVersion; /* | | > > > > < < > > > | | > | | | < | < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < | < < < < | < < < < < < < < < < < < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 | typedef struct TreeKey TreeKey; typedef struct TreeNode TreeNode; typedef struct TreeLeaf TreeLeaf; typedef struct NodeVersion NodeVersion; /* ** Container for a key-value pair. Within the *-shm file, each key/value ** pair is stored in a single allocation (which may not actually be ** contiguous in memory). Layout is the TreeKey structure, followed by ** the nKey bytes of key blob, followed by the nValue bytes of value blob ** (if nValue is non-negative). */ struct TreeKey { int nKey; /* Size of pKey in bytes */ int nValue; /* Size of pValue. Or negative. */ }; #define TK_KEY(p) ((void *)&(p)[1]) #define TK_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey)) /* ** A single tree node. A node structure may contain up to 3 key/value ** pairs. Internal (non-leaf) nodes have up to 4 children. ** ** TODO: Update the format of this to be more compact. Get it working ** first though... */ struct TreeNode { u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ /* The following fields are present for interior nodes only, not leaves. */ u32 aiChildPtr[4]; /* Array of pointers to child nodes */ /* The extra child pointer slot. */ u32 iV2; /* Transaction number of v2 */ u8 iV2Child; /* apChild[] entry replaced by pV2Ptr */ u32 iV2Ptr; /* Substitute pointer */ }; struct TreeLeaf { u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ }; typedef struct TreeBlob TreeBlob; struct TreeBlob { int n; u8 *a; }; /* ** Cursor for searching a tree structure. ** ** If a cursor does not point to any element (a.k.a. EOF), then the ** TreeCursor.iNode variable is set to a negative value. Otherwise, the ** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode]. ** ** Entries in the apTreeNode[] and aiCell[] arrays contain the node and ** index of the TreeNode.apChild[] pointer followed to descend to the ** current element. Hence apTreeNode[0] always contains the root node of ** the tree. */ struct TreeCursor { lsm_db *pDb; /* Database handle for this cursor */ int iNode; /* Cursor points at apTreeNode[iNode] */ TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */ u8 aiCell[MAX_DEPTH]; /* Current position in tree */ TreeKey *pSave; /* Saved key */ TreeBlob blob; /* Dynamic storage for a key */ }; /* ** A value guaranteed to be larger than the largest possible transaction ** id (TreeHeader.iTransId). */ #define WORKING_VERSION (1<<30) static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){ if( n>p->n ){ lsmFree(pDb->pEnv, p->a); p->a = lsmMallocRc(pDb->pEnv, n, pRc); p->n = n; } return (p->a==0); } static void tblobFree(lsm_db *pDb, TreeBlob *p){ lsmFree(pDb->pEnv, p->a); } /*********************************************************************** ** Start of IntArray methods. */ /* ** Append value iVal to the contents of IntArray *p. Return LSM_OK if ** successful, or LSM_NOMEM if an OOM condition is encountered. */ static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){ assert( p->nArray<=p->nAlloc ); if( p->nArray>=p->nAlloc ){ u32 *aNew; int nNew = p->nArray ? p->nArray*2 : 128; aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32)); if( !aNew ) return LSM_NOMEM_BKPT; p->aArray = aNew; p->nAlloc = nNew; } p->aArray[p->nArray++] = iVal; return LSM_OK; } /* ** Zero the IntArray object. */ static void intArrayFree(lsm_env *pEnv, IntArray *p){ lsmFree(pEnv, p->aArray); memset(p, 0, sizeof(IntArray)); } /* ** Return the number of entries currently in the int-array object. */ static int intArraySize(IntArray *p){ return p->nArray; } /* ** Return a copy of the iIdx'th entry in the int-array. */ static u32 intArrayEntry(IntArray *p, int iIdx){ return p->aArray[iIdx]; } /* ** Truncate the int-array so that all but the first nVal values are ** discarded. */ static void intArrayTruncate(IntArray *p, int nVal){ p->nArray = nVal; } /* End of IntArray methods. ***********************************************************************/ /* ** The pointer passed as the first argument points to an interior node, ** not a leaf. This function returns the offset of the iCell'th child ** sub-tree of the node. */ static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){ assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) ); if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Child ) return p->iV2Ptr; return p->aiChildPtr[iCell]; } /* ** Given an offset within the *-shm file, return the associated chunk number. */ static int treeOffsetToChunk(u32 iOff){ assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); return (int)(iOff>>15); } /* ** Return a pointer to the mapped memory location associated with *-shm ** file offset iPtr. */ static void *treeShmptr(lsm_db *pDb, u32 iPtr, int *pRc){ /* TODO: This will likely be way too slow. If it is, chunks should be ** cached as part of the db handle. */ if( iPtr && *pRc==0 ){ int rc; void *pChunk; rc = lsmShmChunk(pDb, treeOffsetToChunk(iPtr), &pChunk); if( rc==LSM_OK ){ return &((u8 *)pChunk)[iPtr & (LSM_SHM_CHUNK_SIZE-1)]; } *pRc = rc; } return 0; } static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){ int rcdummy = LSM_OK; return (ShmChunk *)treeShmptr(pDb, iChunk*LSM_SHM_CHUNK_SIZE, &rcdummy); } /* Values for the third argument to treeShmkey(). */ #define TK_LOADKEY 1 #define TK_LOADVAL 2 static TreeKey *treeShmkey( lsm_db *pDb, /* Database handle */ u32 iPtr, /* Shmptr to TreeKey struct */ int eLoad, /* Either zero or a TREEKEY_LOADXXX value */ TreeBlob *pBlob, /* Used if dynamic memory is required */ int *pRc /* IN/OUT: Error code */ ){ TreeKey *pRet; assert( eLoad==TK_LOADKEY || eLoad==TK_LOADVAL ); pRet = (TreeKey *)treeShmptr(pDb, iPtr, pRc); if( pRet ){ int nReq; /* Bytes of space required at pRet */ int nAvail; /* Bytes of space available at pRet */ nReq = sizeof(TreeKey) + pRet->nKey; if( eLoad==TK_LOADVAL && pRet->nValue>0 ){ nReq += pRet->nValue; } assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1)); if( nAvail<nReq ){ if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){ int nLoad = 0; while( *pRc==LSM_OK ){ ShmChunk *pChunk; void *p = treeShmptr(pDb, iPtr, pRc); int n = LSM_MIN(nAvail, nReq-nLoad); memcpy(&pBlob->a[nLoad], p, n); nLoad += n; if( nLoad==nReq ) break; pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr)); assert( pChunk ); iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR; nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR; } } pRet = (TreeKey *)(pBlob->a); } } return pRet; } #if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT) void assert_leaf_looks_ok(TreeNode *pNode){ assert( pNode->apKey[1] ); } |
︙ | ︙ | |||
243 244 245 246 247 248 249 250 251 | } } #else # define assert_tree_looks_ok(x,y) #endif #ifdef LSM_DEBUG static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){ int i; | > > > > > > > > | > > > | | > > > > | | | < < < < | < < < < < < | < | < | | < < < < | | < | < < < < < < < < < < < < | < | < < | < | < < < < | | | < < | < < < < < < | > | | | | | | | | < < < < < | < | < | | | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < < < | < < < | < < < | > | < | < < < < | > | > | > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | > | > > | | < < < | > | | > | > > | > | < > > > > > | < | | | | > > > | | | | | > > > > > > > > > | | < < | < | < < < | < < < | | < < < < < < < < < < | | | | | | | | | | | | | | | | > > > | | | | | | | | | | | | | | | | | | | | | > | | | | > > | < | | < < < < | | | | > | | | < | > | | | | | < | | | | | | | | | > | | | | | | | | | > | > | | | > > > > > > > > > > > > > > > > > > > > > > > > > > < < < > > > < > | > | < | < < < < < < < | < < < | | < < | | < < < < | > | > | | | > | | | | | | > | | > > | > | < < | < < < < < < < < < > | > < | | | < > | > | | > > | | > > | > > | > | | > | | | | | > | | > < | | > | > | > | > | | | | < < | > | > < | | > | > | > | > > | | | < < | > | | > > | > > | > > > > > | | < | | | | > | > > | | | > | > | < > | | > | | > > > > > | > > > > > > > > > > > > > > > | | > | | < < < | | < < < < | < | > | < | | | < < | < < < < < < < < < < < < < < < < < | > | | | | | | > | > > > > > > > > > | | < < < < < < < < < < < < < | | < > | | | > | < | < | | | < | < > | | < | < < | < < < < < < | > > > | < < > > < < > | > > | < > | > | | < < > > | < < < < | | > < < > > > | | < < | < > > > | > > > | < < | > | | | | < < | < < < < > > | < < < | | > > > > > | < < < < > > < < < < < < < | > | | < < > | < < | < | < < | | | 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 | } } #else # define assert_tree_looks_ok(x,y) #endif #ifdef LSM_DEBUG /* ** Pointer pBlob points to a buffer containing a blob of binary data ** nBlob bytes long. Append the contents of this blob to *pStr, with ** each octet represented by a 2-digit hexadecimal number. For example, ** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF}, ** then "0144ff" is appended to *pStr. */ static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){ int i; lsmStringExtend(pStr, nBlob*2); if( pStr->nAlloc==0 ) return; for(i=0; i<nBlob; i++){ u8 c = ((u8*)pBlob)[i]; if( c>='a' && c<='z' ){ pStr->z[pStr->n++] = c; }else{ pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf]; pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf]; } } pStr->z[pStr->n] = 0; } /* ** Append nIndent space (0x20) characters to string *pStr. */ static void lsmAppendIndent(LsmString *pStr, int nIndent){ int i; lsmStringExtend(pStr, nIndent); for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1); } void dump_node_contents( lsm_db *pDb, u32 iNode, /* Print out hte contents of this node */ int nIndent, /* Number of spaces indentation */ int nHeight /* Height: (0==leaf) (1==parent-of-leaf) */ ){ int i; int rc = LSM_OK; LsmString s; TreeNode *pNode; TreeBlob b = {0, 0}; /* Append the nIndent bytes of space to string s. */ lsmStringInit(&s, pDb->pEnv); if( nIndent ) lsmAppendIndent(&s, nIndent); pNode = (TreeNode *)treeShmptr(pDb, iNode, &rc); /* Append each key to string s. */ for(i=0; i<3; i++){ u32 iPtr = pNode->aiKeyPtr[i]; if( iPtr ){ TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TK_LOADKEY, &b, &rc); lsmAppendStrBlob(&s, TK_KEY(pKey), pKey->nKey); lsmStringAppend(&s, " ", -1); } } printf("%s\n", s.z); lsmStringClear(&s); for(i=0; i<4 && nHeight>0; i++){ u32 iPtr = getChildPtr(pNode, pDb->treehdr.iTransId, i); if( iPtr ){ dump_node_contents(pDb, iPtr, nIndent + 2, nHeight-1); } } tblobFree(pDb, &b); } void dump_tree_contents(lsm_db *pDb, const char *zCaption){ printf("\n%s\n", zCaption); if( pDb->treehdr.iRoot ){ dump_node_contents(pDb, pDb->treehdr.iRoot, 0, pDb->treehdr.nHeight-1); } fflush(stdout); } #endif /* ** Initialize a cursor object, the space for which has already been ** allocated. */ static void treeCursorInit(lsm_db *pDb, TreeCursor *pCsr){ memset(pCsr, 0, sizeof(TreeCursor)); pCsr->pDb = pDb; pCsr->iNode = -1; } /* ** Return a pointer to the mapping of the TreeKey object that the cursor ** is pointing to. */ static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){ return (TreeKey *)treeShmkey(pCsr->pDb, pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]], TK_LOADVAL, pBlob, pRc ); } /* ** Save the current position of tree cursor pCsr. */ int lsmTreeCursorSave(TreeCursor *pCsr){ int rc = LSM_OK; if( pCsr->pSave==0 ){ int iNode = pCsr->iNode; if( iNode>=0 ){ pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc); } pCsr->iNode = -1; } return rc; } /* ** Restore the position of a saved tree cursor. */ static int treeCursorRestore(TreeCursor *pCsr, int *pRes){ int rc = LSM_OK; if( pCsr->pSave ){ TreeKey *pKey = pCsr->pSave; pCsr->pSave = 0; if( pRes ){ rc = lsmTreeCursorSeek(pCsr, TK_KEY(pKey), pKey->nKey, pRes); } } return rc; } /* ** Allocate nByte bytes of space within the *-shm file. If successful, ** return LSM_OK and set *piPtr to the offset within the file at which ** the allocated space is located. */ static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){ u32 iRet = 0; if( *pRc==LSM_OK ){ const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE; const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR; u32 iWrite; /* Current write offset */ u32 iEof; /* End of current chunk */ int iChunk; /* Current chunk */ assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) ); /* Check if there is enough space on the current chunk to fit the ** new allocation. If not, link in a new chunk and put the new ** allocation at the start of it. */ iWrite = pDb->treehdr.iWrite; if( bAlign ){ iWrite = (iWrite + 3) & ~0x0003; assert( (iWrite % 4)==0 ); } assert( iWrite ); iChunk = treeOffsetToChunk(iWrite-1); iEof = (iChunk+1) * CHUNK_SIZE; assert( iEof>=iWrite && (iEof-iWrite)<CHUNK_SIZE ); if( (iWrite+nByte)>iEof ){ ShmChunk *pHdr; /* Header of chunk just finished (iChunk) */ ShmChunk *pFirst; /* Header of chunk treehdr.iFirst */ int iNext = 0; /* Next chunk */ int rc; /* Check if the chunk at the start of the linked list is still in ** use. If not, reuse it. If so, allocate a new chunk by appending ** to the *-shm file. */ if( pDb->treehdr.iFirst!=iChunk ){ int bInUse; pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst); rc = lsmTreeInUse(pDb, pFirst->iLastTree, &bInUse); if( rc!=LSM_OK ){ *pRc = rc; return 0; } if( bInUse==0 ){ iNext = pDb->treehdr.iFirst; pDb->treehdr.iFirst = pFirst->iNext; pFirst->iNext = 0; pFirst->iLastTree = 0; assert( pDb->treehdr.iFirst ); assert( pFirst->iLastTree<pDb->treehdr.iTreeId ); } } if( iNext==0 ) iNext = pDb->treehdr.nChunk++; /* Set the header values for the chunk just finished */ pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE, pRc); pHdr->iLastTree = pDb->treehdr.iTreeId; pHdr->iNext = iNext; /* Advance to the next chunk */ iWrite = iNext * CHUNK_SIZE + CHUNK_HDR; } /* Allocate space at iWrite. */ iRet = iWrite; pDb->treehdr.iWrite = iWrite + nByte; pDb->treehdr.nByte += nByte; } return iRet; } /* ** Allocate and zero nByte bytes of space within the *-shm file. */ static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){ u32 iPtr; void *p; iPtr = treeShmalloc(pDb, 1, nByte, pRc); p = treeShmptr(pDb, iPtr, pRc); if( p ){ assert( *pRc==LSM_OK ); memset(p, 0, nByte); *piPtr = iPtr; } return p; } static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){ return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc); } static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){ return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc); } static TreeKey *newTreeKey( lsm_db *pDb, u32 *piPtr, void *pKey, int nKey, /* Key data */ void *pVal, int nVal, /* Value data (or nVal<0 for delete) */ int *pRc ){ TreeKey *p; u32 iPtr; int nRem; u8 *a; int n; #if 0 nRem = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0); *piPtr = iPtr = treeShmalloc(pDb, 1, nRem, pRc); p = treeShmptr(pDb, iPtr, pRc); if( *pRc ) return 0; p->nKey = nKey; p->nValue = nVal; memcpy(&p[1], pKey, nKey); if( nVal>0 ) memcpy(((u8 *)&p[1]) + nKey, pVal, nVal); return p; #endif /* Allocate space for the TreeKey structure itself */ *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc); p = treeShmptr(pDb, iPtr, pRc); if( *pRc ) return 0; p->nKey = nKey; p->nValue = nVal; /* Allocate and populate the space required for the key and value. */ n = nRem = nKey; a = (u8 *)pKey; while( a ){ while( nRem>0 ){ u8 *aAlloc; int nAlloc; u32 iWrite; iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1)); iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR); nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), nRem); aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc), pRc); if( aAlloc==0 ) break; memcpy(aAlloc, &a[n-nRem], nAlloc); nRem -= nAlloc; } a = pVal; n = nRem = nVal; pVal = 0; } if( *pRc ) return 0; #if 0 printf("store: %d %s\n", (int)iPtr, (char *)pKey); #endif return p; } static TreeNode *copyTreeNode( lsm_db *pDb, TreeNode *pOld, u32 *piNew, int *pRc ){ TreeNode *pNew; pNew = newTreeNode(pDb, piNew, pRc); if( pNew ){ memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr)); memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr)); if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr; } return pNew; } static TreeNode *copyTreeLeaf( lsm_db *pDb, TreeLeaf *pOld, u32 *piNew, int *pRc ){ TreeLeaf *pNew; pNew = newTreeLeaf(pDb, piNew, pRc); if( pNew ){ memcpy(pNew, pOld, sizeof(TreeLeaf)); } return (TreeNode *)pNew; } /* ** The tree cursor passed as the second argument currently points to an ** internal node (not a leaf). Specifically, to a sub-tree pointer. This ** function replaces the sub-tree that the cursor currently points to ** with sub-tree pNew. ** ** The sub-tree may be replaced either by writing the "v2 data" on the ** internal node, or by allocating a new TreeNode structure and then ** calling this function on the parent of the internal node. */ static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){ int rc = LSM_OK; if( pCsr->iNode<0 ){ /* iNew is the new root node */ pDb->treehdr.iRoot = iNew; }else{ /* If this node already has version 2 content, allocate a copy and ** update the copy with the new pointer value. Otherwise, store the ** new pointer as v2 data within the current node structure. */ TreeNode *p; /* The node to be modified */ int iChildPtr; /* apChild[] entry to modify */ p = pCsr->apTreeNode[pCsr->iNode]; iChildPtr = pCsr->aiCell[pCsr->iNode]; if( p->iV2 ){ /* The "allocate new TreeNode" option */ u32 iCopy; TreeNode *pCopy; pCopy = copyTreeNode(pDb, p, &iCopy, &rc); if( pCopy ){ assert( rc==LSM_OK ); pCopy->aiChildPtr[iChildPtr] = iNew; pCsr->iNode--; rc = treeUpdatePtr(pDb, pCsr, iCopy); } }else{ /* The "v2 data" option */ u32 iPtr; assert( pDb->treehdr.iTransId>0 ); if( pCsr->iNode ){ iPtr = getChildPtr( pCsr->apTreeNode[pCsr->iNode-1], pDb->treehdr.iTransId, pCsr->aiCell[pCsr->iNode-1] ); }else{ iPtr = pDb->treehdr.iRoot; } rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr); if( rc==LSM_OK ){ p->iV2 = pDb->treehdr.iTransId; p->iV2Child = (u8)iChildPtr; p->iV2Ptr = iNew; } } } return rc; } /* ** Cursor pCsr points at a node that is part of pTree. This function ** inserts a new key and optionally child node pointer into that node. ** ** The position into which the new key and pointer are inserted is ** determined by the iSlot parameter. The new key will be inserted to ** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is ** greater than the index of the rightmost key in the node. ** ** Pointer pLeftPtr points to a child tree that contains keys that are ** smaller than pTreeKey. */ static int treeInsert( lsm_db *pDb, /* Database handle */ TreeCursor *pCsr, /* Cursor indicating path to insert at */ u32 iLeftPtr, /* Left child pointer */ u32 iTreeKey, /* Location of key to insert */ u32 iRightPtr, /* Right child pointer */ int iSlot /* Position to insert key into */ ){ int rc = LSM_OK; TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; /* Check if the node is currently full. If so, split pNode in two and ** call this function recursively to add a key to the parent. Otherwise, ** insert the new key directly into pNode. */ assert( pNode->aiKeyPtr[1] ); if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){ u32 iLeft; TreeNode *pLeft; /* New left-hand sibling node */ u32 iRight; TreeNode *pRight; /* New right-hand sibling node */ pLeft = newTreeNode(pDb, &iLeft, &rc); pRight = newTreeNode(pDb, &iRight, &rc); if( rc ) return rc; pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0); pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0]; pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1); pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2); pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2]; pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3); if( pCsr->iNode==0 ){ /* pNode is the root of the tree. Grow the tree by one level. */ u32 iRoot; TreeNode *pRoot; /* New root node */ pRoot = newTreeNode(pDb, &iRoot, &rc); pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1]; pRoot->aiChildPtr[1] = iLeft; pRoot->aiChildPtr[2] = iRight; pDb->treehdr.iRoot = iRoot; pDb->treehdr.nHeight++; }else{ pCsr->iNode--; rc = treeInsert(pDb, pCsr, iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode] ); } assert( pLeft->iV2==0 ); assert( pRight->iV2==0 ); switch( iSlot ){ case 0: pLeft->aiKeyPtr[0] = iTreeKey; pLeft->aiChildPtr[0] = iLeftPtr; if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr; break; case 1: pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]); pLeft->aiKeyPtr[2] = iTreeKey; pLeft->aiChildPtr[2] = iLeftPtr; break; case 2: pRight->aiKeyPtr[0] = iTreeKey; pRight->aiChildPtr[0] = iLeftPtr; if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr; break; case 3: pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]); pRight->aiKeyPtr[2] = iTreeKey; pRight->aiChildPtr[2] = iLeftPtr; break; } }else{ TreeNode *pNew; u32 *piKey; u32 *piChild; u32 iStore = 0; u32 iNew = 0; int i; /* Allocate a new version of node pNode. */ pNew = newTreeNode(pDb, &iNew, &rc); if( rc ) return rc; piKey = pNew->aiKeyPtr; piChild = pNew->aiChildPtr; for(i=0; i<iSlot; i++){ if( pNode->aiKeyPtr[i] ){ *(piKey++) = pNode->aiKeyPtr[i]; *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i); } } *piKey++ = iTreeKey; *piChild++ = iLeftPtr; iStore = iRightPtr; for(i=iSlot; i<3; i++){ if( pNode->aiKeyPtr[i] ){ *(piKey++) = pNode->aiKeyPtr[i]; *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i); iStore = 0; } } if( iStore ){ *piChild = iStore; }else{ *piChild = getChildPtr(pNode, WORKING_VERSION, (pNode->aiKeyPtr[2] ? 3 : 2) ); } pCsr->iNode--; rc = treeUpdatePtr(pDb, pCsr, iNew); } return rc; } static int treeInsertLeaf( lsm_db *pDb, /* Database handle */ TreeCursor *pCsr, /* Cursor structure */ u32 iTreeKey, /* Key pointer to insert */ int iSlot /* Insert key to the left of this */ ){ int rc = LSM_OK; /* Return code */ TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode]; TreeLeaf *pNew; u32 iNew; assert( iSlot>=0 && iSlot<=4 ); assert( pCsr->iNode>0 ); assert( pLeaf->aiKeyPtr[1] ); pCsr->iNode--; pNew = newTreeLeaf(pDb, &iNew, &rc); if( pNew ){ if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){ /* The leaf is full. Split it in two. */ TreeLeaf *pRight; u32 iRight; pRight = newTreeLeaf(pDb, &iRight, &rc); if( pRight ){ assert( rc==LSM_OK ); pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0]; pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2]; switch( iSlot ){ case 0: pNew->aiKeyPtr[0] = iTreeKey; break; case 1: pNew->aiKeyPtr[2] = iTreeKey; break; case 2: pRight->aiKeyPtr[0] = iTreeKey; break; case 3: pRight->aiKeyPtr[2] = iTreeKey; break; } rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode] ); } }else{ int iOut = 0; int i; for(i=0; i<4; i++){ if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey; if( i<3 && pLeaf->aiKeyPtr[i] ){ pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i]; } } rc = treeUpdatePtr(pDb, pCsr, iNew); } } return rc; } /* ** Empty the contents of the in-memory tree. */ void lsmTreeClear(lsm_db *pDb){ pDb->treehdr.iTreeId++; pDb->treehdr.iTransId = 1; pDb->treehdr.iRoot = 0; pDb->treehdr.nHeight = 0; pDb->treehdr.nByte = 0; } /* ** This function is called during recovery to initialize the ** tree header. Only the database connections private copy of the tree-header ** is initialized here - it will be copied into shared memory if log file ** recovery is successful. */ void lsmTreeInit(lsm_db *pDb){ pDb->treehdr.iTransId = 1; pDb->treehdr.iFirst = 1; pDb->treehdr.nChunk = 2; pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR; pDb->treehdr.iTreeId = 1; } /* ** Insert a new entry into the in-memory tree. ** ** If the value of the 5th parameter, nVal, is negative, then a delete-marker ** is inserted into the tree. In this case the value pointer, pVal, must be ** NULL. */ int lsmTreeInsert( lsm_db *pDb, /* Database handle */ void *pKey, /* Pointer to key data */ int nKey, /* Size of key data in bytes */ void *pVal, /* Pointer to value data (or NULL) */ int nVal /* Bytes in value data (or -ve for delete) */ ){ int rc = LSM_OK; /* Return Code */ TreeKey *pTreeKey; /* New key-value being inserted */ int nTreeKey; /* Number of bytes allocated at pTreeKey */ u32 iTreeKey; u8 *a; TreeHeader *pHdr = &pDb->treehdr; assert( nVal>=0 || pVal==0 ); assert_tree_looks_ok(LSM_OK, pTree); #if 0 dump_tree_contents(pDb, "before"); #endif /* Allocate and populate a new key-value pair structure */ pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc); if( rc!=LSM_OK ) return rc; if( pHdr->iRoot==0 ){ /* The tree is completely empty. Add a new root node and install ** (pKey/nKey) as the middle entry. Even though it is a leaf at the ** moment, use newTreeNode() to allocate the node (i.e. allocate enough ** space for the fields used by interior nodes). This is because the ** treeInsert() routine may convert this node to an interior node. */ TreeNode *pRoot = newTreeNode(pDb, &pHdr->iRoot, &rc); if( rc==LSM_OK ){ assert( pHdr->nHeight==0 ); pRoot->aiKeyPtr[1] = iTreeKey; pHdr->nHeight = 1; } }else{ TreeCursor csr; int res; /* Seek to the leaf (or internal node) that the new key belongs on */ treeCursorInit(pDb, &csr); lsmTreeCursorSeek(&csr, pKey, nKey, &res); if( res==0 ){ /* The search found a match within the tree. */ TreeNode *pNew; u32 iNew; TreeNode *pNode = csr.apTreeNode[csr.iNode]; int iCell = csr.aiCell[csr.iNode]; /* Create a copy of this node */ if( (csr.iNode>0 && csr.iNode==(pHdr->nHeight-1)) ){ pNew = copyTreeLeaf(pDb, (TreeLeaf *)pNode, &iNew, &rc); }else{ pNew = copyTreeNode(pDb, pNode, &iNew, &rc); } if( rc==LSM_OK ){ /* Modify the value in the new version */ pNew->aiKeyPtr[iCell] = iTreeKey; /* Change the pointer in the parent (if any) to point at the new ** TreeNode */ csr.iNode--; treeUpdatePtr(pDb, &csr, iNew); } }else{ /* The cursor now points to the leaf node into which the new entry should ** be inserted. There may or may not be a free slot within the leaf for ** the new key-value pair. ** ** iSlot is set to the index of the key within pLeaf that the new key ** should be inserted to the left of (or to a value 1 greater than the ** index of the rightmost key if the new key is larger than all keys ** currently stored in the node). */ int iSlot = csr.aiCell[csr.iNode] + (res<0); if( csr.iNode==0 ){ rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot); }else{ rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot); } } tblobFree(pDb, &csr.blob); } #if 0 dump_tree_contents(pDb, "after"); #endif assert_tree_looks_ok(rc, pTree); return rc; } /* ** Return, in bytes, the amount of memory currently used by the tree ** structure. */ int lsmTreeSize(lsm_db *pDb){ return pDb->treehdr.nByte; } /* ** Open a cursor on the in-memory tree pTree. */ int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **ppCsr){ TreeCursor *pCsr; *ppCsr = pCsr = lsmMalloc(pDb->pEnv, sizeof(TreeCursor)); if( pCsr ){ treeCursorInit(pDb, pCsr); return LSM_OK; } return LSM_NOMEM_BKPT; } /* ** Close an in-memory tree cursor. */ void lsmTreeCursorDestroy(TreeCursor *pCsr){ if( pCsr ){ tblobFree(pCsr->pDb, &pCsr->blob); lsmFree(pCsr->pDb->pEnv, pCsr); } } void lsmTreeCursorReset(TreeCursor *pCsr){ pCsr->iNode = -1; pCsr->pSave = 0; } #ifndef NDEBUG static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){ TreeKey *p; int cmp = 0; int rc = LSM_OK; assert( pCsr->iNode>=0 ); p = csrGetKey(pCsr, &pCsr->blob, &rc); if( p ){ cmp = pCsr->pDb->xCmp(TK_KEY(p), p->nKey, pKey, nKey); } return cmp; } #endif /* ** Attempt to seek the cursor passed as the first argument to key (pKey/nKey) ** in the tree structure. If an exact match for the key is found, leave the ** cursor pointing to it and set *pRes to zero before returning. If an ** exact match cannot be found, do one of the following: ** ** * Leave the cursor pointing to the smallest element in the tree that ** is larger than the key and set *pRes to +1, or ** ** * Leave the cursor pointing to the largest element in the tree that ** is smaller than the key and set *pRes to -1, or ** ** * If the tree is empty, leave the cursor at EOF and set *pRes to -1. */ int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){ int rc = LSM_OK; /* Return code */ lsm_db *pDb = pCsr->pDb; TreeHeader *pHdr = &pCsr->pDb->treehdr; int (*xCmp)(void *, int, void *, int) = pDb->xCmp; u32 iNodePtr; /* Location of current node in search */ /* Discard any saved position data */ treeCursorRestore(pCsr, 0); iNodePtr = pDb->treehdr.iRoot; if( iNodePtr==0 ){ /* Either an error occurred or the tree is completely empty. */ assert( rc!=LSM_OK || pDb->treehdr.iRoot==0 ); *pRes = -1; pCsr->iNode = -1; }else{ TreeBlob b = {0, 0}; int res = 0; /* Result of comparison function */ int iNode = -1; while( iNodePtr ){ TreeNode *pNode; /* Node at location iNodePtr */ int iTest; /* Index of second key to test (0 or 2) */ TreeKey *pTreeKey; /* Key to compare against */ pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); iNode++; pCsr->apTreeNode[iNode] = pNode; /* Compare (pKey/nKey) with the key in the middle slot of B-tree node ** pNode. The middle slot is never empty. If the comparison is a match, ** then the search is finished. Break out of the loop. */ pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TK_LOADKEY, &b, &rc); if( rc!=LSM_OK ) break; res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); if( res==0 ){ pCsr->aiCell[iNode] = 1; break; } /* Based on the results of the previous comparison, compare (pKey/nKey) ** to either the left or right key of the B-tree node, if such a key ** exists. */ iTest = (res>0 ? 0 : 2); pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[iTest], TK_LOADKEY, &b, &rc); if( rc ) break; if( pTreeKey==0 ){ iTest = 1; }else{ res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); if( res==0 ){ pCsr->aiCell[iNode] = iTest; break; } } if( iNode<(pHdr->nHeight-1) ){ iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iTest + (res<0)); }else{ iNodePtr = 0; } pCsr->aiCell[iNode] = iTest + (iNodePtr && (res<0)); } *pRes = res; pCsr->iNode = iNode; tblobFree(pDb, &b); } /* assert() that *pRes has been set properly */ #ifndef NDEBUG if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){ int cmp = treeCsrCompare(pCsr, pKey, nKey); assert( *pRes==cmp || (*pRes ^ cmp)>0 ); } #endif return rc; } int lsmTreeCursorNext(TreeCursor *pCsr){ #ifndef NDEBUG TreeKey *pK1; TreeBlob key1 = {0, 0}; #endif lsm_db *pDb = pCsr->pDb; const int iLeaf = pDb->treehdr.nHeight-1; int iCell; int rc = LSM_OK; TreeNode *pNode; /* Restore the cursor position, if required */ int iRestore = 0; treeCursorRestore(pCsr, &iRestore); if( iRestore>0 ) return LSM_OK; /* Save a pointer to the current key. This is used in an assert() at the ** end of this function - to check that the 'next' key really is larger ** than the current key. */ #ifndef NDEBUG pK1 = csrGetKey(pCsr, &key1, &rc); if( rc!=LSM_OK ) return rc; #endif assert( lsmTreeCursorValid(pCsr) ); assert( pCsr->aiCell[pCsr->iNode]<3 ); pNode = pCsr->apTreeNode[pCsr->iNode]; iCell = ++pCsr->aiCell[pCsr->iNode]; /* If the current node is not a leaf, and the current cell has sub-tree ** associated with it, descend to the left-most key on the left-most ** leaf of the sub-tree. */ if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){ do { u32 iNodePtr; pCsr->iNode++; iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell); pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); pCsr->apTreeNode[pCsr->iNode] = pNode; iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0); }while( pCsr->iNode < iLeaf ); } /* Otherwise, the next key is found by following pointer up the tree ** until there is a key immediately to the right of the pointer followed ** to reach the sub-tree containing the current key. */ else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){ while( (--pCsr->iNode)>=0 ){ iCell = pCsr->aiCell[pCsr->iNode]; if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; } } #ifndef NDEBUG if( pCsr->iNode>=0 ){ TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)>0 ); } tblobFree(pDb, &key1); #endif return rc; } int lsmTreeCursorPrev(TreeCursor *pCsr){ #ifndef NDEBUG TreeKey *pK1; TreeBlob key1 = {0, 0}; #endif lsm_db *pDb = pCsr->pDb; const int iLeaf = pDb->treehdr.nHeight-1; int iCell; int rc = LSM_OK; TreeNode *pNode; /* Restore the cursor position, if required */ int iRestore = 0; treeCursorRestore(pCsr, &iRestore); if( iRestore<0 ) return LSM_OK; /* Save a pointer to the current key. This is used in an assert() at the ** end of this function - to check that the 'next' key really is smaller ** than the current key. */ #ifndef NDEBUG pK1 = csrGetKey(pCsr, &key1, &rc); if( rc!=LSM_OK ) return rc; #endif assert( lsmTreeCursorValid(pCsr) ); pNode = pCsr->apTreeNode[pCsr->iNode]; iCell = pCsr->aiCell[pCsr->iNode]; assert( iCell>=0 && iCell<3 ); /* If the current node is not a leaf, and the current cell has sub-tree ** associated with it, descend to the right-most key on the right-most ** leaf of the sub-tree. */ if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){ do { u32 iNodePtr; pCsr->iNode++; iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell); pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); if( rc!=LSM_OK ) break; pCsr->apTreeNode[pCsr->iNode] = pNode; iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf); pCsr->aiCell[pCsr->iNode] = iCell; }while( pCsr->iNode < iLeaf ); } /* Otherwise, the next key is found by following pointer up the tree until ** there is a key immediately to the left of the pointer followed to reach ** the sub-tree containing the current key. */ else{ do { iCell = pCsr->aiCell[pCsr->iNode]-1; if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; }while( (--pCsr->iNode)>=0 ); pCsr->aiCell[pCsr->iNode] = iCell; } #ifndef NDEBUG if( pCsr->iNode>=0 ){ TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)<0 ); } tblobFree(pDb, &key1); #endif return rc; } /* ** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the ** in-memory tree. */ int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){ lsm_db *pDb = pCsr->pDb; TreeHeader *pHdr = &pDb->treehdr; int rc = LSM_OK; u32 iNodePtr; pCsr->iNode = -1; /* Discard any saved position data */ treeCursorRestore(pCsr, 0); iNodePtr = pHdr->iRoot; while( iNodePtr ){ int iCell; TreeNode *pNode; pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); if( rc ) break; if( bLast ){ iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3); }else{ iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0); } pCsr->iNode++; pCsr->apTreeNode[pCsr->iNode] = pNode; if( pCsr->iNode<pHdr->nHeight-1 ){ iNodePtr = getChildPtr(pNode, pHdr->iTransId, iCell); }else{ iNodePtr = 0; } pCsr->aiCell[pCsr->iNode] = iCell - (iNodePtr==0 && bLast); } return rc; } int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey){ TreeKey *pTreeKey; int rc = LSM_OK; assert( lsmTreeCursorValid(pCsr) ); pTreeKey = pCsr->pSave; if( !pTreeKey ){ pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); } if( rc==LSM_OK ){ *pnKey = pTreeKey->nKey; *ppKey = (void *)&pTreeKey[1]; } return rc; } int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){ int res = 0; int rc; rc = treeCursorRestore(pCsr, &res); if( res==0 ){ TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); if( rc==LSM_OK ){ *pnVal = pTreeKey->nValue; if( pTreeKey->nValue>=0 ){ *ppVal = TK_VAL(pTreeKey); }else{ *ppVal = 0; } } }else{ *ppVal = 0; *pnVal = 0; } return rc; } /* ** Return true if the cursor currently points to a valid entry. */ int lsmTreeCursorValid(TreeCursor *pCsr){ return (pCsr && (pCsr->pSave || pCsr->iNode>=0)); } /* ** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a ** pointer to the same TreeMark structure may be used to roll the tree ** contents back to their current state. */ void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){ pMark->iRoot = pDb->treehdr.iRoot; pMark->nHeight = pDb->treehdr.nHeight; pMark->iWrite = pDb->treehdr.iWrite; pMark->nChunk = pDb->treehdr.nChunk; pMark->iFirst = pDb->treehdr.iFirst; pMark->iRollback = intArraySize(&pDb->rollback); } /* ** Roll back to mark pMark. Structure *pMark should have been previously ** populated by a call to lsmTreeMark(). */ void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){ int rcdummy = LSM_OK; int iIdx; int nIdx; u32 iNext; ShmChunk *pChunk; u32 iChunk; /* Revert all required v2 pointers. */ nIdx = intArraySize(&pDb->rollback); for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){ TreeNode *pNode; pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx), &rcdummy); assert( pNode && rcdummy==LSM_OK ); pNode->iV2 = 0; pNode->iV2Child = 0; pNode->iV2Ptr = 0; } intArrayTruncate(&pDb->rollback, pMark->iRollback); /* Restore the free-chunk list */ assert( pMark->iWrite!=0 ); iChunk = treeOffsetToChunk(pMark->iWrite-1); pChunk = treeShmChunk(pDb, iChunk); iNext = pChunk->iNext; pChunk->iNext = 0; assert( iNext==0 || pDb->treehdr.iFirst==pMark->iFirst || iNext==pMark->iFirst ); pDb->treehdr.iFirst = pMark->iFirst; while( iNext ){ iChunk = iNext; pChunk = treeShmChunk(pDb, iChunk); iNext = pChunk->iNext; if( iChunk<pMark->nChunk ){ pChunk->iNext = pDb->treehdr.iFirst; pChunk->iLastTree = 0; } } /* Restore the tree-header fields */ pDb->treehdr.iRoot = pMark->iRoot; pDb->treehdr.nHeight = pMark->nHeight; pDb->treehdr.iWrite = pMark->iWrite; pDb->treehdr.nChunk = pMark->nChunk; } static void treeHeaderChecksum( TreeHeader *pHdr, u32 *aCksum ){ u32 cksum1 = 0x12345678; u32 cksum2 = 0x9ABCDEF0; u32 *a = (u32 *)pHdr; int i; assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) ); assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 ); for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){ cksum1 += a[i]; cksum2 += (cksum1 + a[i+1]); } aCksum[0] = cksum1; aCksum[1] = cksum2; } /* ** Return true if the checksum stored in TreeHeader object *pHdr is ** consistent with the contents of its other fields. */ static int treeHeaderChecksumOk(TreeHeader *pHdr){ u32 aCksum[2]; treeHeaderChecksum(pHdr, aCksum); return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum))); } /* ** Load the in-memory tree header from shared-memory into pDb->treehdr. ** If the header cannot be loaded, return LSM_BUSY. */ int lsmTreeLoadHeader(lsm_db *pDb){ while( 1 ){ int rc; ShmHeader *pShm = pDb->pShmhdr; memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader)); if( treeHeaderChecksumOk(&pDb->treehdr) ) return LSM_OK; rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); if( rc==LSM_BUSY ){ usleep(50); }else{ if( rc==LSM_OK ){ if( treeHeaderChecksumOk(&pShm->hdr1)==0 ){ memcpy(&pShm->hdr1, &pShm->hdr2, sizeof(TreeHeader)); } memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader)); lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); if( treeHeaderChecksumOk(&pDb->treehdr)==0 ){ rc = LSM_CORRUPT_BKPT; } } return rc; } } } /* ** This function is called to conclude a transaction. If argument bCommit ** is true, the transaction is committed. Otherwise it is rolled back. */ int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){ ShmHeader *pShm = pDb->pShmhdr; if( bCommit ){ treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum); memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader)); lsmShmBarrier(pDb); memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)); } pShm->bWriter = 0; intArrayFree(pDb->pEnv, &pDb->rollback); return LSM_OK; } /* ** Begin a new transaction. */ int lsmTreeBeginTransaction(lsm_db *pDb){ pDb->treehdr.iTransId++; return LSM_OK; } |
Changes to src/lsm_unix.c.
︙ | ︙ | |||
32 33 34 35 36 37 38 | #include <stdio.h> #include <ctype.h> #include <unistd.h> #include <errno.h> #include <sys/mman.h> | < | > | > | | > > > > > > > > > > > > > > | 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | #include <stdio.h> #include <ctype.h> #include <unistd.h> #include <errno.h> #include <sys/mman.h> #include "lsmInt.h" /* ** An open file is an instance of the following object */ typedef struct PosixFile PosixFile; struct PosixFile { lsm_env *pEnv; /* The run-time environment */ const char *zName; /* Full path to file */ int fd; /* The open file descriptor */ int shmfd; /* Shared memory file-descriptor */ void *pMap; /* Pointer to mapping of file fd */ off_t nMap; /* Size of mapping at pMap in bytes */ int nShm; /* Number of entries in array apShm[] */ void **apShm; /* Array of 32K shared memory segments */ }; static int lsm_ioerr(void){ return LSM_IOERR; } static char *posixShmFile(PosixFile *p){ char *zShm; int nName = strlen(p->zName); zShm = (char *)lsmMalloc(p->pEnv, nName+4+1); if( zShm ){ memcpy(zShm, p->zName, nName); memcpy(&zShm[nName], "-shm", 5); } return zShm; } static int lsmPosixOsOpen( lsm_env *pEnv, const char *zFile, lsm_file **ppFile ){ int rc = LSM_OK; PosixFile *p; p = lsm_malloc(pEnv, sizeof(PosixFile)); if( p==0 ){ rc = LSM_NOMEM; }else{ memset(p, 0, sizeof(PosixFile)); p->zName = zFile; p->pEnv = pEnv; p->fd = open(zFile, O_RDWR|O_CREAT, 0644); if( p->fd<0 ){ lsm_free(pEnv, p); p = 0; rc = lsm_ioerr(); } |
︙ | ︙ | |||
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | prc = fstat(p->fd, &buf); if( prc!=0 ) return LSM_IOERR_BKPT; memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev)); memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino)); return LSM_OK; } static int lsmPosixOsClose(lsm_file *pFile){ PosixFile *p = (PosixFile *)pFile; if( p->pMap ) munmap(p->pMap, p->nMap); close(p->fd); lsm_free(p->pEnv, p); return LSM_OK; } | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < < < < < | 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 | prc = fstat(p->fd, &buf); if( prc!=0 ) return LSM_IOERR_BKPT; memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev)); memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino)); return LSM_OK; } static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){ int prc = unlink(zFile); return prc ? LSM_IOERR_BKPT : LSM_OK; } int lsmPosixOsLock(lsm_file *pFile, int iLock, int eType){ int rc = LSM_OK; PosixFile *p = (PosixFile *)pFile; static const short aType[3] = { F_UNLCK, F_RDLCK, F_WRLCK }; struct flock lock; assert( aType[LSM_LOCK_UNLOCK]==F_UNLCK ); assert( aType[LSM_LOCK_SHARED]==F_RDLCK ); assert( aType[LSM_LOCK_EXCL]==F_WRLCK ); assert( eType>=0 && eType<array_size(aType) ); assert( iLock>0 && iLock<=16 ); memset(&lock, 0, sizeof(lock)); lock.l_whence = SEEK_SET; lock.l_len = 1; lock.l_type = aType[eType]; lock.l_start = (4096-iLock); if( fcntl(p->fd, F_SETLK, &lock) ){ int e = errno; if( e==EACCES || e==EAGAIN ){ rc = LSM_BUSY; }else{ rc = LSM_IOERR; } } return LSM_OK; } int lsmPosixOsShmMap(lsm_file *pFile, int iChunk, int sz, void **ppShm){ PosixFile *p = (PosixFile *)pFile; *ppShm = 0; assert( sz==LSM_SHM_CHUNK_SIZE ); if( iChunk>=p->nShm ){ int i; void **apNew; int nNew = iChunk+1; off_t nReq = nNew * LSM_SHM_CHUNK_SIZE; struct stat sStat; /* If the shared-memory file has not been opened, open it now. */ if( p->shmfd<=0 ){ char *zShm = posixShmFile(p); if( !zShm ) return LSM_NOMEM_BKPT; p->shmfd = open(zShm, O_RDWR|O_CREAT, 0644); lsmFree(p->pEnv, zShm); if( p->shmfd<0 ){ return LSM_IOERR_BKPT; } } /* If the shared-memory file is not large enough to contain the ** requested chunk, cause it to grow. */ if( fstat(p->shmfd, &sStat) ){ return LSM_IOERR_BKPT; } if( sStat.st_size<nReq ){ if( ftruncate(p->shmfd, nReq) ){ return LSM_IOERR_BKPT; } } apNew = (void **)lsmRealloc(p->pEnv, p->apShm, sizeof(void *) * nNew); if( !apNew ) return LSM_NOMEM_BKPT; for(i=p->nShm; i<nNew; i++){ apNew[i] = 0; } p->apShm = apNew; p->nShm = nNew; } if( p->apShm[iChunk]==0 ){ p->apShm[iChunk] = mmap(0, LSM_SHM_CHUNK_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, p->shmfd, iChunk*LSM_SHM_CHUNK_SIZE ); if( p->apShm[iChunk]==0 ) return LSM_IOERR; } *ppShm = p->apShm[iChunk]; return LSM_OK; } void lsmPosixOsShmBarrier(void){ } int lsmPosixOsShmUnmap(lsm_file *pFile, int bDelete){ PosixFile *p = (PosixFile *)pFile; if( p->shmfd>0 ){ int i; for(i=0; i<p->nShm; i++){ if( p->apShm[i] ){ munmap(p->apShm[i], LSM_SHM_CHUNK_SIZE); p->apShm[i] = 0; } } close(p->shmfd); p->shmfd = 0; if( bDelete ){ char *zShm = posixShmFile(p); if( zShm ) unlink(zShm); } } return LSM_OK; } static int lsmPosixOsClose(lsm_file *pFile){ PosixFile *p = (PosixFile *)pFile; lsmPosixOsShmUnmap(pFile, 0); if( p->pMap ) munmap(p->pMap, p->nMap); close(p->fd); lsm_free(p->pEnv, p); return LSM_OK; } /**************************************************************************** ** Memory allocation routines. */ #define ROUND8(x) (((x)+7)&~7) #define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) ) static void *lsmPosixOsMalloc(lsm_env *pEnv, int N){ |
︙ | ︙ | |||
528 529 530 531 532 533 534 535 536 537 538 539 540 541 | lsmPosixOsTruncate, /* xTruncate */ lsmPosixOsSync, /* xSync */ lsmPosixOsSectorSize, /* xSectorSize */ lsmPosixOsRemap, /* xRemap */ lsmPosixOsFileid, /* xFileid */ lsmPosixOsClose, /* xClose */ lsmPosixOsUnlink, /* xUnlink */ /***** memory allocation *********/ 0, /* pMemCtx */ lsmPosixOsMalloc, /* xMalloc */ lsmPosixOsRealloc, /* xRealloc */ lsmPosixOsFree, /* xFree */ lsmPosixOsMSize, /* xSize */ /***** mutexes *********************/ | > > > > | 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 | lsmPosixOsTruncate, /* xTruncate */ lsmPosixOsSync, /* xSync */ lsmPosixOsSectorSize, /* xSectorSize */ lsmPosixOsRemap, /* xRemap */ lsmPosixOsFileid, /* xFileid */ lsmPosixOsClose, /* xClose */ lsmPosixOsUnlink, /* xUnlink */ lsmPosixOsLock, /* xLock */ lsmPosixOsShmMap, /* xShmMap */ lsmPosixOsShmBarrier, /* xShmBarrier */ lsmPosixOsShmUnmap, /* xShmUnmap */ /***** memory allocation *********/ 0, /* pMemCtx */ lsmPosixOsMalloc, /* xMalloc */ lsmPosixOsRealloc, /* xRealloc */ lsmPosixOsFree, /* xFree */ lsmPosixOsMSize, /* xSize */ /***** mutexes *********************/ |
︙ | ︙ |
Changes to test/attach.test.
︙ | ︙ | |||
20 21 22 23 24 25 26 | ifcapable !attach { finish_test return } for {set i 2} {$i<=15} {incr i} { | | < | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | ifcapable !attach { finish_test return } for {set i 2} {$i<=15} {incr i} { db_delete test$i.db } do_test attach-1.1 { execsql { CREATE TABLE t1(a,b); INSERT INTO t1 VALUES(1,2); INSERT INTO t1 VALUES(3,4); |
︙ | ︙ |
Added test/ckpt1.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | # 2012 August 29 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #*********************************************************************** # The tests in this file focus on testing that very large checkpoints # (those that occur when the database contains an unusually large number # of levels or free blocks) are handled correctly. # set testdir [file dirname $argv0] source $testdir/tester.tcl set testprefix ckpt1 # Check that lsm_config(AUTOWORK) seems to be connected to something. # do_test 1.1 { sqlite4_lsm_config db main autowork 0 } 0 do_test 1.2 { sqlite4_lsm_config db main autowork 1 } 1 do_test 1.3 { sqlite4_lsm_config db main autowork -1 } 1 do_test 1.4 { sqlite4_lsm_config db main autowork 0 } 0 do_test 1.5 { sqlite4_lsm_config db main autowork -1 } 0 set nLevel 200 do_execsql_test 2.0 { CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER UNIQUE) } do_test 2.1 { for {set i 1} {$i <= $nLevel} {incr i} { db close sqlite4 db test.db sqlite4_lsm_config db main autowork 0 db eval { INSERT INTO t1 VALUES($i, $i || $i) } } db eval { SELECT count(*) FROM t1; PRAGMA integrity_check; } } [list $nLevel ok] #------------------------------------------------------------------------- # The point of this test is to add a large number of blocks to the # free-block list and check that this doesn't seem to cause any # obvious problems. # do_test 3.0 { db close forcedelete test.db sqlite4 db file:test.db?lsm_block_size=65536 execsql { CREATE TABLE t1(a PRIMARY KEY, b); CREATE INDEX i1 ON t1(b); } } {} do_execsql_test 3.1 { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)); INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 2 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 4 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 8 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 16 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 32 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 64 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 128 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 256 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 512 INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 1K INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 2K INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 4K INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 8K INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 16K INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 32K INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 64K } do_test 3.2 { sqlite4_lsm_work db main -optimize 1000000 execsql { SELECT count(*) FROM t1 } } {65536} do_test 3.3 { db close sqlite4 db test.db execsql { SELECT count(*) FROM t1 } } {65536} do_test 3.4 { execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) } sqlite4_lsm_work db main -optimize 1000000 execsql { SELECT count(*) FROM t1 } } {65537} finish_test |
Changes to test/manydb.test.
︙ | ︙ | |||
15 16 17 18 19 20 21 | # # $Id: manydb.test,v 1.4 2008/11/21 00:10:35 aswift Exp $ set testdir [file dirname $argv0] source $testdir/tester.tcl set N 300 | < < < < < < < | < < < | | 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | # # $Id: manydb.test,v 1.4 2008/11/21 00:10:35 aswift Exp $ set testdir [file dirname $argv0] source $testdir/tester.tcl set N 300 set num_fd_per_openwrite_db 4 # First test how many file descriptors are available for use. To open a # database for writing SQLite requires 3 file descriptors (the database, the # journal and the directory). set filehandles {} catch { for {set i 0} {$i<($N * $num_fd_per_openwrite_db)} {incr i} { lappend filehandles [open testfile.1 w] } } foreach fd $filehandles { close $fd } catch { |
︙ | ︙ |
Changes to test/permutations.test.
︙ | ︙ | |||
129 130 131 132 133 134 135 | # quick # full # lappend ::testsuitelist xxx test_suite "src4" -prefix "" -description { } -files { | > | > > | 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | # quick # full # lappend ::testsuitelist xxx test_suite "src4" -prefix "" -description { } -files { simple.test log1.test log2.test log3.test csr1.test ckpt1.test aggerror.test attach.test autoindex1.test badutf.test between.test bigrow.test |
︙ | ︙ |
Changes to test/test_lsm.c.
︙ | ︙ | |||
33 34 35 36 37 38 39 40 41 42 43 44 45 46 | int iVal; } aParam[] = { { "log-size", LSM_CONFIG_LOG_SIZE }, { "safety", LSM_CONFIG_SAFETY }, { "write-buffer", LSM_CONFIG_WRITE_BUFFER }, { "mmap", LSM_CONFIG_MMAP }, { "page-size", LSM_CONFIG_PAGE_SIZE }, { 0, 0 } }; const char *zDb; /* objv[1] as a string */ const char *zName; /* objv[2] as a string */ int iParam; /* Second argument for lsm_config() */ int iConfig = -1; /* Third argument for lsm_config() */ | > | 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | int iVal; } aParam[] = { { "log-size", LSM_CONFIG_LOG_SIZE }, { "safety", LSM_CONFIG_SAFETY }, { "write-buffer", LSM_CONFIG_WRITE_BUFFER }, { "mmap", LSM_CONFIG_MMAP }, { "page-size", LSM_CONFIG_PAGE_SIZE }, { "autowork", LSM_CONFIG_AUTOWORK }, { 0, 0 } }; const char *zDb; /* objv[1] as a string */ const char *zName; /* objv[2] as a string */ int iParam; /* Second argument for lsm_config() */ int iConfig = -1; /* Third argument for lsm_config() */ |
︙ | ︙ |
Changes to test/tester.tcl.
︙ | ︙ | |||
17 18 19 20 21 22 23 24 25 26 27 28 29 30 | # The commands provided by the code in this file to help with creating # test cases are as follows: # # Commands to manipulate the db and the file-system at a high level: # # copy_file FROM TO # delete_file FILENAME # drop_all_tables ?DB? # forcecopy FROM TO # forcedelete FILENAME # # Test the capability of the SQLite version built into the interpreter to # determine if a specific test can be run: # | > | 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | # The commands provided by the code in this file to help with creating # test cases are as follows: # # Commands to manipulate the db and the file-system at a high level: # # copy_file FROM TO # delete_file FILENAME # db_delete DBNAME # drop_all_tables ?DB? # forcecopy FROM TO # forcedelete FILENAME # # Test the capability of the SQLite version built into the interpreter to # determine if a specific test can be run: # |
︙ | ︙ | |||
355 356 357 358 359 360 361 362 363 364 365 366 | # If the --binarylog option was specified, create the logging VFS. This # call installs the new VFS as the default for all SQLite connections. # if {$cmdlinearg(binarylog)} { vfslog new binarylog {} vfslog.bin } } # Create a test database # proc reset_db {} { catch {db close} | > > > > > > > > > > | < | 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 | # If the --binarylog option was specified, create the logging VFS. This # call installs the new VFS as the default for all SQLite connections. # if {$cmdlinearg(binarylog)} { vfslog new binarylog {} vfslog.bin } } # Delete all files associated with LSM database $file. That is: # # ${file} # ${file}-log # ${file}-shm # proc db_delete {file} { forcedelete $file $file-shm $file-log } # Create a test database # proc reset_db {} { catch {db close} db_delete test.db sqlite4 db ./test.db set ::DB [sqlite4_connection_pointer db] if {[info exists ::SETUP_SQL]} { db eval $::SETUP_SQL } } reset_db |
︙ | ︙ | |||
1032 1033 1034 1035 1036 1037 1038 | # Delete the files test.db and test2.db, then execute the TCL and # SQL (in that order) to prepare for the test case. do_test $testname.$n.1 { set ::sqlite_io_error_pending 0 catch {db close} catch {db2 close} | | < | < | 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 | # Delete the files test.db and test2.db, then execute the TCL and # SQL (in that order) to prepare for the test case. do_test $testname.$n.1 { set ::sqlite_io_error_pending 0 catch {db close} catch {db2 close} catch {db_delete test.db} catch {db_delete test2.db} set ::DB [sqlite4 db test.db; sqlite4_connection_pointer db] sqlite4_extended_result_codes $::DB $::ioerropts(-erc) if {[info exists ::ioerropts(-tclprep)]} { eval $::ioerropts(-tclprep) } if {[info exists ::ioerropts(-sqlprep)]} { execsql $::ioerropts(-sqlprep) |
︙ | ︙ | |||
1464 1465 1466 1467 1468 1469 1470 | db36231 close hexio_write test.db 28 $A hexio_write test.db 92 $B return "" } proc db_save {} { | | | | | 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 | db36231 close hexio_write test.db 28 $A hexio_write test.db 92 $B return "" } proc db_save {} { db_delete sv_test.db foreach f [glob -nocomplain test.db*] { set f2 "sv_$f" forcecopy $f $f2 } } proc db_save_and_close {} { db_save catch { db close } return "" } proc db_restore {} { db_delete test.db foreach f2 [glob -nocomplain sv_test.db*] { set f [string range $f2 3 end] forcecopy $f2 $f } } proc db_restore_and_reopen {{dbfile test.db}} { catch { db close } db_restore sqlite4 db $dbfile } proc db_delete_and_reopen {{file test.db}} { catch { db close } db_delete $file sqlite4 db $file } # Do an SQL statement. Append the search count to the end of the result. # proc count {sql} { kvwrap reset |
︙ | ︙ |
Changes to tool/lsmview.tcl.
︙ | ︙ | |||
140 141 142 143 144 145 146 | $C bind $tid <1> [list segment_callback $C $maintag $segment] $C bind $tid <Enter> [list segment_info $C $segment] $C bind $tid <Leave> [list segment_info $C {}] } proc segment_info {C segment} { set w $C | | > > > | 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | $C bind $tid <1> [list segment_callback $C $maintag $segment] $C bind $tid <Enter> [list segment_info $C $segment] $C bind $tid <Leave> [list segment_info $C {}] } proc segment_info {C segment} { set w $C while {[winfo class $w]!="Frame"} { set w [winfo parent $w] if {$w==""} return } set w $w.info if {$segment==""} { $w config -text "" } else { foreach {iFirst iLast iRoot nSize} $segment break $w config -text "first: $iFirst last: $iLast\nroot: $iRoot size: $nSize" } |
︙ | ︙ |
Added www/shm.wiki.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | <title>Multi-process LSM Notes</title> <nowiki> <p> Notes on the changes required for LSM to allow connections from multiple processes. In other words, notes to do with the contents of the *-shm file and the way they are accessed and manipulated. <h2>Contents of shared memory</h2> <p> Like SQLite 3 WAL mode, LSM uses a *-shm file. It uses the same "dead man switch" mechanism to ensure it is always initialized to zero when the first client connects. <p> The *-shm file contains: <ol> <li> A flag indicating whether or not the *-shm has been initialized (log file recovered into in-memory tree, header fields loaded etc.) <li> The meta-page number to which a checkpoint was last successfully written. <li> The client snapshot. <li> The worker snapshot. <li> The in-memory tree. This takes up most of the space in the file. </ol> <p> The client and worker snapshots are in the same format as those stored in the header of the database file itself. <p> Sometimes data from the meta-page identified by the header field is required. For example it is necessary to know the id of the last checkpointed snapshot in order to determine which free blocks are safe to reuse. The associated log file offset is also required to determine when the log file may be wrapped. These quantities are read directly from the meta-page in the database itself as required. <h2>File locks</h2> <p> Lsm uses the same ideas as SQLite in WAL mode. Both SHARED and EXCLUSIVE locks are required. There are three exclusive locks: <ul> <li> WRITER: Required to write to in-memory tree and its log file. <li> WORKER: Required to write to body of database file. <li> CHECKPOINTER: Required to write to database file header. </ul> <p> Only one client may hold each of these locks at one time. In other words, each of the above is implemented by represents a range of bytes in the file <p> There are also N separate locks held by readers. These locks also work like WAL locks in that they are a combination of a lock and a value. In WAL mode the value is a 32-bit integer. For LSM, it will be two 64-bit integers - an in-memory tree id and a snapshot id. <h2>Memory allocation</h2> <p> Within the *-shm file, memory is allocated in 32KB chunks. <p> The first chunk of the file is the header chunk. It contains: <ol> <li> The client snapshot (4KB) <li> The worker snapshot (4KB) <li> The "initialized" flag (4 bytes) <li> The meta-page number containing the last checkpoint written (4 bytes) <li> The in-memory tree headers (see below). </ol> <p> The second and subsequent chunks are used to store the in-memory tree data. <p> The in-memory tree structure is essentially an append-only rb-tree with some modifications to reduce the amount of data written. Multiple trees will sometimes be present in the file. To cope with circumstances like the following: <ul> <li> Writer builds tree A. <li> Reader takes a read lock on tree A. <li> Tree A is flushed to the db. <li> Writer begins building tree B. <li> Reader continues reading from tree A. </ul> <p> In this case, the chunks used by tree A may not be reused until after the active read transaction has concluded. <p> Each chunk begins with three 32-bit integer fields: <ul> <li> Id of first tree for which data is stored on the chunk, <li> Id of last tree for which data is stored on the chunk, <li> Chunk number of chunk written after this one (or zero, if this is the most recently written chunk). </ul> <p> The third field described above links all tree chunks in the file, in-use or otherwise, into a single list. To allocate a new chunk, a writer first checks if the chunk at the head of the list can be recycled. If so, it moves it to the end of the list and begins writing to it. Otherwise, it allocates a new chunk at the end of the file, appends that to the list and continues writing. <p><b>Crash recovery: But, what happens if a writer crashes while writing a transaction to the database?</b> <p>If a writer crashes during a write transaction, readers can often continue as normal. However, the next writer must roll back any changes made to the db before it can commence a new transaction. Or, if a writer fails when updating the in-memory tree header, it may not be possible for readers to continue. This is resolved by having one reader become a writer, restore the db, then "commit" the empty transaction. <p> The pattern used by a writer is: <ol> <li> Obtain WRITER lock. This is a barrier operation (on Linux, an fcntl(F_SETLK)). <li> Update shared memory region. <li> Release WRITER lock. Another barrier (on Linux, another F_SETLK). </ol> <p> Or, if a failure occurs during step 2, the unlock operation is done automatically by the OS. Either way, assume that the unlock is also a barrier (see Documentation/memory-barrier.txt in kernel source tree). It can therefore be assumed that from the point of view of the subsequent writer, all writes to the shared memory region completed by the failed writer appear to have been performed in order - there is no need to worry that the hardware has reordered the writes made by the failed writer. The compiler may reorder them, of course, but this should be easy enough to avoid. <p> Also assumed is that 32-bit writes are atomic, in the sense that it is not possible for a failure in a writer process to result in some bits of a 32-bit word being updated and some remaining in their original state. <p> Crashes are then managed by the following: <ul> <li>When a write transaction is opened, a flag is set in the in-memory tree header. This indicates that a transaction is underway. The same flag is cleared right before the WRITER lock is released to commit or roll back the transaction. <li>When a recyclable chunk is moved from the start of the linked list to the end, the first thing done is that the "first tree" field is updated. Then the "last tree". Then the header pointer is set to point to the next element in the list. <li>If the header flag is already set when the writer grabs the WRITER lock, then a crash must have occurred. In this case the free-list must be recovered. <li>Recovering the free list involves two steps: First a linear scan of the current tree to identify those chunks in use (and also for another reason, see below). Second, a scan of the remainder of the file checking the "first tree" field of all chunks that either belong to an earlier tree or appear to belong to the current tree but are not linked in anywhere. Based on this, the new writer can rebuild the free-list. </ul> <h2>In-memory tree format</h2> <p> Header fields: <ul> <li> 32-bits: Tree id (incremented for each new tree). <li> 32-bits: Transaction id (incremented for each new transaction). <li> 32-bits: Pointer to head of tree (an offset within the *-shm file). <li> 32-bits: Height of tree. <li> 64-bits: Last checkpoint id for which log file space has already been reclaimed. <li> DbLog structure (see lsmInt.h). <li> 32-bits: Header checksum 1. <li> 32-bits: Header checksum 2. </ul> <p> There are two copies of the in-memory tree header. Both stored on the *-shm header chunk. Copy 1 and copy 2. <p> To commit a transaction, a writer does the following: <ol> <li> Updates copy 2 of the header, <li> Invokes a memory barrier, <li> Updates copy 1 of the header, <li> Clears the "transaction in progress flag", <li> Drops the WRITER lock. </ol> <p> To open a read transaction, the reader: <ol> <li> Reads copy 1 of the header. <li> If the checksum fails, attempt to obtain the WRITER lock. If successful, do the equivalent of opening and committing an empty transaction (see below). Either way, return to 1 and attempt to reread the in-memory tree header. If copy 1 cannot be read within some reasonable amount of time...? <li> Read the client shapshot from shared memory. If the checksum fails, attempt to obtain the WORKER lock. If successful, copy the worker snapshot over the client snapshot and drop the WORKER lock. Successful or otherwise, attempt to reread the snapshot. If this cannot be completed within some reasonable amount of time...? <li> Grab a read-lock corresponding to the tree id and snapshot ids just read (note: assume that this is a memory barrier). <li> Check that the shared memory tree header and client snapshot still contain the ids for which the lock was obtained. If not, drop the lock and go back to step 1. </ol> <p>To open a write transaction, the writer: <ol> <li> Opens a read transaction, if one is not already open. <li> Obtain the WRITER lock. <li> Check the "transaction in progress" flag. If it is set, perform the emergency rollback and freelist recovery, then clear the flag. <li> Check that copy 1 of the header still matches the copy read when the read transaction was opened. If not, drop the lock and return LSM_BUSY. <li> Set the "transaction in progress" flag. </ol> <p> Emergency rollback and recovery: <ol> <li> If the checksum of copy 1 of the header fails, replace it with the contents of copy 2. <li> Iterate through the entire tree, rolling back any nodes with transaction ids that indicate they require it. Record the blocks occupied by the current tree. <li> Scan through the entire *-shm memory file, inspecting the "first tree" fields of each chunk. </ol> <p> Large values or keys may overflow chunks. <h2>Client and worker snapshots</h2> <p> The client and worker snapshots stored in the *-shm file use the same format as the checkpoint written to the database file. Except, they are always in native byte order. Each is stored in a dedicated 4KB slot, as in the database file. A client must hold the WORKER lock to modify either of the two snapshots. <p> To work on the database file, a worker performs the following: <ol> <li> Obtain the WORKER lock. <li> Copies the worker snapshot from the shared-memory region into heap memory and verifies that the checksum computes. <li> If the checksum of the worker snapshot does not compute, copy the client snapshot over the top of the worker and reload it. If the checksum still does not compute, return LSM_CORRUPT. <li> Perform some merging work on the database. Generate a new worker snapshot. Write it over the top of the old. <li> Optionally, copy the new worker snapshot over the top of the client snapshot. TODO: Copying the worker snapshot into the client slot makes the worker read-only.... Currently, LSM distinguishes between read-only and read-write worker snapshots. But that would mean an extra flag in shared-memory. Perhaps its better to consider all worker snapshots to be read-only. Or, change the format slightly to include a "read-write" flag that can be set for those snapshots not copied into the client slot. UPDATE: Current code already treats all worker snapshots as read-only. <li> Release the WORKER lock. </ol> <p> To checkpoint a snapshot. <ol> <li> Obtain the CHECKPOINTER lock. <li> Read the client snapshot. <li> Sync the database file. <li> Write the client snapshot into the appropriate meta-page (based on the "last checkpoint slot" field in the *-shm header). <li> Sync the database file. <li> Update the "last checkpoint slot" field. <li> Drop the CHECKPOINTER lock. </ol> |