/* ** Copyright (c) 2001 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public ** License as published by the Free Software Foundation; either ** version 2 of the License, or (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ** General Public License for more details. ** ** You should have received a copy of the GNU General Public ** License along with this library; if not, write to the ** Free Software Foundation, Inc., 59 Temple Place - Suite 330, ** Boston, MA 02111-1307, USA. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ************************************************************************* ** $Id: pg.c,v 1.4 2001/01/25 01:45:41 drh Exp $ */ #include #include #include #include #include #include "sqliteInt.h" #include "pg.h" /* ** Uncomment the following for a debug trace */ #if 1 # define TRACE(X) printf X; fflush(stdout); #endif /* ** Hash table sizes */ #define J_HASH_SIZE 127 /* Size of the journal page hash table */ #define PG_HASH_SIZE 349 /* Size of the database page hash table */ /* ** Forward declaration of structure */ typedef struct Pghdr Pghdr; /* ** All information about a single paging file is contained in an ** instance of the following structure. */ struct Pgr { int fdMain; /* The main database file */ char *zMain; /* Name of the database file */ int fdJournal; /* The journal file */ char *zJournal; /* Name of the journal file */ int nMemPg; /* Number of memory-resident pages */ int nJPg; /* Number of pages in the journal */ int nDbPg; /* Number of pages in the database */ int nRefPg; /* Number of pages currently in use */ Pghdr *pLru, *pMru; /* Least and most recently used mem-page */ Pghdr *pJidx; /* List of journal index pages */ Pghdr *pAll; /* All pages, except journal index pages */ u32 aJHash[J_HASH_SIZE]; /* Journal page hash table */ Pghdr *aPgHash[PG_HASH_SIZE]; /* Mem-page hash table */ }; /* ** Each memory-resident page of the paging file has a header which ** is an instance of the following structure. */ struct Pghdr { Pgr *p; /* Pointer back to the Pgr structure */ int nRef; /* Number of references to this page */ int isDirty; /* TRUE if needs to be written to disk */ u32 dbpgno; /* Page number in the database file */ u32 jpgno; /* Page number in the journal file */ Pghdr *pNx; /* Next page on a list of them all */ Pghdr *pLru; /* Less recently used pages */ Pghdr *pMru; /* More recently used pages */ Pghdr *pNxHash; /* Next with same dbpgno hash */ Pghdr *pPvHash; /* Previous with the same dbpgno hash */ }; /* ** For a memory-resident page, the page data comes immediately after ** the page header. The following macros can be used to change a ** pointer to a page header into a pointer to the data, or vice ** versa. */ #define PG_TO_DATA(X) ((void*)&(X)[1]) #define DATA_TO_PG(X) (&((Pghdr*)(X))[-1]) /* ** The number of in-memory pages that we accumulate before trying ** to reuse older pages when new ones are requested. */ #define MX_MEM_PAGE 100 /* ** The number of journal data pages that come between consecutive ** journal index pages. */ #define N_J_DATAPAGE (SQLITE_PAGE_SIZE/(2*sizeof(u32))) /* ** An index page in the journal consists of an array of N_J_DATAPAGE ** of the following structures. There is one instance of the following ** structure for each of the N_J_DATAPAGE data pages that follow the ** index. ** ** Let the journal page number that a JidxEntry describes be J. Then ** the JidxEntry.dbpgno field is the page of the database file that ** corresponds to the J page in the journal. The JidxEntry.next_jpgno ** field hold the number of another journal page that contains ** a database file page with the same hash as JidxEntry.dbpgno. ** ** All information is written to the journal index in big-endian ** notation. */ typedef struct JidxEntry JidxEntry; struct JidxEntry { char dbpgno[sizeof(u32)]; /* Database page number for this entry */ char next_jpgno[sizeof(u32)]; /* Next entry with same hash on dbpgno */ }; /* ** Read a page from a file into memory. Return SQLITE_OK if successful. ** The "pgno" parameter tells where in the file to read the page. ** The first page is 1. Files do not contain a page 0 since a page ** number of 0 is used to indicate "no such page". */ static int sqlitePgRead(int fd, char *zBuf, u32 pgno){ int got = 0; int amt; assert( pgno>0 ); assert( fd>=0 ); lseek(fd, SEEK_SET, (pgno-1)*SQLITE_PAGE_SIZE); while( got0 ); assert( fd>=0 ); lseek(fd, SEEK_SET, (pgno-1)*SQLITE_PAGE_SIZE); while( done>= 8; p[2] = v & 0xff; v >>= 8; p[1] = v & 0xff; v >>= 8; p[0] = v & 0xff; } /* ** Check the hash table for an in-memory page. Return a pointer to ** the page header if found. Return NULL if the page is not in memory. */ static Pghdr *sqlitePgFind(Pgr *p, u32 pgno){ int h; Pghdr *pPg; if( pgno==0 ) return 0; h = pgno % PG_HASH_SIZE; for(pPg = p->aPgHash[h]; pPg; pPg=pPg->pNxHash){ if( pPg->dbpgno==pgno ) return pPg; } TRACE(("PG: data page %u is %#x\n", pgno, (u32)pPg)); return 0; } /* ** Locate and return an index page from the journal. ** ** The first page of a journal is the primary index. Additional ** index pages are called secondary indices. Index pages appear ** in the journal as often as needed. (If SQLITE_PAGE_SIZE==1024, ** then there are 1024/sizeof(int)*2 = 128 database between each ** pair of index pages.) Journal index pages are not hashed and ** do no appear on the Pgr.pAll list. Index pages are on the ** Pgr.pJidx list only. Index pages have Pghdr.dbpgno==0. ** ** If the requested index page is not already in memory, then a ** new memory page is created to hold the index. ** ** This routine will return a NULL pointer if we run out of memory. */ static Pghdr *sqlitePgFindJidx(Pgr *p, u32 pgno){ Pghdr *pPg; assert( pgno % (N_J_DATAPAGE+1) == 1 ); for(pPg=p->pJidx; pPg; pPg=pPg->pNx){ if( pPg->jpgno==pgno ){ TRACE(("PG: found j-index %u at %#x\n", pgno, (u32)pPg)); return pPg; } } pPg = sqliteMalloc( sizeof(Pghdr)+SQLITE_PAGE_SIZE ); if( pPg==0 ) return 0; pPg->jpgno = pgno; pPg->pNx = p->pJidx; p->pJidx = pPg; sqlitePgRead(p->fdJournal, PG_TO_DATA(pPg), pgno); TRACE(("PG: create j-index %u at %#x\n", pgno, (u32)pPg)); return pPg; } /* ** Look in the journal to see if the given database page is stored ** in the journal. If it is, return its journal page number. If ** not, return 0. */ static u32 sqlitePgJournalPageNumber(Pgr *p, u32 dbpgno){ u32 jpgno; if( dbpgno==0 ) return 0; jpgno = p->aJHash[dbpgno % J_HASH_SIZE]; while( jpgno!=0 ){ int idx_num; /* Which journal index describes page jpgno */ int ipgno; /* Page number for the journal index */ int idx_slot; /* Which entry in index idx_num describes jpgno */ Pghdr *pIdxPg; /* The index page for jpgno */ JidxEntry *aIdx; /* The data for the index page */ idx_num = (jpgno - 1)/(N_J_DATAPAGE + 1); idx_slot = (jpgno - 1) % (N_J_DATAPAGE + 1) - 2; ipgno = idx_num * (N_J_DATAPAGE + 1) + 1; if( ipgno>p->nJPg ){ jpgno = 0; break; } pIdxPg = sqlitePgFindJidx(p, ipgno); assert( pIdxPg!=0 ); aIdx = PG_TO_DATA(pIdxPg); if( dbpgno==sqlitePgGetInt(aIdx[idx_slot].dbpgno) ){ break; } jpgno = sqlitePgGetInt(aIdx[idx_slot].next_jpgno); } return jpgno; } /* ** Make a page not dirty by writing it to the journal. */ static int sqlitePgMakeClean(Pghdr *pPg){ Pgr *p = pPg->p; int rc; assert( pPg->isDirty ); assert( p->fdJournal>=0 ); if( pPg->jpgno==0 ){ int jpgno; /* A newly allocate page in the journal */ int idx_num; /* Which journal index describes page jpgno */ int idx_slot; /* Which entry in index idx_num describes jpgno */ Pghdr *pIdxPg; /* The index page for jpgno */ JidxEntry *aIdx; /* The data for the index page */ int h; /* The hash value for pPg->dbpgno */ jpgno = p->nJPg + 1; if( jpgno % (N_J_DATAPAGE + 1) == 1 ){ jpgno++; } idx_num = (jpgno - 1)/(N_J_DATAPAGE + 1); idx_slot = (jpgno - 1) % (N_J_DATAPAGE + 1) - 2; pIdxPg = sqlitePgFindJidx(p, idx_num * (N_J_DATAPAGE + 1) + 1); assert( pIdxPg!=0 ); aIdx = PG_TO_DATA(pIdxPg); sqlitePgPutInt(pPg->dbpgno, aIdx[idx_slot].dbpgno); h = pPg->dbpgno % J_HASH_SIZE; sqlitePgPutInt(p->aJHash[h], aIdx[idx_slot].next_jpgno); p->aJHash[h] = jpgno; p->nJPg = jpgno; pPg->jpgno = jpgno; TRACE(("PG: assign d-page %u to j-page %u\n", jpgno, pPg->dbpgno)); } rc = sqlitePgWrite(p->fdJournal, PG_TO_DATA(pPg), pPg->jpgno); if( rc==SQLITE_OK ){ pPg->isDirty = 0; } return rc; } /* ** Find the number of pages in the given file by measuring the size ** of the file. Return 0 if there is any problem. */ static int sqlitePgPageCount(int fd){ struct stat statbuf; if( fstat(fd, &statbuf)!=0 ) return 0; return statbuf.st_size/SQLITE_PAGE_SIZE; } /* ** This routine reads the journal and transfers pages from the ** journal to the database. */ static int sqlitePgJournalPlayback(Pgr *p){ Pghdr *pPg; JidxEntry *aIdx; int nJpg; int jpgno = 1; int i; int dbpgno; int rc; char idx[SQLITE_PAGE_SIZE]; char pgbuf[SQLITE_PAGE_SIZE]; assert( p->fdJournal>=0 ); nJpg = sqlitePgPageCount(p->fdJournal); while( jpgno<=nJpg ){ if( !sqlitePgRead(p->fdJournal, idx, jpgno++) ) break; aIdx = (JidxEntry*)idx; for(i=0; ifdMain, PG_TO_DATA(pPg), dbpgno); TRACE(("PG: commit j-page %u to d-page %u from memory\n",jpgno,dbpgno)); }else{ rc = sqlitePgRead(p->fdJournal, pgbuf, jpgno); if( rc!=SQLITE_OK ){ return rc; } rc = sqlitePgWrite(p->fdMain, pgbuf, dbpgno); TRACE(("PG: commit j-page %u to d-page %u from disk\n",jpgno,dbpgno)); } jpgno++; if( rc!=SQLITE_OK ){ return rc; } } } TRACE(("PG: commit complete. deleting the journal.\n")); fsync(p->fdMain); close(p->fdJournal); p->fdJournal = -1; unlink(p->zJournal); for(pPg=p->pAll; pPg; pPg=pPg->pNx){ pPg->isDirty = 0; pPg->jpgno = 0; } while( (pPg = p->pJidx)!=0 ){ p->pAll = pPg->pNx; sqliteFree(pPg); } return SQLITE_OK; } /* ** Remove the given page from the LRU list. */ static void sqlitePgUnlinkLru(Pghdr *pPg){ Pgr *p = pPg->p; if( pPg->pLru ){ pPg->pLru->pMru = pPg->pLru; } if( pPg->pMru ){ pPg->pMru->pLru = pPg->pMru; } if( p->pLru==pPg ){ p->pLru = pPg->pLru; } if( p->pMru==pPg ){ p->pMru = pPg->pMru; } pPg->pLru = pPg->pMru = 0; } /* ** Open the database file and make *ppPgr pointer to a structure describing it. ** Return SQLITE_OK on success or an error code if there is a failure. ** ** If there was an unfinished commit, complete it before returnning. */ int sqlitePgOpen(const char *zFilename, Pgr **ppPgr){ Pgr *p; int n; n = strlen(zFilename); p = sqliteMalloc( sizeof(*p) + n*2 + 4 ); if( p==0 ){ *ppPgr = 0; return SQLITE_NOMEM; } p->zMain = (char*)&p[1]; strcpy(p->zMain, zFilename); p->zJournal = &p->zMain[n+1]; strcpy(p->zJournal, p->zMain); p->zJournal[n] = '~'; p->zJournal[n+1] = 0; p->fdJournal = -1; p->fdMain = open(p->zMain, O_CREAT|O_RDWR, 0600); if( p->fdMain<0 ){ *ppPgr = 0; sqliteFree(p); return SQLITE_PERM; } p->nDbPg = sqlitePgPageCount(p->fdMain); if( access(p->zJournal, R_OK)==0 ){ sqlitePgJournalPlayback(p); } *ppPgr = p; return SQLITE_OK; } /* ** Close the database file. Any outstanding transactions are abandoned. */ int sqlitePgClose(Pgr *p){ Pghdr *pPg; if( p->fdMain ) close(p->fdMain); if( p->fdJournal ) close(p->fdJournal); unlink(p->zJournal); while( (pPg = p->pAll)!=0 ){ p->pAll = pPg->pNx; sqliteFree(pPg); } while( (pPg = p->pJidx)!=0 ){ p->pAll = pPg->pNx; sqliteFree(pPg); } sqliteFree(p); return SQLITE_OK; } /* ** Begin a new transaction. Return SQLITE_OK on success or an error ** code if something goes wrong. */ int sqlitePgBeginTransaction(Pgr *p){ assert( p->fdJournal<0 ); if( p->nRefPg>0 ){ /* release the read lock */ } /* write lock the database */ p->fdJournal = open(p->zJournal, O_CREAT|O_EXCL|O_RDWR, 0600); if( p->fdJournal<0 ){ return SQLITE_PERM; } p->nJPg = 0; TRACE(("PG: begin transaction\n")); return SQLITE_OK; } /* ** Commit the current transaction. Return SQLITE_OK or an error code. */ int sqlitePgCommit(Pgr *p){ Pghdr *pPrimaryIdx = 0; Pghdr *pPg; int rc; for(pPg=p->pAll; pPg; pPg=pPg->pNx){ if( pPg->isDirty ){ rc = sqlitePgMakeClean(pPg); if( rc!=SQLITE_OK ){ return rc; } } } for(pPg=p->pJidx; pPg; pPg=pPg->pNx){ if( pPg->jpgno==1 ){ pPrimaryIdx = pPg; }else{ TRACE(("PG: writing j-index %u\n", pPg->jpgno)); rc = sqlitePgMakeClean(pPg); if( rc!=SQLITE_OK ){ return rc; } } } assert( pPrimaryIdx!=0 ); fsync(p->fdJournal); TRACE(("PG: writing j-index %u\n", pPrimaryIdx->jpgno)); rc = sqlitePgMakeClean(pPrimaryIdx); if( rc!=SQLITE_OK ){ return rc; } fsync(p->fdJournal); rc = sqlitePgJournalPlayback(p); if( rc!=SQLITE_OK ){ return rc; } /* remove write lock from database */ if( p->nRefPg>0 ){ /* acquire read lock on database */ } return SQLITE_OK; } /* ** Abandon the current transaction. */ int sqlitePgRollback(Pgr *p){ Pghdr *pPg; TRACE(("PG: begin rollback\n")); for(pPg=p->pAll; pPg; pPg=pPg->pNx){ if( pPg->isDirty || pPg->jpgno!=0 ){ pPg->isDirty = 0; pPg->jpgno = 0; if( pPg->nRef>0 ){ TRACE(("PG: reloading d-page %u\n", pPg->dbpgno)); sqlitePgRead(p->fdMain, PG_TO_DATA(pPg), pPg->dbpgno); }else{ sqlitePgUnlinkLru(pPg); } } } close(p->fdJournal); p->fdJournal = -1; unlink(p->zJournal); while( (pPg = p->pJidx)!=0 ){ p->pAll = pPg->pNx; sqliteFree(pPg); } p->nDbPg = sqlitePgPageCount(p->fdMain); /* remove write lock from database */ if( p->nRefPg>0 ){ /* acquire read lock on database */ } return SQLITE_OK; } /* ** Get a page from the database. Return a pointer to the data for that ** page. ** ** A NULL pointer will be returned if we run out of memory. */ int sqlitePgGet(Pgr *p, u32 pgno, void **ppData){ Pghdr *pPg; int h; pPg = sqlitePgFind(p, pgno); if( pPg ){ pPg->nRef++; if( pPg->nRef==1 ){ sqlitePgUnlinkLru(pPg); TRACE(("PG: d-page %u pulled from cache\n", pgno)); } p->nRefPg++; if( p->nRefPg==1 ){ /* Acquire a read lock */ } *ppData = PG_TO_DATA(pPg); return SQLITE_OK; } if( p->nMemPgpLru==0 ){ pPg = sqliteMalloc( sizeof(Pghdr) + SQLITE_PAGE_SIZE ); if( pPg==0 ) return SQLITE_NOMEM; p->nMemPg++; pPg->pNx = p->pAll; p->pAll = pPg; pPg->p = p; TRACE(("PG: new page %d created.\n", p->nMemPg)); }else{ int rc; pPg = p->pLru; if( pPg->isDirty ){ rc = sqlitePgMakeClean(pPg); if( rc!=SQLITE_OK ) return rc; } sqlitePgUnlinkLru(pPg); h = pPg->dbpgno % PG_HASH_SIZE; if( pPg->pPvHash ){ pPg->pPvHash->pNxHash = pPg->pNxHash; }else{ assert( p->aPgHash[h]==pPg ); p->aPgHash[h] = pPg->pNxHash; } if( pPg->pNxHash ){ pPg->pNxHash->pPvHash = pPg->pPvHash; } TRACE(("PG: recycling d-page %u to d-page %u\n", pPg->dbpgno, pgno)); } pPg->dbpgno = pgno; if( pgno>p->nDbPg ){ p->nDbPg = pgno; } h = pgno % PG_HASH_SIZE; pPg->pPvHash = 0; pPg->pNxHash = p->aPgHash[h]; if( pPg->pNxHash ){ pPg->pNxHash->pPvHash = pPg; } p->aPgHash[h] = pPg; pPg->jpgno = sqlitePgJournalPageNumber(p, pgno); if( pPg->jpgno!=0 ){ TRACE(("PG: reading d-page %u content from j-page %u\n", pgno, pPg->jpgno)); sqlitePgRead(p->fdJournal, PG_TO_DATA(pPg), pPg->jpgno); }else if( pPg->dbpgno!=0 ){ TRACE(("PG: reading d-page %u from database\n", pgno)); sqlitePgRead(p->fdMain, PG_TO_DATA(pPg), pPg->dbpgno); }else{ TRACE(("PG: reading zero page\n"); memset(PG_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE); } pPg->isDirty = 0; pPg->nRef = 1; p->nRefPg++; if( p->nRefPg==1 ){ /* Acquire a read lock */ } *ppData = PG_TO_DATA(pPg); return SQLITE_OK; } /* ** Release a reference to a database data page. */ int sqlitePgUnref(void *pData){ Pghdr *pPg = DATA_TO_PG(pData); pPg->nRef--; assert( pPg->nRef>=0 ); if( pPg->nRef==0 ){ Pgr *p = pPg->p; pPg->pMru = 0; pPg->pLru = p->pLru; p->pLru = pPg; TRACE(("PG: d-page %u is unused\n", pPg->dbpgno)); p->nRefPg--; if( p->nRefPg==0 ){ /* Release the read lock */ } } return SQLITE_OK; } /* ** The database page in the argument has been modified. Write it back ** to the database file on the next commit. */ int sqlitePgTouch(void *pD){ Pghdr *pPg = DATA_TO_PG(pD); assert( pPg->p->fdJournal>=0 ); if( pPg->isDirty==0 ){ pPg->isDirty = 1; TRACE(("PG: d-page %u is dirty\n", pPg->dbpgno)); } return SQLITE_OK; } /* ** Return the number of the first unused page at the end of the ** database file. */ int sqlitePgCount(Pgr *p, u32 *pPgno){ *pPgno = p->nDbPg; return SQLITE_OK; } /* ** Return the page number associated with the given page. */ u32 sqlitePgNum(void *pD){ Pghdr *pPg = DATA_TO_PG(pD); return pPg->dbpgno; }