/*
** Copyright (c) 2001 D. Richard Hipp
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the GNU General Public
** License as published by the Free Software Foundation; either
** version 2 of the License, or (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** General Public License for more details.
**
** You should have received a copy of the GNU General Public
** License along with this library; if not, write to the
** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
** Boston, MA 02111-1307, USA.
**
** Author contact information:
** drh@hwaci.com
** http://www.hwaci.com/drh/
**
*************************************************************************
** $Id: pg.c,v 1.1 2001/01/15 22:51:11 drh Exp $
*/
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "sqliteInt.h"
#include "pg.h"
/*
** Uncomment the following for a debug trace
*/
#if 1
# define TRACE(X) printf X; fflush(stdout);
#endif
#ifndef SQLITE_IOERR
# define SQLITE_IOERR SQLITE_ERROR
#endif
/*
** Hash table sizes
*/
#define J_HASH_SIZE 127 /* Size of the journal page hash table */
#define PG_HASH_SIZE 349 /* Size of the database page hash table */
/*
** Forward declaration of structure
*/
typedef struct Pghdr Pghdr;
/*
** All information about a single paging file is contained in an
** instance of the following structure.
*/
struct Pgr {
int fdMain; /* The main database file */
char *zMain; /* Name of the database file */
int fdJournal; /* The journal file */
char *zJournal; /* Name of the journal file */
int nMemPg; /* Number of memory-resident pages */
int nJPg; /* Number of pages in the journal */
int nDbPg; /* Number of pages in the database */
int nRefPg; /* Number of pages currently in use */
Pghdr *pLru, *pMru; /* Least and most recently used mem-page */
Pghdr *pJidx; /* List of journal index pages */
Pghdr *pAll; /* All pages, except journal index pages */
u32 aJHash[J_HASH_SIZE]; /* Journal page hash table */
Pghdr *aPgHash[PG_HASH_SIZE]; /* Mem-page hash table */
};
/*
** Each memory-resident page of the paging file has a header which
** is an instance of the following structure.
*/
struct Pghdr {
Pgr *p; /* Pointer back to the Pgr structure */
int nRef; /* Number of references to this page */
int isDirty; /* TRUE if needs to be written to disk */
u32 dbpgno; /* Page number in the database file */
u32 jpgno; /* Page number in the journal file */
Pghdr *pNx; /* Next page on a list of them all */
Pghdr *pLru; /* Less recently used pages */
Pghdr *pMru; /* More recently used pages */
Pghdr *pNxHash; /* Next with same dbpgno hash */
Pghdr *pPvHash; /* Previous with the same dbpgno hash */
};
/*
** For a memory-resident page, the page data comes immediately after
** the page header. The following macros can be used to change a
** pointer to a page header into a pointer to the data, or vice
** versa.
*/
#define PG_TO_DATA(X) ((void*)&(X)[1])
#define DATA_TO_PG(X) (&((Pghdr*)(X))[-1])
/*
** The number of in-memory pages that we accumulate before trying
** to reuse older pages when new ones are requested.
*/
#define MX_MEM_PAGE 100
/*
** The number of journal data pages that come between consecutive
** journal index pages.
*/
#define N_J_DATAPAGE (SQLITE_PAGE_SIZE/(2*sizeof(u32)))
/*
** An index page in the journal consists of an array of N_J_DATAPAGE
** of the following structures. There is one instance of the following
** structure for each of the N_J_DATAPAGE data pages that follow the
** index.
**
** Let the journal page number that a JidxEntry describes be J. Then
** the JidxEntry.dbpgno field is the page of the database file that
** corresponds to the J page in the journal. The JidxEntry.next_jpgno
** field hold the number of another journal page that contains
** a database file page with the same hash as JidxEntry.dbpgno.
**
** All information is written to the journal index in big-endian
** notation.
*/
typedef struct JidxEntry JidxEntry;
struct JidxEntry {
char dbpgno[sizeof(u32)]; /* Database page number for this entry */
char next_jpgno[sizeof(u32)]; /* Next entry with same hash on dbpgno */
};
/*
** Read a page from a file into memory. Return SQLITE_OK if successful.
** The "pgno" parameter tells where in the file to read the page.
** The first page is 1. Files do not contain a page 0 since a page
** number of 0 is used to indicate "no such page".
*/
static int sqlitePgRead(int fd, char *zBuf, u32 pgno){
int got = 0;
int amt;
assert( pgno>0 );
assert( fd>=0 );
lseek(fd, SEEK_SET, (pgno-1)*SQLITE_PAGE_SIZE);
while( got<SQLITE_PAGE_SIZE ){
amt = read(fd, &zBuf[got], SQLITE_PAGE_SIZE - got);
if( amt<=0 ){
memset(&zBuf[got], 0, SQLITE_PAGE_SIZE - got);
return amt==0 ? SQLITE_OK : SQLITE_IOERR;
}
got += amt;
}
return SQLITE_OK;
}
/*
** Read a page from a file into memory. Return SQLITE_OK if successful.
** The "pgno" parameter tells where in the file to write the page.
** The first page is 1. Files do not contain a page 0 since a page
** number of 0 is used to indicate "no such page".
*/
static int sqlitePgWrite(int fd, char *zBuf, u32 pgno){
int done = 0;
int amt;
assert( pgno>0 );
assert( fd>=0 );
lseek(fd, SEEK_SET, (pgno-1)*SQLITE_PAGE_SIZE);
while( done<SQLITE_PAGE_SIZE ){
amt = write(fd, &zBuf[done], SQLITE_PAGE_SIZE - done);
if( amt<=0 ) return SQLITE_IOERR;
done += amt;
}
return SQLITE_OK;
}
/*
** Turn four bytes into an integer. The first byte is always the
** most significant 8 bits.
*/
static u32 sqlitePgGetInt(const char *p){
return ((p[0]&0xff)<<24) | ((p[1]&0xff)<<16) | ((p[2]&0xff)<<8) | (p[3]&0xff);
}
/*
** Turn an integer into 4 bytes. The first byte is always the
** most significant 8 bits.
*/
static void sqlitePgPutInt(u32 v, char *p){
p[3] = v & 0xff;
v >>= 8;
p[2] = v & 0xff;
v >>= 8;
p[1] = v & 0xff;
v >>= 8;
p[0] = v & 0xff;
}
/*
** Check the hash table for an in-memory page. Return a pointer to
** the page header if found. Return NULL if the page is not in memory.
*/
static Pghdr *sqlitePgFind(Pgr *p, u32 pgno){
int h;
Pghdr *pPg;
if( pgno==0 ) return 0;
h = pgno % PG_HASH_SIZE;
for(pPg = p->aPgHash[h]; pPg; pPg=pPg->pNxHash){
if( pPg->dbpgno==pgno ) return pPg;
}
TRACE(("PG: data page %u is %#x\n", pgno, (u32)pPg));
return 0;
}
/*
** Locate and return an index page from the journal.
**
** The first page of a journal is the primary index. Additional
** index pages are called secondary indices. Index pages appear
** in the journal as often as needed. (If SQLITE_PAGE_SIZE==1024,
** then there are 1024/sizeof(int)*2 = 128 database between each
** pair of index pages.) Journal index pages are not hashed and
** do no appear on the Pgr.pAll list. Index pages are on the
** Pgr.pJidx list only. Index pages have Pghdr.dbpgno==0.
**
** If the requested index page is not already in memory, then a
** new memory page is created to hold the index.
**
** This routine will return a NULL pointer if we run out of memory.
*/
static Pghdr *sqlitePgFindJidx(Pgr *p, u32 pgno){
Pghdr *pPg;
assert( pgno % (N_J_DATAPAGE+1) == 1 );
for(pPg=p->pJidx; pPg; pPg=pPg->pNx){
if( pPg->jpgno==pgno ){
TRACE(("PG: found j-index %u at %#x\n", pgno, (u32)pPg));
return pPg;
}
}
pPg = sqliteMalloc( sizeof(Pghdr)+SQLITE_PAGE_SIZE );
if( pPg==0 ) return 0;
pPg->jpgno = pgno;
pPg->pNx = p->pJidx;
p->pJidx = pPg;
sqlitePgRead(p->fdJournal, PG_TO_DATA(pPg), pgno);
TRACE(("PG: create j-index %u at %#x\n", pgno, (u32)pPg));
return pPg;
}
/*
** Look in the journal to see if the given database page is stored
** in the journal. If it is, return its journal page number. If
** not, return 0.
*/
static u32 sqlitePgJournalPageNumber(Pgr *p, u32 dbpgno){
u32 jpgno;
assert( dbpgno>0 );
jpgno = p->aJHash[dbpgno % J_HASH_SIZE];
while( jpgno!=0 ){
int idx_num; /* Which journal index describes page jpgno */
int ipgno; /* Page number for the journal index */
int idx_slot; /* Which entry in index idx_num describes jpgno */
Pghdr *pIdxPg; /* The index page for jpgno */
JidxEntry *aIdx; /* The data for the index page */
idx_num = (jpgno - 1)/(N_J_DATAPAGE + 1);
idx_slot = (jpgno - 1) % (N_J_DATAPAGE + 1) - 2;
ipgno = idx_num * (N_J_DATAPAGE + 1) + 1;
if( ipgno>p->nJPg ){
jpgno = 0;
break;
}
pIdxPg = sqlitePgFindJidx(p, ipgno);
assert( pIdxPg!=0 );
aIdx = PG_TO_DATA(pIdxPg);
if( dbpgno==sqlitePgGetInt(aIdx[idx_slot].dbpgno) ){
break;
}
jpgno = sqlitePgGetInt(aIdx[idx_slot].next_jpgno);
}
return jpgno;
}
/*
** Make a page not dirty by writing it to the journal.
*/
static int sqlitePgMakeClean(Pghdr *pPg){
Pgr *p = pPg->p;
int rc;
assert( pPg->isDirty );
assert( p->fdJournal>=0 );
if( pPg->jpgno==0 ){
int jpgno; /* A newly allocate page in the journal */
int idx_num; /* Which journal index describes page jpgno */
int idx_slot; /* Which entry in index idx_num describes jpgno */
Pghdr *pIdxPg; /* The index page for jpgno */
JidxEntry *aIdx; /* The data for the index page */
int h; /* The hash value for pPg->dbpgno */
jpgno = p->nJPg + 1;
if( jpgno % (N_J_DATAPAGE + 1) == 1 ){
jpgno++;
}
idx_num = (jpgno - 1)/(N_J_DATAPAGE + 1);
idx_slot = (jpgno - 1) % (N_J_DATAPAGE + 1) - 2;
pIdxPg = sqlitePgFindJidx(p, idx_num * (N_J_DATAPAGE + 1) + 1);
assert( pIdxPg!=0 );
aIdx = PG_TO_DATA(pIdxPg);
sqlitePgPutInt(pPg->dbpgno, aIdx[idx_slot].dbpgno);
h = pPg->dbpgno % J_HASH_SIZE;
sqlitePgPutInt(p->aJHash[h], aIdx[idx_slot].next_jpgno);
p->aJHash[h] = jpgno;
p->nJPg = jpgno;
pPg->jpgno = jpgno;
TRACE(("PG: assign d-page %u to j-page %u\n", jpgno, pPg->dbpgno));
}
rc = sqlitePgWrite(p->fdJournal, PG_TO_DATA(pPg), pPg->jpgno);
if( rc==SQLITE_OK ){
pPg->isDirty = 0;
}
return rc;
}
/*
** Find the number of pages in the given file by measuring the size
** of the file. Return 0 if there is any problem.
*/
static int sqlitePgPageCount(int fd){
struct stat statbuf;
if( fstat(fd, &statbuf)!=0 ) return 0;
return statbuf.st_size/SQLITE_PAGE_SIZE;
}
/*
** This routine reads the journal and transfers pages from the
** journal to the database.
*/
static int sqlitePgJournalPlayback(Pgr *p){
Pghdr *pPg;
JidxEntry *aIdx;
int nJpg;
int jpgno = 1;
int i;
int dbpgno;
int rc;
char idx[SQLITE_PAGE_SIZE];
char pgbuf[SQLITE_PAGE_SIZE];
assert( p->fdJournal>=0 );
nJpg = sqlitePgPageCount(p->fdJournal);
while( jpgno<=nJpg ){
if( !sqlitePgRead(p->fdJournal, idx, jpgno++) ) break;
aIdx = (JidxEntry*)idx;
for(i=0; i<N_J_DATAPAGE; i++){
dbpgno = sqlitePgGetInt(&idx[i]);
if( dbpgno==0 ){
jpgno = nJpg+1;
break;
}
pPg = sqlitePgFind(p, dbpgno);
if( pPg ){
rc = sqlitePgWrite(p->fdMain, PG_TO_DATA(pPg), dbpgno);
TRACE(("PG: commit j-page %u to d-page %u from memory\n",jpgno,dbpgno));
}else{
rc = sqlitePgRead(p->fdJournal, pgbuf, jpgno);
if( rc!=SQLITE_OK ){
return rc;
}
rc = sqlitePgWrite(p->fdMain, pgbuf, dbpgno);
TRACE(("PG: commit j-page %u to d-page %u from disk\n",jpgno,dbpgno));
}
jpgno++;
if( rc!=SQLITE_OK ){
return rc;
}
}
}
TRACE(("PG: commit complete. deleting the journal.\n"));
fsync(p->fdMain);
close(p->fdJournal);
p->fdJournal = -1;
unlink(p->zJournal);
for(pPg=p->pAll; pPg; pPg=pPg->pNx){
pPg->isDirty = 0;
pPg->jpgno = 0;
}
while( (pPg = p->pJidx)!=0 ){
p->pAll = pPg->pNx;
sqliteFree(pPg);
}
return SQLITE_OK;
}
/*
** Remove the given page from the LRU list.
*/
static void sqlitePgUnlinkLru(Pghdr *pPg){
Pgr *p = pPg->p;
if( pPg->pLru ){
pPg->pLru->pMru = pPg->pLru;
}
if( pPg->pMru ){
pPg->pMru->pLru = pPg->pMru;
}
if( p->pLru==pPg ){
p->pLru = pPg->pLru;
}
if( p->pMru==pPg ){
p->pMru = pPg->pMru;
}
pPg->pLru = pPg->pMru = 0;
}
/*
** Open the database file and make *ppPgr pointer to a structure describing it.
** Return SQLITE_OK on success or an error code if there is a failure.
**
** If there was an unfinished commit, complete it before returnning.
*/
int sqlitePgOpen(const char *zFilename, Pgr **ppPgr){
Pgr *p;
int n;
n = strlen(zFilename);
p = sqliteMalloc( sizeof(*p) + n*2 + 4 );
if( p==0 ){
*ppPgr = 0;
return SQLITE_NOMEM;
}
p->zMain = (char*)&p[1];
strcpy(p->zMain, zFilename);
p->zJournal = &p->zMain[n+1];
strcpy(p->zJournal, p->zMain);
p->zJournal[n] = '~';
p->zJournal[n+1] = 0;
p->fdJournal = -1;
p->fdMain = open(p->zMain, O_CREAT|O_RDWR, 0600);
if( p->fdMain<0 ){
*ppPgr = 0;
sqliteFree(p);
return SQLITE_PERM;
}
p->nDbPg = sqlitePgPageCount(p->fdMain);
if( access(p->zJournal, R_OK)==0 ){
sqlitePgJournalPlayback(p);
}
*ppPgr = p;
return SQLITE_OK;
}
/*
** Close the database file. Any outstanding transactions are abandoned.
*/
int sqlitePgClose(Pgr *p){
Pghdr *pPg;
if( p->fdMain ) close(p->fdMain);
if( p->fdJournal ) close(p->fdJournal);
unlink(p->zJournal);
while( (pPg = p->pAll)!=0 ){
p->pAll = pPg->pNx;
sqliteFree(pPg);
}
while( (pPg = p->pJidx)!=0 ){
p->pAll = pPg->pNx;
sqliteFree(pPg);
}
sqliteFree(p);
return SQLITE_OK;
}
/*
** Begin a new transaction. Return SQLITE_OK on success or an error
** code if something goes wrong.
*/
int sqlitePgBeginTransaction(Pgr *p){
assert( p->fdJournal<0 );
if( p->nRefPg>0 ){
/* release the read lock */
}
/* write lock the database */
p->fdJournal = open(p->zJournal, O_CREAT|O_EXCL|O_RDWR, 0600);
if( p->fdJournal<0 ){
return SQLITE_PERM;
}
p->nJPg = 0;
TRACE(("PG: begin transaction\n"));
return SQLITE_OK;
}
/*
** Commit the current transaction. Return SQLITE_OK or an error code.
*/
int sqlitePgCommit(Pgr *p){
Pghdr *pPrimaryIdx = 0;
Pghdr *pPg;
int rc;
for(pPg=p->pAll; pPg; pPg=pPg->pNx){
if( pPg->isDirty ){
rc = sqlitePgMakeClean(pPg);
if( rc!=SQLITE_OK ){
return rc;
}
}
}
for(pPg=p->pJidx; pPg; pPg=pPg->pNx){
if( pPg->jpgno==1 ){
pPrimaryIdx = pPg;
}else{
TRACE(("PG: writing j-index %u\n", pPg->jpgno));
rc = sqlitePgMakeClean(pPg);
if( rc!=SQLITE_OK ){
return rc;
}
}
}
assert( pPrimaryIdx!=0 );
fsync(p->fdJournal);
TRACE(("PG: writing j-index %u\n", pPrimaryIdx->jpgno));
rc = sqlitePgMakeClean(pPrimaryIdx);
if( rc!=SQLITE_OK ){
return rc;
}
fsync(p->fdJournal);
rc = sqlitePgJournalPlayback(p);
if( rc!=SQLITE_OK ){
return rc;
}
/* remove write lock from database */
if( p->nRefPg>0 ){
/* acquire read lock on database */
}
return SQLITE_OK;
}
/*
** Abandon the current transaction.
*/
int sqlitePgRollback(Pgr *p){
Pghdr *pPg;
TRACE(("PG: begin rollback\n"));
for(pPg=p->pAll; pPg; pPg=pPg->pNx){
if( pPg->isDirty || pPg->jpgno!=0 ){
pPg->isDirty = 0;
pPg->jpgno = 0;
if( pPg->nRef>0 ){
TRACE(("PG: reloading d-page %u\n", pPg->dbpgno));
sqlitePgRead(p->fdMain, PG_TO_DATA(pPg), pPg->dbpgno);
}else{
sqlitePgUnlinkLru(pPg);
}
}
}
close(p->fdJournal);
p->fdJournal = -1;
unlink(p->zJournal);
while( (pPg = p->pJidx)!=0 ){
p->pAll = pPg->pNx;
sqliteFree(pPg);
}
p->nDbPg = sqlitePgPageCount(p->fdMain);
/* remove write lock from database */
if( p->nRefPg>0 ){
/* acquire read lock on database */
}
return SQLITE_OK;
}
/*
** Get a page from the database. Return a pointer to the data for that
** page.
**
** A NULL pointer will be returned if we run out of memory.
*/
int sqlitePgGet(Pgr *p, u32 pgno, void **ppData){
Pghdr *pPg;
int h;
pPg = sqlitePgFind(p, pgno);
if( pPg ){
pPg->nRef++;
if( pPg->nRef==1 ){
sqlitePgUnlinkLru(pPg);
TRACE(("PG: d-page %u pulled from cache\n", pgno));
}
p->nRefPg++;
if( p->nRefPg==1 ){
/* Acquire a read lock */
}
*ppData = PG_TO_DATA(pPg);
return SQLITE_OK;
}
if( p->nMemPg<MX_MEM_PAGE || p->pLru==0 ){
pPg = sqliteMalloc( sizeof(Pghdr) + SQLITE_PAGE_SIZE );
if( pPg==0 ) return SQLITE_NOMEM;
p->nMemPg++;
pPg->pNx = p->pAll;
p->pAll = pPg;
pPg->p = p;
TRACE(("PG: new page %d created.\n", p->nMemPg));
}else{
int rc;
pPg = p->pLru;
if( pPg->isDirty ){
rc = sqlitePgMakeClean(pPg);
if( rc!=SQLITE_OK ) return rc;
}
sqlitePgUnlinkLru(pPg);
h = pPg->dbpgno % PG_HASH_SIZE;
if( pPg->pPvHash ){
pPg->pPvHash->pNxHash = pPg->pNxHash;
}else{
assert( p->aPgHash[h]==pPg );
p->aPgHash[h] = pPg->pNxHash;
}
if( pPg->pNxHash ){
pPg->pNxHash->pPvHash = pPg->pPvHash;
}
TRACE(("PG: recycling d-page %u to d-page %u\n", pPg->dbpgno, pgno));
}
pPg->dbpgno = pgno;
if( pgno>p->nDbPg ){
p->nDbPg = pgno;
}
h = pgno % PG_HASH_SIZE;
pPg->pPvHash = 0;
pPg->pNxHash = p->aPgHash[h];
if( pPg->pNxHash ){
pPg->pNxHash->pPvHash = pPg;
}
p->aPgHash[h] = pPg;
pPg->jpgno = sqlitePgJournalPageNumber(p, pgno);
if( pPg->jpgno!=0 ){
TRACE(("PG: reading d-page %u content from j-page %u\n", pgno, pPg->jpgno));
sqlitePgRead(p->fdJournal, PG_TO_DATA(pPg), pPg->jpgno);
}else{
TRACE(("PG: reading d-page %u from database\n", pgno));
sqlitePgRead(p->fdMain, PG_TO_DATA(pPg), pPg->dbpgno);
}
pPg->isDirty = 0;
pPg->nRef = 1;
p->nRefPg++;
if( p->nRefPg==1 ){
/* Acquire a read lock */
}
*ppData = PG_TO_DATA(pPg);
return SQLITE_OK;
}
/*
** Release a reference to a database data page.
*/
int sqlitePgUnref(void *pData){
Pghdr *pPg = DATA_TO_PG(pData);
pPg->nRef--;
assert( pPg->nRef>=0 );
if( pPg->nRef==0 ){
Pgr *p = pPg->p;
pPg->pMru = 0;
pPg->pLru = p->pLru;
p->pLru = pPg;
TRACE(("PG: d-page %u is unused\n", pPg->dbpgno));
p->nRefPg--;
if( p->nRefPg==0 ){
/* Release the read lock */
}
}
return SQLITE_OK;
}
/*
** The database page in the argument has been modified. Write it back
** to the database file on the next commit.
*/
int sqlitePgTouch(void *pD){
Pghdr *pPg = DATA_TO_PG(pD);
assert( pPg->p->fdJournal>=0 );
if( pPg->isDirty==0 ){
pPg->isDirty = 1;
TRACE(("PG: d-page %u is dirty\n", pPg->dbpgno));
}
return SQLITE_OK;
}
/*
** Return the number of the first unused page at the end of the
** database file.
*/
int sqlitePgAlloc(Pgr *p, int *pPgno){
*pPgno = p->nDbPg;
return SQLITE_OK;
}