SQLite

Artifact [eb34521ec1]
Login

Artifact eb34521ec18454941b0823c6ce5c0e3c9394e733:


/*
** Copyright (c) 2001 D. Richard Hipp
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the GNU General Public
** License as published by the Free Software Foundation; either
** version 2 of the License, or (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
** General Public License for more details.
** 
** You should have received a copy of the GNU General Public
** License along with this library; if not, write to the
** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
** Boston, MA  02111-1307, USA.
**
** Author contact information:
**   drh@hwaci.com
**   http://www.hwaci.com/drh/
**
*************************************************************************
** $Id: pg.c,v 1.1 2001/01/15 22:51:11 drh Exp $
*/
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "sqliteInt.h"
#include "pg.h"

/*
** Uncomment the following for a debug trace
*/
#if 1
# define TRACE(X)  printf X; fflush(stdout);
#endif  

#ifndef SQLITE_IOERR
# define SQLITE_IOERR SQLITE_ERROR
#endif

/*
** Hash table sizes
*/
#define J_HASH_SIZE  127  /* Size of the journal page hash table */
#define PG_HASH_SIZE 349  /* Size of the database page hash table */

/*
** Forward declaration of structure
*/
typedef struct Pghdr Pghdr;

/*
** All information about a single paging file is contained in an
** instance of the following structure.
*/
struct Pgr {
  int fdMain;                    /* The main database file */
  char *zMain;                   /* Name of the database file */
  int fdJournal;                 /* The journal file */
  char *zJournal;                /* Name of the journal file */
  int nMemPg;                    /* Number of memory-resident pages */
  int nJPg;                      /* Number of pages in the journal */
  int nDbPg;                     /* Number of pages in the database */
  int nRefPg;                    /* Number of pages currently in use */
  Pghdr *pLru, *pMru;            /* Least and most recently used mem-page */
  Pghdr *pJidx;                  /* List of journal index pages */
  Pghdr *pAll;                   /* All pages, except journal index pages */
  u32 aJHash[J_HASH_SIZE];       /* Journal page hash table */
  Pghdr *aPgHash[PG_HASH_SIZE];  /* Mem-page hash table */
};

/*
** Each memory-resident page of the paging file has a header which
** is an instance of the following structure.
*/
struct Pghdr {
  Pgr *p;            /* Pointer back to the Pgr structure */
  int nRef;          /* Number of references to this page */
  int isDirty;       /* TRUE if needs to be written to disk */
  u32 dbpgno;        /* Page number in the database file */
  u32 jpgno;         /* Page number in the journal file */
  Pghdr *pNx;        /* Next page on a list of them all */
  Pghdr *pLru;       /* Less recently used pages */
  Pghdr *pMru;       /* More recently used pages */
  Pghdr *pNxHash;    /* Next with same dbpgno hash */
  Pghdr *pPvHash;    /* Previous with the same dbpgno hash */
};

/*
** For a memory-resident page, the page data comes immediately after
** the page header.  The following macros can be used to change a 
** pointer to a page header into a pointer to the data, or vice
** versa.
*/
#define PG_TO_DATA(X)  ((void*)&(X)[1])
#define DATA_TO_PG(X)  (&((Pghdr*)(X))[-1])

/*
** The number of in-memory pages that we accumulate before trying
** to reuse older pages when new ones are requested.
*/
#define MX_MEM_PAGE  100

/*
** The number of journal data pages that come between consecutive 
** journal index pages.
*/
#define N_J_DATAPAGE  (SQLITE_PAGE_SIZE/(2*sizeof(u32)))

/*
** An index page in the journal consists of an array of N_J_DATAPAGE
** of the following structures.  There is one instance of the following
** structure for each of the N_J_DATAPAGE data pages that follow the
** index.
**
** Let the journal page number that a JidxEntry describes be J.  Then
** the JidxEntry.dbpgno field is the page of the database file that
** corresponds to the J page in the journal.  The JidxEntry.next_jpgno
** field hold the number of another journal page that contains
** a database file page with the same hash as JidxEntry.dbpgno.
**
** All information is written to the journal index in big-endian
** notation.
*/
typedef struct JidxEntry JidxEntry;
struct JidxEntry {
  char dbpgno[sizeof(u32)];        /* Database page number for this entry */
  char next_jpgno[sizeof(u32)];    /* Next entry with same hash on dbpgno */
};

/*
** Read a page from a file into memory.  Return SQLITE_OK if successful.
** The "pgno" parameter tells where in the file to read the page.
** The first page is 1.  Files do not contain a page 0 since a page
** number of 0 is used to indicate "no such page".
*/
static int sqlitePgRead(int fd, char *zBuf, u32 pgno){
  int got = 0;
  int amt;

  assert( pgno>0 );
  assert( fd>=0 );
  lseek(fd, SEEK_SET, (pgno-1)*SQLITE_PAGE_SIZE);
  while( got<SQLITE_PAGE_SIZE ){
    amt = read(fd, &zBuf[got], SQLITE_PAGE_SIZE - got);
    if( amt<=0 ){
      memset(&zBuf[got], 0, SQLITE_PAGE_SIZE - got);
      return amt==0 ? SQLITE_OK : SQLITE_IOERR;
    }
    got += amt;
  }
  return SQLITE_OK;
}

/*
** Read a page from a file into memory.  Return SQLITE_OK if successful.
** The "pgno" parameter tells where in the file to write the page.
** The first page is 1.  Files do not contain a page 0 since a page
** number of 0 is used to indicate "no such page".
*/
static int sqlitePgWrite(int fd, char *zBuf, u32 pgno){
  int done = 0;
  int amt;

  assert( pgno>0 );
  assert( fd>=0 );
  lseek(fd, SEEK_SET, (pgno-1)*SQLITE_PAGE_SIZE);
  while( done<SQLITE_PAGE_SIZE ){
    amt = write(fd, &zBuf[done], SQLITE_PAGE_SIZE - done);
    if( amt<=0 ) return SQLITE_IOERR;
    done += amt;
  }
  return SQLITE_OK;
}

/*
** Turn four bytes into an integer.  The first byte is always the
** most significant 8 bits.
*/
static u32 sqlitePgGetInt(const char *p){
  return ((p[0]&0xff)<<24) | ((p[1]&0xff)<<16) | ((p[2]&0xff)<<8) | (p[3]&0xff);
}

/*
** Turn an integer into 4 bytes.  The first byte is always the
** most significant 8 bits.
*/
static void sqlitePgPutInt(u32 v, char *p){
  p[3] = v & 0xff;
  v >>= 8;
  p[2] = v & 0xff;
  v >>= 8;
  p[1] = v & 0xff;
  v >>= 8;
  p[0] = v & 0xff;
}

/*
** Check the hash table for an in-memory page.  Return a pointer to
** the page header if found.  Return NULL if the page is not in memory.
*/
static Pghdr *sqlitePgFind(Pgr *p, u32 pgno){
  int h;
  Pghdr *pPg;

  if( pgno==0 ) return 0;
  h = pgno % PG_HASH_SIZE;
  for(pPg = p->aPgHash[h]; pPg; pPg=pPg->pNxHash){
    if( pPg->dbpgno==pgno ) return pPg;
  }
  TRACE(("PG: data page %u is %#x\n", pgno, (u32)pPg));
  return 0;
}

/*
** Locate and return an index page from the journal.
**
** The first page of a journal is the primary index.  Additional
** index pages are called secondary indices.  Index pages appear
** in the journal as often as needed.  (If SQLITE_PAGE_SIZE==1024,
** then there are 1024/sizeof(int)*2 = 128 database between each
** pair of index pages.)  Journal index pages are not hashed and
** do no appear on the Pgr.pAll list.  Index pages are on the
** Pgr.pJidx list only.  Index pages have Pghdr.dbpgno==0.
**
** If the requested index page is not already in memory, then a
** new memory page is created to hold the index.
**
** This routine will return a NULL pointer if we run out of memory.
*/
static Pghdr *sqlitePgFindJidx(Pgr *p, u32 pgno){
  Pghdr *pPg;

  assert( pgno % (N_J_DATAPAGE+1) == 1 );
  for(pPg=p->pJidx; pPg; pPg=pPg->pNx){
    if( pPg->jpgno==pgno ){
      TRACE(("PG: found j-index %u at %#x\n", pgno, (u32)pPg));
      return pPg;
    }
  }
  pPg = sqliteMalloc( sizeof(Pghdr)+SQLITE_PAGE_SIZE );
  if( pPg==0 ) return 0;
  pPg->jpgno = pgno;
  pPg->pNx = p->pJidx;
  p->pJidx = pPg;
  sqlitePgRead(p->fdJournal, PG_TO_DATA(pPg), pgno);
  TRACE(("PG: create j-index %u at %#x\n", pgno, (u32)pPg));
  return pPg;
}

/*
** Look in the journal to see if the given database page is stored
** in the journal.  If it is, return its journal page number.  If
** not, return 0.
*/
static u32 sqlitePgJournalPageNumber(Pgr *p, u32 dbpgno){
  u32 jpgno;
  
  assert( dbpgno>0 );
  jpgno = p->aJHash[dbpgno % J_HASH_SIZE];
  while( jpgno!=0 ){
    int idx_num;     /* Which journal index describes page jpgno */
    int ipgno;       /* Page number for the journal index */
    int idx_slot;    /* Which entry in index idx_num describes jpgno */
    Pghdr *pIdxPg;   /* The index page for jpgno */
    JidxEntry *aIdx; /* The data for the index page */

    idx_num = (jpgno - 1)/(N_J_DATAPAGE + 1);
    idx_slot = (jpgno - 1) % (N_J_DATAPAGE + 1) - 2;
    ipgno = idx_num * (N_J_DATAPAGE + 1) + 1;
    if( ipgno>p->nJPg ){
      jpgno = 0;
      break;
    }
    pIdxPg = sqlitePgFindJidx(p, ipgno);
    assert( pIdxPg!=0 );
    aIdx = PG_TO_DATA(pIdxPg);
    if( dbpgno==sqlitePgGetInt(aIdx[idx_slot].dbpgno) ){
      break;
    }
    jpgno = sqlitePgGetInt(aIdx[idx_slot].next_jpgno);
  }
  return jpgno;
}

/*
** Make a page not dirty by writing it to the journal.
*/
static int sqlitePgMakeClean(Pghdr *pPg){
  Pgr *p = pPg->p;
  int rc;

  assert( pPg->isDirty );
  assert( p->fdJournal>=0 );
  if( pPg->jpgno==0 ){
    int jpgno;       /* A newly allocate page in the journal */
    int idx_num;     /* Which journal index describes page jpgno */
    int idx_slot;    /* Which entry in index idx_num describes jpgno */
    Pghdr *pIdxPg;   /* The index page for jpgno */
    JidxEntry *aIdx; /* The data for the index page */
    int h;           /* The hash value for pPg->dbpgno */

    jpgno = p->nJPg + 1;
    if( jpgno % (N_J_DATAPAGE + 1) == 1 ){
      jpgno++;
    }
    idx_num = (jpgno - 1)/(N_J_DATAPAGE + 1);
    idx_slot = (jpgno - 1) % (N_J_DATAPAGE + 1) - 2;
    pIdxPg = sqlitePgFindJidx(p, idx_num * (N_J_DATAPAGE + 1) + 1);
    assert( pIdxPg!=0 );
    aIdx = PG_TO_DATA(pIdxPg);
    sqlitePgPutInt(pPg->dbpgno, aIdx[idx_slot].dbpgno);
    h = pPg->dbpgno % J_HASH_SIZE;
    sqlitePgPutInt(p->aJHash[h], aIdx[idx_slot].next_jpgno);
    p->aJHash[h] = jpgno;
    p->nJPg = jpgno;
    pPg->jpgno = jpgno;
    TRACE(("PG: assign d-page %u to j-page %u\n", jpgno, pPg->dbpgno));
  }
  rc = sqlitePgWrite(p->fdJournal, PG_TO_DATA(pPg), pPg->jpgno);
  if( rc==SQLITE_OK ){
    pPg->isDirty = 0;
  }
  return rc;
}

/*
** Find the number of pages in the given file by measuring the size
** of the file.  Return 0 if there is any problem.
*/
static int sqlitePgPageCount(int fd){
  struct stat statbuf;
  if( fstat(fd, &statbuf)!=0 ) return 0;
  return statbuf.st_size/SQLITE_PAGE_SIZE;
}

/*
** This routine reads the journal and transfers pages from the
** journal to the database.
*/
static int sqlitePgJournalPlayback(Pgr *p){
  Pghdr *pPg;
  JidxEntry *aIdx;
  int nJpg;
  int jpgno = 1;
  int i;
  int dbpgno;
  int rc;
  char idx[SQLITE_PAGE_SIZE];
  char pgbuf[SQLITE_PAGE_SIZE];
  
  assert( p->fdJournal>=0 );
  nJpg = sqlitePgPageCount(p->fdJournal);
  while( jpgno<=nJpg ){
    if( !sqlitePgRead(p->fdJournal, idx, jpgno++) ) break;
    aIdx = (JidxEntry*)idx;
    for(i=0; i<N_J_DATAPAGE; i++){
      dbpgno = sqlitePgGetInt(&idx[i]);
      if( dbpgno==0 ){
        jpgno = nJpg+1;
        break;
      }
      pPg = sqlitePgFind(p, dbpgno);
      if( pPg ){
        rc = sqlitePgWrite(p->fdMain, PG_TO_DATA(pPg), dbpgno);
        TRACE(("PG: commit j-page %u to d-page %u from memory\n",jpgno,dbpgno));
      }else{
        rc = sqlitePgRead(p->fdJournal, pgbuf, jpgno);
        if( rc!=SQLITE_OK ){
          return rc;
        }
        rc = sqlitePgWrite(p->fdMain, pgbuf, dbpgno);
        TRACE(("PG: commit j-page %u to d-page %u from disk\n",jpgno,dbpgno));
      }
      jpgno++;
      if( rc!=SQLITE_OK ){
        return rc;
      }
    }
  }
  TRACE(("PG: commit complete. deleting the journal.\n"));
  fsync(p->fdMain);
  close(p->fdJournal);
  p->fdJournal = -1;
  unlink(p->zJournal);
  for(pPg=p->pAll; pPg; pPg=pPg->pNx){
    pPg->isDirty = 0;
    pPg->jpgno = 0;
  }
  while( (pPg = p->pJidx)!=0 ){
    p->pAll = pPg->pNx;
    sqliteFree(pPg);
  }
  return SQLITE_OK;
}

/*
** Remove the given page from the LRU list.
*/
static void sqlitePgUnlinkLru(Pghdr *pPg){
  Pgr *p = pPg->p;
  if( pPg->pLru ){
    pPg->pLru->pMru = pPg->pLru;
  }
  if( pPg->pMru ){
    pPg->pMru->pLru = pPg->pMru;
  }
  if( p->pLru==pPg ){
    p->pLru = pPg->pLru;
  }
  if( p->pMru==pPg ){
    p->pMru = pPg->pMru;
  }
  pPg->pLru = pPg->pMru = 0;
}

/*
** Open the database file and make *ppPgr pointer to a structure describing it.
** Return SQLITE_OK on success or an error code if there is a failure.
**
** If there was an unfinished commit, complete it before returnning.
*/
int sqlitePgOpen(const char *zFilename, Pgr **ppPgr){
  Pgr *p;
  int n;

  n = strlen(zFilename);
  p = sqliteMalloc( sizeof(*p) + n*2 + 4 );
  if( p==0 ){
    *ppPgr = 0;
    return SQLITE_NOMEM;
  }
  p->zMain = (char*)&p[1];
  strcpy(p->zMain, zFilename);
  p->zJournal = &p->zMain[n+1];
  strcpy(p->zJournal, p->zMain);
  p->zJournal[n] = '~';
  p->zJournal[n+1] = 0;
  p->fdJournal = -1;
  p->fdMain = open(p->zMain, O_CREAT|O_RDWR, 0600);
  if( p->fdMain<0 ){
    *ppPgr = 0;
    sqliteFree(p);
    return SQLITE_PERM;
  }
  p->nDbPg = sqlitePgPageCount(p->fdMain);
  if( access(p->zJournal, R_OK)==0 ){
    sqlitePgJournalPlayback(p);
  }
  *ppPgr = p;
  return SQLITE_OK;
}

/*
** Close the database file.  Any outstanding transactions are abandoned.
*/
int sqlitePgClose(Pgr *p){
  Pghdr *pPg;

  if( p->fdMain ) close(p->fdMain);
  if( p->fdJournal ) close(p->fdJournal);
  unlink(p->zJournal);
  while( (pPg = p->pAll)!=0 ){
    p->pAll = pPg->pNx;
    sqliteFree(pPg);
  }
  while( (pPg = p->pJidx)!=0 ){
    p->pAll = pPg->pNx;
    sqliteFree(pPg);
  }
  sqliteFree(p);
  return SQLITE_OK;
}

/*
** Begin a new transaction.  Return SQLITE_OK on success or an error
** code if something goes wrong.
*/
int sqlitePgBeginTransaction(Pgr *p){
  assert( p->fdJournal<0 );
  if( p->nRefPg>0 ){
     /* release the read lock */
  }
  /* write lock the database */
  p->fdJournal = open(p->zJournal, O_CREAT|O_EXCL|O_RDWR, 0600);
  if( p->fdJournal<0 ){
    return SQLITE_PERM;
  }
  p->nJPg = 0;
  TRACE(("PG: begin transaction\n"));
  return SQLITE_OK;
}

/*
** Commit the current transaction.  Return SQLITE_OK or an error code.
*/
int sqlitePgCommit(Pgr *p){
  Pghdr *pPrimaryIdx = 0;
  Pghdr *pPg;
  int rc;

  for(pPg=p->pAll; pPg; pPg=pPg->pNx){
    if( pPg->isDirty ){
      rc = sqlitePgMakeClean(pPg);
      if( rc!=SQLITE_OK ){
        return rc;
      }
    }
  }
  for(pPg=p->pJidx; pPg; pPg=pPg->pNx){
    if( pPg->jpgno==1 ){
      pPrimaryIdx = pPg;
    }else{
      TRACE(("PG: writing j-index %u\n", pPg->jpgno));
      rc = sqlitePgMakeClean(pPg);
      if( rc!=SQLITE_OK ){
        return rc;
      }
    }
  }
  assert( pPrimaryIdx!=0 );
  fsync(p->fdJournal);
  TRACE(("PG: writing j-index %u\n", pPrimaryIdx->jpgno));
  rc = sqlitePgMakeClean(pPrimaryIdx);
  if( rc!=SQLITE_OK ){
    return rc;
  }
  fsync(p->fdJournal);
  rc = sqlitePgJournalPlayback(p);
  if( rc!=SQLITE_OK ){
    return rc;
  }
  /* remove write lock from database */
  if( p->nRefPg>0 ){
    /* acquire read lock on database */
  }
  return SQLITE_OK;
}

/*
** Abandon the current transaction.
*/
int sqlitePgRollback(Pgr *p){
  Pghdr *pPg;

  TRACE(("PG: begin rollback\n"));
  for(pPg=p->pAll; pPg; pPg=pPg->pNx){
    if( pPg->isDirty || pPg->jpgno!=0 ){
      pPg->isDirty = 0;
      pPg->jpgno = 0;
      if( pPg->nRef>0 ){
        TRACE(("PG: reloading d-page %u\n", pPg->dbpgno));
        sqlitePgRead(p->fdMain, PG_TO_DATA(pPg), pPg->dbpgno);
      }else{
        sqlitePgUnlinkLru(pPg);
      }
    }
  }
  close(p->fdJournal);
  p->fdJournal = -1;
  unlink(p->zJournal);
  while( (pPg = p->pJidx)!=0 ){
    p->pAll = pPg->pNx;
    sqliteFree(pPg);
  }
  p->nDbPg = sqlitePgPageCount(p->fdMain);
  /* remove write lock from database */
  if( p->nRefPg>0 ){
    /* acquire read lock on database */
  }
  return SQLITE_OK;
}

/*
** Get a page from the database.  Return a pointer to the data for that
** page.
**
** A NULL pointer will be returned if we run out of memory.
*/
int sqlitePgGet(Pgr *p, u32 pgno, void **ppData){
  Pghdr *pPg;
  int h;

  pPg = sqlitePgFind(p, pgno);
  if( pPg ){
    pPg->nRef++;
    if( pPg->nRef==1 ){
      sqlitePgUnlinkLru(pPg);
      TRACE(("PG: d-page %u pulled from cache\n", pgno));
    }
    p->nRefPg++;
    if( p->nRefPg==1 ){
      /* Acquire a read lock */
    }
    *ppData = PG_TO_DATA(pPg);
    return SQLITE_OK;
  }
  if( p->nMemPg<MX_MEM_PAGE || p->pLru==0 ){
    pPg = sqliteMalloc( sizeof(Pghdr) + SQLITE_PAGE_SIZE );
    if( pPg==0 ) return SQLITE_NOMEM;
    p->nMemPg++;
    pPg->pNx = p->pAll;
    p->pAll = pPg;
    pPg->p = p;
    TRACE(("PG: new page %d created.\n", p->nMemPg));
  }else{
    int rc;
    pPg = p->pLru;
    if( pPg->isDirty ){
      rc = sqlitePgMakeClean(pPg);
      if( rc!=SQLITE_OK ) return rc;
    }
    sqlitePgUnlinkLru(pPg);
    h = pPg->dbpgno % PG_HASH_SIZE;
    if( pPg->pPvHash ){
      pPg->pPvHash->pNxHash = pPg->pNxHash;
    }else{
      assert( p->aPgHash[h]==pPg );
      p->aPgHash[h] = pPg->pNxHash;
    }
    if( pPg->pNxHash ){
      pPg->pNxHash->pPvHash = pPg->pPvHash;
    }
    TRACE(("PG: recycling d-page %u to d-page %u\n", pPg->dbpgno, pgno));
  }
  pPg->dbpgno = pgno;
  if( pgno>p->nDbPg ){
    p->nDbPg = pgno;
  }
  h = pgno % PG_HASH_SIZE;
  pPg->pPvHash = 0;
  pPg->pNxHash = p->aPgHash[h];
  if( pPg->pNxHash ){
    pPg->pNxHash->pPvHash = pPg;
  }
  p->aPgHash[h] = pPg;
  pPg->jpgno = sqlitePgJournalPageNumber(p, pgno);
  if( pPg->jpgno!=0 ){
    TRACE(("PG: reading d-page %u content from j-page %u\n", pgno, pPg->jpgno));
    sqlitePgRead(p->fdJournal, PG_TO_DATA(pPg), pPg->jpgno);
  }else{
    TRACE(("PG: reading d-page %u from database\n", pgno));
    sqlitePgRead(p->fdMain, PG_TO_DATA(pPg), pPg->dbpgno);
  }
  pPg->isDirty = 0;
  pPg->nRef = 1;
  p->nRefPg++;
  if( p->nRefPg==1 ){
    /* Acquire a read lock */
  }
  *ppData = PG_TO_DATA(pPg);
  return SQLITE_OK;
}

/*
** Release a reference to a database data page.
*/
int sqlitePgUnref(void *pData){
  Pghdr *pPg = DATA_TO_PG(pData);
  pPg->nRef--;
  assert( pPg->nRef>=0 );
  if( pPg->nRef==0 ){
    Pgr *p = pPg->p;
    pPg->pMru = 0;
    pPg->pLru = p->pLru;
    p->pLru = pPg;
    TRACE(("PG: d-page %u is unused\n", pPg->dbpgno));
    p->nRefPg--;
    if( p->nRefPg==0 ){
      /* Release the read lock */
    }
  }
  return SQLITE_OK;
}

/*
** The database page in the argument has been modified.  Write it back
** to the database file on the next commit.
*/
int sqlitePgTouch(void *pD){
  Pghdr *pPg = DATA_TO_PG(pD);
  assert( pPg->p->fdJournal>=0 );
  if( pPg->isDirty==0 ){
    pPg->isDirty = 1;
    TRACE(("PG: d-page %u is dirty\n", pPg->dbpgno));
  }
  return SQLITE_OK;
}

/*
** Return the number of the first unused page at the end of the
** database file.
*/
int sqlitePgAlloc(Pgr *p, int *pPgno){
  *pPgno = p->nDbPg;
  return SQLITE_OK;
}