/ Check-in [d04fa3a1]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Refactor PLWriter to remove owned buffer. DLCollector (Document List Collector) now handles the case where PLWriter (Position List Writer) needed a local buffer. Change to using the associated DLWriter (Document List Writer) buffer, which reduces the number of memory copies needed in doclist processing, and brings PLWriter operation in line with DLWriter operation. (CVS 3707)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: d04fa3a13a84f49074c673b8ee2fb6541da061b5
User & Date: shess 2007-03-22 00:14:29
Context
2007-03-22
15:20
Call sqlite3_free() instead of free() to release a buffer allocated by sqlite3_vmprintf() in test_async.c (test suite bug only). (CVS 3708) check-in: b078f09b user: danielk1977 tags: trunk
00:14
Refactor PLWriter to remove owned buffer. DLCollector (Document List Collector) now handles the case where PLWriter (Position List Writer) needed a local buffer. Change to using the associated DLWriter (Document List Writer) buffer, which reduces the number of memory copies needed in doclist processing, and brings PLWriter operation in line with DLWriter operation. (CVS 3707) check-in: d04fa3a1 user: shess tags: trunk
2007-03-20
23:52
Refactor PLWriter in preparation for buffered-document change. Currently, PLWriter (Position List Writer) creates a locally-owned DataBuffer to write into. This is necessary to support doclist collection during tokenization, where there is no obvious buffer to write output to, but is not necessary for the other users of PLWriter. This change adds a DLCollector (Doc List Collector) structure to handle the tokenization case.

Also fix a potential memory leak in writeZeroSegment(). In case of error from leafWriterStep(), the DataBuffer dl was being leaked. (CVS 3706) check-in: 1b9918e2 user: shess tags: trunk

Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts2/fts2.c.

   686    686   /* DLWriter is used to write doclist data to a DataBuffer.  DLWriter
   687    687   ** always appends to the buffer and does not own it.
   688    688   **
   689    689   ** dlwInit - initialize to write a given type doclistto a buffer.
   690    690   ** dlwDestroy - clear the writer's memory.  Does not free buffer.
   691    691   ** dlwAppend - append raw doclist data to buffer.
   692    692   ** dlwAdd - construct doclist element and append to buffer.
          693  +**    Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
   693    694   */
   694    695   typedef struct DLWriter {
   695    696     DocListType iType;
   696    697     DataBuffer *b;
   697    698     sqlite_int64 iPrevDocid;
   698    699   } DLWriter;
   699    700   
................................................................................
   747    748       dataBufferAppend2(pWriter->b, c, nFirstNew,
   748    749                         pData+nFirstOld, nData-nFirstOld);
   749    750     }else{
   750    751       dataBufferAppend(pWriter->b, c, nFirstNew);
   751    752     }
   752    753     pWriter->iPrevDocid = iLastDocid;
   753    754   }
   754         -static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid,
   755         -                   const char *pPosList, int nPosList){
          755  +static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
   756    756     char c[VARINT_MAX];
   757    757     int n = putVarint(c, iDocid-pWriter->iPrevDocid);
   758    758   
   759    759     assert( pWriter->iPrevDocid<iDocid );
   760         -  assert( pPosList==0 || pWriter->iType>DL_DOCIDS );
          760  +  assert( pWriter->iType==DL_DOCIDS );
   761    761   
   762    762     dataBufferAppend(pWriter->b, c, n);
   763         -
   764         -  if( pWriter->iType>DL_DOCIDS ){
   765         -    n = putVarint(c, 0);
   766         -    if( nPosList>0 ){
   767         -      dataBufferAppend2(pWriter->b, pPosList, nPosList, c, n);
   768         -    }else{
   769         -      dataBufferAppend(pWriter->b, c, n);
   770         -    }
   771         -  }
   772    763     pWriter->iPrevDocid = iDocid;
   773    764   }
   774    765   
   775    766   /*******************************************************************/
   776    767   /* PLReader is used to read data from a document's position list.  As
   777    768   ** the caller steps through the list, data is cached so that varints
   778    769   ** only need to be decoded once.
................................................................................
   850    841       pReader->iEndOffset = pReader->iStartOffset+i;
   851    842     }
   852    843     assert( n<=pReader->nData );
   853    844     pReader->pData += n;
   854    845     pReader->nData -= n;
   855    846   }
   856    847   
   857         -static void plrInit(PLReader *pReader, DocListType iType,
   858         -                    const char *pData, int nData){
   859         -  pReader->pData = pData;
   860         -  pReader->nData = nData;
   861         -  pReader->iType = iType;
          848  +static void plrInit(PLReader *pReader, DLReader *pDLReader){
          849  +  pReader->pData = dlrPosData(pDLReader);
          850  +  pReader->nData = dlrPosDataLen(pDLReader);
          851  +  pReader->iType = pDLReader->iType;
   862    852     pReader->iColumn = 0;
   863    853     pReader->iPosition = 0;
   864    854     pReader->iStartOffset = 0;
   865    855     pReader->iEndOffset = 0;
   866    856     plrStep(pReader);
   867    857   }
   868    858   static void plrDestroy(PLReader *pReader){
   869    859     SCRAMBLE(pReader);
   870    860   }
   871    861   
   872    862   /*******************************************************************/
   873    863   /* PLWriter is used in constructing a document's position list.  As a
   874    864   ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
          865  +** PLWriter writes to the associated DLWriter's buffer.
   875    866   **
   876    867   ** plwInit - init for writing a document's poslist.
   877         -** plwReset - reset the writer for a new document.
   878    868   ** plwDestroy - clear a writer.
   879         -** plwNew - malloc storage and initialize it.
   880         -** plwDelete - clear and free storage.
   881         -** plwDlwAdd - append the docid and poslist to a doclist writer.
   882    869   ** plwAdd - append position and offset information.
          870  +** plwTerminate - add any necessary doclist terminator.
          871  +**
          872  +** Calling plwAdd() after plwTerminate() may result in a corrupt
          873  +** doclist.
   883    874   */
   884         -/* TODO(shess) PLWriter is used in two ways.  fulltextUpdate() uses it
   885         -** in construction of a new doclist.  docListTrim() and mergePosList()
   886         -** use it when trimming.  In the former case, it wants to own the
   887         -** DataBuffer, in the latter it's possible it could encode into a
   888         -** pre-existing DataBuffer.
          875  +/* TODO(shess) Until we've written the second item, we can cache the
          876  +** first item's information.  Then we'd have three states:
          877  +**
          878  +** - initialized with docid, no positions.
          879  +** - docid and one position.
          880  +** - docid and multiple positions.
          881  +**
          882  +** Only the last state needs to actually write to dlw->b, which would
          883  +** be an improvement in the DLCollector case.
   889    884   */
   890    885   typedef struct PLWriter {
   891         -  DataBuffer b;
          886  +  DLWriter *dlw;
   892    887   
   893         -  sqlite_int64 iDocid;
   894         -  DocListType iType;
   895    888     int iColumn;    /* the last column written */
   896    889     int iPos;       /* the last position written */
   897    890     int iOffset;    /* the last start offset written */
   898    891   } PLWriter;
   899    892   
   900         -static void plwDlwAdd(PLWriter *pWriter, DLWriter *dlWriter){
   901         -  dlwAdd(dlWriter, pWriter->iDocid, pWriter->b.pData, pWriter->b.nData);
   902         -}
          893  +/* TODO(shess) In the case where the parent is reading these values
          894  +** from a PLReader, we could optimize to a copy if that PLReader has
          895  +** the same type as pWriter.
          896  +*/
   903    897   static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
   904    898                      int iStartOffset, int iEndOffset){
   905    899     /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
   906    900     ** iStartOffsetDelta, and iEndOffsetDelta.
   907    901     */
   908    902     char c[5*VARINT_MAX];
   909    903     int n = 0;
   910    904   
   911         -  if( pWriter->iType==DL_DOCIDS ) return;
          905  +  /* Ban plwAdd() after plwTerminate(). */
          906  +  assert( pWriter->iPos!=-1 );
          907  +
          908  +  if( pWriter->dlw->iType==DL_DOCIDS ) return;
   912    909   
   913    910     if( iColumn!=pWriter->iColumn ){
   914    911       n += putVarint(c+n, POS_COLUMN);
   915    912       n += putVarint(c+n, iColumn);
   916    913       pWriter->iColumn = iColumn;
   917    914       pWriter->iPos = 0;
   918    915       pWriter->iOffset = 0;
   919    916     }
   920    917     assert( iPos>=pWriter->iPos );
   921    918     n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
   922    919     pWriter->iPos = iPos;
   923         -  if( pWriter->iType==DL_POSITIONS_OFFSETS ){
          920  +  if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
   924    921       assert( iStartOffset>=pWriter->iOffset );
   925    922       n += putVarint(c+n, iStartOffset-pWriter->iOffset);
   926    923       pWriter->iOffset = iStartOffset;
   927    924       assert( iEndOffset>=iStartOffset );
   928    925       n += putVarint(c+n, iEndOffset-iStartOffset);
   929    926     }
   930         -  dataBufferAppend(&pWriter->b, c, n);
          927  +  dataBufferAppend(pWriter->dlw->b, c, n);
   931    928   }
   932         -static void plwReset(PLWriter *pWriter,
   933         -                     sqlite_int64 iDocid, DocListType iType){
   934         -  dataBufferReset(&pWriter->b);
   935         -  pWriter->iDocid = iDocid;
   936         -  pWriter->iType = iType;
          929  +static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
          930  +  char c[VARINT_MAX];
          931  +  int n;
          932  +
          933  +  pWriter->dlw = dlw;
          934  +
          935  +  assert( iDocid>pWriter->dlw->iPrevDocid );
          936  +  n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
          937  +  dataBufferAppend(pWriter->dlw->b, c, n);
          938  +  pWriter->dlw->iPrevDocid = iDocid;
          939  +
   937    940     pWriter->iColumn = 0;
   938    941     pWriter->iPos = 0;
   939    942     pWriter->iOffset = 0;
   940    943   }
   941         -static void plwInit(PLWriter *pWriter, sqlite_int64 iDocid, DocListType iType){
   942         -  dataBufferInit(&pWriter->b, 0);
   943         -  plwReset(pWriter, iDocid, iType);
          944  +/* TODO(shess) Should plwDestroy() also terminate the doclist?  But
          945  +** then plwDestroy() would no longer be just a destructor, it would
          946  +** also be doing work, which isn't consistent with the overall idiom.
          947  +** Another option would be for plwAdd() to always append any necessary
          948  +** terminator, so that the output is always correct.  But that would
          949  +** add incremental work to the common case with the only benefit being
          950  +** API elegance.  Punt for now.
          951  +*/
          952  +static void plwTerminate(PLWriter *pWriter){
          953  +  if( pWriter->dlw->iType>DL_DOCIDS ){
          954  +    char c[VARINT_MAX];
          955  +    int n = putVarint(c, POS_END);
          956  +    dataBufferAppend(pWriter->dlw->b, c, n);
          957  +  }
          958  +#ifndef NDEBUG
          959  +  /* Mark as terminated for assert in plwAdd(). */
          960  +  pWriter->iPos = -1;
          961  +#endif
   944    962   }
   945    963   static void plwDestroy(PLWriter *pWriter){
   946         -  dataBufferDestroy(&pWriter->b);
   947    964     SCRAMBLE(pWriter);
   948    965   }
   949    966   
   950    967   /*******************************************************************/
   951    968   /* DLCollector wraps PLWriter and DLWriter to provide a
   952    969   ** dynamically-allocated doclist area to use during tokenization.
   953    970   **
   954    971   ** dlcNew - malloc up and initialize a collector.
   955    972   ** dlcDelete - destroy a collector and all contained items.
   956    973   ** dlcAddPos - append position and offset information.
   957    974   ** dlcAddDoclist - add the collected doclist to the given buffer.
   958    975   */
   959    976   typedef struct DLCollector {
          977  +  DataBuffer b;
          978  +  DLWriter dlw;
   960    979     PLWriter plw;
   961    980   } DLCollector;
   962    981   
          982  +/* TODO(shess) This could also be done by calling plwTerminate() and
          983  +** dataBufferAppend().  I tried that, expecting nominal performance
          984  +** differences, but it seemed to pretty reliably be worth 1% to code
          985  +** it this way.  I suspect it's the incremental malloc overhead (some
          986  +** percentage of the plwTerminate() calls will cause a realloc), so
          987  +** this might be worth revisiting if the DataBuffer implementation
          988  +** changes.
          989  +*/
   963    990   static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
   964         -  DLWriter dlw;
   965         -  dlwInit(&dlw, pCollector->plw.iType, b);
   966         -  plwDlwAdd(&pCollector->plw, &dlw);
   967         -  dlwDestroy(&dlw);
          991  +  if( pCollector->dlw.iType>DL_DOCIDS ){
          992  +    char c[VARINT_MAX];
          993  +    int n = putVarint(c, POS_END);
          994  +    dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
          995  +  }else{
          996  +    dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
          997  +  }
   968    998   }
   969    999   static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
   970   1000                         int iStartOffset, int iEndOffset){
   971   1001     plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
   972   1002   }
   973   1003   
   974   1004   static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
   975   1005     DLCollector *pCollector = malloc(sizeof(DLCollector));
   976         -  plwInit(&pCollector->plw, iDocid, iType);
         1006  +  dataBufferInit(&pCollector->b, 0);
         1007  +  dlwInit(&pCollector->dlw, iType, &pCollector->b);
         1008  +  plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
   977   1009     return pCollector;
   978   1010   }
   979   1011   static void dlcDelete(DLCollector *pCollector){
   980   1012     plwDestroy(&pCollector->plw);
         1013  +  dlwDestroy(&pCollector->dlw);
         1014  +  dataBufferDestroy(&pCollector->b);
   981   1015     SCRAMBLE(pCollector);
   982   1016     free(pCollector);
   983   1017   }
   984   1018   
   985   1019   
   986   1020   /* Copy the doclist data of iType in pData/nData into *out, trimming
   987   1021   ** unnecessary data as we go.  Only columns matching iColumn are
   988         -** copied, all columns copied if iColimn is -1.  Elements with no
         1022  +** copied, all columns copied if iColumn is -1.  Elements with no
   989   1023   ** matching columns are dropped.  The output is an iOutType doclist.
         1024  +*/
         1025  +/* NOTE(shess) This code is only valid after all doclists are merged.
         1026  +** If this is run before merges, then doclist items which represent
         1027  +** deletion will be trimmed, and will thus not effect a deletion
         1028  +** during the merge.
   990   1029   */
   991   1030   static void docListTrim(DocListType iType, const char *pData, int nData,
   992   1031                           int iColumn, DocListType iOutType, DataBuffer *out){
   993   1032     DLReader dlReader;
   994   1033     DLWriter dlWriter;
   995         -  PLWriter plWriter;
   996   1034   
   997   1035     assert( iOutType<=iType );
   998   1036   
   999   1037     dlrInit(&dlReader, iType, pData, nData);
  1000   1038     dlwInit(&dlWriter, iOutType, out);
  1001         -  plwInit(&plWriter, 0, iOutType);
  1002   1039   
  1003   1040     while( !dlrAtEnd(&dlReader) ){
  1004   1041       PLReader plReader;
         1042  +    PLWriter plWriter;
  1005   1043       int match = 0;
  1006   1044   
  1007         -    plrInit(&plReader, dlReader.iType,
  1008         -            dlrPosData(&dlReader), dlrPosDataLen(&dlReader));
  1009         -    plwReset(&plWriter, dlrDocid(&dlReader), iOutType);
         1045  +    plrInit(&plReader, &dlReader);
  1010   1046   
  1011   1047       while( !plrAtEnd(&plReader) ){
  1012   1048         if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
  1013         -        match = 1;
         1049  +        if( !match ){
         1050  +          plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
         1051  +          match = 1;
         1052  +        }
  1014   1053           plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
  1015   1054                  plrStartOffset(&plReader), plrEndOffset(&plReader));
  1016   1055         }
  1017   1056         plrStep(&plReader);
  1018   1057       }
  1019         -    if( match ) plwDlwAdd(&plWriter, &dlWriter);
         1058  +    if( match ){
         1059  +      plwTerminate(&plWriter);
         1060  +      plwDestroy(&plWriter);
         1061  +    }
  1020   1062   
  1021   1063       plrDestroy(&plReader);
  1022   1064       dlrStep(&dlReader);
  1023   1065     }
  1024         -  plwDestroy(&plWriter);
  1025   1066     dlwDestroy(&dlWriter);
  1026   1067     dlrDestroy(&dlReader);
  1027   1068   }
  1028   1069   
  1029   1070   /* Used by docListMerge() to keep doclists in the ascending order by
  1030   1071   ** docid, then ascending order by age (so the newest comes first).
  1031   1072   */
................................................................................
  1168   1209     PLReader left, right;
  1169   1210     PLWriter writer;
  1170   1211     int match = 0;
  1171   1212   
  1172   1213     assert( dlrDocid(pLeft)==dlrDocid(pRight) );
  1173   1214     assert( pOut->iType!=DL_POSITIONS_OFFSETS );
  1174   1215   
  1175         -  plrInit(&left, pLeft->iType, dlrPosData(pLeft), dlrPosDataLen(pLeft));
  1176         -  plrInit(&right, pRight->iType, dlrPosData(pRight), dlrPosDataLen(pRight));
  1177         -  plwInit(&writer, dlrDocid(pLeft), pOut->iType);
         1216  +  plrInit(&left, pLeft);
         1217  +  plrInit(&right, pRight);
  1178   1218   
  1179   1219     while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
  1180   1220       if( plrColumn(&left)<plrColumn(&right) ){
  1181   1221         plrStep(&left);
  1182   1222       }else if( plrColumn(&left)>plrColumn(&right) ){
  1183   1223         plrStep(&right);
  1184   1224       }else if( plrPosition(&left)+1<plrPosition(&right) ){
  1185   1225         plrStep(&left);
  1186   1226       }else if( plrPosition(&left)+1>plrPosition(&right) ){
  1187   1227         plrStep(&right);
  1188   1228       }else{
  1189         -      match = 1;
         1229  +      if( !match ){
         1230  +        plwInit(&writer, pOut, dlrDocid(pLeft));
         1231  +        match = 1;
         1232  +      }
  1190   1233         plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
  1191   1234         plrStep(&left);
  1192   1235         plrStep(&right);
  1193   1236       }
  1194   1237     }
  1195   1238   
  1196         -  /* TODO(shess) We could remember the output position, encode the
  1197         -  ** docid, then encode the poslist directly into the output.  If no
  1198         -  ** match, we back out to the stored output position.  This would
  1199         -  ** also reduce the malloc count.
  1200         -  */
  1201         -  if( match ) plwDlwAdd(&writer, pOut);
         1239  +  if( match ){
         1240  +    plwTerminate(&writer);
         1241  +    plwDestroy(&writer);
         1242  +  }
  1202   1243   
  1203   1244     plrDestroy(&left);
  1204   1245     plrDestroy(&right);
  1205         -  plwDestroy(&writer);
  1206   1246   }
  1207   1247   
  1208   1248   /* We have two doclists with positions:  pLeft and pRight.
  1209   1249   ** Write the phrase intersection of these two doclists into pOut.
  1210   1250   **
  1211   1251   ** A phrase intersection means that two documents only match
  1212   1252   ** if pLeft.iPos+1==pRight.iPos.
................................................................................
  1268   1308   
  1269   1309     while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
  1270   1310       if( dlrDocid(&left)<dlrDocid(&right) ){
  1271   1311         dlrStep(&left);
  1272   1312       }else if( dlrDocid(&right)<dlrDocid(&left) ){
  1273   1313         dlrStep(&right);
  1274   1314       }else{
  1275         -      dlwAdd(&writer, dlrDocid(&left), 0, 0);
         1315  +      dlwAdd(&writer, dlrDocid(&left));
  1276   1316         dlrStep(&left);
  1277   1317         dlrStep(&right);
  1278   1318       }
  1279   1319     }
  1280   1320   
  1281   1321     dlrDestroy(&left);
  1282   1322     dlrDestroy(&right);
................................................................................
  1306   1346   
  1307   1347     dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
  1308   1348     dlrInit(&right, DL_DOCIDS, pRight, nRight);
  1309   1349     dlwInit(&writer, DL_DOCIDS, pOut);
  1310   1350   
  1311   1351     while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
  1312   1352       if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
  1313         -      dlwAdd(&writer, dlrDocid(&left), 0, 0);
         1353  +      dlwAdd(&writer, dlrDocid(&left));
  1314   1354         dlrStep(&left);
  1315   1355       }else if( dlrAtEnd(&left) || dlrDocid(&right)<dlrDocid(&left) ){
  1316         -      dlwAdd(&writer, dlrDocid(&right), 0, 0);
         1356  +      dlwAdd(&writer, dlrDocid(&right));
  1317   1357         dlrStep(&right);
  1318   1358       }else{
  1319         -      dlwAdd(&writer, dlrDocid(&left), 0, 0);
         1359  +      dlwAdd(&writer, dlrDocid(&left));
  1320   1360         dlrStep(&left);
  1321   1361         dlrStep(&right);
  1322   1362       }
  1323   1363     }
  1324   1364   
  1325   1365     dlrDestroy(&left);
  1326   1366     dlrDestroy(&right);
................................................................................
  1350   1390     dlwInit(&writer, DL_DOCIDS, pOut);
  1351   1391   
  1352   1392     while( !dlrAtEnd(&left) ){
  1353   1393       while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
  1354   1394         dlrStep(&right);
  1355   1395       }
  1356   1396       if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
  1357         -      dlwAdd(&writer, dlrDocid(&left), 0, 0);
         1397  +      dlwAdd(&writer, dlrDocid(&left));
  1358   1398       }
  1359   1399       dlrStep(&left);
  1360   1400     }
  1361   1401   
  1362   1402     dlrDestroy(&left);
  1363   1403     dlrDestroy(&right);
  1364   1404     dlwDestroy(&writer);