SQLite

Check-in [1b9918e207]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Refactor PLWriter in preparation for buffered-document change. Currently, PLWriter (Position List Writer) creates a locally-owned DataBuffer to write into. This is necessary to support doclist collection during tokenization, where there is no obvious buffer to write output to, but is not necessary for the other users of PLWriter. This change adds a DLCollector (Doc List Collector) structure to handle the tokenization case.

Also fix a potential memory leak in writeZeroSegment(). In case of error from leafWriterStep(), the DataBuffer dl was being leaked. (CVS 3706)

Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 1b9918e20767aebc9c1e7523027139e5fbc12688
User & Date: shess 2007-03-20 23:52:38.000
Context
2007-03-22
00:14
Refactor PLWriter to remove owned buffer. DLCollector (Document List Collector) now handles the case where PLWriter (Position List Writer) needed a local buffer. Change to using the associated DLWriter (Document List Writer) buffer, which reduces the number of memory copies needed in doclist processing, and brings PLWriter operation in line with DLWriter operation. (CVS 3707) (check-in: d04fa3a13a user: shess tags: trunk)
2007-03-20
23:52
Refactor PLWriter in preparation for buffered-document change. Currently, PLWriter (Position List Writer) creates a locally-owned DataBuffer to write into. This is necessary to support doclist collection during tokenization, where there is no obvious buffer to write output to, but is not necessary for the other users of PLWriter. This change adds a DLCollector (Doc List Collector) structure to handle the tokenization case.

Also fix a potential memory leak in writeZeroSegment(). In case of error from leafWriterStep(), the DataBuffer dl was being leaked. (CVS 3706) (check-in: 1b9918e207 user: shess tags: trunk)

2007-03-19
17:44
Modify the interface to the pager sub-system in preparation for performing IO in blocks based on sector-size, not database page-size. (CVS 3705) (check-in: 7dc7658887 user: danielk1977 tags: trunk)
Changes
Unified Diff Show Whitespace Changes Patch
Changes to ext/fts2/fts2.c.
938
939
940
941
942
943
944
945

946

947










948

949

950


951





952




953
954
955

956
957
958
959
960
961
962
963
  pWriter->iPos = 0;
  pWriter->iOffset = 0;
}
static void plwInit(PLWriter *pWriter, sqlite_int64 iDocid, DocListType iType){
  dataBufferInit(&pWriter->b, 0);
  plwReset(pWriter, iDocid, iType);
}
static PLWriter *plwNew(sqlite_int64 iDocid, DocListType iType){

  PLWriter *pWriter = malloc(sizeof(PLWriter));

  plwInit(pWriter, iDocid, iType);










  return pWriter;

}

static void plwDestroy(PLWriter *pWriter){


  dataBufferDestroy(&pWriter->b);





  SCRAMBLE(pWriter);




}
static void plwDelete(PLWriter *pWriter){
  plwDestroy(pWriter);

  free(pWriter);
}


/* Copy the doclist data of iType in pData/nData into *out, trimming
** unnecessary data as we go.  Only columns matching iColumn are
** copied, all columns copied if iColimn is -1.  Elements with no
** matching columns are dropped.  The output is an iOutType doclist.







|
>
|
>
|
>
>
>
>
>
>
>
>
>
>
|
>
|
>
|
>
>
|
>
>
>
>
>
|
>
>
>
>

|
|
>
|







938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
  pWriter->iPos = 0;
  pWriter->iOffset = 0;
}
static void plwInit(PLWriter *pWriter, sqlite_int64 iDocid, DocListType iType){
  dataBufferInit(&pWriter->b, 0);
  plwReset(pWriter, iDocid, iType);
}
static void plwDestroy(PLWriter *pWriter){
  dataBufferDestroy(&pWriter->b);
  SCRAMBLE(pWriter);
}

/*******************************************************************/
/* DLCollector wraps PLWriter and DLWriter to provide a
** dynamically-allocated doclist area to use during tokenization.
**
** dlcNew - malloc up and initialize a collector.
** dlcDelete - destroy a collector and all contained items.
** dlcAddPos - append position and offset information.
** dlcAddDoclist - add the collected doclist to the given buffer.
*/
typedef struct DLCollector {
  PLWriter plw;
} DLCollector;

static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
  DLWriter dlw;
  dlwInit(&dlw, pCollector->plw.iType, b);
  plwDlwAdd(&pCollector->plw, &dlw);
  dlwDestroy(&dlw);
}
static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
                      int iStartOffset, int iEndOffset){
  plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
}

static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
  DLCollector *pCollector = malloc(sizeof(DLCollector));
  plwInit(&pCollector->plw, iDocid, iType);
  return pCollector;
}
static void dlcDelete(DLCollector *pCollector){
  plwDestroy(&pCollector->plw);
  SCRAMBLE(pCollector);
  free(pCollector);
}


/* Copy the doclist data of iType in pData/nData into *out, trimming
** unnecessary data as we go.  Only columns matching iColumn are
** copied, all columns copied if iColimn is -1.  Elements with no
** matching columns are dropped.  The output is an iOutType doclist.
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
  if( rc!=SQLITE_OK ) return rc;

  pCursor->pTokenizer = pTokenizer;
  while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
                                               &pToken, &nTokenBytes,
                                               &iStartOffset, &iEndOffset,
                                               &iPosition) ){
    PLWriter *p;

    /* Positions can't be negative; we use -1 as a terminator internally. */
    if( iPosition<0 ){
      pTokenizer->pModule->xClose(pCursor);
      return SQLITE_ERROR;
    }

    p = fts2HashFind(terms, pToken, nTokenBytes);
    if( p==NULL ){
      p = plwNew(iDocid, DL_DEFAULT);
      fts2HashInsert(terms, pToken, nTokenBytes, p);
    }
    if( iColumn>=0 ){
      plwAdd(p, iColumn, iPosition, iStartOffset, iEndOffset);
    }
  }

  /* TODO(shess) Check return?  Should this be able to cause errors at
  ** this point?  Actually, same question about sqlite3_finalize(),
  ** though one could argue that failure there means that the data is
  ** not durable.  *ponder*







|









|



|







3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
  if( rc!=SQLITE_OK ) return rc;

  pCursor->pTokenizer = pTokenizer;
  while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
                                               &pToken, &nTokenBytes,
                                               &iStartOffset, &iEndOffset,
                                               &iPosition) ){
    DLCollector *p;

    /* Positions can't be negative; we use -1 as a terminator internally. */
    if( iPosition<0 ){
      pTokenizer->pModule->xClose(pCursor);
      return SQLITE_ERROR;
    }

    p = fts2HashFind(terms, pToken, nTokenBytes);
    if( p==NULL ){
      p = dlcNew(iDocid, DL_DEFAULT);
      fts2HashInsert(terms, pToken, nTokenBytes, p);
    }
    if( iColumn>=0 ){
      dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
    }
  }

  /* TODO(shess) Check return?  Should this be able to cause errors at
  ** this point?  Actually, same question about sqlite3_finalize(),
  ** though one could argue that failure there means that the data is
  ** not durable.  *ponder*
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
}

/****************************************************************/
/* Used to hold hashtable data for sorting. */
typedef struct TermData {
  const char *pTerm;
  int nTerm;
  PLWriter *pWriter;
} TermData;

/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
** for equal, >0 for greater-than).
*/
static int termDataCmp(const void *av, const void *bv){
  const TermData *a = (const TermData *)av;







|







5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
}

/****************************************************************/
/* Used to hold hashtable data for sorting. */
typedef struct TermData {
  const char *pTerm;
  int nTerm;
  DLCollector *pCollector;
} TermData;

/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
** for equal, >0 for greater-than).
*/
static int termDataCmp(const void *av, const void *bv){
  const TermData *a = (const TermData *)av;
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111

5112
5113
5114
5115
5116
5117
5118
  n = fts2HashCount(pTerms);
  pData = malloc(n*sizeof(TermData));

  for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
    assert( i<n );
    pData[i].pTerm = fts2HashKey(e);
    pData[i].nTerm = fts2HashKeysize(e);
    pData[i].pWriter = fts2HashData(e);
  }
  assert( i==n );

  /* TODO(shess) Should we allow user-defined collation sequences,
  ** here?  I think we only need that once we support prefix searches.
  */
  if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);

  /* TODO(shess) Refactor so that we can write directly to the segment
  ** DataBuffer, as happens for segment merges.
  */
  leafWriterInit(0, idx, &writer);
  dataBufferInit(&dl, 0);
  for(i=0; i<n; i++){
    DLWriter dlw;
    dataBufferReset(&dl);
    dlwInit(&dlw, DL_DEFAULT, &dl);
    plwDlwAdd(pData[i].pWriter, &dlw);
    rc = leafWriterStep(v, &writer,
                        pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
    dlwDestroy(&dlw);
    if( rc!=SQLITE_OK ) goto err;
  }
  dataBufferDestroy(&dl);
  rc = leafWriterFinalize(v, &writer);

 err:

  free(pData);
  leafWriterDestroy(&writer);
  return rc;
}

/* This function implements the xUpdate callback; it's the top-level entry
 * point for inserting, deleting or updating a row in a full-text table. */







|














<

<
|


<


<



>







5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124

5125

5126
5127
5128

5129
5130

5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
  n = fts2HashCount(pTerms);
  pData = malloc(n*sizeof(TermData));

  for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
    assert( i<n );
    pData[i].pTerm = fts2HashKey(e);
    pData[i].nTerm = fts2HashKeysize(e);
    pData[i].pCollector = fts2HashData(e);
  }
  assert( i==n );

  /* TODO(shess) Should we allow user-defined collation sequences,
  ** here?  I think we only need that once we support prefix searches.
  */
  if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);

  /* TODO(shess) Refactor so that we can write directly to the segment
  ** DataBuffer, as happens for segment merges.
  */
  leafWriterInit(0, idx, &writer);
  dataBufferInit(&dl, 0);
  for(i=0; i<n; i++){

    dataBufferReset(&dl);

    dlcAddDoclist(pData[i].pCollector, &dl);
    rc = leafWriterStep(v, &writer,
                        pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);

    if( rc!=SQLITE_OK ) goto err;
  }

  rc = leafWriterFinalize(v, &writer);

 err:
  dataBufferDestroy(&dl);
  free(pData);
  leafWriterDestroy(&writer);
  return rc;
}

/* This function implements the xUpdate callback; it's the top-level entry
 * point for inserting, deleting or updating a row in a full-text table. */
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
    rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
  }

  if( rc==SQLITE_OK ) rc = writeZeroSegment(v, &terms);

  /* clean up */
  for(e=fts2HashFirst(&terms); e; e=fts2HashNext(e)){
    plwDelete(fts2HashData(e));
  }
  fts2HashClear(&terms);

  return rc;
}

/*







|







5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
    rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
  }

  if( rc==SQLITE_OK ) rc = writeZeroSegment(v, &terms);

  /* clean up */
  for(e=fts2HashFirst(&terms); e; e=fts2HashNext(e)){
    dlcDelete(fts2HashData(e));
  }
  fts2HashClear(&terms);

  return rc;
}

/*