Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add the NEAR operator to fts3. (CVS 4502) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
aef7720e0bb49d52332ddebe6f698feb |
User & Date: | danielk1977 2007-10-22 18:02:20.000 |
Context
2007-10-23
| ||
08:17 | Fix an error message in the tcl interface. (CVS 4503) (check-in: 2449e08069 user: danielk1977 tags: trunk) | |
2007-10-22
| ||
18:02 | Add the NEAR operator to fts3. (CVS 4502) (check-in: aef7720e0b user: danielk1977 tags: trunk) | |
2007-10-21
| ||
22:59 | We need an extra define to activate OS/2 semaphores for compiling/linking. (CVS 4501) (check-in: 0604dace0e user: pweilbacher tags: trunk) | |
Changes
Changes to ext/fts3/fts3.c.
︙ | ︙ | |||
284 285 286 287 288 289 290 | #include <string.h> #include <ctype.h> #include "fts3.h" #include "fts3_hash.h" #include "fts3_tokenizer.h" #include "sqlite3.h" | > | | > | 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | #include <string.h> #include <ctype.h> #include "fts3.h" #include "fts3_hash.h" #include "fts3_tokenizer.h" #include "sqlite3.h" #ifndef SQLITE_CORE #include "sqlite3ext.h" SQLITE_EXTENSION_INIT1 #endif /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it ** would be nice to order the file better, perhaps something along the ** lines of: ** ** - utility functions |
︙ | ︙ | |||
307 308 309 310 311 312 313 314 315 316 317 318 319 320 | #if 0 # define TRACE(A) printf A; fflush(stdout) #else # define TRACE(A) #endif /* It is not safe to call isspace(), tolower(), or isalnum() on ** hi-bit-set characters. This is the same solution used in the ** tokenizer. */ /* TODO(shess) The snippet-generation code should be using the ** tokenizer-generated tokens rather than doing its own local ** tokenization. | > > > > > | 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | #if 0 # define TRACE(A) printf A; fflush(stdout) #else # define TRACE(A) #endif /* ** Default span for NEAR operators. */ #define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10 /* It is not safe to call isspace(), tolower(), or isalnum() on ** hi-bit-set characters. This is the same solution used in the ** tokenizer. */ /* TODO(shess) The snippet-generation code should be using the ** tokenizer-generated tokens rather than doing its own local ** tokenization. |
︙ | ︙ | |||
1361 1362 1363 1364 1365 1366 1367 | } dlrDestroy(&left); dlrDestroy(&right); dlwDestroy(&writer); } | > > > > | > > > > > | | < < < | | > | > > | > > > > | > | < < > | | | | > | > > > | > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > | > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 | } dlrDestroy(&left); dlrDestroy(&right); dlwDestroy(&writer); } /* ** This function is used as part of the implementation of phrase and ** NEAR matching. ** ** pLeft and pRight are DLReaders positioned to the same docid in ** lists of type DL_POSITION. This function writes an entry to the ** DLWriter pOut for each position in pRight that is less than ** (nNear+1) greater (but not equal to or smaller) than a position ** in pLeft. For example, if nNear is 0, and the positions contained ** by pLeft and pRight are: ** ** pLeft: 5 10 15 20 ** pRight: 6 9 17 21 ** ** then the docid is added to pOut. If pOut is of type DL_POSITIONS, ** then a positionids "6" and "21" are also added to pOut. ** ** If boolean argument isSaveLeft is true, then positionids are copied ** from pLeft instead of pRight. In the example above, the positions "5" ** and "20" would be added instead of "6" and "21". */ static void posListPhraseMerge( DLReader *pLeft, DLReader *pRight, int nNear, int isSaveLeft, DLWriter *pOut ){ PLReader left, right; PLWriter writer; int match = 0; assert( dlrDocid(pLeft)==dlrDocid(pRight) ); assert( pOut->iType!=DL_POSITIONS_OFFSETS ); plrInit(&left, pLeft); plrInit(&right, pRight); while( !plrAtEnd(&left) && !plrAtEnd(&right) ){ if( plrColumn(&left)<plrColumn(&right) ){ plrStep(&left); }else if( plrColumn(&left)>plrColumn(&right) ){ plrStep(&right); }else if( plrPosition(&left)>=plrPosition(&right) ){ plrStep(&right); }else{ if( (plrPosition(&right)-plrPosition(&left))<=(nNear+1) ){ if( !match ){ plwInit(&writer, pOut, dlrDocid(pLeft)); match = 1; } if( !isSaveLeft ){ plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); }else{ plwAdd(&writer, plrColumn(&left), plrPosition(&left), 0, 0); } plrStep(&right); }else{ plrStep(&left); } } } if( match ){ plwTerminate(&writer); plwDestroy(&writer); } plrDestroy(&left); plrDestroy(&right); } /* ** Compare the values pointed to by the PLReaders passed as arguments. ** Return -1 if the value pointed to by pLeft is considered less than ** the value pointed to by pRight, +1 if it is considered greater ** than it, or 0 if it is equal. i.e. ** ** (*pLeft - *pRight) ** ** A PLReader that is in the EOF condition is considered greater than ** any other. If neither argument is in EOF state, the return value of ** plrColumn() is used. If the plrColumn() values are equal, the ** comparison is on the basis of plrPosition(). */ static int plrCompare(PLReader *pLeft, PLReader *pRight){ assert(!plrAtEnd(pLeft) || !plrAtEnd(pRight)); if( plrAtEnd(pRight) || plrAtEnd(pLeft) ){ return (plrAtEnd(pRight) ? -1 : 1); } if( plrColumn(pLeft)!=plrColumn(pRight) ){ return ((plrColumn(pLeft)<plrColumn(pRight)) ? -1 : 1); } if( plrPosition(pLeft)!=plrPosition(pRight) ){ return ((plrPosition(pLeft)<plrPosition(pRight)) ? -1 : 1); } return 0; } /* We have two doclists with positions: pLeft and pRight. Depending ** on the value of the nNear parameter, perform either a phrase ** intersection (if nNear==0) or a NEAR intersection (if nNear>0) ** and write the results into pOut. ** ** A phrase intersection means that two documents only match ** if pLeft.iPos+1==pRight.iPos. ** ** A NEAR intersection means that two documents only match if ** (abs(pLeft.iPos-pRight.iPos)<nNear). ** ** If a NEAR intersection is requested, then the nPhrase argument should ** be passed the number of tokens in the two operands to the NEAR operator ** combined. For example: ** ** Query syntax nPhrase ** ------------------------------------ ** "A B C" NEAR "D E" 5 ** A NEAR B 2 ** ** iType controls the type of data written to pOut. If iType is ** DL_POSITIONS, the positions are those from pRight. */ static void docListPhraseMerge( const char *pLeft, int nLeft, const char *pRight, int nRight, int nNear, /* 0 for a phrase merge, non-zero for a NEAR merge */ int nPhrase, /* Number of tokens in left+right operands to NEAR */ DocListType iType, /* Type of doclist to write to pOut */ DataBuffer *pOut /* Write the combined doclist here */ ){ DLReader left, right; DLWriter writer; if( nLeft==0 || nRight==0 ) return; assert( iType!=DL_POSITIONS_OFFSETS ); dlrInit(&left, DL_POSITIONS, pLeft, nLeft); dlrInit(&right, DL_POSITIONS, pRight, nRight); dlwInit(&writer, iType, pOut); while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ if( dlrDocid(&left)<dlrDocid(&right) ){ dlrStep(&left); }else if( dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); }else{ if( nNear==0 ){ posListPhraseMerge(&left, &right, 0, 0, &writer); }else{ /* This case occurs when two terms (simple terms or phrases) are * connected by a NEAR operator, span (nNear+1). i.e. * * '"terrible company" NEAR widget' */ DataBuffer one = {0, 0, 0}; DataBuffer two = {0, 0, 0}; DLWriter dlwriter2; DLReader dr1 = {0, 0, 0, 0, 0}; DLReader dr2 = {0, 0, 0, 0, 0}; dlwInit(&dlwriter2, iType, &one); posListPhraseMerge(&right, &left, nNear-3+nPhrase, 1, &dlwriter2); dlwInit(&dlwriter2, iType, &two); posListPhraseMerge(&left, &right, nNear-1, 0, &dlwriter2); if( one.nData) dlrInit(&dr1, iType, one.pData, one.nData); if( two.nData) dlrInit(&dr2, iType, two.pData, two.nData); if( !dlrAtEnd(&dr1) || !dlrAtEnd(&dr2) ){ PLReader pr1 = {0}; PLReader pr2 = {0}; PLWriter plwriter; plwInit(&plwriter, &writer, dlrDocid(dlrAtEnd(&dr1)?&dr2:&dr1)); if( one.nData ) plrInit(&pr1, &dr1); if( two.nData ) plrInit(&pr2, &dr2); while( !plrAtEnd(&pr1) || !plrAtEnd(&pr2) ){ int iCompare = plrCompare(&pr1, &pr2); switch( iCompare ){ case -1: plwCopy(&plwriter, &pr1); plrStep(&pr1); break; case 1: plwCopy(&plwriter, &pr2); plrStep(&pr2); break; case 0: plwCopy(&plwriter, &pr1); plrStep(&pr1); plrStep(&pr2); break; } } plwTerminate(&plwriter); } dataBufferDestroy(&one); dataBufferDestroy(&two); } dlrStep(&left); dlrStep(&right); } } dlrDestroy(&left); dlrDestroy(&right); |
︙ | ︙ | |||
1656 1657 1658 1659 1660 1661 1662 | /* end utility functions */ /* Forward reference */ typedef struct fulltext_vtab fulltext_vtab; /* A single term in a query is represented by an instances of | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 | /* end utility functions */ /* Forward reference */ typedef struct fulltext_vtab fulltext_vtab; /* A single term in a query is represented by an instances of ** the following structure. Each word which may match against ** document content is a term. Operators, like NEAR or OR, are ** not terms. Query terms are organized as a flat list stored ** in the Query.pTerms array. ** ** If the QueryTerm.nPhrase variable is non-zero, then the QueryTerm ** is the first in a contiguous string of terms that are either part ** of the same phrase, or connected by the NEAR operator. ** ** If the QueryTerm.nNear variable is non-zero, then the token is followed ** by a NEAR operator with span set to (nNear-1). For example, the ** following query: ** ** The QueryTerm.iPhrase variable stores the index of the token within ** it's phrase, indexed starting at 1, or 1 if the token is not part ** of any phrase. ** ** For example, the data structure used to represent the following query: ** ** ... MATCH 'sqlite NEAR/5 google NEAR/2 "search engine"' ** ** is: ** ** {nPhrase=4, iPhrase=1, nNear=6, pTerm="sqlite"}, ** {nPhrase=0, iPhrase=1, nNear=3, pTerm="google"}, ** {nPhrase=0, iPhrase=1, nNear=0, pTerm="search"}, ** {nPhrase=0, iPhrase=2, nNear=0, pTerm="engine"}, ** ** compiling the FTS3 syntax to Query structures is done by the parseQuery() ** function. */ typedef struct QueryTerm { short int nPhrase; /* How many following terms are part of the same phrase */ short int iPhrase; /* This is the i-th term of a phrase. */ short int iColumn; /* Column of the index that must match this term */ signed char nNear; /* term followed by a NEAR operator with span=(nNear-1) */ signed char isOr; /* this term is preceded by "OR" */ signed char isNot; /* this term is preceded by "-" */ signed char isPrefix; /* this term is followed by "*" */ char *pTerm; /* text of the term. '\000' terminated. malloced */ int nTerm; /* Number of bytes in pTerm[] */ } QueryTerm; |
︙ | ︙ | |||
1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 | * */ typedef struct Query { fulltext_vtab *pFts; /* The full text index */ int nTerms; /* Number of terms in the query */ QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ int nextIsOr; /* Set the isOr flag on the next inserted term */ int nextColumn; /* Next word parsed must be in this column */ int dfltColumn; /* The default column */ } Query; /* ** An instance of the following structure keeps track of generated ** matching-word offset information and snippets. */ typedef struct Snippet { int nMatch; /* Total number of matches */ int nAlloc; /* Space allocated for aMatch[] */ struct snippetMatch { /* One entry for each matching term */ char snStatus; /* Status flag for use while constructing snippets */ short int iCol; /* The column that contains the match */ short int iTerm; /* The index in Query.pTerms[] of the matching term */ short int nByte; /* Number of bytes in the term */ int iStart; /* The offset to the first character of the term */ } *aMatch; /* Points to space obtained from malloc */ char *zOffset; /* Text rendering of aMatch[] */ int nOffset; /* strlen(zOffset) */ char *zSnippet; /* Snippet text */ int nSnippet; /* strlen(zSnippet) */ | > > | 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 | * */ typedef struct Query { fulltext_vtab *pFts; /* The full text index */ int nTerms; /* Number of terms in the query */ QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ int nextIsOr; /* Set the isOr flag on the next inserted term */ int nextIsNear; /* Set the isOr flag on the next inserted term */ int nextColumn; /* Next word parsed must be in this column */ int dfltColumn; /* The default column */ } Query; /* ** An instance of the following structure keeps track of generated ** matching-word offset information and snippets. */ typedef struct Snippet { int nMatch; /* Total number of matches */ int nAlloc; /* Space allocated for aMatch[] */ struct snippetMatch { /* One entry for each matching term */ char snStatus; /* Status flag for use while constructing snippets */ short int iCol; /* The column that contains the match */ short int iTerm; /* The index in Query.pTerms[] of the matching term */ int iToken; /* The index of the matching document token */ short int nByte; /* Number of bytes in the term */ int iStart; /* The offset to the first character of the term */ } *aMatch; /* Points to space obtained from malloc */ char *zOffset; /* Text rendering of aMatch[] */ int nOffset; /* strlen(zOffset) */ char *zSnippet; /* Snippet text */ int nSnippet; /* strlen(zSnippet) */ |
︙ | ︙ | |||
2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 | } /* ** Append a single entry to the p->aMatch[] log. */ static void snippetAppendMatch( Snippet *p, /* Append the entry to this snippet */ int iCol, int iTerm, /* The column and query term */ int iStart, int nByte /* Offset and size of the match */ ){ int i; struct snippetMatch *pMatch; if( p->nMatch+1>=p->nAlloc ){ p->nAlloc = p->nAlloc*2 + 10; p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); if( p->aMatch==0 ){ p->nMatch = 0; p->nAlloc = 0; return; } } i = p->nMatch++; pMatch = &p->aMatch[i]; pMatch->iCol = iCol; pMatch->iTerm = iTerm; pMatch->iStart = iStart; pMatch->nByte = nByte; } /* ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() */ | > > | 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 | } /* ** Append a single entry to the p->aMatch[] log. */ static void snippetAppendMatch( Snippet *p, /* Append the entry to this snippet */ int iCol, int iTerm, /* The column and query term */ int iToken, /* Matching token in document */ int iStart, int nByte /* Offset and size of the match */ ){ int i; struct snippetMatch *pMatch; if( p->nMatch+1>=p->nAlloc ){ p->nAlloc = p->nAlloc*2 + 10; p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); if( p->aMatch==0 ){ p->nMatch = 0; p->nAlloc = 0; return; } } i = p->nMatch++; pMatch = &p->aMatch[i]; pMatch->iCol = iCol; pMatch->iTerm = iTerm; pMatch->iToken = iToken; pMatch->iStart = iStart; pMatch->nByte = nByte; } /* ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() */ |
︙ | ︙ | |||
3038 3039 3040 3041 3042 3043 3044 | assert( aTerm[i].nTerm<=nToken ); if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; match |= 1<<i; if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ for(j=aTerm[i].iPhrase-1; j>=0; j--){ int k = (iRotor-j) & FTS3_ROTOR_MASK; | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 | assert( aTerm[i].nTerm<=nToken ); if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; match |= 1<<i; if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ for(j=aTerm[i].iPhrase-1; j>=0; j--){ int k = (iRotor-j) & FTS3_ROTOR_MASK; snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j, iRotorBegin[k], iRotorLen[k]); } } } prevMatch = match<<1; iRotor++; } pTModule->xClose(pTCursor); } /* ** Remove entries from the pSnippet structure to account for the NEAR ** operator. When this is called, pSnippet contains the list of token ** offsets produced by treating all NEAR operators as AND operators. ** This function removes any entries that should not be present after ** accounting for the NEAR restriction. For example, if the queried ** document is: ** ** "A B C D E A" ** ** and the query is: ** ** A NEAR/0 E ** ** then when this function is called the Snippet contains token offsets ** 0, 4 and 5. This function removes the "0" entry (because the first A ** is not near enough to an E). */ static void trimSnippetOffsetsForNear(Query *pQuery, Snippet *pSnippet){ int ii; int iDir = 1; while(iDir>-2) { assert( iDir==1 || iDir==-1 ); for(ii=0; ii<pSnippet->nMatch; ii++){ int jj; int nNear; struct snippetMatch *pMatch = &pSnippet->aMatch[ii]; QueryTerm *pQueryTerm = &pQuery->pTerms[pMatch->iTerm]; if( (pMatch->iTerm+iDir)<0 || (pMatch->iTerm+iDir)>=pQuery->nTerms ){ continue; } nNear = pQueryTerm->nNear; if( iDir<0 ){ nNear = pQueryTerm[-1].nNear; } if( pMatch->iTerm>=0 && nNear ){ int isOk = 0; int iNextTerm = pMatch->iTerm+iDir; int iPrevTerm = iNextTerm; int iEndToken; int iStartToken; if( iDir<0 ){ int nPhrase = 1; iStartToken = pMatch->iToken; while( (pMatch->iTerm+nPhrase)<pQuery->nTerms && pQuery->pTerms[pMatch->iTerm+nPhrase].iPhrase>1 ){ nPhrase++; } iEndToken = iStartToken + nPhrase - 1; }else{ iEndToken = pMatch->iToken; iStartToken = pMatch->iToken+1-pQueryTerm->iPhrase; } while( pQuery->pTerms[iNextTerm].iPhrase>1 ){ iNextTerm--; } while( (iPrevTerm+1)<pQuery->nTerms && pQuery->pTerms[iPrevTerm+1].iPhrase>1 ){ iPrevTerm++; } for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){ struct snippetMatch *p = &pSnippet->aMatch[jj]; if( p->iCol==pMatch->iCol && (( p->iTerm==iNextTerm && p->iToken>iEndToken && p->iToken<=iEndToken+nNear ) || ( p->iTerm==iPrevTerm && p->iToken<iStartToken && p->iToken>=iStartToken-nNear ))){ isOk = 1; } } if( !isOk ){ for(jj=1-pQueryTerm->iPhrase; jj<=0; jj++){ pMatch[jj].iTerm = -1; } ii = -1; iDir = 1; } } } iDir -= 2; } } /* ** Compute all offsets for the current row of the query. ** If the offsets have already been computed, this routine is a no-op. */ static void snippetAllOffsets(fulltext_cursor *p){ int nColumn; |
︙ | ︙ | |||
3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 | for(i=iFirst; i<=iLast; i++){ const char *zDoc; int nDoc; zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); nDoc = sqlite3_column_bytes(p->pStmt, i+1); snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); } } /* ** Convert the information in the aMatch[] array of the snippet ** into the string zOffset[0..nOffset-1]. */ static void snippetOffsetText(Snippet *p){ int i; int cnt = 0; StringBuffer sb; char zBuf[200]; if( p->zOffset ) return; initStringBuffer(&sb); for(i=0; i<p->nMatch; i++){ struct snippetMatch *pMatch = &p->aMatch[i]; | > > > > > > > > | | | | | > | 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 | for(i=iFirst; i<=iLast; i++){ const char *zDoc; int nDoc; zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); nDoc = sqlite3_column_bytes(p->pStmt, i+1); snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); } trimSnippetOffsetsForNear(&p->q, &p->snippet); } /* ** Convert the information in the aMatch[] array of the snippet ** into the string zOffset[0..nOffset-1]. */ static void snippetOffsetText(Snippet *p){ int i; int cnt = 0; StringBuffer sb; char zBuf[200]; if( p->zOffset ) return; initStringBuffer(&sb); for(i=0; i<p->nMatch; i++){ struct snippetMatch *pMatch = &p->aMatch[i]; if( pMatch->iTerm>=0 ){ /* If snippetMatch.iTerm is less than 0, then the match was ** discarded as part of processing the NEAR operator (see the ** trimSnippetOffsetsForNear() function for details). Ignore ** it in this case */ zBuf[0] = ' '; sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte); append(&sb, zBuf); cnt++; } } p->zOffset = stringBufferData(&sb); p->nOffset = stringBufferLength(&sb); } /* ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set |
︙ | ︙ | |||
3346 3347 3348 3349 3350 3351 3352 | ** is the first term of a phrase query, go ahead and evaluate the phrase ** query and return the doclist for the entire phrase query. ** ** The resulting DL_DOCIDS doclist is stored in pResult, which is ** overwritten. */ static int docListOfTerm( | | | | | | > > > > > > > > > > > > | > | 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 | ** is the first term of a phrase query, go ahead and evaluate the phrase ** query and return the doclist for the entire phrase query. ** ** The resulting DL_DOCIDS doclist is stored in pResult, which is ** overwritten. */ static int docListOfTerm( fulltext_vtab *v, /* The full text index */ int iColumn, /* column to restrict to. No restriction if >=nColumn */ QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ DataBuffer *pResult /* Write the result here */ ){ DataBuffer left, right, new; int i, rc; /* No phrase search if no position info. */ assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS ); /* This code should never be called with buffered updates. */ assert( v->nPendingData<0 ); dataBufferInit(&left, 0); rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, (0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS), &left); if( rc ) return rc; for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ /* If this token is connected to the next by a NEAR operator, and ** the next token is the start of a phrase, then set nPhraseRight ** to the number of tokens in the phrase. Otherwise leave it at 1. */ int nPhraseRight = 1; while( (i+nPhraseRight)<=pQTerm->nPhrase && pQTerm[i+nPhraseRight].nNear==0 ){ nPhraseRight++; } dataBufferInit(&right, 0); rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pQTerm[i].isPrefix, DL_POSITIONS, &right); if( rc ){ dataBufferDestroy(&left); return rc; } dataBufferInit(&new, 0); docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, pQTerm[i-1].nNear, pQTerm[i-1].iPhrase + nPhraseRight, ((i<pQTerm->nPhrase) ? DL_POSITIONS : DL_DOCIDS), &new); dataBufferDestroy(&left); dataBufferDestroy(&right); left = new; } *pResult = left; return SQLITE_OK; } |
︙ | ︙ | |||
3466 3467 3468 3469 3470 3471 3472 | if( rc!=SQLITE_OK ) break; if( !inPhrase && pSegment[iEnd]==':' && (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ pQuery->nextColumn = iCol; continue; } | | | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 | if( rc!=SQLITE_OK ) break; if( !inPhrase && pSegment[iEnd]==':' && (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ pQuery->nextColumn = iCol; continue; } if( !inPhrase && pQuery->nTerms>0 && nToken==2 && pSegment[iBegin+0]=='O' && pSegment[iBegin+1]=='R' ){ pQuery->nextIsOr = 1; continue; } if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4 && pSegment[iBegin+0]=='N' && pSegment[iBegin+1]=='E' && pSegment[iBegin+2]=='A' && pSegment[iBegin+3]=='R' ){ QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1]; if( (iBegin+6)<nSegment && pSegment[iBegin+4] == '/' && pSegment[iBegin+5]>='0' && pSegment[iBegin+5]<='9' ){ pTerm->nNear = (pSegment[iBegin+5] - '0'); nToken += 2; if( pSegment[iBegin+6]>='0' && pSegment[iBegin+6]<=9 ){ pTerm->nNear = pTerm->nNear * 10 + (pSegment[iBegin+6] - '0'); iEnd++; } pModule->xNext(pCursor, &pToken, &nToken, &iBegin, &iEnd, &iPos); } else { pTerm->nNear = SQLITE_FTS3_DEFAULT_NEAR_PARAM; } pTerm->nNear++; continue; } queryAdd(pQuery, pToken, nToken); if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ pQuery->pTerms[pQuery->nTerms-1].isNot = 1; } if( iEnd<nSegment && pSegment[iEnd]=='*' ){ pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1; } |
︙ | ︙ | |||
3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 | fulltext_vtab *v, /* The fulltext index */ const char *zInput, /* Input text of the query string */ int nInput, /* Size of the input text */ int dfltColumn, /* Default column of the index to match against */ Query *pQuery /* Write the parse results here. */ ){ int iInput, inPhrase = 0; if( zInput==0 ) nInput = 0; if( nInput<0 ) nInput = strlen(zInput); pQuery->nTerms = 0; pQuery->pTerms = NULL; pQuery->nextIsOr = 0; pQuery->nextColumn = dfltColumn; | > > | 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 | fulltext_vtab *v, /* The fulltext index */ const char *zInput, /* Input text of the query string */ int nInput, /* Size of the input text */ int dfltColumn, /* Default column of the index to match against */ Query *pQuery /* Write the parse results here. */ ){ int iInput, inPhrase = 0; int ii; QueryTerm *aTerm; if( zInput==0 ) nInput = 0; if( nInput<0 ) nInput = strlen(zInput); pQuery->nTerms = 0; pQuery->pTerms = NULL; pQuery->nextIsOr = 0; pQuery->nextColumn = dfltColumn; |
︙ | ︙ | |||
3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 | } if( inPhrase ){ /* unmatched quote */ queryClear(pQuery); return SQLITE_ERROR; } return SQLITE_OK; } /* TODO(shess) Refactor the code to remove this forward decl. */ static int flushPendingTerms(fulltext_vtab *v); /* Perform a full-text query using the search expression in | > > > > > > > > > > > > > > > | 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 | } if( inPhrase ){ /* unmatched quote */ queryClear(pQuery); return SQLITE_ERROR; } /* Modify the values of the QueryTerm.nPhrase variables to account for ** the NEAR operator. For the purposes of QueryTerm.nPhrase, phrases ** and tokens connected by the NEAR operator are handled as a single ** phrase. See comments above the QueryTerm structure for details. */ aTerm = pQuery->pTerms; for(ii=0; ii<pQuery->nTerms; ii++){ if( aTerm[ii].nNear || aTerm[ii].nPhrase ){ while (aTerm[ii+aTerm[ii].nPhrase].nNear) { aTerm[ii].nPhrase += (1 + aTerm[ii+aTerm[ii].nPhrase+1].nPhrase); } } } return SQLITE_OK; } /* TODO(shess) Refactor the code to remove this forward decl. */ static int flushPendingTerms(fulltext_vtab *v); /* Perform a full-text query using the search expression in |
︙ | ︙ |
Added test/fts3near.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | # 2007 October 15 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # # $Id: fts3near.test,v 1.1 2007/10/22 18:02:20 danielk1977 Exp $ # set testdir [file dirname $argv0] source $testdir/tester.tcl # If SQLITE_ENABLE_FTS3 is defined, omit this file. ifcapable !fts3 { finish_test return } db eval { CREATE VIRTUAL TABLE t1 USING fts3(content); INSERT INTO t1(content) VALUES('one three four five'); INSERT INTO t1(content) VALUES('two three four five'); INSERT INTO t1(content) VALUES('one two three four five'); } do_test fts3near-1.1 { execsql {SELECT docid FROM t1 WHERE content MATCH 'one NEAR/0 three'} } {1} do_test fts3near-1.2 { execsql {SELECT docid FROM t1 WHERE content MATCH 'one NEAR/1 two'} } {3} do_test fts3near-1.3 { execsql {SELECT docid FROM t1 WHERE content MATCH 'one NEAR/1 three'} } {1 3} do_test fts3near-1.4 { execsql {SELECT docid FROM t1 WHERE content MATCH 'three NEAR/1 one'} } {1 3} do_test fts3near-1.5 { execsql {SELECT docid FROM t1 WHERE content MATCH '"one two" NEAR/1 five'} } {} do_test fts3near-1.6 { execsql {SELECT docid FROM t1 WHERE content MATCH '"one two" NEAR/2 five'} } {3} do_test fts3near-1.7 { execsql {SELECT docid FROM t1 WHERE content MATCH 'one NEAR four'} } {1 3} do_test fts3near-1.8 { execsql {SELECT docid FROM t1 WHERE content MATCH 'four NEAR three'} } {1 2 3} do_test fts3near-1.9 { execsql {SELECT docid FROM t1 WHERE content MATCH '"four five" NEAR/0 three'} } {1 2 3} do_test fts3near-1.10 { execsql {SELECT docid FROM t1 WHERE content MATCH '"four five" NEAR/2 one'} } {1 3} do_test fts3near-1.11 { execsql {SELECT docid FROM t1 WHERE content MATCH '"four five" NEAR/1 one'} } {1} do_test fts3near-1.12 { execsql {SELECT docid FROM t1 WHERE content MATCH 'five NEAR/1 "two three"'} } {2 3} do_test fts3near-1.13 { execsql {SELECT docid FROM t1 WHERE content MATCH 'one NEAR five'} } {1 3} # Output format of the offsets() function: # # <column number> <term number> <starting offset> <number of bytes> # db eval { INSERT INTO t1(content) VALUES('A X B C D A B'); } do_test fts3near-2.1 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH 'A NEAR/0 B' } } {{0 0 10 1 0 1 12 1}} do_test fts3near-2.2 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH 'B NEAR/0 A' } } {{0 1 10 1 0 0 12 1}} do_test fts3near-2.3 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH '"C D" NEAR/0 A' } } {{0 0 6 1 0 1 8 1 0 2 10 1}} do_test fts3near-2.4 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH 'A NEAR/0 "C D"' } } {{0 1 6 1 0 2 8 1 0 0 10 1}} do_test fts3near-2.5 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH 'A NEAR A' } } {{0 0 0 1 0 1 0 1 0 0 10 1 0 1 10 1}} do_test fts3near-2.6 { execsql { INSERT INTO t1 VALUES('A A A'); SELECT offsets(t1) FROM t1 WHERE content MATCH 'A NEAR/2 A'; } } [list [list 0 0 0 1 0 1 0 1 0 0 2 1 0 1 2 1 0 0 4 1 0 1 4 1]] do_test fts3near-2.7 { execsql { DELETE FROM t1; INSERT INTO t1 VALUES('A A A A'); SELECT offsets(t1) FROM t1 WHERE content MATCH 'A NEAR A NEAR A'; } } [list [list \ 0 0 0 1 0 1 0 1 0 2 0 1 0 0 2 1 \ 0 1 2 1 0 2 2 1 0 0 4 1 0 1 4 1 \ 0 2 4 1 0 0 6 1 0 1 6 1 0 2 6 1 \ ]] db eval { DELETE FROM t1; INSERT INTO t1(content) VALUES( 'one two three two four six three six nine four eight twelve' ); } do_test fts3near-3.1 { execsql {SELECT offsets(t1) FROM t1 WHERE content MATCH 'three NEAR/1 one'} } {{0 1 0 3 0 0 8 5}} do_test fts3near-3.2 { execsql {SELECT offsets(t1) FROM t1 WHERE content MATCH 'one NEAR/1 three'} } {{0 0 0 3 0 1 8 5}} do_test fts3near-3.3 { execsql {SELECT offsets(t1) FROM t1 WHERE content MATCH 'three NEAR/1 two'} } {{0 1 4 3 0 0 8 5 0 1 14 3}} do_test fts3near-3.4 { execsql {SELECT offsets(t1) FROM t1 WHERE content MATCH 'three NEAR/2 two'} } {{0 1 4 3 0 0 8 5 0 1 14 3 0 0 27 5}} do_test fts3near-3.5 { execsql {SELECT offsets(t1) FROM t1 WHERE content MATCH 'two NEAR/2 three'} } {{0 0 4 3 0 1 8 5 0 0 14 3 0 1 27 5}} do_test fts3near-3.6 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH 'three NEAR/0 "two four"' } } {{0 0 8 5 0 1 14 3 0 2 18 4}} do_test fts3near-3.7 { execsql { SELECT offsets(t1) FROM t1 WHERE content MATCH '"two four" NEAR/0 three'} } {{0 2 8 5 0 0 14 3 0 1 18 4}} db eval { INSERT INTO t1(content) VALUES(' This specification defines Cascading Style Sheets, level 2 (CSS2). CSS2 is a style sheet language that allows authors and users to attach style (e.g., fonts, spacing, and aural cues) to structured documents (e.g., HTML documents and XML applications). By separating the presentation style of documents from the content of documents, CSS2 simplifies Web authoring and site maintenance. CSS2 builds on CSS1 (see [CSS1]) and, with very few exceptions, all valid CSS1 style sheets are valid CSS2 style sheets. CSS2 supports media-specific style sheets so that authors may tailor the presentation of their documents to visual browsers, aural devices, printers, braille devices, handheld devices, etc. This specification also supports content positioning, downloadable fonts, table layout, features for internationalization, automatic counters and numbering, and some properties related to user interface. ') } do_test fts3near-4.1 { execsql { SELECT snippet(t1) FROM t1 WHERE content MATCH 'specification NEAR supports' } } {{<b>...</b> devices, handheld devices, etc. This <b>specification</b> also <b>supports</b> content positioning, downloadable fonts, <b>...</b>}} finish_test |