Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add support for NEAR queries to fts5.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ed403fecf2dec234e66a14a88c9e31ca60234864
User & Date: dan 2012-12-28 20:01:01.340
Context
2012-12-29
09:56
Add support for the AND, OR and NOT operators to fts5. check-in: c2efd983b0 user: dan tags: trunk
2012-12-28
20:01
Add support for NEAR queries to fts5. check-in: ed403fecf2 user: dan tags: trunk
18:57
Add support for phrase queries to fts5. check-in: 0780ef9305 user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/fts5.c.
206
207
208
209
210
211
212




213
214
215
216
217
218
219
      bRet = 0;
    }
  }

  p->iList = i;
  return bRet;
}





static void fts5InstanceListAppend(
  InstanceList *p,                /* Instance list to append to */
  int iCol,                       /* Column of new entry */
  int iWeight,                    /* Weight of new entry */
  int iOff                        /* Offset of new entry */
){







>
>
>
>







206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
      bRet = 0;
    }
  }

  p->iList = i;
  return bRet;
}

static int fts5InstanceListEof(InstanceList *p){
  return (p->iList>=p->nList);
}

static void fts5InstanceListAppend(
  InstanceList *p,                /* Instance list to append to */
  int iCol,                       /* Column of new entry */
  int iWeight,                    /* Weight of new entry */
  int iOff                        /* Offset of new entry */
){
1580
1581
1582
1583
1584
1585
1586
1587



















































































































1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615


1616
1617

1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649

1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663

1664



1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
      *pbEof = 1;
      return 0;
    }
  }

  return (p->iCol==pFirst->iCol && p->iOff==iReq);
}




















































































































/*
** This function tests if the cursors embedded in the Fts5Str object
** currently point to a match for the entire string. If so, *pbMatch
** is set to true before returning.
**
** If the cursors do not point to a match, then *piAdvance is set to
** the index of the individual cursor that should be advanced before
** retrying this function. Or, if one or more of the cursors are at EOF
** (implying that there will be no further matches), *piAdvance is set
** to a negative value.
*/
static int fts5StringIsMatch(
  Fts5Cursor *pCsr,               /* Cursor that owns this string */
  Fts5Str *pStr,                  /* String expression to test */
  int *pbMatch,                   /* OUT: True for a match, false otherwise */
  int *piAdvance                  /* OUT: Token to advance before retrying */
){
  const u8 *aPk1 = 0;
  int nPk1 = 0;
  int rc = SQLITE4_OK;
  int i;

  *pbMatch = 0;
  *piAdvance = 0;
  pStr->nList = 0;

  rc = fts5TokenPk(&pStr->aToken[0], &aPk1, &nPk1);
  for(i=1; rc==SQLITE4_OK && i<pStr->nToken; i++){


    const u8 *aPk = 0;
    int nPk = 0;

    rc = fts5TokenPk(&pStr->aToken[i], &aPk, &nPk);
    if( rc==SQLITE4_OK ){
      int res = fts5KeyCompare(aPk1, nPk1, aPk, nPk);
      if( res<0 ){
        return SQLITE4_OK;
      }
      if( res>0 ){
        *piAdvance = i;
        return SQLITE4_OK;
      }
    }
  }

  if( pStr->nToken>1 ){
    /* At this point, it is established that all of the token cursors in the
    ** string point to an entry with the same primary key. Now synthesize a
    ** position list for the entire string. */
    int bEof = 0;
    int nByte = sizeof(InstanceList) * pStr->nToken;
    InstanceList *aIn;
    InstanceList out;

    memset(&out, 0, sizeof(InstanceList));

    aIn = (InstanceList *)sqlite4DbMallocZero(pCsr->db, nByte);
    if( !aIn ) rc = SQLITE4_NOMEM;
    for(i=0; rc==SQLITE4_OK && i<pStr->nToken; i++){
      const u8 *aData;
      int nData;
      rc = sqlite4KVCursorData(pStr->aToken[i].pCsr, 0, -1, &aData, &nData);
      if( rc==SQLITE4_OK ){
        fts5InstanceListInit((u8 *)aData, nData, &aIn[i]);

        fts5InstanceListNext(&aIn[i]);
      }
    }

    /* Allocate the output list */
    if( rc==SQLITE4_OK ){
      int nReq = aIn[0].nList;
      if( nReq<=pStr->nListAlloc ){
        out.aList = pStr->aList;
        out.nList = pStr->nListAlloc;
      }else{
        pStr->aList = out.aList = sqlite4DbMallocZero(pCsr->db, nReq*2);
        pStr->nListAlloc = out.nList = nReq*2;
        if( out.aList==0 ) rc = SQLITE4_NOMEM;

      }



    }

    while( rc==SQLITE4_OK && bEof==0 ){
      for(i=1; i<pStr->nToken; i++){
        int bMatch = fts5TokenAdvanceToMatch(&aIn[i], &aIn[0], i, &bEof);
        if( bMatch==0 || bEof ) break;
      }
      if( i==pStr->nToken ){
        /* Record a match here */
        fts5InstanceListAppend(&out, aIn[0].iCol, aIn[0].iWeight, aIn[0].iOff);
        *pbMatch = 1;
      }
      bEof = fts5InstanceListNext(&aIn[0]);
    }

    pStr->nList = out.iList;
    sqlite4DbFree(pCsr->db, aIn);
  }else{
    *pbMatch = 1;
  }

  return rc;
}

static int fts5StringAdvanceToMatch(Fts5Cursor *pCsr, Fts5Str *pStr){
  int rc;
  do {
    int bMatch;
    int iAdvance;
    rc = fts5StringIsMatch(pCsr, pStr, &bMatch, &iAdvance);
    if( rc!=SQLITE4_OK || bMatch ) break;
    rc = sqlite4KVCursorNext(pStr->aToken[iAdvance].pCsr);
  }while( rc==SQLITE4_OK );
  return rc;
}

static int fts5StringAdvance(Fts5Cursor *pCsr, Fts5Str *pStr){
  int rc = sqlite4KVCursorNext(pStr->aToken[0].pCsr);
  if( rc==SQLITE4_OK ) rc = fts5StringAdvanceToMatch(pCsr, pStr);
  return rc;
}

static int fts5PhraseAdvanceToMatch(Fts5Cursor *pCsr, Fts5Phrase *pPhrase){
  return fts5StringAdvanceToMatch(pCsr, &pPhrase->aStr[0]);
}

static int fts5PhraseAdvance(Fts5Cursor *pCsr, Fts5Phrase *pPhrase){
  return fts5StringAdvance(pCsr, &pPhrase->aStr[0]);
}

int sqlite4Fts5Next(Fts5Cursor *pCsr){
  Fts5Phrase *pPhrase;
  int rc;

  assert( pCsr->pExpr->pRoot->eType==TOKEN_PRIMITIVE );








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
|


|
|
|
<
<

|

|

|







|
<

|
|
>
>
|
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
|
<
|
<
<
|
<
<
<
<
<
>
|
|
|
|
<
|
|
<
<
<
<
<
<
<
>
|
>
>
>
|
<
<
<
<
<
|
<
<
<
|
<
<
<
<
<
<
<
<
<
<



|



|
|

|




|
|
|

<
<
<
<
<
<
<
<







1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714


1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728

1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752





1753

1754


1755





1756
1757
1758
1759
1760

1761
1762







1763
1764
1765
1766
1767
1768





1769



1770










1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789








1790
1791
1792
1793
1794
1795
1796
      *pbEof = 1;
      return 0;
    }
  }

  return (p->iCol==pFirst->iCol && p->iOff==iReq);
}

static int fts5StringFindInstances(Fts5Cursor *pCsr, Fts5Str *pStr){
  int i;
  int rc = SQLITE4_OK;
  int bEof = 0;
  int nByte = sizeof(InstanceList) * pStr->nToken;
  InstanceList *aIn;
  InstanceList out;

  pStr->nList = 0;
  memset(&out, 0, sizeof(InstanceList));

  aIn = (InstanceList *)sqlite4DbMallocZero(pCsr->db, nByte);
  if( !aIn ) rc = SQLITE4_NOMEM;
  for(i=0; rc==SQLITE4_OK && i<pStr->nToken; i++){
    const u8 *aData;
    int nData;
    rc = sqlite4KVCursorData(pStr->aToken[i].pCsr, 0, -1, &aData, &nData);
    if( rc==SQLITE4_OK ){
      fts5InstanceListInit((u8 *)aData, nData, &aIn[i]);
      fts5InstanceListNext(&aIn[i]);
    }
  }

  /* Allocate the output list */
  if( rc==SQLITE4_OK ){
    int nReq = aIn[0].nList;
    if( nReq<=pStr->nListAlloc ){
      out.aList = pStr->aList;
      out.nList = pStr->nListAlloc;
    }else{
      pStr->aList = out.aList = sqlite4DbMallocZero(pCsr->db, nReq*2);
      pStr->nListAlloc = out.nList = nReq*2;
      if( out.aList==0 ) rc = SQLITE4_NOMEM;
    }
  }

  while( rc==SQLITE4_OK && bEof==0 ){
    for(i=1; i<pStr->nToken; i++){
      int bMatch = fts5TokenAdvanceToMatch(&aIn[i], &aIn[0], i, &bEof);
      if( bMatch==0 || bEof ) break;
    }
    if( i==pStr->nToken ){
      /* Record a match here */
      fts5InstanceListAppend(&out, aIn[0].iCol, aIn[0].iWeight, aIn[0].iOff);
    }
    bEof = fts5InstanceListNext(&aIn[0]);
  }

  pStr->nList = out.iList;
  sqlite4DbFree(pCsr->db, aIn);

  return rc;
}

static int fts5IsNear(InstanceList *p1, InstanceList *p2, int nNear){
  if( p1->iCol==p2->iCol && p1->iOff<p2->iOff && (p1->iOff+nNear)>=p2->iOff ){
    return 1;
  }
  return 0;
}

static int fts5StringNearTrim(
  Fts5Cursor *pCsr,               /* Cursor object that owns both strings */
  Fts5Str *pTrim,                 /* Trim this instance list */
  Fts5Str *pNext,                 /* According to this one */
  int nNear
){
  if( pNext->nList==0 ){
    pTrim->nList = 0;
  }else{
    int bEof = 0;
    int nTrail = nNear + (pNext->nToken-1) + 1;
    int nLead = nNear + (pTrim->nToken-1) + 1;

    InstanceList trail;
    InstanceList lead;
    InstanceList in;
    InstanceList out;

    fts5InstanceListInit(pNext->aList, pNext->nList, &trail);
    fts5InstanceListInit(pNext->aList, pNext->nList, &lead);
    fts5InstanceListInit(pTrim->aList, pTrim->nList, &in);
    fts5InstanceListInit(pTrim->aList, pTrim->nList, &out);
    fts5InstanceListNext(&trail);
    fts5InstanceListNext(&lead);
    fts5InstanceListNext(&in);

    while( bEof==0 ){
      /* Check if the current position is a match */
      if( fts5IsNear(&trail, &in, nTrail) 
       || fts5IsNear(&in, &lead, nLead)
      ){
        fts5InstanceListAppend(&out, in.iCol, in.iWeight, in.iOff);
        bEof = fts5InstanceListNext(&in);
      }else{
        if( fts5InstanceListEof(&trail)==0
         && (trail.iCol<in.iCol || trail.iOff>=in.iOff) 
        ){
          fts5InstanceListNext(&trail);
        }
        else if( (lead.iList<lead.nList)
         && (lead.iCol<in.iCol || lead.iOff<=in.iOff)
        ){
          fts5InstanceListNext(&trail);
        }else{
          bEof = fts5InstanceListNext(&in);
        }
      }
    }

    pTrim->nList = out.iList;
  }
  return SQLITE4_OK;
}

/*
** This function tests if the cursors embedded in the Fts5Phrase object
** currently point to a match for the entire phrase. If so, *pbMatch
** is set to true before returning.
**
** If the cursors do not point to a match, then *ppAdvance is set to
** the token of the individual cursor that should be advanced before
** retrying this function.


*/
static int fts5PhraseIsMatch(
  Fts5Cursor *pCsr,               /* Cursor that owns this string */
  Fts5Phrase *pPhrase,            /* Phrase to test */
  int *pbMatch,                   /* OUT: True for a match, false otherwise */
  Fts5Token **ppAdvance           /* OUT: Token to advance before retrying */
){
  const u8 *aPk1 = 0;
  int nPk1 = 0;
  int rc = SQLITE4_OK;
  int i;

  *pbMatch = 0;
  *ppAdvance = &pPhrase->aStr[0].aToken[0];


  rc = fts5TokenPk(*ppAdvance, &aPk1, &nPk1);
  for(i=0; rc==SQLITE4_OK && i<pPhrase->nStr; i++){
    int j;
    for(j=(i==0); j<pPhrase->aStr[i].nToken; j++){
      const u8 *aPk = 0;
      int nPk = 0;
      Fts5Token *pToken = &pPhrase->aStr[i].aToken[j];
      rc = fts5TokenPk(pToken, &aPk, &nPk);
      if( rc==SQLITE4_OK ){
        int res = fts5KeyCompare(aPk1, nPk1, aPk, nPk);
        if( res<0 ){
          return SQLITE4_OK;
        }
        if( res>0 ){
          *ppAdvance = pToken;
          return SQLITE4_OK;
        }
      }
    }
  }

  /* At this point, it is established that all of the token cursors in the
  ** phrase point to an entry with the same primary key. Now figure out if





  ** the various string constraints are met. Along the way, synthesize a 

  ** position list for each Fts5Str object.  */


  for(i=0; rc==SQLITE4_OK && i<pPhrase->nStr; i++){





    Fts5Str *pStr = &pPhrase->aStr[i];
    rc = fts5StringFindInstances(pCsr, pStr);
  }

  /* Trim the instance lists according to any NEAR constraints.  */

  for(i=1; rc==SQLITE4_OK && i<pPhrase->nStr; i++){
    int n = pPhrase->aiNear[i-1];







    rc = fts5StringNearTrim(pCsr, &pPhrase->aStr[i], &pPhrase->aStr[i-1], n);
  }
  for(i=pPhrase->nStr-1; rc==SQLITE4_OK && i>0; i--){
    int n = pPhrase->aiNear[i-1];
    rc = fts5StringNearTrim(pCsr, &pPhrase->aStr[i-1], &pPhrase->aStr[i], n);
  }









  *pbMatch = (pPhrase->aStr[0].nList>0);










  return rc;
}

static int fts5PhraseAdvanceToMatch(Fts5Cursor *pCsr, Fts5Phrase *pPhrase){
  int rc;
  do {
    int bMatch;
    Fts5Token *pAdvance = 0;
    rc = fts5PhraseIsMatch(pCsr, pPhrase, &bMatch, &pAdvance);
    if( rc!=SQLITE4_OK || bMatch ) break;
    rc = sqlite4KVCursorNext(pAdvance->pCsr);
  }while( rc==SQLITE4_OK );
  return rc;
}

static int fts5PhraseAdvance(Fts5Cursor *pCsr, Fts5Phrase *pPhrase){
  int rc = sqlite4KVCursorNext(pPhrase->aStr[0].aToken[0].pCsr);
  if( rc==SQLITE4_OK ) rc = fts5PhraseAdvanceToMatch(pCsr, pPhrase);
  return rc;








}

int sqlite4Fts5Next(Fts5Cursor *pCsr){
  Fts5Phrase *pPhrase;
  int rc;

  assert( pCsr->pExpr->pRoot->eType==TOKEN_PRIMITIVE );
Changes to test/fts5query1.test.
50
51
52
53
54
55
56

























57
58
59
  1 {SELECT x FROM t1 WHERE t1 MATCH 't'}   {2 3}
  2 {SELECT x FROM t1 WHERE t1 MATCH 'abc'} {}
  3 {SELECT x FROM t1 WHERE t1 MATCH 't+h'} {3}
  4 {SELECT x FROM t1 WHERE t1 MATCH 't+o'} {}
} {
  do_execsql_test 2.$tn $stmt $res
}


























finish_test








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  1 {SELECT x FROM t1 WHERE t1 MATCH 't'}   {2 3}
  2 {SELECT x FROM t1 WHERE t1 MATCH 'abc'} {}
  3 {SELECT x FROM t1 WHERE t1 MATCH 't+h'} {3}
  4 {SELECT x FROM t1 WHERE t1 MATCH 't+o'} {}
} {
  do_execsql_test 2.$tn $stmt $res
}

do_execsql_test 3.0 {
  DROP TABLE t1;
  CREATE TABLE t1(x PRIMARY KEY, y);
  CREATE INDEX i1 ON t1 USING fts5();

  INSERT INTO t1 VALUES(1, 'a b c d e f g h i j k l m n o p q r s t u');
  INSERT INTO t1 VALUES(2, 'a e i o u b c d f g h j k l m n p q r s t');
  INSERT INTO t1 VALUES(3, 'b c d f g h j k l m n p q r s t v w x y z');
  INSERT INTO t1 VALUES(4, 'a e i o u');
}

foreach {tn stmt res} {
  1 {SELECT x FROM t1 WHERE t1 MATCH 'a NEAR/5 i'}     {2 4}
  2 {SELECT x FROM t1 WHERE t1 MATCH 'a NEAR/3 b'}     {1}
  3 {SELECT x FROM t1 WHERE t1 MATCH 'a NEAR/2 d'}     {1}
  4 {SELECT x FROM t1 WHERE t1 MATCH 'a NEAR/2 e'}     {2 4}
  5 {SELECT x FROM t1 WHERE t1 MATCH 'a NEAR/3 e'}     {1 2 4}
  6 {SELECT x FROM t1 WHERE t1 MATCH 'b+c NEAR/2 g+h'} {2 3}
  7 {SELECT x FROM t1 WHERE t1 MATCH 'b+c NEAR/3 g+h'} {1 2 3}
  8 {SELECT x FROM t1 WHERE t1 MATCH 'b+c NEAR/2 g+h+j'} {2 3}
  9 {SELECT x FROM t1 WHERE t1 MATCH 'b+c+d NEAR/1 g+h'} {2 3}
} {
  do_execsql_test 3.$tn $stmt $res
}

finish_test