/ Check-in [0e91a6a5]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improve test coverage of fts5_tokenize.c.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: 0e91a6a520f040b8902da6a1a4d9107dc66c0ea3
User & Date: dan 2015-05-20 09:27:51
Context
2015-05-22
06:08
Improve test coverage of fts5_unicode2.c. check-in: fea8a4db user: dan tags: fts5
2015-05-20
09:27
Improve test coverage of fts5_tokenize.c. check-in: 0e91a6a5 user: dan tags: fts5
2015-05-19
19:37
Add tests for fts5 tokenizers. check-in: 4f90ba20 user: dan tags: fts5
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5_tokenize.c.

662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
....
1163
1164
1165
1166
1167
1168
1169

1170
1171
1172
1173
1174
1175
1176
1177
    }
    return ((mask & 0x0007)==0x0005);
  }
}

/* porter rule condition: (m > 1 and (*S or *T)) */
static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
  return nStem>0
      && (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
      && fts5Porter_MGt1(zStem, nStem);
}

/* porter rule condition: (*v*) */
static int fts5Porter_Vowel(char *zStem, int nStem){
  int i;
  for(i=0; i<nStem; i++){
................................................................................

  /* Steps 2 through 4. */
  fts5PorterStep2(aBuf, &nBuf);
  fts5PorterStep3(aBuf, &nBuf);
  fts5PorterStep4(aBuf, &nBuf);

  /* Step 5a. */

  if( nBuf>0 && aBuf[nBuf-1]=='e' ){
    if( fts5Porter_MGt1(aBuf, nBuf-1) 
     || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
    ){
      nBuf--;
    }
  }








|
|







 







>
|







662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
....
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
    }
    return ((mask & 0x0007)==0x0005);
  }
}

/* porter rule condition: (m > 1 and (*S or *T)) */
static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
  assert( nStem>0 );
  return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t') 
      && fts5Porter_MGt1(zStem, nStem);
}

/* porter rule condition: (*v*) */
static int fts5Porter_Vowel(char *zStem, int nStem){
  int i;
  for(i=0; i<nStem; i++){
................................................................................

  /* Steps 2 through 4. */
  fts5PorterStep2(aBuf, &nBuf);
  fts5PorterStep3(aBuf, &nBuf);
  fts5PorterStep4(aBuf, &nBuf);

  /* Step 5a. */
  assert( nBuf>0 );
  if( aBuf[nBuf-1]=='e' ){
    if( fts5Porter_MGt1(aBuf, nBuf-1) 
     || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
    ){
      nBuf--;
    }
  }

Added ext/fts5/test/fts5porter2.test.

































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# 2014 Dec 20
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 porter stemmer implementation.
#
# These are extra tests added to those in fts5porter.test in order to
# improve test coverage of the porter stemmer implementation.
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5porter2

set test_vocab {
  tion          tion
  ation         ation
  vation        vation
  avation       avat
  vion          vion
  ion           ion
  relational    relat
  relation      relat
  relate        relat
  zzz           zzz
  ii            ii
  iiing         ii
  xtional       xtional
  xenci         xenci
  xlogi         xlogi
  realization   realiz
  realize       realiz
  xization      xizat
  capitalism    capit
  talism        talism
  xiveness      xive
  xfulness      xful
  xousness      xous
  xical         xical
  xicate        xicat
  xicity        xiciti
  ies           ie
  eed           e
  eing           e
  s             s
}

set i 0
foreach {in out} $test_vocab {
  do_test "1.$i.($in -> $out)" {
    lindex [sqlite3_fts5_tokenize db porter $in] 0
  } $out
  incr i
}


finish_test

Changes to ext/fts5/test/fts5tokenizer.test.

205
206
207
208
209
210
211


212





























213
214
  INSERT INTO e5 VALUES($c || ' ' || $a);
}

do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }

































finish_test








>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
  INSERT INTO e5 VALUES($c || ' ' || $a);
}

do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }

#-------------------------------------------------------------------------
# Test the 'separators' option with the unicode61 tokenizer.
#
do_execsql_test 8.1 {
  BEGIN;
  CREATE VIRTUAL TABLE e6 USING fts5(x,
    tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  );
  INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  SELECT term FROM e7;
  ROLLBACK;
} {
  brown dog fox jumped lazy over quick the
}

do_execsql_test 8.2 [subst {
  BEGIN;
  CREATE VIRTUAL TABLE e6 USING fts5(x,
    tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
  );
  INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' 
                     || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
  );
  INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  SELECT term FROM e7;
  ROLLBACK;
}] [subst {
  brown dog fox jumped lazy over quick the \u0E08 \u0E09
}]

finish_test

Changes to ext/fts5/test/fts5unicode2.test.

65
66
67
68
69
70
71






72
73
74
75
76
77
78
...
221
222
223
224
225
226
227




228
229
230
231
232
233
234
...
248
249
250
251
252
253
254









255
256
257
258
259
260
261

# Check that diacritics are removed if remove_diacritics=1 is specified.
# And that they do not break tokens.
do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"

# Title-case mappings work
do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"







#-------------------------------------------------------------------------
#
set docs [list {
  Enhance the INSERT syntax to allow multiple rows to be inserted via the
  VALUES clause.
} {
................................................................................
  execsql {
    CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);
    INSERT INTO t1 VALUES($a);
    INSERT INTO t1 VALUES($b);
    INSERT INTO t1 VALUES($c);
    INSERT INTO t1 VALUES($d);
  }




} {}

do_test 4.2 {
  set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
  set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
  set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
  set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
................................................................................
  execsql {
    INSERT INTO t1 VALUES($a);
    INSERT INTO t1 VALUES($b);
    INSERT INTO t1 VALUES($c);
    INSERT INTO t1 VALUES($d);
  }
} {}











#-------------------------------------------------------------------------

breakpoint
do_unicode_token_test3 5.1 {tokenchars {}} {
  sqlite3_reset sqlite3_column_int







>
>
>
>
>
>







 







>
>
>
>







 







>
>
>
>
>
>
>
>
>







65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
...
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
...
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

# Check that diacritics are removed if remove_diacritics=1 is specified.
# And that they do not break tokens.
do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"

# Title-case mappings work
do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"

do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \
    "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3"

do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \
    "abc abc def def"

#-------------------------------------------------------------------------
#
set docs [list {
  Enhance the INSERT syntax to allow multiple rows to be inserted via the
  VALUES clause.
} {
................................................................................
  execsql {
    CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);
    INSERT INTO t1 VALUES($a);
    INSERT INTO t1 VALUES($b);
    INSERT INTO t1 VALUES($c);
    INSERT INTO t1 VALUES($d);
  }

  execsql "CREATE VIRTUAL TABLE t8 USING fts5(
      a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\"
  )"
} {}

do_test 4.2 {
  set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
  set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
  set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
  set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
................................................................................
  execsql {
    INSERT INTO t1 VALUES($a);
    INSERT INTO t1 VALUES($b);
    INSERT INTO t1 VALUES($c);
    INSERT INTO t1 VALUES($d);
  }
} {}

do_test 4.4 {
  sqlite3_exec_hex db {
    CREATE VIRTUAL TABLE t9 USING fts5(a, b, 
      tokenize="unicode61 separators '%C09004'"
    );
    INSERT INTO t9(a) VALUES('abc%88def %89ghi%90');
  }
} {0 {}}


#-------------------------------------------------------------------------

breakpoint
do_unicode_token_test3 5.1 {tokenchars {}} {
  sqlite3_reset sqlite3_column_int