/ Artifact Content
Login

Artifact 6aeb5e8061ffc0ff9a5299f27beaee3b2b4b8b336d4f107262bca338bea8f8e9:


# 2014 Dec 20
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the built-in fts5 tokenizers. 
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5tokenizer

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}


do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  DROP TABLE ft1;
}
do_execsql_test 1.1 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
  DROP TABLE ft1;
}
do_execsql_test 1.2 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
  DROP TABLE ft1;
}
do_execsql_test 1.3 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
  DROP TABLE ft1;
}
do_execsql_test 1.4 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
  DROP TABLE ft1;
}

do_catchsql_test 1.5 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
} {1 {no such tokenizer: nosuch}}

do_catchsql_test 1.6 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
} {1 {error in tokenizer constructor}}

do_execsql_test 2.0 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  INSERT INTO ft1 VALUES('embedded databases');
}
do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
do_execsql_test 2.3 { 
  SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' 
} 1

proc tcl_create {args} { 
  set ::targs $args
  error "failed" 
}
sqlite3_fts5_create_tokenizer db tcl tcl_create

foreach {tn directive expected} {
  1 {tokenize='tcl a b c'}             {a b c}
  2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
  3 {tokenize="tcl 'g' 'h' 'i'"}       {g h i}
  4 {tokenize = tcl}                   {}
} {
  do_catchsql_test 3.$tn.1 "
    CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
  " {1 {error in tokenizer constructor}}
  do_test 3.$tn.2 { set ::targs } $expected
}

do_catchsql_test 4.1 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
} {1 {parse error in "tokenize = tcl abc"}}
do_catchsql_test 4.2 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x y)
} {1 {unrecognized column option: y}}

#-------------------------------------------------------------------------
# Test the "separators" and "tokenchars" options a bit.
#
foreach {tn tokenizer} {1 ascii 2 unicode61} {
  reset_db
  set T "$tokenizer tokenchars ',.:' separators 'xyz'"
  execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
  do_execsql_test 5.$tn.1 {
    INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
  }
  foreach {tn2 token res} {
    1 abc 1     2 def 1     3 ghi 1    4 jkl {}
    5 mno {}    6 pqr {}    7 stu {}   8 jkl.mno,pqr:stu 1
    9 vw  1
  } {
    do_execsql_test 5.$tn.2.$tn2 "
      SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
    " $res
  }
}

#-------------------------------------------------------------------------
# Miscellaneous tests for the ascii tokenizer.
#
# 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
#        'separators' option. But unicode61 does not.
#
# 5.2.*: An option without an argument is an error.
#

do_test 5.1.1 {
  execsql "
    CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
    INSERT INTO a1 VALUES('abc\u1234def');
  "
  execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } 
} {}

do_test 5.1.2 {
  execsql "
    CREATE VIRTUAL TABLE a2 USING fts5(
        x, tokenize=`unicode61 separators '\u1234'`);
    INSERT INTO a2 VALUES('abc\u1234def');
  "
  execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } 
} {1}

do_catchsql_test 5.2 {
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
} {1 {error in tokenizer constructor}}
do_catchsql_test 5.3 {
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
} {1 {error in tokenizer constructor}}

#-------------------------------------------------------------------------
# Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE 
# correctly.
#

proc test_token_cb {varname token iStart iEnd} {
  upvar $varname var
  lappend var $token
  if {[llength $var]==3} { return "SQLITE_DONE" }
  return "SQLITE_OK"
}

proc tokenize {cmd} {
  set res [list]
  $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
  set res
}
sqlite3_fts5_create_function db tokenize tokenize

do_execsql_test 6.0 {
  CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
  INSERT INTO x1 VALUES('q w e r t y');
  INSERT INTO x1 VALUES('y t r e w q');
  SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
} {
  {q w e} {y t r}
}

do_execsql_test 6.1 {
  CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
  INSERT INTO x2 VALUES('q w e r t y');
  INSERT INTO x2 VALUES('y t r e w q');
  SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
} {
  {q w e} {y t r}
}


#-------------------------------------------------------------------------
# Miscellaneous tests for the unicode tokenizer.
#
do_catchsql_test 6.1 {
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.2 {
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.3 {
  CREATE VIRTUAL TABLE a3 USING fts5(
    x, y, tokenize = 'unicode61 remove_diacritics 2'
  );
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.4 {
  CREATE VIRTUAL TABLE a3 USING fts5(
    x, y, tokenize = 'unicode61 remove_diacritics 10'
  );
} {1 {error in tokenizer constructor}}

#-------------------------------------------------------------------------
# Porter tokenizer with very large tokens.
#
set a [string repeat a 100]
set b [string repeat b 500]
set c [string repeat c 1000]
do_execsql_test 7.0 {
  CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
  INSERT INTO e5 VALUES($a || ' ' || $b);
  INSERT INTO e5 VALUES($b || ' ' || $c);
  INSERT INTO e5 VALUES($c || ' ' || $a);
}

do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }

#-------------------------------------------------------------------------
# Test the 'separators' option with the unicode61 tokenizer.
#
do_execsql_test 8.1 {
  BEGIN;
  CREATE VIRTUAL TABLE e6 USING fts5(x,
    tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  );
  INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  SELECT term FROM e7;
  ROLLBACK;
} {
  brown dog fox jumped lazy over quick the
}

do_execsql_test 8.2 [subst {
  BEGIN;
  CREATE VIRTUAL TABLE e6 USING fts5(x,
    tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
  );
  INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' 
                     || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
  );
  INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  SELECT term FROM e7;
  ROLLBACK;
}] [subst {
  brown dog fox jumped lazy over quick the \u0E08 \u0E09
}]

# Test that the porter tokenizer correctly passes arguments through to
# its parent tokenizer.
do_execsql_test 8.3 {
  BEGIN;
  CREATE VIRTUAL TABLE e6 USING fts5(x,
    tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  );
  INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  SELECT term FROM e7;
  ROLLBACK;
} {
  brown dog fox jump lazi over quick the
}

#-------------------------------------------------------------------------
# Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
# implementation.
#
reset_db
proc tcl_create {args} { return "tcl_tokenize" }
sqlite3_fts5_create_tokenizer db tcl tcl_create
set ::flags [list]
proc tcl_tokenize {tflags text} {
  lappend ::flags $tflags
  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
    sqlite3_fts5_token $w $iStart $iEnd
  }
}

do_execsql_test 9.1.1 {
  CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
  INSERT INTO t1 VALUES('abc');
  INSERT INTO t1 VALUES('xyz');
} {}
do_test 9.1.2 { set ::flags } {document document}

set ::flags [list]
do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
do_test 9.2.2 { set ::flags } {query}

set ::flags [list]
do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
do_test 9.3.2 { set ::flags } {prefixquery}

set ::flags [list]
do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
do_test 9.4.2 { set ::flags } {prefixquery}

set ::flags [list]
do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
do_test 9.5.2 { set ::flags } {query}


finish_test