SQLite

Check-in [b7b7bde9]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Prevent fts5 tokenizer unicode61 from considering '\0' to be a token characters, even if other characters of class "Cc" are. See forum thread 09609d7e22 for details.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: b7b7bde9b7a03665e3691c6d51118965f216d2dfb1617f138b9f9e60e418ed2f
User & Date: dan 2020-10-26 13:24:36
Original Comment: Prevent fts5 tokenizer unicode61 from considering '\0' to be a token characters, even if other characters of class "Cc" are.
References
2020-11-26
20:13
Update mkunicode.tcl to match the change erroneously made to machine generated file fts5_unicode2.c in [b7b7bde9]. (check-in: 326d579d user: dan tags: trunk)
Context
2020-10-26
16:22
Ensure that the table argument passed to Tcl_GetIndexFromObjStruct() in the sessions module test code is declared "static". (check-in: 80eba105 user: dan tags: trunk)
13:24
Prevent fts5 tokenizer unicode61 from considering '\0' to be a token characters, even if other characters of class "Cc" are. See forum thread 09609d7e22 for details. (check-in: b7b7bde9 user: dan tags: trunk)
2020-10-22
18:50
Minor tweaks to query planning weights so that when STAT4 is enabled and functioning, a full table scan is more likely to be selected if that seems like the fastest solution. Only do this when STAT4 info is available because an error has a large potential downside. (check-in: 0e7e113d user: drh tags: trunk)
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5_unicode2.c.

769
770
771
772
773
774
775

776
    int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
    int n = (aFts5UnicodeData[iTbl] >> 5) + i;
    for(; i<128 && i<n; i++){
      aAscii[i] = (u8)bToken;
    }
    iTbl++;
  }

}







>

769
770
771
772
773
774
775
776
777
    int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
    int n = (aFts5UnicodeData[iTbl] >> 5) + i;
    for(; i<128 && i<n; i++){
      aAscii[i] = (u8)bToken;
    }
    iTbl++;
  }
  aAscii[0] = 0;                  /* 0x00 is never a token character */
}

Changes to ext/fts5/test/fts5tok1.test.

106
107
108
109
110
111
112



































113
114
115
  CREATE VIRTUAL TABLE tX USING fts5tokenize(nosuchtokenizer);
} {1 {vtable constructor failed: tX}}

do_catchsql_test 2.1 {
  CREATE VIRTUAL TABLE t4 USING fts5tokenize;
  SELECT * FROM t4;
} {1 {SQL logic error}}





































finish_test







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
  CREATE VIRTUAL TABLE tX USING fts5tokenize(nosuchtokenizer);
} {1 {vtable constructor failed: tX}}

do_catchsql_test 2.1 {
  CREATE VIRTUAL TABLE t4 USING fts5tokenize;
  SELECT * FROM t4;
} {1 {SQL logic error}}

#-------------------------------------------------------------------------
# Embedded 0x00 characters.
#
reset_db
do_execsql_test 3.1.0 {
  CREATE VIRTUAL TABLE t1 USING fts5(z);
  CREATE VIRTUAL TABLE tt USING fts5vocab(t1, 'instance');
  INSERT INTO t1 VALUES('abc' || char(0) || 'def');
  SELECT * FROM tt;
} { abc 1 z 0 def 1 z 1 }
do_execsql_test 3.1.1 {
  SELECT hex(z) FROM t1;
} {61626300646566}
do_execsql_test 3.1.2 {
  INSERT INTO t1(t1) VALUES('integrity-check');
} {}

do_execsql_test 3.2.0 {
  CREATE VIRTUAL TABLE t2 USING fts5(z, 
      tokenize="unicode61 categories 'L* N* Co Cc'"
  );
  CREATE VIRTUAL TABLE tu USING fts5vocab(t2, 'instance');

  INSERT INTO t2 VALUES('abc' || char(0) || 'def');
  SELECT * FROM tu;
} { abc 1 z 0 def 1 z 1 }

do_execsql_test 3.2.1 {
  SELECT hex(z) FROM t1;
} {61626300646566}

do_execsql_test 3.2.2 {
  INSERT INTO t1(t1) VALUES('integrity-check');
} {}


finish_test