SQLite

Check-in [c94595a6]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix the initial-prefix optimization for the REGEXP extension such that it works even if the prefix contains characters that require a 3-byte UTF8 encoding. This should fix the problem reported by forum post 96692f8ba5.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: c94595a6e15490b432f099fefbe2429fa19287f7bdc86332cba0fd1e08f65bd6
User & Date: drh 2022-07-03 14:25:47
Context
2022-07-03
18:12
Enhance the REGEXP extension so that the end-of-input indicate ("$") is allowed to occur on one branch of an OR ("|"). Forum post 0107d5d40dd273e2, second issue. (check-in: 3c04d21e user: drh tags: trunk)
14:32
Fix the initial-prefix optimization for the REGEXP extension such that it works even if the prefix contains characters that require a 3-byte UTF8 encoding. This should fix the problem reported by forum post 96692f8ba5. (check-in: 7a32cccc user: drh tags: branch-3.39)
14:25
Fix the initial-prefix optimization for the REGEXP extension such that it works even if the prefix contains characters that require a 3-byte UTF8 encoding. This should fix the problem reported by forum post 96692f8ba5. (check-in: c94595a6 user: drh tags: trunk)
11:16
Improved comment on sqlite3VdbeSwap(). No changes to code. (check-in: 6a8e4fb7 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/misc/regexp.c.
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
      unsigned x = pRe->aArg[i];
      if( x<=127 ){
        pRe->zInit[j++] = (unsigned char)x;
      }else if( x<=0xfff ){
        pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else if( x<=0xffff ){
        pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12));
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else{
        break;
      }
    }
    if( j>0 && pRe->zInit[j-1]==0 ) j--;







|







681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
      unsigned x = pRe->aArg[i];
      if( x<=127 ){
        pRe->zInit[j++] = (unsigned char)x;
      }else if( x<=0xfff ){
        pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else if( x<=0xffff ){
        pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12));
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else{
        break;
      }
    }
    if( j>0 && pRe->zInit[j-1]==0 ) j--;
Changes to test/regexp1.test.
235
236
237
238
239
240
241






















242
         'abc$¢€xyz' REGEXP '^abc[\u0024\u00A2\u20AC]{3}xyz$',
         'abc$¢€xyz' REGEXP '^abc[\x24][\xa2\u20ac]+xyz$'
} {1 1 1}
do_execsql_test regexp1-2.22 {
  SELECT 'abc$¢€xyz' REGEXP '^abc[^\u0025-X][^ -\u007f][^\u20ab]xyz$'
} {1}























finish_test







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
         'abc$¢€xyz' REGEXP '^abc[\u0024\u00A2\u20AC]{3}xyz$',
         'abc$¢€xyz' REGEXP '^abc[\x24][\xa2\u20ac]+xyz$'
} {1 1 1}
do_execsql_test regexp1-2.22 {
  SELECT 'abc$¢€xyz' REGEXP '^abc[^\u0025-X][^ -\u007f][^\u20ab]xyz$'
} {1}

# 2022-07-03
# https://sqlite.org/forum/forumpost/96692f8ba5
# The REGEXP extension mishandles the prefix search optimization when
# the prefix contains 3-byte UTF8 characters.
#
reset_db
load_static_extension db regexp
do_execsql_test regexp1-3.1 {
  CREATE TABLE t1(id INTEGER PRIMARY KEY, a TEXT);
  INSERT INTO t1(id, a) VALUES(1, '日本語');
  SELECT a, hex(a), length(a) FROM t1;
} {日本語 E697A5E69CACE8AA9E 3}
do_execsql_test regexp1-3.2 {
  SELECT * FROM t1 WHERE a='日本語';
} {1 日本語}
do_execsql_test regexp1-3.3 {
  SELECT * FROM t1 WHERE a LIKE '日本語';
} {1 日本語}
do_execsql_test regexp1-3.4 {
  SELECT * FROM t1 wHERE a REGEXP '日本語';
} {1 日本語}

finish_test