SQLite

Check-in [abb18f61]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix corner cases in UTF8 handling in the REGEXP extension. Forum post 3ffe058b04.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: abb18f61c5cec0f524acc41453b4c06b61c5af51ff46417588837fc0c3967288
User & Date: drh 2022-11-17 19:24:39
Context
2022-12-26
16:03
Fix corner cases in UTF8 handling in the REGEXP extension. Forum post 3ffe058b04. (check-in: 93e68b39 user: drh tags: branch-3.40)
2022-11-18
17:50
Add the SQLITE_DBCONFIG_LENIENT_JSON configuration option. Modify the built-in JSON routines such that when this setting is active, arguments that that ought to be JSON but still give a reasonable result (ex: NULL) rather than raising an error. (Leaf check-in: 186db57d user: drh tags: lenient-json)
15:22
shell.c.in: on non-Windows platforms, check for $XDG_CONFIG_HOME/sqlite3/sqliterc before ~/.sqliterc, per request in forum post 7a16582b1e403c81. (check-in: 17065d09 user: stephan tags: trunk)
2022-11-17
19:24
Fix corner cases in UTF8 handling in the REGEXP extension. Forum post 3ffe058b04. (check-in: abb18f61 user: drh tags: trunk)
14:40
Use the log10() and log2() functions from the standard C library to implement the equivalent SQL functions, in the hope that this will prevent reported precision problems. See forum post cfceb1230bdcfd84 and the surrounding thread. (check-in: 7c572d02 user: drh tags: trunk)
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/misc/regexp.c.

181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
      c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
      if( c<0x80 ) c = 0xfffd;
    }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
           && (p->z[p->i+1]&0xc0)==0x80 ){
      c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
      p->i += 2;
      if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
    }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
           && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
      c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
                       | (p->z[p->i+2]&0x3f);
      p->i += 3;
      if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
    }else{
      c = 0xfffd;







|







181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
      c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
      if( c<0x80 ) c = 0xfffd;
    }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
           && (p->z[p->i+1]&0xc0)==0x80 ){
      c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
      p->i += 2;
      if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
    }else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80
           && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
      c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
                       | (p->z[p->i+2]&0x3f);
      p->i += 3;
      if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
    }else{
      c = 0xfffd;
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
  }

  /* The following is a performance optimization.  If the regex begins with
  ** ".*" (if the input regex lacks an initial "^") and afterwards there are
  ** one or more matching characters, enter those matching characters into
  ** zInit[].  The re_match() routine can then search ahead in the input 
  ** string looking for the initial match without having to run the whole
  ** regex engine over the string.  Do not worry able trying to match
  ** unicode characters beyond plane 0 - those are very rare and this is
  ** just an optimization. */
  if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){
    for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
      unsigned x = pRe->aArg[i];
      if( x<=127 ){
        pRe->zInit[j++] = (unsigned char)x;
      }else if( x<=0xfff ){
        pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else if( x<=0xffff ){
        pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12));
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else{







|





|

|







708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
  }

  /* The following is a performance optimization.  If the regex begins with
  ** ".*" (if the input regex lacks an initial "^") and afterwards there are
  ** one or more matching characters, enter those matching characters into
  ** zInit[].  The re_match() routine can then search ahead in the input 
  ** string looking for the initial match without having to run the whole
  ** regex engine over the string.  Do not worry about trying to match
  ** unicode characters beyond plane 0 - those are very rare and this is
  ** just an optimization. */
  if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){
    for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
      unsigned x = pRe->aArg[i];
      if( x<=0x7f ){
        pRe->zInit[j++] = (unsigned char)x;
      }else if( x<=0x7ff ){
        pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else if( x<=0xffff ){
        pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12));
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        pRe->zInit[j++] = 0x80 | (x&0x3f);
      }else{

Changes to test/regexp1.test.

299
300
301
302
303
304
305



























306
307
308
do_execsql_test regexp1-6.4 {SELECT 'foo' REGEXP '(^[a-z]+)$';} {1}
do_execsql_test regexp1-6.5 {SELECT 'foo' REGEXP '(^[a-z]+$)';} {1}
do_execsql_test regexp1-6.6 {SELECT 'abc' REGEXP '(^abc|def)';} {1}
do_execsql_test regexp1-6.7 {SELECT 'xabc' REGEXP '(^abc|def)';} {0}
do_execsql_test regexp1-6.8 {SELECT 'def' REGEXP '(^abc|def)';} {1}
do_execsql_test regexp1-6.9 {SELECT 'xdef' REGEXP '(^abc|def)';} {1}






























finish_test







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
do_execsql_test regexp1-6.4 {SELECT 'foo' REGEXP '(^[a-z]+)$';} {1}
do_execsql_test regexp1-6.5 {SELECT 'foo' REGEXP '(^[a-z]+$)';} {1}
do_execsql_test regexp1-6.6 {SELECT 'abc' REGEXP '(^abc|def)';} {1}
do_execsql_test regexp1-6.7 {SELECT 'xabc' REGEXP '(^abc|def)';} {0}
do_execsql_test regexp1-6.8 {SELECT 'def' REGEXP '(^abc|def)';} {1}
do_execsql_test regexp1-6.9 {SELECT 'xdef' REGEXP '(^abc|def)';} {1}

# 2022-11-17
# https://sqlite.org/forum/forumpost/3ffe058b04
#
do_execsql_test regexp1-7.1 {
  SELECT char(0x61,0x7ff,0x62) REGEXP char(0x7ff);
} 1
do_execsql_test regexp1-7.2 {
  SELECT char(0x61,0x800,0x62) REGEXP char(0x800);
} 1
do_execsql_test regexp1-7.3 {
  SELECT char(0x61,0xabc,0x62) REGEXP char(0xabc);
} 1
do_execsql_test regexp1-7.4 {
  SELECT char(0x61,0xfff,0x62) REGEXP char(0xfff);
} 1
do_execsql_test regexp1-7.5 {
  SELECT char(0x61,0x1000,0x62) REGEXP char(0x1000);
} 1
do_execsql_test regexp1-7.10 {
  SELECT char(0x61,0xffff,0x62) REGEXP char(0xffff);
} 1
do_execsql_test regexp1-7.11 {
  SELECT char(0x61,0x10000,0x62) REGEXP char(0x10000);
} 1
do_execsql_test regexp1-7.12 {
  SELECT char(0x61,0x10ffff,0x62) REGEXP char(0x10ffff);
} 1


finish_test