Two UTF-8 errors in Regexp

(1) By jleedev on 2022-11-16 02:59:55 [source]

First, the UTF-8 decoding routing includes an off-by-one error when a character in the range U+10000..U+10FFFF is found at the end of the string:

 % ./sqlite3 --version 
3.40.0 2022-11-14 19:42:01 6ee57a2e4e3399481100c40da0229d2d33cbe3290016185c8a60839d14b209f6
 % ./sqlite3 
sqlite> SELECT regexp_bytecode(char(0x1f4a9));
INIT     efbfbdefbfbdefbfbdefbfbd
ANYSTAR     0
MATCH    65533
MATCH    65533
MATCH    65533
MATCH    65533
ACCEPT      0
sqlite> SELECT regexp_bytecode('('||char(0x1f4a9)||')');
ANYSTAR     0
MATCH    128169
ACCEPT      0

Second, the UTF-8 encoding routine that writes to zInit writes bad values when a character in the range U+0800..U+0FFF is found in this prefix (causing it to reject good matches):

sqlite> SELECT regexp_bytecode('\u0800');
INIT     e080  # should be e0a080
ANYSTAR     0
MATCH    2048
ACCEPT      0
sqlite> SELECT regexp_bytecode('\u0fff');
INIT     ffbf  # should be e0bfbf
ANYSTAR     0
MATCH    4095
ACCEPT      0

Patch:

diff --git c/ext/misc/regexp.c i/ext/misc/regexp.c
index d0c8ee5cf..50a8826b4 100644
--- c/ext/misc/regexp.c
+++ i/ext/misc/regexp.c
@@ -185,7 +185,7 @@ static unsigned re_next_char(ReInput *p){
       c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
       p->i += 2;
       if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
-    }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
+    }else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80
            && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
       c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
                        | (p->z[p->i+2]&0x3f);
@@ -720,7 +720,7 @@ static const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
       unsigned x = pRe->aArg[i];
       if( x<=127 ){
         pRe->zInit[j++] = (unsigned char)x;
-      }else if( x<=0xfff ){
+      }else if( x<=0x7ff ){
         pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
         pRe->zInit[j++] = 0x80 | (x&0x3f);
       }else if( x<=0xffff ){

(2) By Larry Brasfield (larrybr) on 2022-11-17 19:30:39 in reply to 1 [link] [source]

Thanks for reporting this. Fixed here.