Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fix corner cases in UTF8 handling in the REGEXP extension. Forum post 3ffe058b04. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
abb18f61c5cec0f524acc41453b4c06b |
User & Date: | drh 2022-11-17 19:24:39 |
Context
2022-12-26
| ||
16:03 | Fix corner cases in UTF8 handling in the REGEXP extension. Forum post 3ffe058b04. (check-in: 93e68b39 user: drh tags: branch-3.40) | |
2022-11-18
| ||
17:50 | Add the SQLITE_DBCONFIG_LENIENT_JSON configuration option. Modify the built-in JSON routines such that when this setting is active, arguments that that ought to be JSON but still give a reasonable result (ex: NULL) rather than raising an error. (Leaf check-in: 186db57d user: drh tags: lenient-json) | |
15:22 | shell.c.in: on non-Windows platforms, check for $XDG_CONFIG_HOME/sqlite3/sqliterc before ~/.sqliterc, per request in forum post 7a16582b1e403c81. (check-in: 17065d09 user: stephan tags: trunk) | |
2022-11-17
| ||
19:24 | Fix corner cases in UTF8 handling in the REGEXP extension. Forum post 3ffe058b04. (check-in: abb18f61 user: drh tags: trunk) | |
14:40 | Use the log10() and log2() functions from the standard C library to implement the equivalent SQL functions, in the hope that this will prevent reported precision problems. See forum post cfceb1230bdcfd84 and the surrounding thread. (check-in: 7c572d02 user: drh tags: trunk) | |
Changes
Changes to ext/misc/regexp.c.
︙ | ︙ | |||
181 182 183 184 185 186 187 | c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f); if( c<0x80 ) c = 0xfffd; }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80 && (p->z[p->i+1]&0xc0)==0x80 ){ c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f); p->i += 2; if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd; | | | 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f); if( c<0x80 ) c = 0xfffd; }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80 && (p->z[p->i+1]&0xc0)==0x80 ){ c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f); p->i += 2; if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd; }else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80 && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){ c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6) | (p->z[p->i+2]&0x3f); p->i += 3; if( c<=0xffff || c>0x10ffff ) c = 0xfffd; }else{ c = 0xfffd; |
︙ | ︙ | |||
708 709 710 711 712 713 714 | } /* The following is a performance optimization. If the regex begins with ** ".*" (if the input regex lacks an initial "^") and afterwards there are ** one or more matching characters, enter those matching characters into ** zInit[]. The re_match() routine can then search ahead in the input ** string looking for the initial match without having to run the whole | | | | | 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 | } /* The following is a performance optimization. If the regex begins with ** ".*" (if the input regex lacks an initial "^") and afterwards there are ** one or more matching characters, enter those matching characters into ** zInit[]. The re_match() routine can then search ahead in the input ** string looking for the initial match without having to run the whole ** regex engine over the string. Do not worry about trying to match ** unicode characters beyond plane 0 - those are very rare and this is ** just an optimization. */ if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){ for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ unsigned x = pRe->aArg[i]; if( x<=0x7f ){ pRe->zInit[j++] = (unsigned char)x; }else if( x<=0x7ff ){ pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); pRe->zInit[j++] = 0x80 | (x&0x3f); }else if( x<=0xffff ){ pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12)); pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); pRe->zInit[j++] = 0x80 | (x&0x3f); }else{ |
︙ | ︙ |
Changes to test/regexp1.test.
︙ | ︙ | |||
299 300 301 302 303 304 305 306 307 308 | do_execsql_test regexp1-6.4 {SELECT 'foo' REGEXP '(^[a-z]+)$';} {1} do_execsql_test regexp1-6.5 {SELECT 'foo' REGEXP '(^[a-z]+$)';} {1} do_execsql_test regexp1-6.6 {SELECT 'abc' REGEXP '(^abc|def)';} {1} do_execsql_test regexp1-6.7 {SELECT 'xabc' REGEXP '(^abc|def)';} {0} do_execsql_test regexp1-6.8 {SELECT 'def' REGEXP '(^abc|def)';} {1} do_execsql_test regexp1-6.9 {SELECT 'xdef' REGEXP '(^abc|def)';} {1} finish_test | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | do_execsql_test regexp1-6.4 {SELECT 'foo' REGEXP '(^[a-z]+)$';} {1} do_execsql_test regexp1-6.5 {SELECT 'foo' REGEXP '(^[a-z]+$)';} {1} do_execsql_test regexp1-6.6 {SELECT 'abc' REGEXP '(^abc|def)';} {1} do_execsql_test regexp1-6.7 {SELECT 'xabc' REGEXP '(^abc|def)';} {0} do_execsql_test regexp1-6.8 {SELECT 'def' REGEXP '(^abc|def)';} {1} do_execsql_test regexp1-6.9 {SELECT 'xdef' REGEXP '(^abc|def)';} {1} # 2022-11-17 # https://sqlite.org/forum/forumpost/3ffe058b04 # do_execsql_test regexp1-7.1 { SELECT char(0x61,0x7ff,0x62) REGEXP char(0x7ff); } 1 do_execsql_test regexp1-7.2 { SELECT char(0x61,0x800,0x62) REGEXP char(0x800); } 1 do_execsql_test regexp1-7.3 { SELECT char(0x61,0xabc,0x62) REGEXP char(0xabc); } 1 do_execsql_test regexp1-7.4 { SELECT char(0x61,0xfff,0x62) REGEXP char(0xfff); } 1 do_execsql_test regexp1-7.5 { SELECT char(0x61,0x1000,0x62) REGEXP char(0x1000); } 1 do_execsql_test regexp1-7.10 { SELECT char(0x61,0xffff,0x62) REGEXP char(0xffff); } 1 do_execsql_test regexp1-7.11 { SELECT char(0x61,0x10000,0x62) REGEXP char(0x10000); } 1 do_execsql_test regexp1-7.12 { SELECT char(0x61,0x10ffff,0x62) REGEXP char(0x10ffff); } 1 finish_test |