SQLite

Check-in [4b4f33d7]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix a problem with LIKE and GLOB processing in utf-16be databases in cases where the utf-8 encoding of a character ends with the byte 0xBF. Forum post d7b90d92ffbfc61f
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 4b4f33d791fe4318c4597bee7d2f9e486ed223e731982af470f5cc0dbdc600fc
User & Date: dan 2025-01-31 14:52:36
Original Comment: Fix a problem with LIKE and GLOB processing in utf-16be databases in cases where the utf-8 encoding of a character ends with the byte 0xBF.
References
2025-02-14
10:28
Fix test cases added for [4b4f33d791fe4318] so that they also work when compiling with ICU support. Forum post 2ca8a09a7e (check-in: 5964616d user: drh tags: trunk)
2025-01-31
20:18
Tweaks to [4b4f33d791fe4318] to make it easier to test. (check-in: 2567298f user: drh tags: trunk)
Context
2025-01-31
18:33
Remove an assert() that is not true if a trace-callback is deregistered while there are active statements. Forum post 3547aa1078510686. (check-in: 0ecfc4d0 user: dan tags: trunk)
14:58
Fix a problem with LIKE and GLOB processing in utf-16be databases in cases where the utf-8 encoding of a character ends with the byte 0xBF. (check-in: 9d59f8a5 user: drh tags: branch-3.48)
14:52
Fix a problem with LIKE and GLOB processing in utf-16be databases in cases where the utf-8 encoding of a character ends with the byte 0xBF. Forum post d7b90d92ffbfc61f (check-in: 4b4f33d7 user: dan tags: trunk)
14:52
If any errors occur while processing sqlite_dbpage changes, cancel pending truncate operations. (check-in: 6138e0dc user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/whereexpr.c.
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235


236
237
238
239
240
241
242
    sqlite3VdbeSetVarmask(pParse->pVdbe, iCol);
    assert( pRight->op==TK_VARIABLE || pRight->op==TK_REGISTER );
  }else if( op==TK_STRING ){
    assert( !ExprHasProperty(pRight, EP_IntValue) );
     z = (u8*)pRight->u.zToken;
  }
  if( z ){
    /* Count the number of prefix bytes prior to the first wildcard.
    ** or U+fffd character.  If the underlying database has a UTF16LE
    ** encoding, then only consider ASCII characters.  Note that the
    ** encoding of z[] is UTF8 - we are dealing with only UTF8 here in
    ** this code, but the database engine itself might be processing
    ** content using a different encoding. */
    cnt = 0;
    while( (c=z[cnt])!=0 && c!=wc[0] && c!=wc[1] && c!=wc[2] ){
      cnt++;
      if( c==wc[3] && z[cnt]>0 && z[cnt]<0x80 ){
        cnt++;
      }else if( c>=0x80 ){
        const u8 *z2 = z+cnt-1;
        if( sqlite3Utf8Read(&z2)==0xfffd || ENC(db)==SQLITE_UTF16LE ){


          cnt--;
          break;
        }else{
          cnt = (int)(z2-z);
        }
      }
    }







|
|
|
|
|
|







|
>
>







215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
    sqlite3VdbeSetVarmask(pParse->pVdbe, iCol);
    assert( pRight->op==TK_VARIABLE || pRight->op==TK_REGISTER );
  }else if( op==TK_STRING ){
    assert( !ExprHasProperty(pRight, EP_IntValue) );
     z = (u8*)pRight->u.zToken;
  }
  if( z ){
    /* Count the number of prefix bytes prior to the first wildcard,
    ** U+fffd character, or malformed utf-8. If the underlying database
    ** has a UTF16LE encoding, then only consider ASCII characters.  Note that
    ** the encoding of z[] is UTF8 - we are dealing with only UTF8 here in this
    ** code, but the database engine itself might be processing content using a
    ** different encoding. */
    cnt = 0;
    while( (c=z[cnt])!=0 && c!=wc[0] && c!=wc[1] && c!=wc[2] ){
      cnt++;
      if( c==wc[3] && z[cnt]>0 && z[cnt]<0x80 ){
        cnt++;
      }else if( c>=0x80 ){
        const u8 *z2 = z+cnt-1;
        if( sqlite3Utf8Read(&z2)==0xfffd || c==0xFF   /* bad utf-8 */
         || ENC(db)==SQLITE_UTF16LE 
        ){
          cnt--;
          break;
        }else{
          cnt = (int)(z2-z);
        }
      }
    }
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399



1400




1401
1402
1403
1404
1405
1406
1407
      for(i=0; (c = pStr1->u.zToken[i])!=0; i++){
        pStr1->u.zToken[i] = sqlite3Toupper(c);
        pStr2->u.zToken[i] = sqlite3Tolower(c);
      }
    }

    if( !db->mallocFailed ){
      u8 c, *pC;       /* Last character before the first wildcard */
      pC = (u8*)&pStr2->u.zToken[sqlite3Strlen30(pStr2->u.zToken)-1];
      c = *pC;
      if( noCase ){
        /* The point is to increment the last character before the first
        ** wildcard.  But if we increment '@', that will push it into the
        ** alphabetic range where case conversions will mess up the
        ** inequality.  To avoid this, make sure to also run the full
        ** LIKE on all candidate expressions by clearing the isComplete flag
        */
        if( c=='A'-1 ) isComplete = 0;
        c = sqlite3UpperToLower[c];
      }



      *pC = c + 1;




    }
    zCollSeqName = noCase ? "NOCASE" : sqlite3StrBINARY;
    pNewExpr1 = sqlite3ExprDup(db, pLeft, 0);
    pNewExpr1 = sqlite3PExpr(pParse, TK_GE,
           sqlite3ExprAddCollateString(pParse,pNewExpr1,zCollSeqName),
           pStr1);
    transferJoinMarkings(pNewExpr1, pExpr);







|

<







|
|

>
>
>
|
>
>
>
>







1382
1383
1384
1385
1386
1387
1388
1389
1390

1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
      for(i=0; (c = pStr1->u.zToken[i])!=0; i++){
        pStr1->u.zToken[i] = sqlite3Toupper(c);
        pStr2->u.zToken[i] = sqlite3Tolower(c);
      }
    }

    if( !db->mallocFailed ){
      u8 *pC;       /* Last character before the first wildcard */
      pC = (u8*)&pStr2->u.zToken[sqlite3Strlen30(pStr2->u.zToken)-1];

      if( noCase ){
        /* The point is to increment the last character before the first
        ** wildcard.  But if we increment '@', that will push it into the
        ** alphabetic range where case conversions will mess up the
        ** inequality.  To avoid this, make sure to also run the full
        ** LIKE on all candidate expressions by clearing the isComplete flag
        */
        if( *pC=='A'-1 ) isComplete = 0;
        *pC = sqlite3UpperToLower[*pC];
      }

      /* Increment the value of the last utf8 character in the prefix. */
      while( *pC==0xBF && pC>(u8*)pStr2->u.zToken ){
        *pC = 0x80;
        pC--;
      }
      assert( *pC!=0xFF );        /* isLikeOrGlob() guarantees this */
      (*pC)++;
    }
    zCollSeqName = noCase ? "NOCASE" : sqlite3StrBINARY;
    pNewExpr1 = sqlite3ExprDup(db, pLeft, 0);
    pNewExpr1 = sqlite3PExpr(pParse, TK_GE,
           sqlite3ExprAddCollateString(pParse,pNewExpr1,zCollSeqName),
           pStr1);
    transferJoinMarkings(pNewExpr1, pExpr);
Changes to test/like3.test.
270
271
272
273
274
275
276
277















































































278

do_eqp_test like3-6.240 {
  SELECT * FROM t2 WHERE path LIKE 'a%' ESCAPE '\';
} {
  QUERY PLAN
  `--SEARCH t2 USING INDEX t2path2 (path>? AND path<?)
}
}
















































































finish_test









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
do_eqp_test like3-6.240 {
  SELECT * FROM t2 WHERE path LIKE 'a%' ESCAPE '\';
} {
  QUERY PLAN
  `--SEARCH t2 USING INDEX t2path2 (path>? AND path<?)
}
}

#-------------------------------------------------------------------------

ifcapable utf16 {
  reset_db
  do_execsql_test like3-7.0 {
    PRAGMA encoding = 'UTF-16be';
  
    CREATE TABLE Example(word TEXT NOT NULL);
    CREATE INDEX Example_word on Example(word);
  
    INSERT INTO Example VALUES(char(0x307F));
  }
  
  do_execsql_test like3-7.1 {
    SELECT char(0x307F)=='み';
  } {1}
  
  do_execsql_test like3-7.1 {
    SELECT * FROM Example WHERE word GLOB 'み*'
  } {み}
  
  do_execsql_test like3-7.2 {
    SELECT * FROM Example WHERE word >= char(0x307F) AND word < char(0x3080);
  } {み}
}

#-------------------------------------------------------------------------
reset_db

foreach enc {
  UTF-8
  UTF-16le 
  UTF-16be
} {
  foreach {tn expr} {
    1 "CAST (X'FF' AS TEXT)"
    2 "CAST (X'FFBF' AS TEXT)"
    3 "CAST (X'FFBFBF' AS TEXT)"
    4 "CAST (X'FFBFBFBF' AS TEXT)"

    5 "'abc' || CAST (X'FF' AS TEXT)"
    6 "'def' || CAST (X'FFBF' AS TEXT)"
    7 "'ghi' || CAST (X'FFBFBF' AS TEXT)"
    8 "'jkl' || CAST (X'FFBFBFBF' AS TEXT)"
  } {
    reset_db
    execsql "PRAGMA encoding = '$enc'"
    do_execsql_test like3-8.$tn.0 {
      CREATE TABLE t1(x);
    }
  
    do_execsql_test like3-8.$tn.1 {
      PRAGMA encoding
    } $enc
  
    do_execsql_test like3-8.$tn.1 "
      INSERT INTO t1 VALUES( $expr )
    "
  
    do_execsql_test like3-8.$tn.2 {
      SELECT typeof(x) FROM t1
    } {text}
  
    set x [db one {SELECT x || '%' FROM t1}]
  
    do_execsql_test like3-8.$tn.3 {
      SELECT rowid FROM t1 WHERE x LIKE $x
    } 1
  
    do_execsql_test like3-8.$tn.4 {
      CREATE INDEX i1 ON t1(x);
    }
  
    do_execsql_test like3-8.$tn.5 {
      SELECT rowid FROM t1 WHERE x LIKE $x
    } 1
  }
}

finish_test