/ Check-in [6c8ad279]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:A new approach for UTF-8 translation. (CVS 4004)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 6c8ad2790eaede90b3f1ef62614e667178b2a8c4
User & Date: drh 2007-05-15 11:55:09
Context
2007-05-15
13:27
Clarification on the best practices for using the _bytes() APIs. Change sqlite3_value_blob() to force the representation to be purely a BLOB and not a dual BLOB/String. Ticket #2360. (CVS 4005) check-in: cf2dd45b user: drh tags: trunk
11:55
A new approach for UTF-8 translation. (CVS 4004) check-in: 6c8ad279 user: drh tags: trunk
09:00
Do not allow bytes 0x80 through 0xBF to be the first character of an identifer because no valid UTF-8 character can begin with those bytes. If we allowed an identifier to begin with one of those bytes, then the substr() function in ALTER TABLE will not work correctly. (CVS 4003) check-in: 25281042 user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/func.c.

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
..
99
100
101
102
103
104
105
106




107
108
109
110
111
112
113
...
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179



180
181
182
183
184
185
186
...
189
190
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
...
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
...
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
...
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
** This file contains the C functions that implement various SQL
** functions of SQLite.  
**
** There is only one exported symbol in this file - the function
** sqliteRegisterBuildinFunctions() found at the bottom of the file.
** All other code has file scope.
**
** $Id: func.c,v 1.156 2007/05/15 01:13:47 drh Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>
/* #include <math.h> */
#include <stdlib.h>
#include <assert.h>
#include "vdbeInt.h"
................................................................................
    case SQLITE_FLOAT: {
      sqlite3_result_int(context, sqlite3_value_bytes(argv[0]));
      break;
    }
    case SQLITE_TEXT: {
      const unsigned char *z = sqlite3_value_text(argv[0]);
      if( z==0 ) return;
      for(len=0; *z; z++){ if( (0xc0&*z)!=0x80 ) len++; }




      sqlite3_result_int(context, len);
      break;
    }
    default: {
      sqlite3_result_null(context);
      break;
    }
................................................................................
static void substrFunc(
  sqlite3_context *context,
  int argc,
  sqlite3_value **argv
){
  const unsigned char *z;
  const unsigned char *z2;
  int i;
  int len;
  int p0type;
  i64 p1, p2;

  assert( argc==3 );
  p0type = sqlite3_value_type(argv[0]);
  if( p0type==SQLITE_BLOB ){
    len = sqlite3_value_bytes(argv[0]);
    z = sqlite3_value_blob(argv[0]);
    if( z==0 ) return;
  }else{
    z = sqlite3_value_text(argv[0]);
    if( z==0 ) return;
    for(len=0, z2=z; *z2; z2++){ if( (0xc0&*z2)!=0x80 ) len++; }



  }
  p1 = sqlite3_value_int(argv[1]);
  p2 = sqlite3_value_int(argv[2]);
  if( p1<0 ){
    p1 += len;
    if( p1<0 ){
      p2 += p1;
................................................................................
  }else if( p1>0 ){
    p1--;
  }
  if( p1+p2>len ){
    p2 = len-p1;
  }
  if( p0type!=SQLITE_BLOB ){
    for(i=0; i<p1 && z[i]; i++){
      if( (z[i]&0xc0)==0x80 ) p1++;

    }
    while( z[i] && (z[i]&0xc0)==0x80 ){ i++; p1++; }
    for(; i<p1+p2 && z[i]; i++){
      if( (z[i]&0xc0)==0x80 ) p2++;
    }
    while( z[i] && (z[i]&0xc0)==0x80 ){ i++; p2++; }
    if( p2<0 ) p2 = 0;
    sqlite3_result_text(context, (char*)&z[p1], p2, SQLITE_TRANSIENT);
  }else{
    if( p2<0 ) p2 = 0;
    sqlite3_result_blob(context, (char*)&z[p1], p2, SQLITE_TRANSIENT);
  }
}

/*
................................................................................
** case.  Thus  'a' LIKE 'A' would be true. */
static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
/* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };

/*
** X is a pointer to the first byte of a UTF-8 character.  Increment
** X so that it points to the next character.  This only works right
** if X points to a well-formed UTF-8 string.
*/
#define sqliteNextChar(X)  while( (0xc0&*++(X))==0x80 ){}
#define sqliteCharVal(X)   sqlite3ReadUtf8(X)


/*
** Compare two UTF-8 strings for equality where the first string can
** potentially be a "glob" expression.  Return true (1) if they
** are the same and false (0) if they are different.
**
** Globbing rules:
................................................................................
  int prevEscape = 0;     /* True if the previous character was 'escape' */

  while( (c = *zPattern)!=0 ){
    if( !prevEscape && c==matchAll ){
      while( (c=zPattern[1]) == matchAll || c == matchOne ){
        if( c==matchOne ){
          if( *zString==0 ) return 0;
          sqliteNextChar(zString);
        }
        zPattern++;
      }
      if( c && esc && sqlite3ReadUtf8(&zPattern[1])==esc ){
        u8 const *zTemp = &zPattern[1];
        sqliteNextChar(zTemp);
        c = *zTemp;
      }
      if( c==0 ) return 1;
      if( c==matchSet ){
        assert( esc==0 );   /* This is GLOB, not LIKE */
        while( *zString && patternCompare(&zPattern[1],zString,pInfo,esc)==0 ){
          sqliteNextChar(zString);
        }
        return *zString!=0;
      }else{
        while( (c2 = *zString)!=0 ){
          if( noCase ){
            c2 = sqlite3UpperToLower[c2];
            c = sqlite3UpperToLower[c];
            while( c2 != 0 && c2 != c ){ c2 = sqlite3UpperToLower[*++zString]; }
          }else{
            while( c2 != 0 && c2 != c ){ c2 = *++zString; }
          }
          if( c2==0 ) return 0;
          if( patternCompare(&zPattern[1],zString,pInfo,esc) ) return 1;
          sqliteNextChar(zString);
        }
        return 0;
      }
    }else if( !prevEscape && c==matchOne ){
      if( *zString==0 ) return 0;
      sqliteNextChar(zString);
      zPattern++;
    }else if( c==matchSet ){
      int prior_c = 0;
      assert( esc==0 );    /* This only occurs for GLOB, not LIKE */
      seen = 0;
      invert = 0;
      c = sqliteCharVal(zString);
      if( c==0 ) return 0;
      c2 = *++zPattern;
      if( c2=='^' ){ invert = 1; c2 = *++zPattern; }
      if( c2==']' ){
        if( c==']' ) seen = 1;
        c2 = *++zPattern;
      }
      while( (c2 = sqliteCharVal(zPattern))!=0 && c2!=']' ){
        if( c2=='-' && zPattern[1]!=']' && zPattern[1]!=0 && prior_c>0 ){
          zPattern++;
          c2 = sqliteCharVal(zPattern);
          if( c>=prior_c && c<=c2 ) seen = 1;
          prior_c = 0;
        }else if( c==c2 ){
          seen = 1;
          prior_c = c2;
        }else{
          prior_c = c2;
        }
        sqliteNextChar(zPattern);
      }
      if( c2==0 || (seen ^ invert)==0 ) return 0;
      sqliteNextChar(zString);
      zPattern++;
    }else if( esc && !prevEscape && sqlite3ReadUtf8(zPattern)==esc){
      prevEscape = 1;
      sqliteNextChar(zPattern);
    }else{
      if( noCase ){
        if( sqlite3UpperToLower[c] != sqlite3UpperToLower[*zString] ) return 0;
      }else{
        if( c != *zString ) return 0;
      }
      zPattern++;
................................................................................
    azChar = azOne;
    zCharSet = 0;
  }else if( (zCharSet = sqlite3_value_text(argv[1]))==0 ){
    return;
  }else{
    const unsigned char *z;
    for(z=zCharSet, nChar=0; *z; nChar++){
      sqliteNextChar(z);
    }
    if( nChar>0 ){
      azChar = sqlite3_malloc( nChar*(sizeof(char*)+1) );
      if( azChar==0 ){
        return;
      }
      aLen = (unsigned char*)&azChar[nChar];
      for(z=zCharSet, nChar=0; *z; nChar++){
        azChar[nChar] = z;
        sqliteNextChar(z);
        aLen[nChar] = z - azChar[nChar];
      }
    }
  }
  if( nChar>0 ){
    flags = (int)sqlite3_user_data(context);
    if( flags & 1 ){







|







 







|
>
>
>
>







 







<













|
>
>
>







 







|
|
>

<
|
|

<
<
|







 







|
|
|
|
|
|
|







 







|





|






|













|





|






|







|


|








|


|



|







 







|









|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
..
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
...
162
163
164
165
166
167
168

169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
...
195
196
197
198
199
200
201
202
203
204
205

206
207
208


209
210
211
212
213
214
215
216
...
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
...
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
...
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
** This file contains the C functions that implement various SQL
** functions of SQLite.  
**
** There is only one exported symbol in this file - the function
** sqliteRegisterBuildinFunctions() found at the bottom of the file.
** All other code has file scope.
**
** $Id: func.c,v 1.157 2007/05/15 11:55:09 drh Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>
/* #include <math.h> */
#include <stdlib.h>
#include <assert.h>
#include "vdbeInt.h"
................................................................................
    case SQLITE_FLOAT: {
      sqlite3_result_int(context, sqlite3_value_bytes(argv[0]));
      break;
    }
    case SQLITE_TEXT: {
      const unsigned char *z = sqlite3_value_text(argv[0]);
      if( z==0 ) return;
      len = 0;
      while( *z ){
        len++;
        SQLITE_SKIP_UTF8(z);
      }
      sqlite3_result_int(context, len);
      break;
    }
    default: {
      sqlite3_result_null(context);
      break;
    }
................................................................................
static void substrFunc(
  sqlite3_context *context,
  int argc,
  sqlite3_value **argv
){
  const unsigned char *z;
  const unsigned char *z2;

  int len;
  int p0type;
  i64 p1, p2;

  assert( argc==3 );
  p0type = sqlite3_value_type(argv[0]);
  if( p0type==SQLITE_BLOB ){
    len = sqlite3_value_bytes(argv[0]);
    z = sqlite3_value_blob(argv[0]);
    if( z==0 ) return;
  }else{
    z = sqlite3_value_text(argv[0]);
    if( z==0 ) return;
    len = 0;
    for(z2=z; *z2; len++){
      SQLITE_SKIP_UTF8(z2);
    }
  }
  p1 = sqlite3_value_int(argv[1]);
  p2 = sqlite3_value_int(argv[2]);
  if( p1<0 ){
    p1 += len;
    if( p1<0 ){
      p2 += p1;
................................................................................
  }else if( p1>0 ){
    p1--;
  }
  if( p1+p2>len ){
    p2 = len-p1;
  }
  if( p0type!=SQLITE_BLOB ){
    while( *z && p1 ){
      SQLITE_SKIP_UTF8(z);
      p1--;
    }

    for(z2=z; *z2 && p2; p2--){
      SQLITE_SKIP_UTF8(z2);
    }


    sqlite3_result_text(context, (char*)z, z2-z, SQLITE_TRANSIENT);
  }else{
    if( p2<0 ) p2 = 0;
    sqlite3_result_blob(context, (char*)&z[p1], p2, SQLITE_TRANSIENT);
  }
}

/*
................................................................................
** case.  Thus  'a' LIKE 'A' would be true. */
static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
/* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };

/*
** Read a single UTF-8 character and return its value.
*/
u32 sqlite3ReadUtf8(const unsigned char *z){
  u32 c;
  SQLITE_READ_UTF8(z, c);
  return c;
}

/*
** Compare two UTF-8 strings for equality where the first string can
** potentially be a "glob" expression.  Return true (1) if they
** are the same and false (0) if they are different.
**
** Globbing rules:
................................................................................
  int prevEscape = 0;     /* True if the previous character was 'escape' */

  while( (c = *zPattern)!=0 ){
    if( !prevEscape && c==matchAll ){
      while( (c=zPattern[1]) == matchAll || c == matchOne ){
        if( c==matchOne ){
          if( *zString==0 ) return 0;
          SQLITE_SKIP_UTF8(zString);
        }
        zPattern++;
      }
      if( c && esc && sqlite3ReadUtf8(&zPattern[1])==esc ){
        u8 const *zTemp = &zPattern[1];
        SQLITE_SKIP_UTF8(zTemp);
        c = *zTemp;
      }
      if( c==0 ) return 1;
      if( c==matchSet ){
        assert( esc==0 );   /* This is GLOB, not LIKE */
        while( *zString && patternCompare(&zPattern[1],zString,pInfo,esc)==0 ){
          SQLITE_SKIP_UTF8(zString);
        }
        return *zString!=0;
      }else{
        while( (c2 = *zString)!=0 ){
          if( noCase ){
            c2 = sqlite3UpperToLower[c2];
            c = sqlite3UpperToLower[c];
            while( c2 != 0 && c2 != c ){ c2 = sqlite3UpperToLower[*++zString]; }
          }else{
            while( c2 != 0 && c2 != c ){ c2 = *++zString; }
          }
          if( c2==0 ) return 0;
          if( patternCompare(&zPattern[1],zString,pInfo,esc) ) return 1;
          SQLITE_SKIP_UTF8(zString);
        }
        return 0;
      }
    }else if( !prevEscape && c==matchOne ){
      if( *zString==0 ) return 0;
      SQLITE_SKIP_UTF8(zString);
      zPattern++;
    }else if( c==matchSet ){
      int prior_c = 0;
      assert( esc==0 );    /* This only occurs for GLOB, not LIKE */
      seen = 0;
      invert = 0;
      c = sqlite3ReadUtf8(zString);
      if( c==0 ) return 0;
      c2 = *++zPattern;
      if( c2=='^' ){ invert = 1; c2 = *++zPattern; }
      if( c2==']' ){
        if( c==']' ) seen = 1;
        c2 = *++zPattern;
      }
      while( (c2 = sqlite3ReadUtf8(zPattern))!=0 && c2!=']' ){
        if( c2=='-' && zPattern[1]!=']' && zPattern[1]!=0 && prior_c>0 ){
          zPattern++;
          c2 = sqlite3ReadUtf8(zPattern);
          if( c>=prior_c && c<=c2 ) seen = 1;
          prior_c = 0;
        }else if( c==c2 ){
          seen = 1;
          prior_c = c2;
        }else{
          prior_c = c2;
        }
        SQLITE_SKIP_UTF8(zPattern);
      }
      if( c2==0 || (seen ^ invert)==0 ) return 0;
      SQLITE_SKIP_UTF8(zString);
      zPattern++;
    }else if( esc && !prevEscape && sqlite3ReadUtf8(zPattern)==esc){
      prevEscape = 1;
      SQLITE_SKIP_UTF8(zPattern);
    }else{
      if( noCase ){
        if( sqlite3UpperToLower[c] != sqlite3UpperToLower[*zString] ) return 0;
      }else{
        if( c != *zString ) return 0;
      }
      zPattern++;
................................................................................
    azChar = azOne;
    zCharSet = 0;
  }else if( (zCharSet = sqlite3_value_text(argv[1]))==0 ){
    return;
  }else{
    const unsigned char *z;
    for(z=zCharSet, nChar=0; *z; nChar++){
      SQLITE_SKIP_UTF8(z);
    }
    if( nChar>0 ){
      azChar = sqlite3_malloc( nChar*(sizeof(char*)+1) );
      if( azChar==0 ){
        return;
      }
      aLen = (unsigned char*)&azChar[nChar];
      for(z=zCharSet, nChar=0; *z; nChar++){
        azChar[nChar] = z;
        SQLITE_SKIP_UTF8(z);
        aLen[nChar] = z - azChar[nChar];
      }
    }
  }
  if( nChar>0 ){
    flags = (int)sqlite3_user_data(context);
    if( flags & 1 ){

Changes to src/sqliteInt.h.

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
....
1531
1532
1533
1534
1535
1536
1537

























































1538
1539
1540
1541
1542
1543
1544
....
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** Internal interface definitions for SQLite.
**
** @(#) $Id: sqliteInt.h,v 1.566 2007/05/12 12:08:51 drh Exp $
*/
#ifndef _SQLITEINT_H_
#define _SQLITEINT_H_
#include "limits.h"


#if defined(SQLITE_TCL) || defined(TCLSH)
................................................................................

/*
 * This global flag is set for performance testing of triggers. When it is set
 * SQLite will perform the overhead of building new and old trigger references 
 * even when no triggers exist
 */
extern int sqlite3_always_code_trigger_setup;


























































/*
** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production
** builds) or a function call (for debugging).  If it is a function call,
** it allows the operator to set a breakpoint at the spot where database
** corruption is first detected.
*/
................................................................................
int sqlite3FixTriggerStep(DbFixer*, TriggerStep*);
int sqlite3AtoF(const char *z, double*);
char *sqlite3_snprintf(int,char*,const char*,...);
int sqlite3GetInt32(const char *, int*);
int sqlite3FitsIn64Bits(const char *);
int sqlite3Utf16ByteLen(const void *pData, int nChar);
int sqlite3Utf8CharLen(const char *pData, int nByte);
int sqlite3ReadUtf8(const unsigned char *);
int sqlite3PutVarint(unsigned char *, u64);
int sqlite3GetVarint(const unsigned char *, u64 *);
int sqlite3GetVarint32(const unsigned char *, u32 *);
int sqlite3VarintLen(u64 v);
void sqlite3IndexAffinityStr(Vdbe *, Index *);
void sqlite3TableAffinityStr(Vdbe *, Table *);
char sqlite3CompareAffinity(Expr *pExpr, char aff2);







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
....
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
....
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** Internal interface definitions for SQLite.
**
** @(#) $Id: sqliteInt.h,v 1.567 2007/05/15 11:55:09 drh Exp $
*/
#ifndef _SQLITEINT_H_
#define _SQLITEINT_H_
#include "limits.h"


#if defined(SQLITE_TCL) || defined(TCLSH)
................................................................................

/*
 * This global flag is set for performance testing of triggers. When it is set
 * SQLite will perform the overhead of building new and old trigger references 
 * even when no triggers exist
 */
extern int sqlite3_always_code_trigger_setup;

/*
** A lookup table used by the SQLITE_READ_UTF8 macro.  The definition
** is in utf.c.
*/
extern const unsigned char sqlite3UtfTrans1[];

/*
** Macros for reading UTF8 characters.
**
** SQLITE_READ_UTF8(x,c) reads a single UTF8 value out of x and writes
** that value into c.  The type of x must be unsigned char*.  The type
** of c must be unsigned int.
**
** SQLITE_SKIP_UTF8(x) advances x forward by one character.  The type of
** x must be unsigned char*.
**
** Notes On Invalid UTF-8:
**
**  *  These macros never allow a 7-bit character (0x00 through 0x7f) to
**     be encoded as a multi-byte character.  Any multi-byte character that
**     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
**
**  *  These macros never allow a UTF16 surragate value to be encoded.
**     If a multi-byte character attempts to encode a value between
**     0xd800 and 0xe000 then it is rendered as 0xfffd.
**
**  *  Bytes in the range of 0x80 through 0xbf which occur as the first
**     byte of a character are interpreted as single-byte characters
**     and rendered as themselves even though they are technically
**     invalid characters.
**
**  *  These routines accept an infinite number of different UTF8 encodings
**     for unicode values 0x80 and greater.  They do not change over-length
**     encodings to 0xfffd as some systems recommend.
** 
*/
#define SQLITE_READ_UTF8(zIn, c) {                     \
  c = *(zIn++);                                        \
  if( c>=0xc0 ){                                       \
    c = sqlite3UtfTrans1[c-0xc0];                      \
    while( (*zIn & 0xc0)==0x80 ){                      \
      c = (c<<6) + (0x3f & *(zIn++));                  \
    }                                                  \
    if( c<0x80                                         \
        || (c&0xFFFFF800)==0xD800                      \
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }    \
  }                                                    \
}
#define SQLITE_SKIP_UTF8(zIn) {                        \
  if( (*(zIn++))>=0xc0 ){                              \
    while( (*zIn & 0xc0)==0x80 ){ zIn++; }             \
  }                                                    \
}




/*
** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production
** builds) or a function call (for debugging).  If it is a function call,
** it allows the operator to set a breakpoint at the spot where database
** corruption is first detected.
*/
................................................................................
int sqlite3FixTriggerStep(DbFixer*, TriggerStep*);
int sqlite3AtoF(const char *z, double*);
char *sqlite3_snprintf(int,char*,const char*,...);
int sqlite3GetInt32(const char *, int*);
int sqlite3FitsIn64Bits(const char *);
int sqlite3Utf16ByteLen(const void *pData, int nChar);
int sqlite3Utf8CharLen(const char *pData, int nByte);
u32 sqlite3ReadUtf8(const unsigned char *);
int sqlite3PutVarint(unsigned char *, u64);
int sqlite3GetVarint(const unsigned char *, u64 *);
int sqlite3GetVarint32(const unsigned char *, u32 *);
int sqlite3VarintLen(u64 v);
void sqlite3IndexAffinityStr(Vdbe *, Index *);
void sqlite3TableAffinityStr(Vdbe *, Table *);
char sqlite3CompareAffinity(Expr *pExpr, char aff2);

Changes to src/utf.c.

8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
..
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
...
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
...
453
454
455
456
457
458
459
460
461
462

463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
...
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
...
613
614
615
616
617
618
619

620
621
622
623
624
625
626
627
628
629
630
631
632

633
634
635
636
637
638
639
640
641
642

643
644
645
646
647
648
649
650
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** This file contains routines used to translate between UTF-8, 
** UTF-16, UTF-16BE, and UTF-16LE.
**
** $Id: utf.c,v 1.46 2007/05/10 17:23:12 drh Exp $
**
** Notes on UTF-8:
**
**   Byte-0    Byte-1    Byte-2    Byte-3    Value
**  0xxxxxxx                                 00000000 00000000 0xxxxxxx
**  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
**  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
................................................................................
**  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
**
**
** BOM or Byte Order Mark:
**     0xff 0xfe   little-endian utf-16 follows
**     0xfe 0xff   big-endian utf-16 follows
**
**
** Handling of malformed strings:
**
** SQLite accepts and processes malformed strings without an error wherever
** possible. However this is not possible when converting between UTF-8 and
** UTF-16.
**
** When converting malformed UTF-8 strings to UTF-16, one instance of the
** replacement character U+FFFD for each byte that cannot be interpeted as
** part of a valid unicode character.
**
** When converting malformed UTF-16 strings to UTF-8, one instance of the
** replacement character U+FFFD for each pair of bytes that cannot be
** interpeted as part of a valid unicode character.
**
** This file contains the following public routines:
**
** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
** sqlite3Utf16ByteLen()     - Calculate byte-length of a void* UTF16 string.
** sqlite3Utf8CharLen()      - Calculate char-length of a char* UTF8 string.
** sqlite3Utf8LikeCompare()  - Do a LIKE match given two UTF8 char* strings.
**
*/
#include "sqliteInt.h"
#include <assert.h>
#include "vdbeInt.h"

/*
** The following constant value is used by the SQLITE_BIGENDIAN and
** SQLITE_LITTLEENDIAN macros.
*/
const int sqlite3one = 1;

/*
** This table maps from the first byte of a UTF-8 character to the number
** of trailing bytes expected. A value '4' indicates that the table key
** is not a legal first byte for a UTF-8 character.
*/

static const u8 xtra_utf8_bytes[256]  = {
/* 0xxxxxxx */
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,

/* 10wwwwww */
4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,

/* 110yyyyy */
1, 1, 1, 1, 1, 1, 1, 1,     1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,     1, 1, 1, 1, 1, 1, 1, 1,

/* 1110zzzz */
2, 2, 2, 2, 2, 2, 2, 2,     2, 2, 2, 2, 2, 2, 2, 2,

/* 11110yyy */
3, 3, 3, 3, 3, 3, 3, 3,     4, 4, 4, 4, 4, 4, 4, 4,
};

/*
** This table maps from the number of trailing bytes in a UTF-8 character
** to an integer constant that is effectively calculated for each character
** read by a naive implementation of a UTF-8 character reader. The code
** in the READ_UTF8 macro explains things best.
*/
static const int xtra_utf8_bits[] =  {
  0,
  12416,          /* (0xC0 << 6) + (0x80) */
  925824,         /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
  63447168        /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
};

/*
** If a UTF-8 character contains N bytes extra bytes (N bytes follow
** the initial byte so that the total character length is N+1) then
** masking the character with utf8_mask[N] must produce a non-zero
** result.  Otherwise, we have an (illegal) overlong encoding.
*/
static const int utf_mask[] = {
  0x00000000,
  0xffffff80,
  0xfffff800,
  0xffff0000,
};

#define READ_UTF8(zIn, c) { \
  int xtra;                                            \
  c = *(zIn)++;                                        \
  xtra = xtra_utf8_bytes[c];                           \
  switch( xtra ){                                      \
    case 4: c = (int)0xFFFD; break;                    \
    case 3: c = (c<<6) + *(zIn)++;                     \
    case 2: c = (c<<6) + *(zIn)++;                     \
    case 1: c = (c<<6) + *(zIn)++;                     \
    c -= xtra_utf8_bits[xtra];                         \
    if( (utf_mask[xtra]&c)==0                          \
        || (c&0xFFFFF800)==0xD800                      \
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }    \
  }                                                    \
}
int sqlite3ReadUtf8(const unsigned char *z){
  int c;
  READ_UTF8(z, c);
  return c;
}

#define SKIP_UTF8(zIn) {                               \
  zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1);            \
}

#define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (c&0xFF);                                \
  }                                                    \
  else if( c<0x00800 ){                                \
    *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \
................................................................................
  }
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        READ_UTF8(zIn, c); 
        WRITE_UTF16LE(z, c);
      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
        READ_UTF8(zIn, c); 
        WRITE_UTF16BE(z, c);
      }
    }
    pMem->n = z - zOut;
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
................................................................................
/*
** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
** return the number of unicode characters in pZ up to (but not including)
** the first 0x00 byte. If nByte is not less than zero, return the
** number of unicode characters in the first nByte of pZ (or up to 
** the first 0x00, whichever comes first).
*/
int sqlite3Utf8CharLen(const char *z, int nByte){
  int r = 0;
  const char *zTerm;

  if( nByte>=0 ){
    zTerm = &z[nByte];
  }else{
    zTerm = (const char *)(-1);
  }
  assert( z<=zTerm );
  while( *z!=0 && z<zTerm ){
    SKIP_UTF8(z);
    r++;
  }
  return r;
}

#ifndef SQLITE_OMIT_UTF16
/*
................................................................................
*/
int sqlite3Utf8To8(unsigned char *zIn){
  unsigned char *zOut = zIn;
  unsigned char *zStart = zIn;
  int c;

  while(1){
    READ_UTF8(zIn, c);
    if( c==0 ) break;
    if( c!=0xfffd ){
      WRITE_UTF8(zOut, c);
    }
  }
  *zOut = 0;
  return zOut - zStart;
................................................................................
  int n;
  unsigned int c;

  for(i=0; i<0x00110000; i++){
    z = zBuf;
    WRITE_UTF8(z, i);
    n = z-zBuf;

    z = zBuf;
    READ_UTF8(z, c);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<=0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16LE(z, i);
    n = z-zBuf;

    z = zBuf;
    READ_UTF16LE(z, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<=0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16BE(z, i);
    n = z-zBuf;

    z = zBuf;
    READ_UTF16BE(z, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
}
#endif /* SQLITE_TEST */
#endif /* SQLITE_OMIT_UTF16 */







|







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<












|
|
<

>
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|






|







 







|

|
>



|



|







 







|







 







>

|











>










>








8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
..
30
31
32
33
34
35
36























37
38
39
40
41
42
43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
58
59
60


















61



















































62
63
64
65
66
67
68
...
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
...
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
...
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
...
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** This file contains routines used to translate between UTF-8, 
** UTF-16, UTF-16BE, and UTF-16LE.
**
** $Id: utf.c,v 1.47 2007/05/15 11:55:09 drh Exp $
**
** Notes on UTF-8:
**
**   Byte-0    Byte-1    Byte-2    Byte-3    Value
**  0xxxxxxx                                 00000000 00000000 0xxxxxxx
**  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
**  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
................................................................................
**  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
**
**
** BOM or Byte Order Mark:
**     0xff 0xfe   little-endian utf-16 follows
**     0xfe 0xff   big-endian utf-16 follows
**























*/
#include "sqliteInt.h"
#include <assert.h>
#include "vdbeInt.h"

/*
** The following constant value is used by the SQLITE_BIGENDIAN and
** SQLITE_LITTLEENDIAN macros.
*/
const int sqlite3one = 1;

/*
** This lookup table is used to help decode the first byte of
** a multi-byte UTF8 character.

*/
const unsigned char sqlite3UtfTrans1[] = {
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,


















};




















































#define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (c&0xFF);                                \
  }                                                    \
  else if( c<0x00800 ){                                \
    *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \
................................................................................
  }
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        SQLITE_READ_UTF8(zIn, c); 
        WRITE_UTF16LE(z, c);
      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
        SQLITE_READ_UTF8(zIn, c); 
        WRITE_UTF16BE(z, c);
      }
    }
    pMem->n = z - zOut;
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
................................................................................
/*
** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
** return the number of unicode characters in pZ up to (but not including)
** the first 0x00 byte. If nByte is not less than zero, return the
** number of unicode characters in the first nByte of pZ (or up to 
** the first 0x00, whichever comes first).
*/
int sqlite3Utf8CharLen(const char *zIn, int nByte){
  int r = 0;
  const u8 *z = (const u8*)zIn;
  const u8 *zTerm;
  if( nByte>=0 ){
    zTerm = &z[nByte];
  }else{
    zTerm = (const u8*)(-1);
  }
  assert( z<=zTerm );
  while( *z!=0 && z<zTerm ){
    SQLITE_SKIP_UTF8(z);
    r++;
  }
  return r;
}

#ifndef SQLITE_OMIT_UTF16
/*
................................................................................
*/
int sqlite3Utf8To8(unsigned char *zIn){
  unsigned char *zOut = zIn;
  unsigned char *zStart = zIn;
  int c;

  while(1){
    SQLITE_READ_UTF8(zIn, c);
    if( c==0 ) break;
    if( c!=0xfffd ){
      WRITE_UTF8(zOut, c);
    }
  }
  *zOut = 0;
  return zOut - zStart;
................................................................................
  int n;
  unsigned int c;

  for(i=0; i<0x00110000; i++){
    z = zBuf;
    WRITE_UTF8(z, i);
    n = z-zBuf;
    z[0] = 0;
    z = zBuf;
    SQLITE_READ_UTF8(z, c);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<=0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16LE(z, i);
    n = z-zBuf;
    z[0] = 0;
    z = zBuf;
    READ_UTF16LE(z, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<=0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16BE(z, i);
    n = z-zBuf;
    z[0] = 0;
    z = zBuf;
    READ_UTF16BE(z, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
}
#endif /* SQLITE_TEST */
#endif /* SQLITE_OMIT_UTF16 */