/ Check-in [71260ff7]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add code to convert between the various supported unicode encoding schemes. Untested at this point. (CVS 1315)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 71260ff7f7030f56c292b43f83a213c65c9a184e
User & Date: danielk1977 2004-05-06 23:37:52
Context
2004-05-07
01:50
Fix compilation problem in test5.c (CVS 1318) check-in: 49c3c86c user: danielk1977 tags: trunk
2004-05-06
23:37
Add code to convert between the various supported unicode encoding schemes. Untested at this point. (CVS 1315) check-in: 71260ff7 user: danielk1977 tags: trunk
2004-05-04
17:27
Update test3.c to work with the new btree.c API. (CVS 1314) check-in: bfb3234d user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to main.mk.

    51     51   # This is how we compile
    52     52   #
    53     53   TCCX = $(TCC) $(OPTS) $(THREADSAFE) $(USLEEP) -I. -I$(TOP)/src
    54     54   
    55     55   # Object files for the SQLite library.
    56     56   #
    57     57   LIBOBJ = hash.o os.o pager.o random.o \
    58         -         util.o tclsqlite.o
           58  +         util.o tclsqlite.o utf.o
    59     59   
    60     60   LIBOBJ_ORIG = attach.o auth.o btree.o btree_rb.o build.o copy.o date.o delete.o \
    61     61            expr.o func.o hash.o insert.o \
    62     62            main.o opcodes.o os.o pager.o parse.o pragma.o printf.o random.o \
    63     63            select.o table.o tokenize.o trigger.o update.o util.o \
    64     64            vacuum.o vdbe.o vdbeaux.o where.o tclsqlite.o
    65     65   
................................................................................
   117    117   
   118    118   # Source code to the test files.
   119    119   #
   120    120   TESTSRC = \
   121    121     $(TOP)/src/os.c \
   122    122     $(TOP)/src/pager.c \
   123    123     $(TOP)/src/test2.c \
          124  +  $(TOP)/src/test5.c \
   124    125     $(TOP)/src/md5.c
   125    126   
   126    127   TESTSRC_ORIG = \
   127    128     $(TOP)/src/btree.c \
   128    129     $(TOP)/src/func.c \
   129    130     $(TOP)/src/os.c \
   130    131     $(TOP)/src/pager.c \
................................................................................
   225    226   	  awk '{printf "#define %-30s %3d\n", $$2, ++cnt}' >>opcodes.h
   226    227   
   227    228   os.o:	$(TOP)/src/os.c $(HDR)
   228    229   	$(TCCX) -c $(TOP)/src/os.c
   229    230   
   230    231   parse.o:	parse.c $(HDR)
   231    232   	$(TCCX) -c parse.c
          233  +
          234  +utf.o:	$(TOP)/src/utf.c $(HDR)
          235  +	$(TCCX) -c $(TOP)/src/utf.c
   232    236   
   233    237   parse.h:	parse.c
   234    238   
   235    239   parse.c:	$(TOP)/src/parse.y lemon
   236    240   	cp $(TOP)/src/parse.y .
   237    241   	./lemon parse.y
   238    242   

Changes to src/sqliteInt.h.

     7      7   **    May you do good and not evil.
     8      8   **    May you find forgiveness for yourself and forgive others.
     9      9   **    May you share freely, never taking more than you give.
    10     10   **
    11     11   *************************************************************************
    12     12   ** Internal interface definitions for SQLite.
    13     13   **
    14         -** @(#) $Id: sqliteInt.h,v 1.221 2004/04/26 14:10:22 drh Exp $
           14  +** @(#) $Id: sqliteInt.h,v 1.222 2004/05/06 23:37:53 danielk1977 Exp $
    15     15   */
    16     16   #include "config.h"
    17     17   #include "sqlite.h"
    18     18   #include "hash.h"
    19     19   #include "parse.h"
    20     20   #include <stdio.h>
    21     21   #include <stdlib.h>
................................................................................
  1264   1264   int sqliteFixSelect(DbFixer*, Select*);
  1265   1265   int sqliteFixExpr(DbFixer*, Expr*);
  1266   1266   int sqliteFixExprList(DbFixer*, ExprList*);
  1267   1267   int sqliteFixTriggerStep(DbFixer*, TriggerStep*);
  1268   1268   double sqliteAtoF(const char *z, const char **);
  1269   1269   char *sqlite_snprintf(int,char*,const char*,...);
  1270   1270   int sqliteFitsIn32Bits(const char *);
         1271  +
         1272  +unsigned char *sqlite3utf16to8(const void *pData, int N);
         1273  +void *sqlite3utf8to16be(const unsigned char *pIn, int N);
         1274  +void *sqlite3utf8to16le(const unsigned char *pIn, int N);
         1275  +void sqlite3utf16to16le(void *pData, int N);
         1276  +void sqlite3utf16to16be(void *pData, int N);
         1277  +

Changes to src/tclsqlite.c.

     7      7   **    May you do good and not evil.
     8      8   **    May you find forgiveness for yourself and forgive others.
     9      9   **    May you share freely, never taking more than you give.
    10     10   **
    11     11   *************************************************************************
    12     12   ** A TCL Interface to SQLite
    13     13   **
    14         -** $Id: tclsqlite.c,v 1.60 2004/04/26 14:10:22 drh Exp $
           14  +** $Id: tclsqlite.c,v 1.61 2004/05/06 23:37:53 danielk1977 Exp $
    15     15   */
    16     16   #ifndef NO_TCL     /* Omit this whole file if TCL is unavailable */
    17     17   
    18     18   #include "sqliteInt.h"
    19     19   #include "tcl.h"
    20     20   #include <stdlib.h>
    21     21   #include <string.h>
................................................................................
  1204   1204     /* Sqlite_Init(interp); */
  1205   1205   #ifdef SQLITE_TEST
  1206   1206     {
  1207   1207       extern int Sqlitetest1_Init(Tcl_Interp*);
  1208   1208       extern int Sqlitetest2_Init(Tcl_Interp*);
  1209   1209       extern int Sqlitetest3_Init(Tcl_Interp*);
  1210   1210       extern int Sqlitetest4_Init(Tcl_Interp*);
         1211  +    extern int Sqlitetest5_Init(Tcl_Interp*);
  1211   1212       extern int Md5_Init(Tcl_Interp*);
  1212   1213       /* Sqlitetest1_Init(interp); */
  1213   1214       Sqlitetest2_Init(interp);
  1214   1215       /* Sqlitetest3_Init(interp); */
  1215   1216       /* Sqlitetest4_Init(interp); */
         1217  +    Sqlitetest5_Init(interp);
  1216   1218       Md5_Init(interp);
  1217   1219     }
  1218   1220   #endif
  1219   1221     if( argc>=2 ){
  1220   1222       int i;
  1221   1223       Tcl_SetVar(interp,"argv0",argv[1],TCL_GLOBAL_ONLY);
  1222   1224       Tcl_SetVar(interp,"argv", "", TCL_GLOBAL_ONLY);

Added src/test5.c.

            1  +/*
            2  +** 2001 September 15
            3  +**
            4  +** The author disclaims copyright to this source code.  In place of
            5  +** a legal notice, here is a blessing:
            6  +**
            7  +**    May you do good and not evil.
            8  +**    May you find forgiveness for yourself and forgive others.
            9  +**    May you share freely, never taking more than you give.
           10  +**
           11  +*************************************************************************
           12  +** Code for testing the utf.c module in SQLite.  This code
           13  +** is not included in the SQLite library.  It is used for automated
           14  +** testing of the SQLite library.
           15  +**
           16  +** $Id: 
           17  +*/
           18  +#include "sqliteInt.h"
           19  +#include "tcl.h"
           20  +#include <stdlib.h>
           21  +#include <string.h>
           22  +
           23  +/*
           24  +** Return the number of bytes up to and including the first \u0000 
           25  +** character in *pStr.
           26  +*/
           27  +static int utf16_length(const unsigned char *pZ){
           28  +  const unsigned char *pC1 = pZ;
           29  +  const unsigned char *pC2 = pZ+1;
           30  +  while( *pC1 || *pC2 ){
           31  +    pC1 += 2;
           32  +    pC2 += 2;
           33  +  }
           34  +  return (pC1-pZ)+2;
           35  +}
           36  +
           37  +static int sqlite_utf8to16le(
           38  +  void * clientData,
           39  +  Tcl_Interp *interp,
           40  +  int objc,
           41  +  Tcl_Obj *CONST objv[]
           42  +){
           43  +  unsigned char *out;
           44  +  unsigned char *in;
           45  +  Tcl_Obj *res;
           46  +
           47  +  if( objc!=2 ){
           48  +    Tcl_AppendResult(interp, "wrong # args: should be \"",
           49  +        Tcl_GetStringFromObj(objv[0], 0), "<utf-8 encoded-string>", 0);
           50  +    return TCL_ERROR;
           51  +  }
           52  +
           53  +  in = Tcl_GetByteArrayFromObj(objv[1], 0);
           54  +  out = (unsigned char *)sqlite3utf8to16le(in, -1);
           55  +  res = Tcl_NewByteArrayObj(out, utf16_length(ret));
           56  +  sqliteFree(out);
           57  +
           58  +  Tcl_SetObjResult(interp, res);
           59  +
           60  +  return TCL_OK;
           61  +}
           62  +
           63  +static int sqlite_utf8to16be(
           64  +  void * clientData,
           65  +  Tcl_Interp *interp,
           66  +  int objc,
           67  +  Tcl_Obj *CONST objv[]
           68  +){
           69  +  unsigned char *out;
           70  +  unsigned char *in;
           71  +  Tcl_Obj *res;
           72  +
           73  +  if( objc!=2 ){
           74  +    Tcl_AppendResult(interp, "wrong # args: should be \"",
           75  +        Tcl_GetStringFromObj(objv[0], 0), "<utf-8 encoded-string>", 0);
           76  +    return TCL_ERROR;
           77  +  }
           78  +
           79  +  in = Tcl_GetByteArrayFromObj(objv[1], 0);
           80  +  out = (unsigned char *)sqlite3utf8to16be(in, -1);
           81  +  res = Tcl_NewByteArrayObj(out, utf16_length(ret));
           82  +  sqliteFree(out);
           83  +
           84  +  Tcl_SetObjResult(interp, res);
           85  +
           86  +  return TCL_OK;
           87  +}
           88  +
           89  +static int sqlite_utf16to16le(
           90  +  void * clientData,
           91  +  Tcl_Interp *interp,
           92  +  int objc,
           93  +  Tcl_Obj *CONST objv[]
           94  +){
           95  +  unsigned char *out;
           96  +  unsigned char *in;
           97  +  int in_len;
           98  +  Tcl_Obj *res;
           99  +
          100  +  if( objc!=2 ){
          101  +    Tcl_AppendResult(interp, "wrong # args: should be \"",
          102  +        Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0);
          103  +    return TCL_ERROR;
          104  +  }
          105  +
          106  +  in = Tcl_GetByteArrayFromObj(objv[1], &in_len);
          107  +  out = (unsigned char *)sqliteMalloc(in_len);
          108  +  memcpy(out, in, in_len);
          109  +  
          110  +  sqlite3utf16to16le(out, -1);
          111  +  res = Tcl_NewByteArrayObj(out, utf16_length(ret));
          112  +  sqliteFree(out);
          113  +
          114  +  Tcl_SetObjResult(interp, res);
          115  +
          116  +  return TCL_OK;
          117  +}
          118  +
          119  +static int sqlite_utf16to16be(
          120  +  void * clientData,
          121  +  Tcl_Interp *interp,
          122  +  int objc,
          123  +  Tcl_Obj *CONST objv[]
          124  +){
          125  +  unsigned char *out;
          126  +  unsigned char *in;
          127  +  int in_len;
          128  +  Tcl_Obj *res;
          129  +
          130  +  if( objc!=2 ){
          131  +    Tcl_AppendResult(interp, "wrong # args: should be \"",
          132  +        Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0);
          133  +    return TCL_ERROR;
          134  +  }
          135  +
          136  +  in = Tcl_GetByteArrayFromObj(objv[1], &in_len);
          137  +  out = (unsigned char *)sqliteMalloc(in_len);
          138  +  memcpy(out, in, in_len);
          139  +  
          140  +  sqlite3utf16to16be(out, -1);
          141  +  res = Tcl_NewByteArrayObj(out, utf16_length(ret));
          142  +  sqliteFree(out);
          143  +
          144  +  Tcl_SetObjResult(interp, res);
          145  +
          146  +  return TCL_OK;
          147  +}
          148  +
          149  +static int sqlite_utf16to8(
          150  +  void * clientData,
          151  +  Tcl_Interp *interp,
          152  +  int objc,
          153  +  Tcl_Obj *CONST objv[]
          154  +){
          155  +  unsigned char *out;
          156  +  unsigned char *in;
          157  +  Tcl_Obj *res;
          158  +
          159  +  if( objc!=2 ){
          160  +    Tcl_AppendResult(interp, "wrong # args: should be \"",
          161  +        Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0);
          162  +    return TCL_ERROR;
          163  +  }
          164  +
          165  +  in = Tcl_GetByteArrayFromObj(objv[1], 0);
          166  +  out = sqlite3utf16to8(in, -1);
          167  +  res = Tcl_NewByteArrayObj(out, strlen(ret));
          168  +  sqliteFree(out);
          169  +
          170  +  Tcl_SetObjResult(interp, res);
          171  +
          172  +  return TCL_OK;
          173  +}
          174  +
          175  +
          176  +/*
          177  +** Register commands with the TCL interpreter.
          178  +*/
          179  +int Sqlitetest5_Init(Tcl_Interp *interp){
          180  +  static struct {
          181  +    char *zName;
          182  +    Tcl_CmdProc *xProc;
          183  +  } aCmd[] = {
          184  +    { "sqlite_utf16to8",         (Tcl_CmdProc*)sqlite_utf16to8    },
          185  +    { "sqlite_utf8to16le",       (Tcl_CmdProc*)sqlite_utf8to16le  },
          186  +    { "sqlite_utf8to16be",       (Tcl_CmdProc*)sqlite_utf8to16be  },
          187  +    { "sqlite_utf16to16le",      (Tcl_CmdProc*)sqlite_utf16to16le },
          188  +    { "sqlite_utf16to16be",      (Tcl_CmdProc*)sqlite_utf16to16be }
          189  +  };
          190  +  int i;
          191  +  for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){
          192  +    Tcl_CreateCommand(interp, aCmd[i].zName, aCmd[i].xProc, 0, 0);
          193  +  }
          194  +
          195  +  return TCL_OK;
          196  +}

Changes to src/utf.c.

     8      8   **    May you find forgiveness for yourself and forgive others.
     9      9   **    May you share freely, never taking more than you give.
    10     10   **
    11     11   *************************************************************************
    12     12   ** This file contains routines used to translate between UTF-8, 
    13     13   ** UTF-16, UTF-16BE, and UTF-16LE.
    14     14   **
    15         -** $Id: utf.c,v 1.1 2004/05/04 15:00:47 drh Exp $
           15  +** $Id: utf.c,v 1.2 2004/05/06 23:37:53 danielk1977 Exp $
    16     16   **
    17     17   ** Notes on UTF-8:
    18     18   **
    19     19   **   Byte-0    Byte-1    Byte-2    Byte-3    Value
    20     20   **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
    21     21   **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
    22     22   **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
................................................................................
    24     24   **
    25     25   **
    26     26   ** Notes on UTF-16:  (with wwww+1==uuuuu)
    27     27   **
    28     28   **      Word-0            Word-1             Value
    29     29   **  110110wwwwxxxxxx 110111yyyyyyyyyy        000uuuuu xxxxxxyy yyyyyyyy
    30     30   **  xxxxxxxxyyyyyyyy                         00000000 xxxxxxxx yyyyyyyy
           31  +**
    31     32   **
    32     33   ** BOM or Byte Order Mark:
    33     34   **     0xff 0xfe   little-endian utf-16 follows
    34     35   **     0xfe 0xff   big-endian utf-16 follows
           36  +**
           37  +**
           38  +** Handling of malformed strings:
           39  +**
           40  +** SQLite accepts and processes malformed strings without an error wherever
           41  +** possible. However this is not possible when converting between UTF-8 and
           42  +** UTF-16.
           43  +**
           44  +** When converting malformed UTF-8 strings to UTF-16, one instance of the
           45  +** replacement character U+FFFD for each byte that cannot be interpeted as
           46  +** part of a valid unicode character.
           47  +**
           48  +** When converting malformed UTF-16 strings to UTF-8, one instance of the
           49  +** replacement character U+FFFD for each pair of bytes that cannot be
           50  +** interpeted as part of a valid unicode character.
           51  +*/
           52  +
           53  +#include <assert.h>
           54  +#include <unistd.h>
           55  +#include "sqliteInt.h"
           56  +
           57  +typedef struct UtfString UtfString;
           58  +struct UtfString {
           59  +  unsigned char *pZ;    /* Raw string data */
           60  +  int n;                /* Allocated length of pZ in bytes */
           61  +  int c;                /* Number of pZ bytes already read or written */
           62  +};
           63  +
           64  +/* TODO: Implement this macro in os.h. It should be 1 on big-endian
           65  +** machines, and 0 on little-endian.
           66  +*/
           67  +#define SQLITE3_NATIVE_BIGENDIAN 0
           68  +
           69  +#if SQLITE3_NATIVE_BIGENDIAN == 1
           70  +#define BOM_BIGENDIAN 0x0000FFFE
           71  +#define BOM_LITTLEENDIAN 0x0000FEFF
           72  +#else
           73  +#define BOM_BIGENDIAN 0x0000FEFF
           74  +#define BOM_LITTLEENDIAN 0x0000FFFE
           75  +#endif
           76  +
           77  +/*
           78  +** These two macros are used to interpret the first two bytes of the 
           79  +** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
           80  +** interpretation, LE16() for little-endian.
           81  +*/
           82  +#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
           83  +#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
           84  +
           85  +/*
           86  +** READ_16 interprets the first two bytes of the unsigned char array pZ 
           87  +** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
           88  +** is big-endian, otherwise little-endian.
           89  +*/
           90  +#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
           91  +
           92  +/*
           93  +** Read the BOM from the start of *pStr, if one is present. Return zero
           94  +** for little-endian, non-zero for big-endian. If no BOM is present, return
           95  +** the machines native byte order.
           96  +**
           97  +** Return values:
           98  +**     1 -> big-endian string
           99  +**     0 -> little-endian string
          100  +*/
          101  +static int readUtf16Bom(UtfString *pStr){
          102  +  /* The BOM must be the first thing read from the string */
          103  +  assert( pStr->c==0 );
          104  +
          105  +  /* If the string data consists of 1 byte or less, the BOM will make no
          106  +  ** difference anyway. In this case just fall through to the default case
          107  +  ** and return the native byte-order for this machine.
          108  +  **
          109  +  ** Otherwise, check the first 2 bytes of the string to see if a BOM is
          110  +  ** present.
          111  +  */
          112  +  if( pStr->n>1 ){
          113  +    u32 bom = BE16(pStr->pZ);
          114  +    if( bom==BOM_BIGENDIAN ){
          115  +      pStr->c = 2;
          116  +      return 1;
          117  +    }
          118  +    if( bom==BOM_LITTLEENDIAN ){
          119  +      pStr->c = 2;
          120  +      return 0;
          121  +    }
          122  +  }
          123  +
          124  +  return SQLITE3_NATIVE_BIGENDIAN;
          125  +}
          126  +
          127  +
          128  +/*
          129  +** Read a single unicode character from the UTF-8 encoded string *pStr. The
          130  +** value returned is a unicode scalar value. In the case of malformed
          131  +** strings, the unicode replacement character U+FFFD may be returned.
          132  +*/
          133  +static u32 readUtf8(UtfString *pStr){
          134  +  struct Utf8TblRow {
          135  +    u8 b1_mask;
          136  +    u8 b1_masked_val;
          137  +    u8 b1_value_mask;
          138  +    int trailing_bytes;
          139  +  };
          140  +  static const struct Utf8TblRow utf8tbl[] = {
          141  +    { 0x80, 0x00, 0x7F, 0 },
          142  +    { 0xE0, 0xC0, 0x1F, 1 },
          143  +    { 0xF0, 0xE0, 0x0F, 2 },
          144  +    { 0xF8, 0xF0, 0x0E, 3 },
          145  +    { 0, 0, 0, 0}
          146  +  };
          147  +
          148  +  u8 b1;       /* First byte of the potentially multi-byte utf-8 character */
          149  +  u32 ret = 0; /* Return value */
          150  +  int ii;
          151  +  struct Utf8TblRow const *pRow;
          152  +
          153  +  pRow = &(utf8tbl[0]);
          154  +
          155  +  b1 = pStr->pZ[pStr->c];
          156  +  pStr->c++;
          157  +  while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
          158  +    pRow++;
          159  +  }
          160  +  if( !pRow->b1_mask ){
          161  +    return 0xFFFD;
          162  +  }
          163  +  
          164  +  ret = (u32)(b1&pRow->b1_value_mask);
          165  +  for( ii=0; ii<pRow->trailing_bytes; ii++ ){
          166  +    u8 b = pStr->pZ[pStr->c+ii];
          167  +    if( (b&0xC0)!=0x80 ){
          168  +      return 0xFFFD;
          169  +    }
          170  +    ret = (ret<<6) + (u32)(b&0x3F);
          171  +  }
          172  +  
          173  +  pStr->c += pRow->trailing_bytes;
          174  +  return ret;
          175  +}
          176  +
          177  +/*
          178  +** Write the unicode character 'code' to the string pStr using UTF-8
          179  +** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
          180  +*/
          181  +static int writeUtf8(UtfString *pStr, u32 code){
          182  +  struct Utf8WriteTblRow {
          183  +    u32 max_code;
          184  +    int trailing_bytes;
          185  +    u8 b1_and_mask;
          186  +    u8 b1_or_mask;
          187  +  };
          188  +  static const struct Utf8WriteTblRow utf8tbl[] = {
          189  +    {0x0000007F, 0, 0x7F, 0x00},
          190  +    {0x000007FF, 1, 0xDF, 0xC0},
          191  +    {0x0000FFFF, 2, 0xEF, 0xE0},
          192  +    {0x0010FFFF, 3, 0xF7, 0xF0},
          193  +    {0x00000000, 0, 0x00, 0x00}
          194  +  };
          195  +  static const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
          196  +
          197  +  while( code<=pRow->max_code ){
          198  +    assert( pRow->max_code );
          199  +    pRow++;
          200  +  }
          201  +
          202  +  /* Ensure there is enough room left in the output buffer to write
          203  +  ** this UTF-8 character. 
          204  +  */
          205  +  assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
          206  +
          207  +  /* Write the UTF-8 encoded character to pStr. All cases below are
          208  +  ** intentionally fall-through.
          209  +  */
          210  +  switch( pRow->trailing_bytes ){
          211  +    case 3:
          212  +      pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80;
          213  +      code = code>>6;
          214  +    case 2:
          215  +      pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80;
          216  +      code = code>>6;
          217  +    case 1:
          218  +      pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80;
          219  +      code = code>>6;
          220  +    case 0:
          221  +      pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask);
          222  +  }
          223  +  pStr->c += (pRow->trailing_bytes + 1);
          224  +
          225  +  return 0;
          226  +}
          227  +
          228  +/*
          229  +** Read a single unicode character from the UTF-16 encoded string *pStr. The
          230  +** value returned is a unicode scalar value. In the case of malformed
          231  +** strings, the unicode replacement character U+FFFD may be returned.
          232  +**
          233  +** If big_endian is true, the string is assumed to be UTF-16BE encoded.
          234  +** Otherwise, it is UTF-16LE encoded.
          235  +*/
          236  +static u32 readUtf16(UtfString *pStr, int big_endian){
          237  +  u32 code_point;   /* the first code-point in the character */
          238  +
          239  +  /* If there is only one byte of data left in the string, return the 
          240  +  ** replacement character.
          241  +  */
          242  +  if( (pStr->n-pStr->c)==1 ){
          243  +    pStr->c++;
          244  +    return (int)0xFFFD;
          245  +  }
          246  +
          247  +  code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
          248  +  pStr->c += 2;
          249  +
          250  +  /* If this is a non-surrogate code-point, just cast it to an int and
          251  +  ** return the code-point value.
          252  +  */
          253  +  if( code_point<0xD800 || code_point>0xE000 ){
          254  +    return code_point;
          255  +  }
          256  +
          257  +  /* If this is a trailing surrogate code-point, then the string is
          258  +  ** malformed; return the replacement character.
          259  +  */
          260  +  if( code_point>0xDBFF ){
          261  +    return 0xFFFD;
          262  +  }
          263  +
          264  +  /* The code-point just read is a leading surrogate code-point. If their
          265  +  ** is not enough data left or the next code-point is not a trailing
          266  +  ** surrogate, return the replacement character.
          267  +  */
          268  +  if( (pStr->n-pStr->c)>1 ){
          269  +    u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
          270  +    if( code_point2<0xDC00 || code_point>0xDFFF ){
          271  +      return 0xFFFD;
          272  +    }
          273  +    pStr->c += 2;
          274  +
          275  +    return ( 
          276  +        (((code_point&0x03C0)+0x0040)<<16) +   /* uuuuu */
          277  +        ((code_point&0x003F)<<10) +            /* xxxxxx */
          278  +        (code_point2&0x03FF)                   /* yy yyyyyyyy */
          279  +    );
          280  +
          281  +  }else{
          282  +    return (int)0xFFFD;
          283  +  }
          284  +  
          285  +  /* not reached */
          286  +}
          287  +
          288  +static int writeUtf16(UtfString *pStr, int code, int big_endian){
          289  +  int bytes;
          290  +  unsigned char *hi_byte;
          291  +  unsigned char *lo_byte;
          292  +
          293  +  bytes = (code>0x0000FFFF?4:2);
          294  +
          295  +  /* Ensure there is enough room left in the output buffer to write
          296  +  ** this UTF-8 character.
          297  +  */
          298  +  assert( (pStr->n-pStr->c)>=bytes );
          299  +  
          300  +  /* Initialise hi_byte and lo_byte to point at the locations into which
          301  +  ** the MSB and LSB of the (first) 16-bit unicode code-point written for
          302  +  ** this character.
          303  +  */
          304  +  hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
          305  +  lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
          306  +
          307  +  if( bytes==2 ){
          308  +    *hi_byte = (u8)((code&0x0000FF00)>>8);
          309  +    *lo_byte = (u8)(code&0x000000FF);
          310  +  }else{
          311  +    u32 wrd;
          312  +    wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800;
          313  +    *hi_byte = (u8)((wrd&0x0000FF00)>>8);
          314  +    *lo_byte = (u8)(wrd&0x000000FF);
          315  +
          316  +    wrd = (code&0x000003FF)|0x0000DC00;
          317  +    *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
          318  +    *(lo_byte+2) = (u8)(wrd&0x000000FF);
          319  +  }
          320  +
          321  +  pStr->c += bytes;
          322  +  
          323  +  return 0;
          324  +}
          325  +
          326  +/*
          327  +** Return the number of bytes up to (but not including) the first \u0000
          328  +** character in *pStr.
    35    329   */
          330  +static int utf16Bytelen(const unsigned char *pZ){
          331  +  const unsigned char *pC1 = pZ;
          332  +  const unsigned char *pC2 = pZ+1;
          333  +  while( *pC1 || *pC2 ){
          334  +    pC1 += 2;
          335  +    pC2 += 2;
          336  +  }
          337  +  return pC1-pZ;
          338  +}
    36    339   
    37    340   /*
    38    341   ** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
    39    342   ** "BOM") into a UTF-8 string.  The UTF-8 string is written into space 
    40         -** obtained from sqlit3Malloc() and must be released by the calling function.
          343  +** obtained from sqlite3Malloc() and must be released by the calling function.
    41    344   **
    42    345   ** The parameter N is the number of bytes in the UTF-16 string.  If N is
    43    346   ** negative, the entire string up to the first \u0000 character is translated.
    44    347   **
    45    348   ** The returned UTF-8 string is always \000 terminated.
    46    349   */
    47    350   unsigned char *sqlite3utf16to8(const void *pData, int N){
    48         -  unsigned char *in = (unsigned char *)pData;
          351  +  UtfString in;
          352  +  UtfString out;
          353  +  int big_endian;
          354  +
          355  +  out.pZ = 0;
          356  +
          357  +  in.pZ = (unsigned char *)pData;
          358  +  in.n = N;
          359  +  in.c = 0;
          360  +
          361  +  if( in.n<0 ){
          362  +    in.n = utf16Bytelen(in.pZ);
          363  +  }
          364  +
          365  +  /* A UTF-8 encoding of a unicode string can require at most 1.5 times as
          366  +  ** much space to store as the same string encoded using UTF-16. Allocate
          367  +  ** this now.
          368  +  */
          369  +  out.n = (in.n*1.5) + 1;
          370  +  out.pZ = sqliteMalloc(in.n);
          371  +  if( !out.pZ ){
          372  +    return 0;
          373  +  }
          374  +  out.c = 0;
          375  +
          376  +  big_endian = readUtf16Bom(&in);
          377  +  while( in.c<in.n ){
          378  +    writeUtf8(&out, readUtf16(&in, big_endian));
          379  +  }
          380  +
          381  +  /* Add the NULL-terminator character */
          382  +  assert( out.c<out.n );
          383  +  out.pZ[out.c] = 0x00;
          384  +
          385  +  return out.pZ;
          386  +}
          387  +
          388  +static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){
          389  +  UtfString in;
          390  +  UtfString out;
          391  +
          392  +  in.pZ = (unsigned char *)pIn;
          393  +  in.n = N;
          394  +  in.c = 0;
          395  +
          396  +  if( in.n<0 ){
          397  +    in.n = strlen(in.pZ);
          398  +  }
          399  +
          400  +  /* A UTF-16 encoding of a unicode string can require at most twice as
          401  +  ** much space to store as the same string encoded using UTF-8. Allocate
          402  +  ** this now.
          403  +  */
          404  +  out.n = (in.n*2) + 2;
          405  +  out.pZ = sqliteMalloc(in.n);
          406  +  if( !out.pZ ){
          407  +    return 0;
          408  +  }
          409  +  out.c = 0;
          410  +
          411  +  while( in.c<in.n ){
          412  +    writeUtf16(&out, readUtf8(&in), big_endian);
          413  +  }
          414  +
          415  +  /* Add the NULL-terminator character */
          416  +  assert( (out.c+1)<out.n );
          417  +  out.pZ[out.c] = 0x00;
          418  +  out.pZ[out.c+1] = 0x00;
          419  +
          420  +  return out.pZ;
          421  +}
          422  +
          423  +/*
          424  +** Translate UTF-8 to UTF-16BE or UTF-16LE
          425  +*/
          426  +void *sqlite3utf8to16be(const unsigned char *pIn, int N){
          427  +  return utf8toUtf16(pIn, N, 1);
          428  +}
          429  +
          430  +void *sqlite3utf8to16le(const unsigned char *pIn, int N){
          431  +  return utf8toUtf16(pIn, N, 0);
          432  +}
          433  +
          434  +/* 
          435  +** This routine does the work for sqlite3utf16to16le() and
          436  +** sqlite3utf16to16be(). If big_endian is 1 the input string is
          437  +** transformed in place to UTF-16BE encoding. If big_endian is 0 then
          438  +** the input is transformed to UTF-16LE.
          439  +**
          440  +** Unless the first two bytes of the input string is a BOM, the input is
          441  +** assumed to be UTF-16 encoded using the machines native byte ordering.
          442  +*/
          443  +static void utf16to16(void *pData, int N, int big_endian){
          444  +  UtfString inout;
          445  +  inout.pZ = (unsigned char *)pData;
          446  +  inout.c = 0;
          447  +  inout.n = N;
          448  +
          449  +  if( inout.n<0 ){
          450  +    inout.n = utf16Bytelen(inout.pZ);
          451  +  }
          452  +
          453  +  if( readUtf16Bom(&inout)!=big_endian ){
          454  +    swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c);
          455  +  }else if( inout.c ){
          456  +    memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
          457  +  }
    49    458   }
    50    459   
    51    460   /*
    52    461   ** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
    53    462   ** string.  The conversion occurs in-place.  The output overwrites the
    54    463   ** input.  N bytes are converted.  If N is negative everything is converted
    55    464   ** up to the first \u0000 character.
    56    465   **
    57    466   ** If the native byte order is little-endian and there is no BOM, then
    58    467   ** this routine is a no-op.  If there is a BOM at the start of the string,
    59    468   ** it is removed.
          469  +**
          470  +** Translation from UTF-16LE to UTF-16BE and back again is accomplished
          471  +** using the library function swab().
    60    472   */
    61    473   void sqlite3utf16to16le(void *pData, int N){
    62         -}
    63         -void sqlite3utf16to16be(void *pData, int N){
          474  +  utf16to16(pData, N, 0);
    64    475   }
    65    476   
    66    477   /*
          478  +** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
          479  +** string.  The conversion occurs in-place.  The output overwrites the
          480  +** input.  N bytes are converted.  If N is negative everything is converted
          481  +** up to the first \u0000 character.
          482  +**
          483  +** If the native byte order is little-endian and there is no BOM, then
          484  +** this routine is a no-op.  If there is a BOM at the start of the string,
          485  +** it is removed.
          486  +**
    67    487   ** Translation from UTF-16LE to UTF-16BE and back again is accomplished
    68    488   ** using the library function swab().
    69    489   */
    70         -
    71         -/*
    72         -** Translate UTF-8 to UTF-16BE or UTF-16LE
    73         -*/
    74         -void *sqlite3utf8to16be(const unsigned char *pIn, int N){
          490  +void sqlite3utf16to16be(void *pData, int N){
          491  +  utf16to16(pData, N, 1);
    75    492   }
    76         -void *sqlite3utf8to16le(const unsigned char *pIn, int N){
    77         -}
          493  +