SQLite

Check-in [b29a81b4fb]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add interface to configure SQLite to use ICU collation functions. (CVS 3936)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: b29a81b4fbb926fa09186340342848b9fe589033
User & Date: danielk1977 2007-05-07 11:53:14.000
Context
2007-05-07
13:11
Fix typo in Makefile.in. Ticket #2343 (CVS 3937) (check-in: db51f59a7b user: drh tags: trunk)
11:53
Add interface to configure SQLite to use ICU collation functions. (CVS 3936) (check-in: b29a81b4fb user: danielk1977 tags: trunk)
11:24
Change sqlite3_snprintf() so that it does not write a zero-terminator if the buffer size argument is less than 1. Ticket #2341. Added documentation about the sqlite3_snprintf() function. (CVS 3935) (check-in: f3ae4ac5fe user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/icu/icu.c.
17
18
19
20
21
22
23

24
25
26
27
28
29
30
*/

#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)

#include <unicode/utypes.h>
#include <unicode/uregex.h>
#include <unicode/ustring.h>


#include <assert.h>
#include "sqlite3.h"

#ifndef SQLITE_CORE
  #include "sqlite3ext.h"
  SQLITE_EXTENSION_INIT1







>







17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
*/

#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)

#include <unicode/utypes.h>
#include <unicode/uregex.h>
#include <unicode/ustring.h>
#include <unicode/ucol.h>

#include <assert.h>
#include "sqlite3.h"

#ifndef SQLITE_CORE
  #include "sqlite3ext.h"
  SQLITE_EXTENSION_INIT1
46
47
48
49
50
51
52


















53
54
55
56
57
58
59
}

/*
** LIKE operator.
**
** http://unicode.org/reports/tr21/tr21-5.html#Caseless_Matching
*/



















/*
** Function to delete compiled regexp objects. Registered as
** a destructor function with sqlite3_set_auxdata().
*/
static void icuRegexpDelete(void *p){
  URegularExpression *pExpr = (URegularExpression *)p;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
}

/*
** LIKE operator.
**
** http://unicode.org/reports/tr21/tr21-5.html#Caseless_Matching
*/

/*
** This function is called when an ICU function called from within
** the implementation of an SQL scalar function returns an error.
**
** The scalar function context passed as the first argument is 
** loaded with an error message based on the following two args.
*/
static void icuFunctionError(
  sqlite3_context *pCtx,       /* SQLite scalar function context */
  const char *zName,           /* Name of ICU function that failed */
  UErrorCode e                 /* Error code returned by ICU function */
){
  char zBuf[128];
  sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
  zBuf[127] = '\0';
  sqlite3_result_error(pCtx, zBuf, -1);
}

/*
** Function to delete compiled regexp objects. Registered as
** a destructor function with sqlite3_set_auxdata().
*/
static void icuRegexpDelete(void *p){
  URegularExpression *pExpr = (URegularExpression *)p;
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    }
    pExpr = uregex_open(zPattern, -1, 0, 0, &status);

    if( U_SUCCESS(status) ){
      sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
    }else{
      assert(!pExpr);
      sqlite3_result_error(p, "Error compiling regular expression", -1);
      return;
    }
  }

  /* Configure the text that the regular expression operates on. */
  uregex_setText(pExpr, zString, -1, &status);
  if( !U_SUCCESS(status) ){
    sqlite3_result_error(p, "Error configuring regular expression", -1);
    return;
  }

  /* Attempt the match */
  res = uregex_matches(pExpr, 0, &status);
  if( !U_SUCCESS(status) ){
    sqlite3_result_error(p, "Error matching regular expression", -1);
    return;
  }

  /* Set the text that the regular expression operates on to a NULL
  ** pointer. This is not really necessary, but it is tidier than 
  ** leaving the regular expression object configured with an invalid
  ** pointer after this function returns.







|







|






|







119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    }
    pExpr = uregex_open(zPattern, -1, 0, 0, &status);

    if( U_SUCCESS(status) ){
      sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
    }else{
      assert(!pExpr);
      icuFunctionError(p, "uregex_open", status);
      return;
    }
  }

  /* Configure the text that the regular expression operates on. */
  uregex_setText(pExpr, zString, -1, &status);
  if( !U_SUCCESS(status) ){
    icuFunctionError(p, "uregex_setText", status);
    return;
  }

  /* Attempt the match */
  res = uregex_matches(pExpr, 0, &status);
  if( !U_SUCCESS(status) ){
    icuFunctionError(p, "uregex_matches", status);
    return;
  }

  /* Set the text that the regular expression operates on to a NULL
  ** pointer. This is not really necessary, but it is tidier than 
  ** leaving the regular expression object configured with an invalid
  ** pointer after this function returns.
186
187
188
189
190
191
192
193
194
195
196
197
198

















































































199
200
201
202
203
204
205
  if( sqlite3_user_data(p) ){
    u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
  }else{
    u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
  }

  if( !U_SUCCESS(status) ){
    sqlite3_result_error(p, "Error converting case", -1);
    return;
  }

  sqlite3_result_text16(p, zOutput, -1, xFree);
}


















































































/*
** Register the ICU extension functions with database db.
*/
int sqlite3IcuInit(sqlite3 *db){
  struct IcuScalar {
    const char *zName;                        /* Function name */







|





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
  if( sqlite3_user_data(p) ){
    u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
  }else{
    u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
  }

  if( !U_SUCCESS(status) ){
    icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
    return;
  }

  sqlite3_result_text16(p, zOutput, -1, xFree);
}

/*
** Collation sequence destructor function. The pCtx argument points to
** a UCollator structure previously allocated using ucol_open().
*/
static void icuCollationDel(void *pCtx){
  UCollator *p = (UCollator *)pCtx;
  ucol_close(p);
}

/*
** Collation sequence comparison function. The pCtx argument points to
** a UCollator structure previously allocated using ucol_open().
*/
static int icuCollationColl(
  void *pCtx,
  int nLeft,
  const void *zLeft,
  int nRight,
  const void *zRight
){
  UCollationResult res;
  UCollator *p = (UCollator *)pCtx;
  res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
  switch( res ){
    case UCOL_LESS:    return -1;
    case UCOL_GREATER: return +1;
    case UCOL_EQUAL:   return 0;
  }
  assert(!"Bad return value from ucol_strcoll()");
  return 0;
}

/*
** Implementation of the scalar function icu_load_collation().
**
** This scalar function is used to add ICU collation based collation 
** types to an SQLite database connection. It is intended to be called
** as follows:
**
**     SELECT icu_load_collation(<locale>, <collation-name>);
**
** Where <locale> is a string containing an ICU locale identifier (i.e.
** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
** collation sequence to create.
*/
static void icuLoadCollation(
  sqlite3_context *p, 
  int nArg, 
  sqlite3_value **apArg
){
  sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
  UErrorCode status = U_ZERO_ERROR;
  const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
  const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
  UCollator *pUCollator;    /* ICU library collation object */
  int rc;                   /* Return code from sqlite3_create_collation_x() */

  assert(nArg==2);
  zLocale = (const char *)sqlite3_value_text(apArg[0]);
  zName = (const char *)sqlite3_value_text(apArg[1]);

  if( !zLocale || !zName ){
    return;
  }

  pUCollator = ucol_open(zLocale, &status);
  if( !U_SUCCESS(status) ){
    icuFunctionError(p, "ucol_open", status);
    return;
  }
  assert(p);

  rc = sqlite3_create_collation_x(db, zName, SQLITE_UTF16, (void *)pUCollator, 
      icuCollationColl, icuCollationDel
  );
  if( rc!=SQLITE_OK ){
    ucol_close(pUCollator);
    sqlite3_result_error(p, "Error registering collation function", -1);
  }
}

/*
** Register the ICU extension functions with database db.
*/
int sqlite3IcuInit(sqlite3 *db){
  struct IcuScalar {
    const char *zName;                        /* Function name */
215
216
217
218
219
220
221


222
223
224
225
226
227
228
    {"upper",  1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
    {"upper",  2, SQLITE_UTF16, (void*)1, icuCaseFunc16},

    {"lower",  1, SQLITE_UTF8,         0, icuCaseFunc16},
    {"lower",  2, SQLITE_UTF8,         0, icuCaseFunc16},
    {"upper",  1, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
    {"upper",  2, SQLITE_UTF8,  (void*)1, icuCaseFunc16},


  };

  int rc = SQLITE_OK;
  int i;

  for(i=0; rc==SQLITE_OK && i<(sizeof(scalars)/sizeof(struct IcuScalar)); i++){
    struct IcuScalar *p = &scalars[i];







>
>







315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
    {"upper",  1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
    {"upper",  2, SQLITE_UTF16, (void*)1, icuCaseFunc16},

    {"lower",  1, SQLITE_UTF8,         0, icuCaseFunc16},
    {"lower",  2, SQLITE_UTF8,         0, icuCaseFunc16},
    {"upper",  1, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
    {"upper",  2, SQLITE_UTF8,  (void*)1, icuCaseFunc16},

    {"icu_load_collation",  2, SQLITE_UTF8, (void*)db, icuLoadCollation},
  };

  int rc = SQLITE_OK;
  int i;

  for(i=0; rc==SQLITE_OK && i<(sizeof(scalars)/sizeof(struct IcuScalar)); i++){
    struct IcuScalar *p = &scalars[i];
Added test/icu.test.












































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# 2007 May 1
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# $Id: icu.test,v 1.1 2007/05/07 11:53:14 danielk1977 Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

ifcapable !icu {
  finish_test
  return
}

# Create a table to work with.
#
execsql {CREATE TABLE test1(i1 int, i2 int, r1 real, r2 real, t1 text, t2 text)}
execsql {INSERT INTO test1 VALUES(1,2,1.1,2.2,'hello','world')}
proc test_expr {name settings expr result} {
  do_test $name [format {
    db one {
      BEGIN; 
      UPDATE test1 SET %s; 
      SELECT %s FROM test1; 
      ROLLBACK;
    }
  } $settings $expr] $result
}

# Tests of the REGEXP operator.
#
test_expr icu-1.1 {i1='hello'} {i1 REGEXP 'hello'}  1
test_expr icu-1.2 {i1='hello'} {i1 REGEXP '.ello'}  1
test_expr icu-1.3 {i1='hello'} {i1 REGEXP '.ell'}   0
test_expr icu-1.4 {i1='hello'} {i1 REGEXP '.ell.*'} 1
test_expr icu-1.5 {i1=NULL}    {i1 REGEXP '.ell.*'} {}

# Some non-ascii characters with defined case mappings
#
set ::EGRAVE "\xC8"
set ::egrave "\xE8"

set ::OGRAVE "\xD2"
set ::ograve "\xF2"

# That German letter that looks a bit like a B. The
# upper-case version of which is "SS" (two characters).
#
set ::szlig "\xDF" 

# Tests of the upper()/lower() functions.
#
test_expr icu-2.1 {i1='HellO WorlD'} {upper(i1)} {HELLO WORLD}
test_expr icu-2.2 {i1='HellO WorlD'} {lower(i1)} {hello world}
test_expr icu-2.3 {i1=$::egrave} {lower(i1)}     $::egrave
test_expr icu-2.4 {i1=$::egrave} {upper(i1)}     $::EGRAVE
test_expr icu-2.5 {i1=$::ograve} {lower(i1)}     $::ograve
test_expr icu-2.6 {i1=$::ograve} {upper(i1)}     $::OGRAVE
test_expr icu-2.3 {i1=$::EGRAVE} {lower(i1)}     $::egrave
test_expr icu-2.4 {i1=$::EGRAVE} {upper(i1)}     $::EGRAVE
test_expr icu-2.5 {i1=$::OGRAVE} {lower(i1)}     $::ograve
test_expr icu-2.6 {i1=$::OGRAVE} {upper(i1)}     $::OGRAVE

test_expr icu-2.7 {i1=$::szlig} {upper(i1)}      "SS"
test_expr icu-2.8 {i1='SS'} {lower(i1)}          "ss"

# In turkish (locale="tr_TR"), the lower case version of I
# is "small dotless i" (code point 0x131 (decimal 305)).
#
set ::small_dotless_i "\u0131"
test_expr icu-3.1 {i1='I'} {lower(i1)}           "i"
test_expr icu-3.2 {i1='I'} {lower(i1, 'tr_tr')}  $::small_dotless_i
test_expr icu-3.3 {i1='I'} {lower(i1, 'en_AU')}  "i"

#--------------------------------------------------------------------
# Test the collation sequence function.
#
do_test icu-4.1 {
  execsql {
    CREATE TABLE fruit(name);
    INSERT INTO fruit VALUES('plum');
    INSERT INTO fruit VALUES('cherry');
    INSERT INTO fruit VALUES('apricot');
    INSERT INTO fruit VALUES('peach');
    INSERT INTO fruit VALUES('chokecherry');
    INSERT INTO fruit VALUES('yamot');
  }
} {}
do_test icu-4.2 {
  execsql {
    SELECT icu_load_collation('en_US', 'AmericanEnglish');
    SELECT icu_load_collation('lt_LT', 'Lithuanian');
  }
  execsql {
    SELECT name FROM fruit ORDER BY name COLLATE AmericanEnglish ASC;
  }
} {apricot cherry chokecherry peach plum yamot}


# Test collation using Lithuanian rules. In the Lithuanian
# alphabet, "y" comes right after "i".
#
do_test icu-4.3 {
  execsql {
    SELECT name FROM fruit ORDER BY name COLLATE Lithuanian ASC;
  }
} {apricot cherry chokecherry yamot peach plum}

finish_test