SQLite

Check-in [e240d467e6]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add documentation for tokenizer api to fts5.h. Also add a script to extract extension API docs and format them as html.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: e240d467e60b7755486aae5e8b0824f7c741f852
User & Date: dan 2014-08-25 19:58:54.559
Context
2014-11-15
20:07
Fix the customization interfaces so that they match the documentation. (check-in: fba0b5fc7e user: dan tags: fts5)
2014-08-25
19:58
Add documentation for tokenizer api to fts5.h. Also add a script to extract extension API docs and format them as html. (check-in: e240d467e6 user: dan tags: fts5)
2014-08-18
19:30
Add an "automerge=0" mode that disables auto-merging and falls back to fts4-style crisis merges. (check-in: 2397404e15 user: dan tags: fts5)
Changes
Unified Diff Ignore Whitespace Patch
Added ext/fts5/extract_api_docs.tcl.






































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#
# 2014 August 24
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#--------------------------------------------------------------------------
#
# This script extracts the documentation for the API used by fts5 auxiliary 
# functions from header file fts5.h. It outputs html text on stdout that
# is included in the documentation on the web.
# 

set input_file [file join [file dir [info script]] fts5.h]
set fd [open $input_file]
set data [read $fd]
close $fd


# Argument $data is the entire text of the fts5.h file. This function 
# extracts the definition of the Fts5ExtensionApi structure from it and
# returns a key/value list of structure member names and definitions. i.e.
#
#   iVersion {int iVersion} xUserData {void *(*xUserData)(Fts5Context*)} ...
#
proc get_struct_members {data} {

  # Extract the structure definition from the fts5.h file.
  regexp "struct Fts5ExtensionApi {(.*)};" $data -> defn

  # Remove all comments from the structure definition
  regsub -all {/[*].*?[*]/} $defn {} defn2

  set res [list]
  foreach member [split $defn2 {;}] {

    set member [string trim $member]
    if {$member!=""} { 
      catch { set name [lindex $member end] }
      regexp {.*?[(][*]([^)]*)[)]} $member -> name
      lappend res $name $member
    }
  }

  set res
}

proc get_struct_docs {data names} {
  # Extract the structure definition from the fts5.h file.
  regexp {EXTENSION API FUNCTIONS(.*?)[*]/} $data -> docs

  set current_doc    ""
  set current_header ""

  foreach line [split $docs "\n"] {
    regsub {[*]*} $line {} line
    if {[regexp {^  } $line]} {
      append current_doc "$line\n"
    } elseif {[string trim $line]==""} {
      if {$current_header!=""} { append current_doc "\n" }
    } else {
      if {$current_doc != ""} {
        lappend res $current_header $current_doc
        set current_doc ""
      }
      set subject n/a
      regexp {^ *([[:alpha:]]*)} $line -> subject
      if {[lsearch $names $subject]>=0} {
        set current_header $subject
      } else {
        set current_header [string trim $line]
      }
    }
  }

  if {$current_doc != ""} {
    lappend res $current_header $current_doc
  }

  set res
}

# Initialize global array M as a map from Fts5StructureApi member name
# to member definition. i.e.
#
#   iVersion  -> {int iVersion}
#   xUserData -> {void *(*xUserData)(Fts5Context*)}
#   ...
#
array set M [get_struct_members $data]

# Initialize global list D as a map from section name to documentation
# text. Most (all?) section names are structure member names.
#
set D [get_struct_docs $data [array names M]]

foreach {hdr docs} $D {
  if {[info exists M($hdr)]} {
    set hdr $M($hdr)
  }
  puts "<h3><pre>  $hdr</pre></h3>"

  set mode ""
  set bEmpty 1
  foreach line [split [string trim $docs] "\n"] {
    if {[string trim $line]==""} {
      if {$mode != ""} {puts "</$mode>"}
      set mode ""
    } elseif {$mode == ""} {
      if {[regexp {^     } $line]} {
        set mode code
      } else {
        set mode p
      }
      puts "<$mode>"
    }
    puts $line
  }
  if {$mode != ""} {puts "</$mode>"}
}







Changes to ext/fts5/fts5.h.
38
39
40
41
42
43
44
45
46

47
48
49
50
51
52
53
54
55
56
57
58
59
  Fts5Context *pFts,              /* First arg to pass to pApi functions */
  sqlite3_context *pCtx,          /* Context for returning result/error */
  int nVal,                       /* Number of values in apVal[] array */
  sqlite3_value **apVal           /* Array of trailing arguments */
);

/*
** xUserData(pFts):
**

**   Return a copy of the context pointer the extension function was 
**   registered with.
**
**
** xColumnTotalSize(pFts, iCol, pnToken):
**
**   Returns the total number of tokens in column iCol, considering all
**   rows in the FTS5 table.
**
**
** xColumnCount:
**   Returns the number of columns in the FTS5 table.
**







|

>





<







38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

53
54
55
56
57
58
59
  Fts5Context *pFts,              /* First arg to pass to pApi functions */
  sqlite3_context *pCtx,          /* Context for returning result/error */
  int nVal,                       /* Number of values in apVal[] array */
  sqlite3_value **apVal           /* Array of trailing arguments */
);

/*
** EXTENSION API FUNCTIONS
**
** xUserData(pFts):
**   Return a copy of the context pointer the extension function was 
**   registered with.
**
**
** xColumnTotalSize(pFts, iCol, pnToken):

**   Returns the total number of tokens in column iCol, considering all
**   rows in the FTS5 table.
**
**
** xColumnCount:
**   Returns the number of columns in the FTS5 table.
**
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
**   At EOF, a non-zero value is returned and output variable iPos set to -1.
**
** xTokenize:
**   Tokenize text using the tokenizer belonging to the FTS5 table.
**
**
** xQueryPhrase(pFts5, iPhrase, pUserData, xCallback):
**
**   This API function is used to query the FTS table for phrase iPhrase
**   of the current query. Specifically, a query equivalent to:
**
**       ... FROM ftstable WHERE ftstable MATCH $p ORDER BY DESC
**
**   with $p set to a phrase equivalent to the phrase iPhrase of the
**   current query is executed. For each row visited, the callback function







<







79
80
81
82
83
84
85

86
87
88
89
90
91
92
**   At EOF, a non-zero value is returned and output variable iPos set to -1.
**
** xTokenize:
**   Tokenize text using the tokenizer belonging to the FTS5 table.
**
**
** xQueryPhrase(pFts5, iPhrase, pUserData, xCallback):

**   This API function is used to query the FTS table for phrase iPhrase
**   of the current query. Specifically, a query equivalent to:
**
**       ... FROM ftstable WHERE ftstable MATCH $p ORDER BY DESC
**
**   with $p set to a phrase equivalent to the phrase iPhrase of the
**   current query is executed. For each row visited, the callback function
175
176
177
178
179
180
181



















































































182
183

#define FTS5_POS2COLUMN(iPos) (int)(iPos >> 32)
#define FTS5_POS2OFFSET(iPos) (int)(iPos & 0xFFFFFFFF)

/* 
** CUSTOM AUXILIARY FUNCTIONS
*************************************************************************/



















































































#endif /* _FTS5_H */








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

#define FTS5_POS2COLUMN(iPos) (int)(iPos >> 32)
#define FTS5_POS2OFFSET(iPos) (int)(iPos & 0xFFFFFFFF)

/* 
** CUSTOM AUXILIARY FUNCTIONS
*************************************************************************/

/*************************************************************************
** CUSTOM TOKENIZERS
**
** Applications may also register custom tokenizer types. A tokenizer 
** is registered by providing fts5 with a populated instance of the 
** following structure. The structure methods are expected to function
** as follows:
**
** xCreate:
**   This function is used to allocate and inititalize a tokenizer instance.
**   A tokenizer instance is required to actually tokenize text.
**
**   The first argument passed to this function is a copy of the (void*)
**   pointer provided by the application when the fts5_tokenizer object
**   was registered with SQLite. The second and third arguments are an
**   array of nul-terminated strings containing the tokenizer arguments,
**   if any, specified as part of the CREATE VIRTUAL TABLE statement used
**   to create the fts5 table.
**
**   The final argument is an output variable. If successful, (*ppOut) 
**   should be set to point to the new tokenizer handle and SQLITE_OK
**   returned. If an error occurs, some value other than SQLITE_OK should
**   be returned. In this case, fts5 assumes that the final value of *ppOut 
**   is undefined.
**
** xDelete:
**   This function is invoked to delete a tokenizer handle previously
**   allocated using xCreate(). Fts5 guarantees that this function will
**   be invoked exactly once for each successful call to xCreate().
**
** xTokenize:
**   This function is expected to tokenize the nText byte string indicated 
**   by argument pText. pText may not be nul-terminated. The first argument
**   passed to this function is a pointer to an Fts5Tokenizer object returned 
**   by an earlier call to xCreate().
**
**   For each token in the input string, the supplied callback xToken() must
**   be invoked. The first argument to it should be a copy of the pointer
**   passed as the second argument to xTokenize(). The next two arguments
**   are a pointer to a buffer containing the token text, and the size of
**   the token in bytes. The 4th and 5th arguments are the byte offsets of
**   the first byte of and first byte immediately following the text from 
**   which the token is derived within the input. The final argument is the
**   token position - the total number of tokens that appear before this one 
**   in the input buffer.
**
**   The xToken() callback must be invoked with non-decreasing values of
**   the iPos parameter.
**
**   If an xToken() callback returns any value other than SQLITE_OK, then
**   the tokenization should be abandoned and the xTokenize() method should
**   immediately return a copy of the xToken() return value. Or, if the
**   input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally,
**   if an error occurs with the xTokenize() implementation itself, it
**   may abandon the tokenization and return any error code other than
**   SQLITE_OK or SQLITE_DONE.
**
*/
typedef struct fts5_tokenizer fts5_tokenizer;
typedef struct Fts5Tokenizer Fts5Tokenizer;

struct fts5_tokenizer {
  int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut);
  void (*xDelete)(Fts5Tokenizer*);
  int (*xTokenize)(Fts5Tokenizer*, 
      void *pCtx,
      const char *pText, int nText, 
      int (*xToken)(
        void *pCtx,         /* Copy of 2nd argument to xTokenize() */
        const char *pToken, /* Pointer to buffer containing token */
        int nToken,         /* Size of token in bytes */
        int iStart,         /* Byte offset of token within input text */
        int iEnd,           /* Byte offset of end of token within input text */
        int iPos            /* Position of token in input (first token is 0) */
      )
  );
};

/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/

#endif /* _FTS5_H */