/* ** 2011 Jun 13 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** ** ** This file is not part of the production FTS code. It is only used for ** testing. It contains a Tcl command that can be used to test if a document ** matches an FTS NEAR expression. ** ** As of March 2012, it also contains a version 1 tokenizer used for testing ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly. */ #include #include #include #if defined(SQLITE_TEST) #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */ #include "fts3Int.h" #define NM_MAX_TOKEN 12 typedef struct NearPhrase NearPhrase; typedef struct NearDocument NearDocument; typedef struct NearToken NearToken; struct NearDocument { int nToken; /* Length of token in bytes */ NearToken *aToken; /* Token array */ }; struct NearToken { int n; /* Length of token in bytes */ const char *z; /* Pointer to token string */ }; struct NearPhrase { int nNear; /* Preceding NEAR value */ int nToken; /* Number of tokens in this phrase */ NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */ }; static int nm_phrase_match( NearPhrase *p, NearToken *aToken ){ int ii; for(ii=0; iinToken; ii++){ NearToken *pToken = &p->aToken[ii]; if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){ if( aToken[ii].n<(pToken->n-1) ) return 0; if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0; }else{ if( aToken[ii].n!=pToken->n ) return 0; if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0; } } return 1; } static int nm_near_chain( int iDir, /* Direction to iterate through aPhrase[] */ NearDocument *pDoc, /* Document to match against */ int iPos, /* Position at which iPhrase was found */ int nPhrase, /* Size of phrase array */ NearPhrase *aPhrase, /* Phrase array */ int iPhrase /* Index of phrase found */ ){ int iStart; int iStop; int ii; int nNear; int iPhrase2; NearPhrase *p; NearPhrase *pPrev; assert( iDir==1 || iDir==-1 ); if( iDir==1 ){ if( (iPhrase+1)==nPhrase ) return 1; nNear = aPhrase[iPhrase+1].nNear; }else{ if( iPhrase==0 ) return 1; nNear = aPhrase[iPhrase].nNear; } pPrev = &aPhrase[iPhrase]; iPhrase2 = iPhrase+iDir; p = &aPhrase[iPhrase2]; iStart = iPos - nNear - p->nToken; iStop = iPos + nNear + pPrev->nToken; if( iStart<0 ) iStart = 0; if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken; for(ii=iStart; ii<=iStop; ii++){ if( nm_phrase_match(p, &pDoc->aToken[ii]) ){ if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1; } } return 0; } static int nm_match_count( NearDocument *pDoc, /* Document to match against */ int nPhrase, /* Size of phrase array */ NearPhrase *aPhrase, /* Phrase array */ int iPhrase /* Index of phrase to count matches for */ ){ int nOcc = 0; int ii; NearPhrase *p = &aPhrase[iPhrase]; for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){ if( nm_phrase_match(p, &pDoc->aToken[ii]) ){ /* Test forward NEAR chain (i>iPhrase) */ if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue; /* Test reverse NEAR chain (iNM_MAX_TOKEN ){ Tcl_AppendResult(interp, "Too many tokens in phrase", 0); rc = TCL_ERROR; goto near_match_out; } for(jj=0; jjz = Tcl_GetStringFromObj(apToken[jj], &pT->n); } aPhrase[ii].nToken = nToken; } for(ii=1; ii0)); near_match_out: ckfree((char *)aPhrase); ckfree((char *)doc.aToken); return rc; } /* ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD? ** ** Normally, FTS uses hard-coded values to determine the minimum doclist ** size eligible for incremental loading, and the size of the chunks loaded ** when a doclist is incrementally loaded. This command allows the built-in ** values to be overridden for testing purposes. ** ** If present, the first argument is the chunksize in bytes to load doclists ** in. The second argument is the minimum doclist size in bytes to use ** incremental loading with. ** ** Whether or not the arguments are present, this command returns a list of ** two integers - the initial chunksize and threshold when the command is ** invoked. This can be used to restore the default behavior after running ** tests. For example: ** ** # Override incr-load settings for testing: ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold] ** ** .... run tests .... ** ** # Restore initial incr-load settings: ** eval fts3_configure_incr_load $cfg */ static int fts3_configure_incr_load_cmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ #ifdef SQLITE_ENABLE_FTS3 extern int test_fts3_node_chunksize; extern int test_fts3_node_chunk_threshold; Tcl_Obj *pRet; if( objc!=1 && objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?"); return TCL_ERROR; } pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); Tcl_ListObjAppendElement( interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize)); Tcl_ListObjAppendElement( interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold)); if( objc==3 ){ int iArg1; int iArg2; if( Tcl_GetIntFromObj(interp, objv[1], &iArg1) || Tcl_GetIntFromObj(interp, objv[2], &iArg2) ){ Tcl_DecrRefCount(pRet); return TCL_ERROR; } test_fts3_node_chunksize = iArg1; test_fts3_node_chunk_threshold = iArg2; } Tcl_SetObjResult(interp, pRet); Tcl_DecrRefCount(pRet); #endif UNUSED_PARAMETER(clientData); return TCL_OK; } #ifdef SQLITE_ENABLE_FTS3 /************************************************************************** ** Beginning of test tokenizer code. ** ** For language 0, this tokenizer is similar to the default 'simple' ** tokenizer. For other languages L, the following: ** ** * Odd numbered languages are case-sensitive. Even numbered ** languages are not. ** ** * Language ids 100 or greater are considered an error. ** ** The implementation assumes that the input contains only ASCII characters ** (i.e. those that may be encoded in UTF-8 using a single byte). */ typedef struct test_tokenizer { sqlite3_tokenizer base; } test_tokenizer; typedef struct test_tokenizer_cursor { sqlite3_tokenizer_cursor base; const char *aInput; /* Input being tokenized */ int nInput; /* Size of the input in bytes */ int iInput; /* Current offset in aInput */ int iToken; /* Index of next token to be returned */ char *aBuffer; /* Buffer containing current token */ int nBuffer; /* Number of bytes allocated at pToken */ int iLangid; /* Configured language id */ } test_tokenizer_cursor; static int testTokenizerCreate( int argc, const char * const *argv, sqlite3_tokenizer **ppTokenizer ){ test_tokenizer *pNew; UNUSED_PARAMETER(argc); UNUSED_PARAMETER(argv); pNew = sqlite3_malloc(sizeof(test_tokenizer)); if( !pNew ) return SQLITE_NOMEM; memset(pNew, 0, sizeof(test_tokenizer)); *ppTokenizer = (sqlite3_tokenizer *)pNew; return SQLITE_OK; } static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){ test_tokenizer *p = (test_tokenizer *)pTokenizer; sqlite3_free(p); return SQLITE_OK; } static int testTokenizerOpen( sqlite3_tokenizer *pTokenizer, /* The tokenizer */ const char *pInput, int nBytes, /* String to be tokenized */ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ ){ int rc = SQLITE_OK; /* Return code */ test_tokenizer_cursor *pCsr; /* New cursor object */ UNUSED_PARAMETER(pTokenizer); pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor)); if( pCsr==0 ){ rc = SQLITE_NOMEM; }else{ memset(pCsr, 0, sizeof(test_tokenizer_cursor)); pCsr->aInput = pInput; if( nBytes<0 ){ pCsr->nInput = (int)strlen(pInput); }else{ pCsr->nInput = nBytes; } } *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; return rc; } static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){ test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; sqlite3_free(pCsr->aBuffer); sqlite3_free(pCsr); return SQLITE_OK; } static int testIsTokenChar(char c){ return (c>='a' && c<='z') || (c>='A' && c<='Z'); } static int testTolower(char c){ char ret = c; if( ret>='A' && ret<='Z') ret = ret - ('A'-'a'); return ret; } static int testTokenizerNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; int rc = SQLITE_OK; const char *p; const char *pEnd; p = &pCsr->aInput[pCsr->iInput]; pEnd = &pCsr->aInput[pCsr->nInput]; /* Skip past any white-space */ assert( p<=pEnd ); while( ppCsr->nBuffer ){ sqlite3_free(pCsr->aBuffer); pCsr->aBuffer = sqlite3_malloc(nToken); } if( pCsr->aBuffer==0 ){ rc = SQLITE_NOMEM; }else{ int i; if( pCsr->iLangid & 0x00000001 ){ for(i=0; iaBuffer[i] = pToken[i]; }else{ for(i=0; iaBuffer[i] = testTolower(pToken[i]); } pCsr->iToken++; pCsr->iInput = (int)(p - pCsr->aInput); *ppToken = pCsr->aBuffer; *pnBytes = nToken; *piStartOffset = (int)(pToken - pCsr->aInput); *piEndOffset = (int)(p - pCsr->aInput); *piPosition = pCsr->iToken; } } return rc; } static int testTokenizerLanguage( sqlite3_tokenizer_cursor *pCursor, int iLangid ){ int rc = SQLITE_OK; test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; pCsr->iLangid = iLangid; if( pCsr->iLangid>=100 ){ rc = SQLITE_ERROR; } return rc; } #endif static int fts3_test_tokenizer_cmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ #ifdef SQLITE_ENABLE_FTS3 static const sqlite3_tokenizer_module testTokenizerModule = { 1, testTokenizerCreate, testTokenizerDestroy, testTokenizerOpen, testTokenizerClose, testTokenizerNext, testTokenizerLanguage }; const sqlite3_tokenizer_module *pPtr = &testTokenizerModule; if( objc!=1 ){ Tcl_WrongNumArgs(interp, 1, objv, ""); return TCL_ERROR; } Tcl_SetObjResult(interp, Tcl_NewByteArrayObj( (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *) )); #endif UNUSED_PARAMETER(clientData); return TCL_OK; } static int fts3_test_varint_cmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ #ifdef SQLITE_ENABLE_FTS3 char aBuf[24]; int rc; Tcl_WideInt w, w2; int nByte, nByte2; if( objc!=2 ){ Tcl_WrongNumArgs(interp, 1, objv, "INTEGER"); return TCL_ERROR; } rc = Tcl_GetWideIntFromObj(interp, objv[1], &w); if( rc!=TCL_OK ) return rc; nByte = sqlite3Fts3PutVarint(aBuf, w); nByte2 = sqlite3Fts3GetVarint(aBuf, &w2); if( w!=w2 || nByte!=nByte2 ){ char *zErr = sqlite3_mprintf("error testing %lld", w); Tcl_ResetResult(interp); Tcl_AppendResult(interp, zErr, 0); return TCL_ERROR; } if( w<=2147483647 && w>=0 ){ int i; nByte2 = fts3GetVarint32(aBuf, &i); if( (int)w!=i || nByte!=nByte2 ){ char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w); Tcl_ResetResult(interp); Tcl_AppendResult(interp, zErr, 0); return TCL_ERROR; } } #endif UNUSED_PARAMETER(clientData); return TCL_OK; } /* ** End of tokenizer code. **************************************************************************/ int Sqlitetestfts3_Init(Tcl_Interp *interp){ Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); Tcl_CreateObjCommand(interp, "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 ); Tcl_CreateObjCommand( interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0 ); Tcl_CreateObjCommand( interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0 ); return TCL_OK; } #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */ #endif /* ifdef SQLITE_TEST */