/* ** This C program extracts all "words" from an input document and adds them ** to an SQLite database. A "word" is any contiguous sequence of alphabetic ** characters. All digits, punctuation, and whitespace characters are ** word separators. The database stores a single entry for each distinct ** word together with a count of the number of occurrences of that word. ** A fresh database is created automatically on each run. ** ** wordcount DATABASE INPUTFILE ** ** The INPUTFILE name can be omitted, in which case input it taken from ** standard input. ** ** Option: ** ** --without-rowid Use a WITHOUT ROWID table to store the words. ** --insert Use INSERT mode (the default) ** --replace Use REPLACE mode ** --select Use SELECT mode ** --update Use UPDATE mode ** --delete Use DELETE mode ** --query Use QUERY mode ** --nocase Add the NOCASE collating sequence to the words. ** --trace Enable sqlite3_trace() output. ** --summary Show summary information on the collected data. ** --stats Show sqlite3_status() results at the end. ** --pagesize NNN Use a page size of NNN ** --cachesize NNN Use a cache size of NNN ** --commit NNN Commit after every NNN operations ** --nosync Use PRAGMA synchronous=OFF ** --journal MMMM Use PRAGMA journal_mode=MMMM ** ** Modes: ** ** Insert mode means: ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,1) ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new -- if (1) is a noop ** ** Update mode means: ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,0) ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new ** ** Replace mode means: ** (1) REPLACE INTO wordcount ** VALUES($new,ifnull((SELECT cnt FROM wordcount WHERE word=$new),0)+1); ** ** Select mode means: ** (1) SELECT 1 FROM wordcount WHERE word=$new ** (2) INSERT INTO wordcount VALUES($new,1) -- if (1) returns nothing ** (3) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new --if (1) return TRUE ** ** Delete mode means: ** (1) DELETE FROM wordcount WHERE word=$new ** ** Query mode means: ** (1) SELECT cnt FROM wordcount WHERE word=$new ** ** Note that delete mode and query mode are only useful for preexisting ** databases. The wordcount table is created using IF NOT EXISTS so this ** utility can be run multiple times on the same database file. The ** --without-rowid, --nocase, and --pagesize parameters are only effective ** when creating a new database and are harmless no-ops on preexisting ** databases. ** ****************************************************************************** ** ** Compile as follows: ** ** gcc -I. wordcount.c sqlite3.c -ldl -lpthreads ** ** Or: ** ** gcc -I. -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \ ** wordcount.c sqlite3.c */ #include #include #include #include #include #include "sqlite3.h" /* Print an error message and exit */ static void fatal_error(const char *zMsg, ...){ va_list ap; va_start(ap, zMsg); vfprintf(stderr, zMsg, ap); va_end(ap); exit(1); } /* The sqlite3_trace() callback function */ static void traceCallback(void *NotUsed, const char *zSql){ printf("%s;\n", zSql); } /* An sqlite3_exec() callback that prints results on standard output, ** each column separated by a single space. */ static int printResult(void *NotUsed, int nArg, char **azArg, char **azNm){ int i; printf("--"); for(i=0; i0 && (nOp%commitInterval)==0 ){ sqlite3_exec(db, "COMMIT; BEGIN IMMEDIATE", 0, 0, 0); } } } sqlite3_exec(db, "COMMIT", 0, 0, 0); if( zFileToRead ) fclose(in); sqlite3_finalize(pInsert); sqlite3_finalize(pUpdate); sqlite3_finalize(pSelect); sqlite3_finalize(pDelete); if( iMode==MODE_QUERY ){ printf("sum of cnt: %lld\n", sumCnt); rc = sqlite3_prepare_v2(db,"SELECT sum(cnt*cnt) FROM wordcount", -1, &pSelect, 0); if( rc==SQLITE_OK && sqlite3_step(pSelect)==SQLITE_ROW ){ printf("double-check: %lld\n", sqlite3_column_int64(pSelect, 0)); } sqlite3_finalize(pSelect); } if( showSummary ){ sqlite3_create_function(db, "checksum", -1, SQLITE_UTF8, 0, 0, checksumStep, checksumFinalize); sqlite3_exec(db, "SELECT 'count(*): ', count(*) FROM wordcount;\n" "SELECT 'sum(cnt): ', sum(cnt) FROM wordcount;\n" "SELECT 'max(cnt): ', max(cnt) FROM wordcount;\n" "SELECT 'avg(cnt): ', avg(cnt) FROM wordcount;\n" "SELECT 'sum(cnt=1):', sum(cnt=1) FROM wordcount;\n" "SELECT 'top 10: ', group_concat(word, ', ') FROM " "(SELECT word FROM wordcount ORDER BY cnt DESC, word LIMIT 10);\n" "SELECT 'checksum: ', checksum(word, cnt) FROM " "(SELECT word, cnt FROM wordcount ORDER BY word);\n" "PRAGMA integrity_check;\n", printResult, 0, 0); } /* Database connection statistics printed after both prepared statements ** have been finalized */ if( showStats ){ sqlite3_db_status(db, SQLITE_DBSTATUS_LOOKASIDE_USED, &iCur, &iHiwtr, 0); printf("-- Lookaside Slots Used: %d (max %d)\n", iCur,iHiwtr); sqlite3_db_status(db, SQLITE_DBSTATUS_LOOKASIDE_HIT, &iCur, &iHiwtr, 0); printf("-- Successful lookasides: %d\n", iHiwtr); sqlite3_db_status(db, SQLITE_DBSTATUS_LOOKASIDE_MISS_SIZE, &iCur,&iHiwtr,0); printf("-- Lookaside size faults: %d\n", iHiwtr); sqlite3_db_status(db, SQLITE_DBSTATUS_LOOKASIDE_MISS_FULL, &iCur,&iHiwtr,0); printf("-- Lookaside OOM faults: %d\n", iHiwtr); sqlite3_db_status(db, SQLITE_DBSTATUS_CACHE_USED, &iCur, &iHiwtr, 0); printf("-- Pager Heap Usage: %d bytes\n", iCur); sqlite3_db_status(db, SQLITE_DBSTATUS_CACHE_HIT, &iCur, &iHiwtr, 1); printf("-- Page cache hits: %d\n", iCur); sqlite3_db_status(db, SQLITE_DBSTATUS_CACHE_MISS, &iCur, &iHiwtr, 1); printf("-- Page cache misses: %d\n", iCur); sqlite3_db_status(db, SQLITE_DBSTATUS_CACHE_WRITE, &iCur, &iHiwtr, 1); printf("-- Page cache writes: %d\n", iCur); sqlite3_db_status(db, SQLITE_DBSTATUS_SCHEMA_USED, &iCur, &iHiwtr, 0); printf("-- Schema Heap Usage: %d bytes\n", iCur); sqlite3_db_status(db, SQLITE_DBSTATUS_STMT_USED, &iCur, &iHiwtr, 0); printf("-- Statement Heap Usage: %d bytes\n", iCur); } sqlite3_close(db); /* Global memory usage statistics printed after the database connection ** has closed. Memory usage should be zero at this point. */ if( showStats ){ sqlite3_status(SQLITE_STATUS_MEMORY_USED, &iCur, &iHiwtr, 0); printf("-- Memory Used (bytes): %d (max %d)\n", iCur,iHiwtr); sqlite3_status(SQLITE_STATUS_MALLOC_COUNT, &iCur, &iHiwtr, 0); printf("-- Outstanding Allocations: %d (max %d)\n", iCur,iHiwtr); sqlite3_status(SQLITE_STATUS_PAGECACHE_OVERFLOW, &iCur, &iHiwtr, 0); printf("-- Pcache Overflow Bytes: %d (max %d)\n", iCur,iHiwtr); sqlite3_status(SQLITE_STATUS_SCRATCH_OVERFLOW, &iCur, &iHiwtr, 0); printf("-- Scratch Overflow Bytes: %d (max %d)\n", iCur,iHiwtr); sqlite3_status(SQLITE_STATUS_MALLOC_SIZE, &iCur, &iHiwtr, 0); printf("-- Largest Allocation: %d bytes\n",iHiwtr); sqlite3_status(SQLITE_STATUS_PAGECACHE_SIZE, &iCur, &iHiwtr, 0); printf("-- Largest Pcache Allocation: %d bytes\n",iHiwtr); sqlite3_status(SQLITE_STATUS_SCRATCH_SIZE, &iCur, &iHiwtr, 0); printf("-- Largest Scratch Allocation: %d bytes\n", iHiwtr); } return 0; }