/ Check-in [7c15d762]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:The ".import" command of the shell, and the csv virtual table extension both ignore a single UTF-8 BOM at the beginning of their input.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 7c15d762d99c2e3e534cd35dfe25ddcd317637eb1f2655fd24c2dd5f9d5a7613
User & Date: drh 2017-06-26 18:42:23
Context
2017-06-26
21:08
Add the -withoutnulls option to the "db eval" method in the TCL interface. check-in: 18f0616e user: drh tags: trunk
18:42
The ".import" command of the shell, and the csv virtual table extension both ignore a single UTF-8 BOM at the beginning of their input. check-in: 7c15d762 user: drh tags: trunk
2017-06-24
19:21
Disable shell tests for the .schema command if virtual tables are not available. check-in: c8186874 user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/misc/csv.c.

    73     73   typedef struct CsvReader CsvReader;
    74     74   struct CsvReader {
    75     75     FILE *in;              /* Read the CSV text from this input stream */
    76     76     char *z;               /* Accumulated text for a field */
    77     77     int n;                 /* Number of bytes in z */
    78     78     int nAlloc;            /* Space allocated for z[] */
    79     79     int nLine;             /* Current line number */
           80  +  int bNotFirst;         /* True if prior text has been seen */
    80     81     char cTerm;            /* Character that terminated the most recent field */
    81     82     size_t iIn;            /* Next unread character in the input buffer */
    82     83     size_t nIn;            /* Number of characters in the input buffer */
    83     84     char *zIn;             /* The input buffer */
    84     85     char zErr[CSV_MXERR];  /* Error message */
    85     86   };
    86     87   
................................................................................
    87     88   /* Initialize a CsvReader object */
    88     89   static void csv_reader_init(CsvReader *p){
    89     90     p->in = 0;
    90     91     p->z = 0;
    91     92     p->n = 0;
    92     93     p->nAlloc = 0;
    93     94     p->nLine = 0;
           95  +  p->bNotFirst = 0;
    94     96     p->nIn = 0;
    95     97     p->zIn = 0;
    96     98     p->zErr[0] = 0;
    97     99   }
    98    100   
    99    101   /* Close and reset a CsvReader object */
   100    102   static void csv_reader_reset(CsvReader *p){
................................................................................
   247    249           }
   248    250         }
   249    251         if( csv_append(p, (char)c) ) return 0;
   250    252         ppc = pc;
   251    253         pc = c;
   252    254       }
   253    255     }else{
          256  +    /* If this is the first field being parsed and it begins with the
          257  +    ** UTF-8 BOM  (0xEF BB BF) then skip the BOM */
          258  +    if( (c&0xff)==0xef && p->bNotFirst==0 ){
          259  +      csv_append(p, c);
          260  +      c = csv_getc(p);
          261  +      if( (c&0xff)==0xbb ){
          262  +        csv_append(p, c);
          263  +        c = csv_getc(p);
          264  +        if( (c&0xff)==0xbf ){
          265  +          p->bNotFirst = 1;
          266  +          p->n = 0;
          267  +          return csv_read_one_field(p);
          268  +        }
          269  +      }
          270  +    }
   254    271       while( c>',' || (c!=EOF && c!=',' && c!='\n') ){
   255    272         if( csv_append(p, (char)c) ) return 0;
   256    273         c = csv_getc(p);
   257    274       }
   258    275       if( c=='\n' ){
   259    276         p->nLine++;
   260    277         if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--;
   261    278       }
   262    279       p->cTerm = (char)c;
   263    280     }
   264    281     if( p->z ) p->z[p->n] = 0;
          282  +  p->bNotFirst = 1;
   265    283     return p->z;
   266    284   }
   267    285   
   268    286   
   269    287   /* Forward references to the various virtual table methods implemented
   270    288   ** in this file. */
   271    289   static int csvtabCreate(sqlite3*, void*, int, const char*const*, 

Changes to src/shell.c.

  3818   3818   struct ImportCtx {
  3819   3819     const char *zFile;  /* Name of the input file */
  3820   3820     FILE *in;           /* Read the CSV text from this input stream */
  3821   3821     char *z;            /* Accumulated text for a field */
  3822   3822     int n;              /* Number of bytes in z */
  3823   3823     int nAlloc;         /* Space allocated for z[] */
  3824   3824     int nLine;          /* Current line number */
         3825  +  int bNotFirst;      /* True if one or more bytes already read */
  3825   3826     int cTerm;          /* Character that terminated the most recent field */
  3826   3827     int cColSep;        /* The column separator character.  (Usually ",") */
  3827   3828     int cRowSep;        /* The row separator character.  (Usually "\n") */
  3828   3829   };
  3829   3830   
  3830   3831   /* Append a single byte to z[] */
  3831   3832   static void import_append_char(ImportCtx *p, int c){
................................................................................
  3897   3898           break;
  3898   3899         }
  3899   3900         import_append_char(p, c);
  3900   3901         ppc = pc;
  3901   3902         pc = c;
  3902   3903       }
  3903   3904     }else{
         3905  +    /* If this is the first field being parsed and it begins with the
         3906  +    ** UTF-8 BOM  (0xEF BB BF) then skip the BOM */
         3907  +    if( (c&0xff)==0xef && p->bNotFirst==0 ){
         3908  +      import_append_char(p, c);
         3909  +      c = fgetc(p->in);
         3910  +      if( (c&0xff)==0xbb ){
         3911  +        import_append_char(p, c);
         3912  +        c = fgetc(p->in);
         3913  +        if( (c&0xff)==0xbf ){
         3914  +          p->bNotFirst = 1;
         3915  +          p->n = 0;
         3916  +          return csv_read_one_field(p);
         3917  +        }
         3918  +      }
         3919  +    }
  3904   3920       while( c!=EOF && c!=cSep && c!=rSep ){
  3905   3921         import_append_char(p, c);
  3906   3922         c = fgetc(p->in);
  3907   3923       }
  3908   3924       if( c==rSep ){
  3909   3925         p->nLine++;
  3910   3926         if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--;
  3911   3927       }
  3912   3928       p->cTerm = c;
  3913   3929     }
  3914   3930     if( p->z ) p->z[p->n] = 0;
         3931  +  p->bNotFirst = 1;
  3915   3932     return p->z;
  3916   3933   }
  3917   3934   
  3918   3935   /* Read a single field of ASCII delimited text.
  3919   3936   **
  3920   3937   **   +  Input comes from p->in.
  3921   3938   **   +  Store results in p->z of length p->n.  Space to hold p->z comes

Changes to test/shell5.test.

   179    179     set res [catchcmd "test.db" {.import shell5.csv t1
   180    180   SELECT COUNT(*) FROM t1;}]
   181    181   } {0 7}
   182    182   
   183    183   do_test shell5-1.4.10.2 {
   184    184     catchcmd "test.db" {SELECT b FROM t1 WHERE a='7';}
   185    185   } {0 {Now is the time for all good men to come to the aid of their country.}}
          186  +
          187  +# import file with 2 rows, 2 columns and an initial BOM
          188  +#
          189  +do_test shell5-1.4.11 {
          190  +  set in [open shell5.csv wb]
          191  +  puts $in "\xef\xbb\xbf2|3"
          192  +  puts $in "4|5"
          193  +  close $in
          194  +  set res [catchcmd "test.db" {CREATE TABLE t2(x INT, y INT);
          195  +.import shell5.csv t2
          196  +.mode quote
          197  +.header on
          198  +SELECT * FROM t2;}]
          199  + string map {\n | \n\r |} $res
          200  +} {0 {'x','y'|2,3|4,5}}
          201  +
          202  +# import file with 2 rows, 2 columns or text with an initial BOM
          203  +#
          204  +do_test shell5-1.4.12 {
          205  +  set in [open shell5.csv wb]
          206  +  puts $in "\xef\xbb\xbf\"two\"|3"
          207  +  puts $in "4|5"
          208  +  close $in
          209  +  set res [catchcmd "test.db" {DELETE FROM t2;
          210  +.import shell5.csv t2
          211  +.mode quote
          212  +.header on
          213  +SELECT * FROM t2;}]
          214  + string map {\n | \n\r |} $res
          215  +} {0 {'x','y'|'two',3|4,5}}
   186    216   
   187    217   # check importing very long field
   188    218   do_test shell5-1.5.1 {
   189    219     set str [string repeat X 999]
   190    220     set in [open shell5.csv w]
   191    221     puts $in "8|$str"
   192    222     close $in
................................................................................
   206    236     for {set i 1} {$i<$cols} {incr i} {
   207    237       append data "$i|"
   208    238     }
   209    239     append data "$cols"
   210    240     set in [open shell5.csv w]
   211    241     puts $in $data
   212    242     close $in
   213         -  set res [catchcmd "test.db" {.import shell5.csv t2
          243  +  set res [catchcmd "test.db" {DROP TABLE IF EXISTS t2;
          244  +.import shell5.csv t2
   214    245   SELECT COUNT(*) FROM t2;}]
   215    246   } {0 1}
   216    247   
   217    248   # try importing a large number of rows
   218    249   set rows 9999
   219    250   do_test shell5-1.7.1 {
   220    251     set in [open shell5.csv w]