10
10
import com .github .lwhite1 .tablesaw .columns .Column ;
11
11
import com .google .common .annotations .VisibleForTesting ;
12
12
import com .google .common .collect .Lists ;
13
+ import org .apache .commons .lang3 .StringUtils ;
13
14
14
15
import javax .annotation .Nullable ;
15
16
import javax .annotation .concurrent .Immutable ;
24
25
import java .time .format .DateTimeParseException ;
25
26
import java .util .ArrayList ;
26
27
import java .util .List ;
28
+ import java .util .concurrent .CopyOnWriteArrayList ;
27
29
import java .util .function .Predicate ;
28
30
31
+ import static com .github .lwhite1 .tablesaw .api .ColumnType .*;
32
+
29
33
/**
30
34
* Static utility class that Builds Tables from Comma Separated Value (CSV) files.
31
35
* <p>
@@ -120,7 +124,7 @@ public static Table read(ColumnType types[], boolean header, int[] wanted, char
120
124
121
125
/**
122
126
* Returns a Relation constructed from a CSV File with the given file name
123
- *
127
+ * <p>
124
128
* The @code{fileName} is used as the initial table name for the new table
125
129
*
126
130
* @param types An array of the types of columns in the file, in the order they appear
@@ -178,6 +182,54 @@ public static Table read(ColumnType types[], boolean header, char columnSeparato
178
182
}
179
183
return table ;
180
184
}
185
+ /**
186
+ * Returns a Relation constructed from a CSV File with the given file name
187
+ * <p>
188
+ * The @code{fileName} is used as the initial table name for the new table
189
+ *
190
+ * @param types An array of the types of columns in the file, in the order they appear
191
+ * @param header Is the first row in the file a header?
192
+ * @param columnSeparator the delimiter
193
+ * @param fileName The fully specified file name. It is used to provide a default name for the table
194
+ * @return A Relation containing the data in the csv file.
195
+ * @throws IOException
196
+ */
197
+ public static Table headerOnly (ColumnType types [], boolean header , char columnSeparator , String fileName )
198
+ throws IOException {
199
+
200
+ CsvMapper mapper = new CsvMapper ();
201
+ CsvSchema schema = CsvSchema .builder ().setColumnSeparator (columnSeparator ).build ();
202
+ mapper .enable (CsvParser .Feature .WRAP_AS_ARRAY );
203
+ File csvFile = new File (fileName );
204
+ Table table ;
205
+ try (MappingIterator <String []> it = mapper .reader (String [].class ).with (schema ).readValues (csvFile )) {
206
+
207
+ String [] columnNames ;
208
+ List <String > headerRow ;
209
+ if (header ) {
210
+ headerRow = Lists .newArrayList (it .next ());
211
+ columnNames = selectColumnNames (headerRow , types );
212
+ } else {
213
+ columnNames = makeColumnNames (types );
214
+ headerRow = Lists .newArrayList (columnNames );
215
+ }
216
+
217
+ table = new Table (nameMaker (fileName ));
218
+ for (int x = 0 ; x < types .length ; x ++) {
219
+ if (types [x ] != ColumnType .SKIP ) {
220
+ Column newColumn = TypeUtils .newColumn (headerRow .get (x ), types [x ]);
221
+ table .addColumn (newColumn );
222
+ }
223
+ }
224
+ int [] columnIndexes = new int [columnNames .length ];
225
+ for (int i = 0 ; i < columnIndexes .length ; i ++) {
226
+ // get the index in the original table, which includes skipped fields
227
+ columnIndexes [i ] = headerRow .indexOf (columnNames [i ]);
228
+ }
229
+ it .close ();
230
+ }
231
+ return table ;
232
+ }
181
233
182
234
/**
183
235
* Provides placeholder column names for when the file read has no header
@@ -209,19 +261,108 @@ private static String nameMaker(String path) {
209
261
return p .getFileName ().toString ();
210
262
}
211
263
264
+ /**
265
+ * Returns the structure of the table given by {@code csvFileName} as detected by analysis of a sample of the data
266
+ * @throws IOException
267
+ */
268
+ public static Table detectedColumnTypes (String csvFileName , boolean header , char delimiter ) throws IOException {
269
+ Table t = CsvReader .headerOnly (
270
+ CsvReader .detectColumnTypes (csvFileName , header , delimiter ), header , delimiter , csvFileName );
271
+ return t .structure ();
272
+ }
273
+
274
+
275
+ /**
276
+ * Returns a string representation of the file types in file {@code csvFilename},
277
+ * as determined by the type-detection algorithm
278
+ *
279
+ * This method is intended to help analysts quickly fix any erroneous types, by printing out the types in a format
280
+ * such that they can be edited to correct any mistakes, and used in an array literal
281
+ *
282
+ * For example:
283
+ *
284
+ * LOCAL_DATE, // 0 date
285
+ * SHORT_INT, // 1 approval
286
+ * CATEGORY, // 2 who
287
+ *
288
+ * Note that the types are array separated, and that the index position and the column name are printed such that they
289
+ * would be interpreted as comments if you paste the output into an array:
290
+ *
291
+ * ColumnType[] types = {
292
+ * LOCAL_DATE, // 0 date
293
+ * SHORT_INT, // 1 approval
294
+ * CATEGORY, // 2 who
295
+ * }
296
+ * @throws IOException
297
+ */
298
+ public static String printColumnTypes (String csvFileName , boolean header , char delimiter ) throws IOException {
299
+ Table t = CsvReader .headerOnly (
300
+ CsvReader .detectColumnTypes (csvFileName , header , delimiter ), header , delimiter , csvFileName );
301
+ Table structure = t .structure ();
302
+
303
+ StringBuilder buf = new StringBuilder ();
304
+
305
+ //buf.append(structure.name()).append('\n');
306
+ Column typeCol = structure .column ("Column Type" );
307
+ Column indxCol = structure .column ("Index" );
308
+ Column nameCol = structure .column ("Column Name" );
309
+
310
+ // add the column headers
311
+ /*
312
+ buf.append(StringUtils.rightPad(StringUtils.defaultString(typeCol.name()), typeCol.columnWidth()));
313
+ buf.append(" ");
314
+ buf.append(StringUtils.rightPad(StringUtils.defaultString(indxCol.name()), indxCol.columnWidth()));
315
+ buf.append(' ');
316
+ buf.append(StringUtils.rightPad(StringUtils.defaultString(nameCol.name()), nameCol.columnWidth()));
317
+ buf.append('\n');
318
+ */
319
+
320
+ int typeColIndex = structure .columnIndex (typeCol );
321
+ int indxColIndex = structure .columnIndex (indxCol );
322
+ int nameColIndex = structure .columnIndex (nameCol );
323
+
324
+ int typeColWidth = typeCol .columnWidth ();
325
+ int indxColWidth = indxCol .columnWidth ();
326
+ int nameColWidth = nameCol .columnWidth ();
327
+
328
+ for (int r = 0 ; r < structure .rowCount (); r ++) {
329
+ String cell = StringUtils .rightPad (structure .get (typeColIndex , r ) + "," , typeColWidth );
330
+ buf .append (cell );
331
+ buf .append (" // " );
332
+
333
+ cell = StringUtils .rightPad (structure .get (indxColIndex , r ), indxColWidth );
334
+ buf .append (cell );
335
+ buf .append (' ' );
336
+
337
+ cell = StringUtils .rightPad (structure .get (nameColIndex , r ), nameColWidth );
338
+ buf .append (cell );
339
+ buf .append (' ' );
340
+
341
+ buf .append ('\n' );
342
+ }
343
+ return buf .toString ();
344
+ }
345
+
212
346
public static Table read (String fileName ) throws IOException {
213
347
ColumnType [] columnTypes = detectColumnTypes (fileName , true , ',' );
214
348
return read (columnTypes , true , fileName );
215
349
}
216
350
351
+ public static Table read (String fileName , boolean header ) throws IOException {
352
+ ColumnType [] columnTypes = detectColumnTypes (fileName , header , ',' );
353
+ return read (columnTypes , true , fileName );
354
+ }
355
+
356
+ public static Table read (String fileName , boolean header , char delimiter ) throws IOException {
357
+ ColumnType [] columnTypes = detectColumnTypes (fileName , header , delimiter );
358
+ return read (columnTypes , true , fileName );
359
+ }
360
+
217
361
@ VisibleForTesting
218
362
static ColumnType [] detectColumnTypes (String file , boolean header , char delimiter )
219
363
throws IOException {
220
364
221
- int linesToSkip = header ? 1 : 2 ;
222
- final int maxRows = 100 ;
223
- // Read the first 100 rows and guess
224
- // TODO(lwhite): Could we read the last 100 rows to double check?
365
+ int linesToSkip = header ? 1 : 0 ;
225
366
226
367
// to hold the results
227
368
List <ColumnType > columnTypes = new ArrayList <>();
@@ -232,18 +373,32 @@ static ColumnType[] detectColumnTypes(String file, boolean header, char delimite
232
373
int rowCount = 0 ; // make sure we don't go over maxRows
233
374
try (CSVReader reader = new CSVReader (new FileReader (file ), delimiter , '"' , linesToSkip )) {
234
375
String [] nextLine ;
235
- while ((nextLine = reader .readNext ()) != null && rowCount < maxRows ) {
236
- int columnNumber = 0 ;
237
- for (String field : nextLine ) {
238
- if (rowCount == 0 ) {
376
+ int nextRow = 0 ;
377
+ while ((nextLine = reader .readNext ()) != null ) {
378
+
379
+ // initialize the arrays to hold the strings. we don't know how many we need until we read the first row
380
+ if (rowCount == 0 ) {
381
+ for (int i = 0 ; i < nextLine .length ; i ++) {
239
382
columnData .add (new ArrayList <>());
240
- //continue; // TODO(lwhite): Better way to handle header
241
383
}
242
- columnData .get (columnNumber ).add (field );
243
- columnNumber ++;
384
+ }
385
+ int columnNumber = 0 ;
386
+ if (rowCount >= linesToSkip ) {
387
+ if (rowCount == nextRow ) {
388
+ // System.out.println(nextRow);
389
+ for (String field : nextLine ) {
390
+ columnData .get (columnNumber ).add (field );
391
+ columnNumber ++;
392
+ }
393
+ // System.out.println(columnData.get(0).size());
394
+ }
395
+ }
396
+ if (rowCount == nextRow ) {
397
+ nextRow = nextRow (nextRow );
244
398
}
245
399
rowCount ++;
246
400
}
401
+ // System.out.println(columnData.get(0).size());
247
402
}
248
403
249
404
// now detect
@@ -255,36 +410,95 @@ static ColumnType[] detectColumnTypes(String file, boolean header, char delimite
255
410
return columnTypes .toArray (new ColumnType [columnTypes .size ()]);
256
411
}
257
412
413
+ private static int nextRow (int nextRow ) {
414
+ if (nextRow < 100 ) {
415
+ return nextRow + 1 ;
416
+ }
417
+ if (nextRow < 1000 ) {
418
+ return nextRow + 10 ;
419
+ }
420
+ if (nextRow < 10_000 ) {
421
+ return nextRow + 100 ;
422
+ }
423
+ if (nextRow < 100_000 ) {
424
+ return nextRow + 1000 ;
425
+ }
426
+ if (nextRow < 1_000_000 ) {
427
+ return nextRow + 10_000 ;
428
+ }
429
+ if (nextRow < 10_000_000 ) {
430
+ return nextRow + 100_000 ;
431
+ }
432
+ if (nextRow < 100_000_000 ) {
433
+ return nextRow + 1_000_000 ;
434
+ }
435
+ return nextRow + 10_000_000 ;
436
+ }
437
+
258
438
private static ColumnType detectType (List <String > valuesList ) {
259
439
260
- ColumnType [] types = ColumnType .values ();
440
+ // Types to choose from. When more than one would work, we pick the first of the options
441
+ ColumnType [] typeArray = // we leave out category, as that is the default type
442
+ {LOCAL_DATE_TIME , LOCAL_TIME , LOCAL_DATE , BOOLEAN , SHORT_INT , INTEGER , LONG_INT , FLOAT };
443
+
444
+ CopyOnWriteArrayList <ColumnType > typeCandidates = new CopyOnWriteArrayList <>(typeArray );
445
+
261
446
for (String s : valuesList ) {
262
- if (isLocalDateTime .test (s )) {
263
- return ColumnType .LOCAL_DATE_TIME ;
447
+ if (typeCandidates .contains (LOCAL_DATE_TIME )) {
448
+ if (!isLocalDateTime .test (s )) {
449
+ typeCandidates .remove (LOCAL_DATE_TIME );
450
+ }
264
451
}
265
- if (isLocalTime .test (s )) {
266
- return ColumnType .LOCAL_TIME ;
452
+ if (typeCandidates .contains (LOCAL_TIME )) {
453
+ if (!isLocalTime .test (s )) {
454
+ typeCandidates .remove (LOCAL_TIME );
455
+ }
267
456
}
268
- if (isLocalDate .test (s )) {
269
- return ColumnType .LOCAL_DATE ;
457
+ if (typeCandidates .contains (LOCAL_DATE )) {
458
+ if (!isLocalDate .test (s )) {
459
+ typeCandidates .remove (LOCAL_DATE );
460
+ }
270
461
}
271
- if (isBoolean .test (s )) {
272
- return ColumnType .BOOLEAN ;
462
+ if (typeCandidates .contains (BOOLEAN )) {
463
+ if (!isBoolean .test (s )) {
464
+ typeCandidates .remove (BOOLEAN );
465
+ }
273
466
}
274
- if (isShort .test (s )){
275
- return ColumnType .SHORT_INT ;
467
+ if (typeCandidates .contains (SHORT_INT )) {
468
+ if (!isShort .test (s )) {
469
+ typeCandidates .remove (SHORT_INT );
470
+ }
276
471
}
277
- if (isInteger .test (s )) {
278
- return ColumnType .INTEGER ;
472
+ if (typeCandidates .contains (INTEGER )) {
473
+ if (!isInteger .test (s )) {
474
+ typeCandidates .remove (INTEGER );
475
+ }
279
476
}
280
- if (isLong .test (s )) {
281
- return ColumnType .LONG_INT ;
477
+ if (typeCandidates .contains (LONG_INT )) {
478
+ if (!isLong .test (s )) {
479
+ typeCandidates .remove (LONG_INT );
480
+ }
282
481
}
283
- if (isFloat .test (s )) {
284
- return ColumnType .FLOAT ;
482
+ if (typeCandidates .contains (FLOAT )) {
483
+ if (!isFloat .test (s )) {
484
+ typeCandidates .remove (FLOAT );
485
+ }
285
486
}
286
487
}
287
- return ColumnType .CATEGORY ;
488
+ return selectType (typeCandidates );
489
+ }
490
+
491
+ /**
492
+ * Returns the selected candidate for a column of data, by picking the first value in the given list
493
+ * @param typeCandidates a possibly empty list of candidates. This list should be sorted in order of preference
494
+ */
495
+ private static ColumnType selectType (List <ColumnType > typeCandidates ) {
496
+ if (typeCandidates .isEmpty ()) {
497
+ return CATEGORY ;
498
+ }
499
+ else {
500
+ return typeCandidates .get (0 );
501
+ }
288
502
}
289
503
290
504
private static java .util .function .Predicate <String > isBoolean = s ->
0 commit comments