Skip to content

Commit d820da2

Browse files
committed
finished up work on auto-detection of columns
1 parent 95d6715 commit d820da2

File tree

4 files changed

+272
-65
lines changed

4 files changed

+272
-65
lines changed

src/main/java/com/github/lwhite1/tablesaw/io/CsvReader.java

Lines changed: 244 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import com.github.lwhite1.tablesaw.columns.Column;
1111
import com.google.common.annotations.VisibleForTesting;
1212
import com.google.common.collect.Lists;
13+
import org.apache.commons.lang3.StringUtils;
1314

1415
import javax.annotation.Nullable;
1516
import javax.annotation.concurrent.Immutable;
@@ -24,8 +25,11 @@
2425
import java.time.format.DateTimeParseException;
2526
import java.util.ArrayList;
2627
import java.util.List;
28+
import java.util.concurrent.CopyOnWriteArrayList;
2729
import java.util.function.Predicate;
2830

31+
import static com.github.lwhite1.tablesaw.api.ColumnType.*;
32+
2933
/**
3034
* Static utility class that Builds Tables from Comma Separated Value (CSV) files.
3135
* <p>
@@ -120,7 +124,7 @@ public static Table read(ColumnType types[], boolean header, int[] wanted, char
120124

121125
/**
122126
* Returns a Relation constructed from a CSV File with the given file name
123-
*
127+
* <p>
124128
* The @code{fileName} is used as the initial table name for the new table
125129
*
126130
* @param types An array of the types of columns in the file, in the order they appear
@@ -178,6 +182,54 @@ public static Table read(ColumnType types[], boolean header, char columnSeparato
178182
}
179183
return table;
180184
}
185+
/**
186+
* Returns a Relation constructed from a CSV File with the given file name
187+
* <p>
188+
* The @code{fileName} is used as the initial table name for the new table
189+
*
190+
* @param types An array of the types of columns in the file, in the order they appear
191+
* @param header Is the first row in the file a header?
192+
* @param columnSeparator the delimiter
193+
* @param fileName The fully specified file name. It is used to provide a default name for the table
194+
* @return A Relation containing the data in the csv file.
195+
* @throws IOException
196+
*/
197+
public static Table headerOnly(ColumnType types[], boolean header, char columnSeparator, String fileName)
198+
throws IOException {
199+
200+
CsvMapper mapper = new CsvMapper();
201+
CsvSchema schema = CsvSchema.builder().setColumnSeparator(columnSeparator).build();
202+
mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY);
203+
File csvFile = new File(fileName);
204+
Table table;
205+
try (MappingIterator<String[]> it = mapper.reader(String[].class).with(schema).readValues(csvFile)) {
206+
207+
String[] columnNames;
208+
List<String> headerRow;
209+
if (header) {
210+
headerRow = Lists.newArrayList(it.next());
211+
columnNames = selectColumnNames(headerRow, types);
212+
} else {
213+
columnNames = makeColumnNames(types);
214+
headerRow = Lists.newArrayList(columnNames);
215+
}
216+
217+
table = new Table(nameMaker(fileName));
218+
for (int x = 0; x < types.length; x++) {
219+
if (types[x] != ColumnType.SKIP) {
220+
Column newColumn = TypeUtils.newColumn(headerRow.get(x), types[x]);
221+
table.addColumn(newColumn);
222+
}
223+
}
224+
int[] columnIndexes = new int[columnNames.length];
225+
for (int i = 0; i < columnIndexes.length; i++) {
226+
// get the index in the original table, which includes skipped fields
227+
columnIndexes[i] = headerRow.indexOf(columnNames[i]);
228+
}
229+
it.close();
230+
}
231+
return table;
232+
}
181233

182234
/**
183235
* Provides placeholder column names for when the file read has no header
@@ -209,19 +261,108 @@ private static String nameMaker(String path) {
209261
return p.getFileName().toString();
210262
}
211263

264+
/**
265+
* Returns the structure of the table given by {@code csvFileName} as detected by analysis of a sample of the data
266+
* @throws IOException
267+
*/
268+
public static Table detectedColumnTypes(String csvFileName, boolean header, char delimiter) throws IOException {
269+
Table t = CsvReader.headerOnly(
270+
CsvReader.detectColumnTypes(csvFileName, header, delimiter), header, delimiter, csvFileName);
271+
return t.structure();
272+
}
273+
274+
275+
/**
276+
* Returns a string representation of the file types in file {@code csvFilename},
277+
* as determined by the type-detection algorithm
278+
*
279+
* This method is intended to help analysts quickly fix any erroneous types, by printing out the types in a format
280+
* such that they can be edited to correct any mistakes, and used in an array literal
281+
*
282+
* For example:
283+
*
284+
* LOCAL_DATE, // 0 date
285+
* SHORT_INT, // 1 approval
286+
* CATEGORY, // 2 who
287+
*
288+
* Note that the types are array separated, and that the index position and the column name are printed such that they
289+
* would be interpreted as comments if you paste the output into an array:
290+
*
291+
* ColumnType[] types = {
292+
* LOCAL_DATE, // 0 date
293+
* SHORT_INT, // 1 approval
294+
* CATEGORY, // 2 who
295+
* }
296+
* @throws IOException
297+
*/
298+
public static String printColumnTypes(String csvFileName, boolean header, char delimiter) throws IOException {
299+
Table t = CsvReader.headerOnly(
300+
CsvReader.detectColumnTypes(csvFileName, header, delimiter), header, delimiter, csvFileName);
301+
Table structure = t.structure();
302+
303+
StringBuilder buf = new StringBuilder();
304+
305+
//buf.append(structure.name()).append('\n');
306+
Column typeCol = structure.column("Column Type");
307+
Column indxCol = structure.column("Index");
308+
Column nameCol = structure.column("Column Name");
309+
310+
// add the column headers
311+
/*
312+
buf.append(StringUtils.rightPad(StringUtils.defaultString(typeCol.name()), typeCol.columnWidth()));
313+
buf.append(" ");
314+
buf.append(StringUtils.rightPad(StringUtils.defaultString(indxCol.name()), indxCol.columnWidth()));
315+
buf.append(' ');
316+
buf.append(StringUtils.rightPad(StringUtils.defaultString(nameCol.name()), nameCol.columnWidth()));
317+
buf.append('\n');
318+
*/
319+
320+
int typeColIndex = structure.columnIndex(typeCol);
321+
int indxColIndex = structure.columnIndex(indxCol);
322+
int nameColIndex = structure.columnIndex(nameCol);
323+
324+
int typeColWidth = typeCol.columnWidth();
325+
int indxColWidth = indxCol.columnWidth();
326+
int nameColWidth = nameCol.columnWidth();
327+
328+
for (int r = 0; r < structure.rowCount(); r++) {
329+
String cell = StringUtils.rightPad(structure.get(typeColIndex, r) + ",", typeColWidth);
330+
buf.append(cell);
331+
buf.append(" // ");
332+
333+
cell = StringUtils.rightPad(structure.get(indxColIndex, r), indxColWidth);
334+
buf.append(cell);
335+
buf.append(' ');
336+
337+
cell = StringUtils.rightPad(structure.get(nameColIndex, r), nameColWidth);
338+
buf.append(cell);
339+
buf.append(' ');
340+
341+
buf.append('\n');
342+
}
343+
return buf.toString();
344+
}
345+
212346
public static Table read(String fileName) throws IOException {
213347
ColumnType[] columnTypes = detectColumnTypes(fileName, true, ',');
214348
return read(columnTypes, true, fileName);
215349
}
216350

351+
public static Table read(String fileName, boolean header) throws IOException {
352+
ColumnType[] columnTypes = detectColumnTypes(fileName, header, ',');
353+
return read(columnTypes, true, fileName);
354+
}
355+
356+
public static Table read(String fileName, boolean header, char delimiter) throws IOException {
357+
ColumnType[] columnTypes = detectColumnTypes(fileName, header, delimiter);
358+
return read(columnTypes, true, fileName);
359+
}
360+
217361
@VisibleForTesting
218362
static ColumnType[] detectColumnTypes(String file, boolean header, char delimiter)
219363
throws IOException {
220364

221-
int linesToSkip = header ? 1 : 2;
222-
final int maxRows = 100;
223-
// Read the first 100 rows and guess
224-
// TODO(lwhite): Could we read the last 100 rows to double check?
365+
int linesToSkip = header ? 1 : 0;
225366

226367
// to hold the results
227368
List<ColumnType> columnTypes = new ArrayList<>();
@@ -232,18 +373,32 @@ static ColumnType[] detectColumnTypes(String file, boolean header, char delimite
232373
int rowCount = 0; // make sure we don't go over maxRows
233374
try (CSVReader reader = new CSVReader(new FileReader(file), delimiter, '"', linesToSkip)) {
234375
String[] nextLine;
235-
while ((nextLine = reader.readNext()) != null && rowCount < maxRows) {
236-
int columnNumber = 0;
237-
for (String field : nextLine) {
238-
if (rowCount == 0) {
376+
int nextRow = 0;
377+
while ((nextLine = reader.readNext()) != null) {
378+
379+
// initialize the arrays to hold the strings. we don't know how many we need until we read the first row
380+
if (rowCount == 0) {
381+
for (int i = 0; i < nextLine.length; i++) {
239382
columnData.add(new ArrayList<>());
240-
//continue; // TODO(lwhite): Better way to handle header
241383
}
242-
columnData.get(columnNumber).add(field);
243-
columnNumber ++;
384+
}
385+
int columnNumber = 0;
386+
if (rowCount >= linesToSkip) {
387+
if (rowCount == nextRow) {
388+
// System.out.println(nextRow);
389+
for (String field : nextLine) {
390+
columnData.get(columnNumber).add(field);
391+
columnNumber++;
392+
}
393+
// System.out.println(columnData.get(0).size());
394+
}
395+
}
396+
if (rowCount == nextRow) {
397+
nextRow = nextRow(nextRow);
244398
}
245399
rowCount++;
246400
}
401+
// System.out.println(columnData.get(0).size());
247402
}
248403

249404
// now detect
@@ -255,36 +410,95 @@ static ColumnType[] detectColumnTypes(String file, boolean header, char delimite
255410
return columnTypes.toArray(new ColumnType[columnTypes.size()]);
256411
}
257412

413+
private static int nextRow(int nextRow) {
414+
if (nextRow < 100) {
415+
return nextRow + 1;
416+
}
417+
if (nextRow < 1000) {
418+
return nextRow + 10;
419+
}
420+
if (nextRow < 10_000) {
421+
return nextRow + 100;
422+
}
423+
if (nextRow < 100_000) {
424+
return nextRow + 1000;
425+
}
426+
if (nextRow < 1_000_000) {
427+
return nextRow + 10_000;
428+
}
429+
if (nextRow < 10_000_000) {
430+
return nextRow + 100_000;
431+
}
432+
if (nextRow < 100_000_000) {
433+
return nextRow + 1_000_000;
434+
}
435+
return nextRow + 10_000_000;
436+
}
437+
258438
private static ColumnType detectType(List<String> valuesList) {
259439

260-
ColumnType[] types = ColumnType.values();
440+
// Types to choose from. When more than one would work, we pick the first of the options
441+
ColumnType[] typeArray = // we leave out category, as that is the default type
442+
{LOCAL_DATE_TIME, LOCAL_TIME, LOCAL_DATE, BOOLEAN, SHORT_INT, INTEGER, LONG_INT, FLOAT};
443+
444+
CopyOnWriteArrayList<ColumnType> typeCandidates = new CopyOnWriteArrayList<>(typeArray);
445+
261446
for (String s : valuesList) {
262-
if (isLocalDateTime.test(s)) {
263-
return ColumnType.LOCAL_DATE_TIME;
447+
if (typeCandidates.contains(LOCAL_DATE_TIME)) {
448+
if (!isLocalDateTime.test(s)) {
449+
typeCandidates.remove(LOCAL_DATE_TIME);
450+
}
264451
}
265-
if (isLocalTime.test(s)) {
266-
return ColumnType.LOCAL_TIME;
452+
if (typeCandidates.contains(LOCAL_TIME)) {
453+
if (!isLocalTime.test(s)) {
454+
typeCandidates.remove(LOCAL_TIME);
455+
}
267456
}
268-
if (isLocalDate.test(s)) {
269-
return ColumnType.LOCAL_DATE;
457+
if (typeCandidates.contains(LOCAL_DATE)) {
458+
if (!isLocalDate.test(s)) {
459+
typeCandidates.remove(LOCAL_DATE);
460+
}
270461
}
271-
if (isBoolean.test(s)) {
272-
return ColumnType.BOOLEAN;
462+
if (typeCandidates.contains(BOOLEAN)) {
463+
if (!isBoolean.test(s)) {
464+
typeCandidates.remove(BOOLEAN);
465+
}
273466
}
274-
if (isShort.test(s)){
275-
return ColumnType.SHORT_INT;
467+
if (typeCandidates.contains(SHORT_INT)) {
468+
if (!isShort.test(s)) {
469+
typeCandidates.remove(SHORT_INT);
470+
}
276471
}
277-
if (isInteger.test(s)) {
278-
return ColumnType.INTEGER;
472+
if (typeCandidates.contains(INTEGER)) {
473+
if (!isInteger.test(s)) {
474+
typeCandidates.remove(INTEGER);
475+
}
279476
}
280-
if (isLong.test(s)) {
281-
return ColumnType.LONG_INT;
477+
if (typeCandidates.contains(LONG_INT)) {
478+
if (!isLong.test(s)) {
479+
typeCandidates.remove(LONG_INT);
480+
}
282481
}
283-
if (isFloat.test(s)) {
284-
return ColumnType.FLOAT;
482+
if (typeCandidates.contains(FLOAT)) {
483+
if (!isFloat.test(s)) {
484+
typeCandidates.remove(FLOAT);
485+
}
285486
}
286487
}
287-
return ColumnType.CATEGORY;
488+
return selectType(typeCandidates);
489+
}
490+
491+
/**
492+
* Returns the selected candidate for a column of data, by picking the first value in the given list
493+
* @param typeCandidates a possibly empty list of candidates. This list should be sorted in order of preference
494+
*/
495+
private static ColumnType selectType(List<ColumnType> typeCandidates) {
496+
if (typeCandidates.isEmpty()) {
497+
return CATEGORY;
498+
}
499+
else {
500+
return typeCandidates.get(0);
501+
}
288502
}
289503

290504
private static java.util.function.Predicate<String> isBoolean = s ->

0 commit comments

Comments
 (0)