jtablesaw
diff --git a/‎src/main/java/com/github/lwhite1/tablesaw/io/CsvReader.java
Lines changed: 244 additions & 30 deletions b/‎src/main/java/com/github/lwhite1/tablesaw/io/CsvReader.java
Lines changed: 244 additions & 30 deletions
@@ -10,6 +10,7 @@
 import com.github.lwhite1.tablesaw.columns.Column;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;
+import org.apache.commons.lang3.StringUtils;
 
 import javax.annotation.Nullable;
 import javax.annotation.concurrent.Immutable;
@@ -24,8 +25,11 @@
 import java.time.format.DateTimeParseException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.function.Predicate;
 
+import static com.github.lwhite1.tablesaw.api.ColumnType.*;
+
 /**
  * Static utility class that Builds Tables from Comma Separated Value (CSV) files.
  * <p>
@@ -120,7 +124,7 @@ public static Table read(ColumnType types[], boolean header, int[] wanted, char
 
   /**
    * Returns a Relation constructed from a CSV File with the given file name
-   *
+   * <p>
    * The @code{fileName} is used as the initial table name for the new table
    *
    * @param types           An array of the types of columns in the file, in the order they appear
@@ -178,6 +182,54 @@ public static Table read(ColumnType types[], boolean header, char columnSeparato
     }
     return table;
   }
+/**
+   * Returns a Relation constructed from a CSV File with the given file name
+   * <p>
+   * The @code{fileName} is used as the initial table name for the new table
+   *
+   * @param types           An array of the types of columns in the file, in the order they appear
+   * @param header          Is the first row in the file a header?
+   * @param columnSeparator the delimiter
+   * @param fileName        The fully specified file name. It is used to provide a default name for the table
+   * @return A Relation containing the data in the csv file.
+   * @throws IOException
+   */
+  public static Table headerOnly(ColumnType types[], boolean header, char columnSeparator, String fileName)
+      throws IOException {
+
+    CsvMapper mapper = new CsvMapper();
+    CsvSchema schema = CsvSchema.builder().setColumnSeparator(columnSeparator).build();
+    mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY);
+    File csvFile = new File(fileName);
+    Table table;
+    try (MappingIterator<String[]> it = mapper.reader(String[].class).with(schema).readValues(csvFile)) {
+
+      String[] columnNames;
+      List<String> headerRow;
+      if (header) {
+        headerRow = Lists.newArrayList(it.next());
+        columnNames = selectColumnNames(headerRow, types);
+      } else {
+        columnNames = makeColumnNames(types);
+        headerRow = Lists.newArrayList(columnNames);
+      }
+
+      table = new Table(nameMaker(fileName));
+      for (int x = 0; x < types.length; x++) {
+        if (types[x] != ColumnType.SKIP) {
+          Column newColumn = TypeUtils.newColumn(headerRow.get(x), types[x]);
+          table.addColumn(newColumn);
+        }
+      }
+      int[] columnIndexes = new int[columnNames.length];
+      for (int i = 0; i < columnIndexes.length; i++) {
+        // get the index in the original table, which includes skipped fields
+        columnIndexes[i] = headerRow.indexOf(columnNames[i]);
+      }
+      it.close();
+    }
+    return table;
+  }
 
   /**
    * Provides placeholder column names for when the file read has no header
@@ -209,19 +261,108 @@ private static String nameMaker(String path) {
     return p.getFileName().toString();
   }
 
+  /**
+   * Returns the structure of the table given by {@code csvFileName} as detected by analysis of a sample of the data
+   * @throws IOException
+   */
+  public static Table detectedColumnTypes(String csvFileName, boolean header, char delimiter) throws IOException {
+    Table t = CsvReader.headerOnly(
+        CsvReader.detectColumnTypes(csvFileName, header, delimiter), header, delimiter, csvFileName);
+    return t.structure();
+  }
+
+
+    /**
+     * Returns a string representation of the file types in file {@code csvFilename},
+     * as determined by the type-detection algorithm
+     *
+     * This method is intended to help analysts quickly fix any erroneous types, by printing out the types in a format
+     * such that they can be edited to correct any mistakes, and used in an array literal
+     *
+     * For example:
+     *
+     * LOCAL_DATE, // 0     date
+     * SHORT_INT,  // 1     approval
+     * CATEGORY,   // 2     who
+     *
+     * Note that the types are array separated, and that the index position and the column name are printed such that they
+     * would be interpreted as comments if you paste the output into an array:
+     *
+     * ColumnType[] types = {
+     *   LOCAL_DATE, // 0     date
+     *   SHORT_INT,  // 1     approval
+     *   CATEGORY,   // 2     who
+     * }
+     * @throws IOException
+     */
+  public static String printColumnTypes(String csvFileName, boolean header, char delimiter) throws IOException {
+    Table t = CsvReader.headerOnly(
+        CsvReader.detectColumnTypes(csvFileName, header, delimiter), header, delimiter, csvFileName);
+    Table structure = t.structure();
+
+    StringBuilder buf = new StringBuilder();
+
+    //buf.append(structure.name()).append('\n');
+    Column typeCol = structure.column("Column Type");
+    Column indxCol = structure.column("Index");
+    Column nameCol = structure.column("Column Name");
+
+    // add the column headers
+/*
+    buf.append(StringUtils.rightPad(StringUtils.defaultString(typeCol.name()), typeCol.columnWidth()));
+    buf.append("    ");
+    buf.append(StringUtils.rightPad(StringUtils.defaultString(indxCol.name()), indxCol.columnWidth()));
+    buf.append(' ');
+    buf.append(StringUtils.rightPad(StringUtils.defaultString(nameCol.name()), nameCol.columnWidth()));
+    buf.append('\n');
+*/
+
+    int typeColIndex = structure.columnIndex(typeCol);
+    int indxColIndex = structure.columnIndex(indxCol);
+    int nameColIndex = structure.columnIndex(nameCol);
+
+    int typeColWidth = typeCol.columnWidth();
+    int indxColWidth = indxCol.columnWidth();
+    int nameColWidth = nameCol.columnWidth();
+
+    for (int r = 0; r < structure.rowCount(); r++) {
+      String cell = StringUtils.rightPad(structure.get(typeColIndex, r) + ",", typeColWidth);
+      buf.append(cell);
+      buf.append(" // ");
+
+      cell = StringUtils.rightPad(structure.get(indxColIndex, r), indxColWidth);
+      buf.append(cell);
+      buf.append(' ');
+
+      cell = StringUtils.rightPad(structure.get(nameColIndex, r), nameColWidth);
+      buf.append(cell);
+      buf.append(' ');
+
+      buf.append('\n');
+    }
+    return buf.toString();
+  }
+
   public static Table read(String fileName) throws IOException {
     ColumnType[] columnTypes = detectColumnTypes(fileName, true, ',');
     return read(columnTypes, true, fileName);
   }
 
+  public static Table read(String fileName, boolean header) throws IOException {
+    ColumnType[] columnTypes = detectColumnTypes(fileName, header, ',');
+    return read(columnTypes, true, fileName);
+  }
+
+  public static Table read(String fileName, boolean header, char delimiter) throws IOException {
+    ColumnType[] columnTypes = detectColumnTypes(fileName, header, delimiter);
+    return read(columnTypes, true, fileName);
+  }
+
   @VisibleForTesting
   static ColumnType[] detectColumnTypes(String file, boolean header, char delimiter)
       throws IOException {
 
-    int linesToSkip = header ? 1 : 2;
-    final int maxRows = 100;
-    // Read the first 100 rows and guess
-    // TODO(lwhite): Could we read the last 100 rows to double check?
+    int linesToSkip = header ? 1 : 0;
 
     // to hold the results
     List<ColumnType> columnTypes = new ArrayList<>();
@@ -232,18 +373,32 @@ static ColumnType[] detectColumnTypes(String file, boolean header, char delimite
     int rowCount = 0; // make sure we don't go over maxRows
     try (CSVReader reader = new CSVReader(new FileReader(file), delimiter, '"', linesToSkip)) {
       String[] nextLine;
-      while ((nextLine = reader.readNext()) != null && rowCount < maxRows) {
-        int columnNumber = 0;
-        for (String field : nextLine) {
-          if (rowCount == 0) {
+      int nextRow = 0;
+      while ((nextLine = reader.readNext()) != null) {
+
+        // initialize the arrays to hold the strings. we don't know how many we need until we read the first row
+        if (rowCount == 0) {
+          for (int i = 0; i < nextLine.length; i++) {
             columnData.add(new ArrayList<>());
-            //continue; // TODO(lwhite): Better way to handle header
           }
-          columnData.get(columnNumber).add(field);
-          columnNumber ++;
+        }
+        int columnNumber = 0;
+        if (rowCount >= linesToSkip) {
+          if (rowCount == nextRow) {
+           // System.out.println(nextRow);
+            for (String field : nextLine) {
+              columnData.get(columnNumber).add(field);
+              columnNumber++;
+            }
+           // System.out.println(columnData.get(0).size());
+          }
+        }
+        if (rowCount == nextRow) {
+          nextRow = nextRow(nextRow);
         }
         rowCount++;
       }
+      // System.out.println(columnData.get(0).size());
     }
 
     // now detect
@@ -255,36 +410,95 @@ static ColumnType[] detectColumnTypes(String file, boolean header, char delimite
     return columnTypes.toArray(new ColumnType[columnTypes.size()]);
   }
 
+  private static int nextRow(int nextRow) {
+    if (nextRow < 100) {
+      return nextRow + 1;
+    }
+    if (nextRow < 1000) {
+      return nextRow + 10;
+    }
+    if (nextRow < 10_000) {
+      return nextRow + 100;
+    }
+    if (nextRow < 100_000) {
+      return nextRow + 1000;
+    }
+    if (nextRow < 1_000_000) {
+      return nextRow + 10_000;
+    }
+    if (nextRow < 10_000_000) {
+      return nextRow + 100_000;
+    }
+    if (nextRow < 100_000_000) {
+      return nextRow + 1_000_000;
+    }
+    return nextRow + 10_000_000;
+  }
+
   private static ColumnType detectType(List<String> valuesList) {
 
-    ColumnType[] types = ColumnType.values();
+    // Types to choose from. When more than one would work, we pick the first of the options
+    ColumnType[] typeArray =  // we leave out category, as that is the default type
+        {LOCAL_DATE_TIME, LOCAL_TIME, LOCAL_DATE, BOOLEAN, SHORT_INT, INTEGER, LONG_INT, FLOAT};
+
+    CopyOnWriteArrayList<ColumnType> typeCandidates = new CopyOnWriteArrayList<>(typeArray);
+
     for (String s : valuesList) {
-      if (isLocalDateTime.test(s)) {
-        return ColumnType.LOCAL_DATE_TIME;
+      if (typeCandidates.contains(LOCAL_DATE_TIME)) {
+        if (!isLocalDateTime.test(s)) {
+          typeCandidates.remove(LOCAL_DATE_TIME);
+        }
       }
-      if (isLocalTime.test(s)) {
-        return ColumnType.LOCAL_TIME;
+      if (typeCandidates.contains(LOCAL_TIME)) {
+        if (!isLocalTime.test(s)) {
+          typeCandidates.remove(LOCAL_TIME);
+        }
       }
-      if (isLocalDate.test(s)) {
-        return ColumnType.LOCAL_DATE;
+      if (typeCandidates.contains(LOCAL_DATE)) {
+        if (!isLocalDate.test(s)) {
+          typeCandidates.remove(LOCAL_DATE);
+        }
       }
-      if (isBoolean.test(s)) {
-        return ColumnType.BOOLEAN;
+      if (typeCandidates.contains(BOOLEAN)) {
+        if (!isBoolean.test(s)) {
+          typeCandidates.remove(BOOLEAN);
+        }
       }
-      if (isShort.test(s)){
-        return ColumnType.SHORT_INT;
+      if (typeCandidates.contains(SHORT_INT)) {
+        if (!isShort.test(s)) {
+          typeCandidates.remove(SHORT_INT);
+        }
       }
-      if (isInteger.test(s)) {
-        return ColumnType.INTEGER;
+      if (typeCandidates.contains(INTEGER)) {
+        if (!isInteger.test(s)) {
+          typeCandidates.remove(INTEGER);
+        }
       }
-      if (isLong.test(s)) {
-        return ColumnType.LONG_INT;
+      if (typeCandidates.contains(LONG_INT)) {
+        if (!isLong.test(s)) {
+          typeCandidates.remove(LONG_INT);
+        }
       }
-      if (isFloat.test(s)) {
-        return ColumnType.FLOAT;
+      if (typeCandidates.contains(FLOAT)) {
+        if (!isFloat.test(s)) {
+          typeCandidates.remove(FLOAT);
+        }
       }
     }
-    return ColumnType.CATEGORY;
+    return selectType(typeCandidates);
+  }
+
+  /**
+   * Returns the selected candidate for a column of data, by picking the first value in the given list
+   * @param   typeCandidates a possibly empty list of candidates. This list should be sorted in order of preference
+   */
+  private static ColumnType selectType(List<ColumnType> typeCandidates) {
+    if (typeCandidates.isEmpty()) {
+      return CATEGORY;
+    }
+    else {
+      return typeCandidates.get(0);
+    }
   }
 
   private static java.util.function.Predicate<String> isBoolean = s ->