Skip to content

Commit 771a5da

Browse files
committed
fixed missing value handling in csv data type detection
1 parent 912e14b commit 771a5da

File tree

5 files changed

+49
-13
lines changed

5 files changed

+49
-13
lines changed

src/main/java/com/github/lwhite1/tablesaw/columns/BooleanColumn.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,15 @@ public static BooleanColumn create(String name) {
2828
return new BooleanColumn(name);
2929
}
3030

31+
public static BooleanColumn create(String name, int size, RoaringBitmap values) {
32+
BooleanColumn booleanColumn = new BooleanColumn(name, size);
33+
IntIterator intIterator = values.getIntIterator();
34+
while(intIterator.hasNext()){
35+
booleanColumn.set(intIterator.next(), true);
36+
}
37+
return booleanColumn;
38+
}
39+
3140
public static BooleanColumn create(String name, int rowSize) {
3241
return new BooleanColumn(name, rowSize);
3342
}

src/main/java/com/github/lwhite1/tablesaw/columns/FloatColumn.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ public void addCell(String object) {
268268
*/
269269
public static float convert(String stringValue) {
270270
if (Strings.isNullOrEmpty(stringValue) || TypeUtils.MISSING_INDICATORS.contains(stringValue)) {
271-
return Float.NaN;
271+
return MISSING_VALUE;
272272
}
273273
Matcher matcher = COMMA_PATTERN.matcher(stringValue);
274274
return Float.parseFloat(matcher.replaceAll(""));

src/main/java/com/github/lwhite1/tablesaw/io/CsvReader.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import com.github.lwhite1.tablesaw.api.Table;
1010
import com.github.lwhite1.tablesaw.columns.Column;
1111
import com.google.common.annotations.VisibleForTesting;
12+
import com.google.common.base.Strings;
1213
import com.google.common.collect.Lists;
1314
import org.apache.commons.lang3.StringUtils;
1415

@@ -457,7 +458,11 @@ private static ColumnType detectType(List<String> valuesList) {
457458

458459
CopyOnWriteArrayList<ColumnType> typeCandidates = new CopyOnWriteArrayList<>(typeArray);
459460

461+
460462
for (String s : valuesList) {
463+
if (Strings.isNullOrEmpty(s) || TypeUtils.MISSING_INDICATORS.contains(s)) {
464+
continue;
465+
}
461466
if (typeCandidates.contains(LOCAL_DATE_TIME)) {
462467
if (!isLocalDateTime.test(s)) {
463468
typeCandidates.remove(LOCAL_DATE_TIME);
@@ -516,7 +521,7 @@ private static ColumnType selectType(List<ColumnType> typeCandidates) {
516521
}
517522

518523
private static java.util.function.Predicate<String> isBoolean = s ->
519-
TypeUtils.TRUE_STRINGS.contains(s) || TypeUtils.FALSE_STRINGS.contains(s);
524+
TypeUtils.TRUE_STRINGS_FOR_DETECTION.contains(s) || TypeUtils.FALSE_STRINGS_FOR_DETECTION.contains(s);
520525

521526
private static Predicate<String> isLong = new Predicate<String>() {
522527

src/main/java/com/github/lwhite1/tablesaw/io/TypeUtils.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,17 @@ private TypeUtils() {
3939
public static final List<String> TRUE_STRINGS =
4040
Arrays.asList("T", "t", "Y", "y", "TRUE", "true", "1");
4141

42+
public static final List<String> TRUE_STRINGS_FOR_DETECTION =
43+
Arrays.asList("T", "t", "Y", "y", "TRUE", "true");
44+
4245
// These Strings will convert to true booleans
4346
public static final List<String> FALSE_STRINGS =
4447
Arrays.asList("F", "f", "N", "n", "FALSE", "false", "0");
4548

49+
// These Strings will convert to true booleans
50+
public static final List<String> FALSE_STRINGS_FOR_DETECTION =
51+
Arrays.asList("F", "f", "N", "n", "FALSE", "false");
52+
4653
// Formats that we accept in parsing dates from strings
4754
// TODO: Add more types, especially dates with month names spelled-out fully.
4855
private static final DateTimeFormatter dtf1 = DateTimeFormatter.ofPattern("yyyyMMdd");

src/test/java/com/github/lwhite1/tablesaw/integration/AirlineDelays2.java

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,57 @@
11
package com.github.lwhite1.tablesaw.integration;
22

3-
import com.github.lwhite1.tablesaw.api.Table;
43
import com.github.lwhite1.tablesaw.api.ColumnType;
5-
import com.github.lwhite1.tablesaw.store.StorageManager;
4+
import com.github.lwhite1.tablesaw.api.Table;
5+
import com.github.lwhite1.tablesaw.columns.BooleanColumn;
6+
import com.github.lwhite1.tablesaw.io.CsvReader;
67
import com.google.common.base.Stopwatch;
78

89
import java.util.concurrent.TimeUnit;
910

11+
import static com.github.lwhite1.tablesaw.api.QueryHelper.column;
12+
import static java.lang.System.out;
13+
1014
/**
1115
*
1216
*/
1317
public class AirlineDelays2 {
1418

15-
private static Table flights2015;
19+
private static Table flt2007;
1620

1721
public static void main(String[] args) throws Exception {
1822

1923
new AirlineDelays2();
2024
Stopwatch stopwatch = Stopwatch.createStarted();
21-
Table sorted = flights2015.sortAscendingOn("ORIGIN", "UNIQUE_CARRIER");
25+
Table sorted = flt2007.sortAscendingOn("ORIGIN", "UniqueCarrier");
2226
System.out.println("Sorting " + stopwatch.elapsed(TimeUnit.SECONDS));
2327
System.out.println(sorted.first(1000).print());
2428
System.exit(0);
2529
}
2630

2731
private AirlineDelays2() throws Exception {
2832
Stopwatch stopwatch = Stopwatch.createStarted();
29-
System.out.println("loading");
33+
out.println("loading");
34+
out.println(CsvReader.printColumnTypes("/Users/larrywhite/Downloads/flight delays/2007.csv", true, ','));
35+
flt2007 = Table.create("/Users/larrywhite/Downloads/flight delays/2007.csv");
3036

31-
flights2015 = StorageManager.readTable("bigdata/3f07b9bf-053f-4f9b-9dff-9d354835b276");
37+
out.println(flt2007.first(5).print());
3238

33-
System.out.println(String.format("loaded %d records in %d seconds",
34-
flights2015.rowCount(),
39+
out.println(String.format("loaded %d records in %d seconds",
40+
flt2007.rowCount(),
3541
(int) stopwatch.elapsed(TimeUnit.SECONDS)));
3642

37-
out(flights2015.shape());
38-
out(flights2015.columnNames().toString());
39-
out(flights2015.first(10).print());
43+
out(flt2007.shape());
44+
45+
Table ord = flt2007.selectWhere(column("Origin").isEqualTo("ORD"));
46+
47+
BooleanColumn delayed = new BooleanColumn("Delayed?", ord.floatColumn("DepDelay").isGreaterThan(15), ord.rowCount());
48+
ord.addColumn(delayed);
49+
50+
out("total flights: " + ord.rowCount());
51+
out("delayed flights: " + delayed.countTrue());
52+
53+
out(flt2007.columnNames().toString());
54+
out(flt2007.first(10).print());
4055
}
4156

4257
private static void out(Object obj) {

0 commit comments

Comments
 (0)