merge with master. fix duckdb group by and enums

Tmonster · Tmonster · commit cf1c9f7757dd · 2023-01-17T09:40:52.000Z
diff --git a/_control/data.csv b/_control/data.csv
@@ -1,25 +1,2 @@
 task,data,nrow,k,na,sort,active
-groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
-groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
-groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
-groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
-groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
-groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
-groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
-groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
-groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
-groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
 groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
-groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
-groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
-groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
-groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
-join,J1_1e7_NA_0_0,1e7,NA,0,0,1
-join,J1_1e7_NA_5_0,1e7,NA,5,0,1
-join,J1_1e7_NA_0_1,1e7,NA,0,1,1
-join,J1_1e8_NA_0_0,1e8,NA,0,0,1
-join,J1_1e8_NA_5_0,1e8,NA,5,0,1
-join,J1_1e8_NA_0_1,1e8,NA,0,1,1
-join,J1_1e9_NA_0_0,1e9,NA,0,0,1
-join,J1_1e9_NA_5_0,1e9,NA,5,0,1
-join,J1_1e9_NA_0_1,1e9,NA,0,1,1
diff --git a/duckdb/groupby-duckdb.R b/duckdb/groupby-duckdb.R
@@ -40,13 +40,13 @@ invisible(dbExecute(con, sprintf("COPY y FROM '%s' (AUTO_DETECT TRUE)", src_grp)
 
 # if there are no nulls (which our enums can't handle, make enums)
 if (!uses_NAs) {
-  invisible(dbExecute(con, sprintf("CREATE TYPE id1ENUM AS ENUM (SELECT id1 FROM y)", src_grp)))
-  invisible(dbExecute(con, sprintf("CREATE TYPE id2ENUM AS ENUM (SELECT id2 FROM y)", src_grp)))
-  invisible(dbExecute(con, sprintf("CREATE TYPE id3ENUM AS ENUM (SELECT id3 FROM y)", src_grp)))
+  invisible(dbExecute(con, "CREATE TYPE id1ENUM AS ENUM (SELECT id1 FROM y)"))
+  invisible(dbExecute(con, "CREATE TYPE id2ENUM AS ENUM (SELECT id2 FROM y)"))
+  invisible(dbExecute(con, "CREATE TYPE id3ENUM AS ENUM (SELECT id3 FROM y)"))
 
   invisible(dbExecute(con, "CREATE TABLE x(id1 id1ENUM, id2 id2ENUM, id3 id3ENUM, id4 INT, id5 INT, id6 INT, v1 INT, v2 INT, v3 FLOAT)"))
   invisible(dbExecute(con, sprintf("INSERT INTO x (SELECT * FROM y)"))
-   invisible(dbExecute(con, "DROP TABLE IF EXISTS y"))
+  invisible(dbExecute(con, "DROP TABLE IF EXISTS y"))
 } else {
   # otherwise rename y
   invisible(dbExecute(con, "ALTER TABLE y RENAME TO x"))
diff --git a/pandas/groupby-pandas.py b/pandas/groupby-pandas.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 print("# groupby-pandas.py", flush=True)
 
diff --git a/pandas/join-pandas.py b/pandas/join-pandas.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 print("# join-pandas.py", flush=True)
 
diff --git a/pandas/read-pandas.py b/pandas/read-pandas.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 print("# read-pandas.py")
 
diff --git a/pandas/sort-pandas.py b/pandas/sort-pandas.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 print("# sort-pandas.py")
 
diff --git a/path.env b/path.env
@@ -1,7 +1,3 @@
 export JULIA_HOME=/opt/julia-1.6.1
-export PATH=$PATH:$JULIA_HOME/bin
 export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
-export JULIA_HOME=/opt/julia-1.6.1
 export PATH=$PATH:$JULIA_HOME/bin
-export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
-export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
diff --git a/polars/join-polars.py b/polars/join-polars.py
@@ -27,19 +27,14 @@
 print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[2] + ", " + y_data_name[2], flush=True)
 
 with pl.StringCache():
-  x = pl.read_csv(src_jn_x, dtype={"id1":pl.Int32, "id2":pl.Int32, "id3":pl.Int32, "v1":pl.Float64})
-  x["id4"] = x["id4"].cast(pl.Categorical)
-  x["id5"] = x["id5"].cast(pl.Categorical)
-  x["id6"] = x["id6"].cast(pl.Categorical)
-  small = pl.read_csv(src_jn_y[0], dtype={"id1":pl.Int32, "v2":pl.Float64})
-  small["id4"] = small["id4"].cast(pl.Categorical)
-  medium = pl.read_csv(src_jn_y[1], dtype={"id1":pl.Int32, "id2":pl.Int32, "v2":pl.Float64})
-  medium["id4"] = medium["id4"].cast(pl.Categorical)
-  medium["id5"] = medium["id5"].cast(pl.Categorical)
-  big = pl.read_csv(src_jn_y[2], dtype={"id1":pl.Int32, "id2":pl.Int32, "id3":pl.Int32, "v2":pl.Float64})
-  big["id4"] = big["id4"].cast(pl.Categorical)
-  big["id5"] = big["id5"].cast(pl.Categorical)
-  big["id6"] = big["id6"].cast(pl.Categorical)
+    x = pl.read_csv(src_jn_x, dtypes={"id1":pl.Int32, "id2":pl.Int32, "id3":pl.Int32, "v1":pl.Float64})
+    x = x.with_columns([x["id4"].cast(pl.Categorical),x["id5"].cast(pl.Categorical),x["id6"].cast(pl.Categorical)])
+    small = pl.read_csv(src_jn_y[0], dtypes={"id1":pl.Int32, "v2":pl.Float64})
+    small = small.with_columns([small["id4"].cast(pl.Categorical)])
+    medium = pl.read_csv(src_jn_y[1], dtypes={"id1":pl.Int32, "id2":pl.Int32, "v2":pl.Float64})
+    medium = medium.with_columns([medium["id4"].cast(pl.Categorical), medium["id5"].cast(pl.Categorical)])
+    big = pl.read_csv(src_jn_y[2], dtypes={"id1":pl.Int32, "id2":pl.Int32, "id3":pl.Int32, "v2":pl.Float64})
+    big = big.with_columns([big["id4"].cast(pl.Categorical), big["id5"].cast(pl.Categorical), big["id6"].cast(pl.Categorical)])
 
 print(len(x), flush=True)
 print(len(small), flush=True)
diff --git a/run.conf b/run.conf
@@ -1,7 +1,7 @@
 # task, used in init-setup-iteration.R
-export RUN_TASKS="groupby join"
+export RUN_TASKS="groupby"
 # solution, used in init-setup-iteration.R
-export RUN_SOLUTIONS="data.table dplyr pandas pydatatable spark dask polars duckdb"
+export RUN_SOLUTIONS="duckdb"
 
  # juliadf cudf clickhouse"
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#!/usr/bin/env python`
	`1`	`+#!/usr/bin/env python3`
`2`	`2`
`3`	`3`	`print("# groupby-pandas.py", flush=True)`
`4`	`4`