Merge branch 'ibis-project:main' into fix-all-unique-value-when-scale

ibis-project · Sep 17, 2024 · 72eb5b3 · 72eb5b3
2 parents d6446c7 + 6dce35e
commit 72eb5b3
Show file tree

Hide file tree

Showing 21 changed files with 622 additions and 489 deletions.
diff --git a/docs/_quarto.yml b/docs/_quarto.yml
@@ -214,9 +214,9 @@ quartodoc:
             name: Temporal feature extraction
             desc: Feature extraction for temporal columns
           contents:
-            - ExpandDateTime
             - ExpandDate
             - ExpandTime
+            - ExpandTimestamp
 
         - kind: page
           path: steps-other

diff --git a/docs/index.qmd b/docs/index.qmd
@@ -11,8 +11,9 @@ hide-description: true
 
 - Preprocess your data at scale on any [Ibis](https://ibis-project.org/)-supported
   backend.
-- Compose [`Recipe`](/reference/core.html#ibis_ml.Recipe)s with other scikit-learn
-  estimators using
+- Compose
+  [`Recipe`](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)s
+  with other scikit-learn estimators using
   [`Pipeline`](https://scikit-learn.org/stable/modules/compose.html#pipeline-chaining-estimators)s.
 - Seamlessly integrate with [scikit-learn](https://scikit-learn.org/stable/),
   [XGBoost](https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html), and

diff --git a/docs/reference/support-matrix/step_config.yml b/docs/reference/support-matrix/step_config.yml
@@ -90,7 +90,30 @@ ExpandDate:
         components:
           - doy
 
-ExpandDateTime:
+ExpandTime:
+  configurations:
+    - name: h
+      config:
+        inputs: time
+        components:
+          - hour
+    - name: m
+      config:
+        inputs: time
+        components:
+          - minute
+    - name: s
+      config:
+        inputs: time
+        components:
+          - second
+    - name: ms
+      config:
+        inputs: time
+        components:
+          - millisecond
+
+ExpandTimestamp:
   configurations:
     - name: ms
       config:
@@ -137,26 +160,3 @@ ExpandDateTime:
         inputs: timestamp
         components:
           - doy
-
-ExpandTime:
-  configurations:
-    - name: h
-      config:
-        inputs: time
-        components:
-          - hour
-    - name: m
-      config:
-        inputs: time
-        components:
-          - minute
-    - name: s
-      config:
-        inputs: time
-        components:
-          - second
-    - name: ms
-      config:
-        inputs: time
-        components:
-          - millisecond
diff --git a/docs/tutorial/pytorch.qmd b/docs/tutorial/pytorch.qmd
@@ -102,7 +102,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -122,44 +122,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),

diff --git a/docs/tutorial/scikit-learn.qmd b/docs/tutorial/scikit-learn.qmd
@@ -101,7 +101,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),

diff --git a/docs/tutorial/xgboost.qmd b/docs/tutorial/xgboost.qmd
@@ -101,7 +101,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),