diff --git a/docs/_quarto.yml b/docs/_quarto.yml index a9df3b8..cb8d33c 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -214,9 +214,9 @@ quartodoc: name: Temporal feature extraction desc: Feature extraction for temporal columns contents: - - ExpandDateTime - ExpandDate - ExpandTime + - ExpandTimestamp - kind: page path: steps-other diff --git a/docs/index.qmd b/docs/index.qmd index 6da7806..02751fb 100644 --- a/docs/index.qmd +++ b/docs/index.qmd @@ -11,8 +11,9 @@ hide-description: true - Preprocess your data at scale on any [Ibis](https://ibis-project.org/)-supported backend. -- Compose [`Recipe`](/reference/core.html#ibis_ml.Recipe)s with other scikit-learn - estimators using +- Compose + [`Recipe`](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)s + with other scikit-learn estimators using [`Pipeline`](https://scikit-learn.org/stable/modules/compose.html#pipeline-chaining-estimators)s. - Seamlessly integrate with [scikit-learn](https://scikit-learn.org/stable/), [XGBoost](https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html), and diff --git a/docs/reference/support-matrix/step_config.yml b/docs/reference/support-matrix/step_config.yml index 59bd883..4db4a25 100644 --- a/docs/reference/support-matrix/step_config.yml +++ b/docs/reference/support-matrix/step_config.yml @@ -90,7 +90,30 @@ ExpandDate: components: - doy -ExpandDateTime: +ExpandTime: + configurations: + - name: h + config: + inputs: time + components: + - hour + - name: m + config: + inputs: time + components: + - minute + - name: s + config: + inputs: time + components: + - second + - name: ms + config: + inputs: time + components: + - millisecond + +ExpandTimestamp: configurations: - name: ms config: @@ -137,26 +160,3 @@ ExpandDateTime: inputs: timestamp components: - doy - -ExpandTime: - configurations: - - name: h - config: - inputs: time - components: - - hour - - name: m - config: - inputs: time - components: - - minute - - name: s - config: - inputs: time - components: - - second - - name: ms - config: - inputs: time - components: - - millisecond diff --git a/docs/tutorial/pytorch.qmd b/docs/tutorial/pytorch.qmd index 94a09dd..24a55a4 100644 --- a/docs/tutorial/pytorch.qmd +++ b/docs/tutorial/pytorch.qmd @@ -102,7 +102,7 @@ flight_data = ( "time_hour", ) # Exclude missing data - .dropna() + .drop_null() ) flight_data ``` @@ -122,44 +122,24 @@ To get started, let's split this single dataset into two: a _training_ set and a Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.) ```{python} -flight_data_with_unique_key = flight_data.mutate( - unique_key=ibis.literal(",").join( - [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)] - ) -) -flight_data_with_unique_key -``` - -```{python} -# FIXME(deepyaman): Proposed key isn't unique for actual departure date. -flight_data_with_unique_key.group_by("unique_key").mutate( - cnt=flight_data_with_unique_key.count() -)[ibis._.cnt > 1] -``` - -```{python} -import random - -# Fix the random numbers by setting the seed -# This enables the analysis to be reproducible when random numbers are used -random.seed(222) - -# Put 3/4 of the data into the training set -random_key = str(random.getrandbits(256)) -data_split = flight_data_with_unique_key.mutate( - train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3 -) +import ibis_ml as ml # Create data frames for the two sets: -train_data = data_split[data_split.train].drop("unique_key", "train") -test_data = data_split[~data_split.train].drop("unique_key", "train") +train_data, test_data = ml.train_test_split( + flight_data, + unique_key=["carrier", "flight", "date"], + # Put 3/4 of the data into the training set + test_size=0.25, + num_buckets=4, + # Fix the random numbers by setting the seed + # This enables the analysis to be reproducible when random numbers are used + random_seed=222, +) ``` ## Create features ```{python} -import ibis_ml as ml - flights_rec = ml.Recipe( ml.ExpandDate("date", components=["dow", "month"]), ml.Drop("date"), diff --git a/docs/tutorial/scikit-learn.qmd b/docs/tutorial/scikit-learn.qmd index cefdf96..30f1958 100644 --- a/docs/tutorial/scikit-learn.qmd +++ b/docs/tutorial/scikit-learn.qmd @@ -101,7 +101,7 @@ flight_data = ( "time_hour", ) # Exclude missing data - .dropna() + .drop_null() ) flight_data ``` @@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.) ```{python} -flight_data_with_unique_key = flight_data.mutate( - unique_key=ibis.literal(",").join( - [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)] - ) -) -flight_data_with_unique_key -``` - -```{python} -# FIXME(deepyaman): Proposed key isn't unique for actual departure date. -flight_data_with_unique_key.group_by("unique_key").mutate( - cnt=flight_data_with_unique_key.count() -)[ibis._.cnt > 1] -``` - -```{python} -import random - -# Fix the random numbers by setting the seed -# This enables the analysis to be reproducible when random numbers are used -random.seed(222) - -# Put 3/4 of the data into the training set -random_key = str(random.getrandbits(256)) -data_split = flight_data_with_unique_key.mutate( - train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3 -) +import ibis_ml as ml # Create data frames for the two sets: -train_data = data_split[data_split.train].drop("unique_key", "train") -test_data = data_split[~data_split.train].drop("unique_key", "train") +train_data, test_data = ml.train_test_split( + flight_data, + unique_key=["carrier", "flight", "date"], + # Put 3/4 of the data into the training set + test_size=0.25, + num_buckets=4, + # Fix the random numbers by setting the seed + # This enables the analysis to be reproducible when random numbers are used + random_seed=222, +) ``` ## Create features ```{python} -import ibis_ml as ml - flights_rec = ml.Recipe( ml.ExpandDate("date", components=["dow", "month"]), ml.Drop("date"), diff --git a/docs/tutorial/xgboost.qmd b/docs/tutorial/xgboost.qmd index 8f53aff..4ec77ad 100644 --- a/docs/tutorial/xgboost.qmd +++ b/docs/tutorial/xgboost.qmd @@ -101,7 +101,7 @@ flight_data = ( "time_hour", ) # Exclude missing data - .dropna() + .drop_null() ) flight_data ``` @@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.) ```{python} -flight_data_with_unique_key = flight_data.mutate( - unique_key=ibis.literal(",").join( - [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)] - ) -) -flight_data_with_unique_key -``` - -```{python} -# FIXME(deepyaman): Proposed key isn't unique for actual departure date. -flight_data_with_unique_key.group_by("unique_key").mutate( - cnt=flight_data_with_unique_key.count() -)[ibis._.cnt > 1] -``` - -```{python} -import random - -# Fix the random numbers by setting the seed -# This enables the analysis to be reproducible when random numbers are used -random.seed(222) - -# Put 3/4 of the data into the training set -random_key = str(random.getrandbits(256)) -data_split = flight_data_with_unique_key.mutate( - train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3 -) +import ibis_ml as ml # Create data frames for the two sets: -train_data = data_split[data_split.train].drop("unique_key", "train") -test_data = data_split[~data_split.train].drop("unique_key", "train") +train_data, test_data = ml.train_test_split( + flight_data, + unique_key=["carrier", "flight", "date"], + # Put 3/4 of the data into the training set + test_size=0.25, + num_buckets=4, + # Fix the random numbers by setting the seed + # This enables the analysis to be reproducible when random numbers are used + random_seed=222, +) ``` ## Create features ```{python} -import ibis_ml as ml - flights_rec = ml.Recipe( ml.ExpandDate("date", components=["dow", "month"]), ml.Drop("date"), diff --git a/examples/Preprocess your data with recipes.ipynb b/examples/Preprocess your data with recipes.ipynb index 6c4a4b7..87393d2 100644 --- a/examples/Preprocess your data with recipes.ipynb +++ b/examples/Preprocess your data with recipes.ipynb @@ -243,16 +243,16 @@ "┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n", "│ timeint64stringstringint64int64stringdateint64timestamp(6) │\n", "├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n", - "│ 10:45:0067EWR ORD 120719UA 2013-02-1402013-02-14 15:00:00 │\n", - "│ 10:48:00373LGA FLL 1791076B6 2013-02-1402013-02-14 15:00:00 │\n", - "│ 10:48:00764EWR IAH 2071400UA 2013-02-1402013-02-14 15:00:00 │\n", - "│ 10:51:002044LGA MIA 1711096DL 2013-02-1402013-02-14 16:00:00 │\n", - "│ 10:51:002171LGA DCA 40214US 2013-02-1402013-02-14 16:00:00 │\n", - "│ 10:57:001275JFK SLC 2861990DL 2013-02-1402013-02-14 16:00:00 │\n", - "│ 10:57:00366LGA STL 135888WN 2013-02-1402013-02-14 16:00:00 │\n", - "│ 10:57:001550EWR SFO 3382565UA 2013-02-1402013-02-14 15:00:00 │\n", - "│ 10:58:004694EWR MKE 113725EV 2013-02-1402013-02-14 15:00:00 │\n", - "│ 10:58:001647LGA ATL 117762DL 2013-02-1402013-02-14 16:00:00 │\n", + "│ 05:57:00461LGA ATL 100762DL 2013-06-2602013-06-26 10:00:00 │\n", + "│ 05:58:004424EWR RDU 63416EV 2013-06-2602013-06-26 10:00:00 │\n", + "│ 05:58:006177EWR IAD 45212EV 2013-06-2602013-06-26 10:00:00 │\n", + "│ 06:00:00731LGA DTW 78502DL 2013-06-2602013-06-26 10:00:00 │\n", + "│ 06:01:00684EWR LAX 3162454UA 2013-06-2602013-06-26 10:00:00 │\n", + "│ 06:01:00301LGA ORD 164733AA 2013-06-2612013-06-26 10:00:00 │\n", + "│ 06:01:001837LGA MIA 1481096AA 2013-06-2602013-06-26 10:00:00 │\n", + "│ 06:01:001279LGA MEM 128963DL 2013-06-2602013-06-26 10:00:00 │\n", + "│ 06:02:001691JFK LAX 3092475UA 2013-06-2602013-06-26 10:00:00 │\n", + "│ 06:04:001447JFK CLT 75541US 2013-06-2602013-06-26 10:00:00 │\n", "│ │\n", "└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n", "\n" @@ -263,16 +263,16 @@ "┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n", "│ \u001b[2mtime\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mtimestamp(6)\u001b[0m │\n", "├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n", - "│ \u001b[35m10:45:00\u001b[0m │ \u001b[1;36m67\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mORD \u001b[0m │ \u001b[1;36m120\u001b[0m │ \u001b[1;36m719\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 15:00:00\u001b[0m │\n", - "│ \u001b[35m10:48:00\u001b[0m │ \u001b[1;36m373\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mFLL \u001b[0m │ \u001b[1;36m179\u001b[0m │ \u001b[1;36m1076\u001b[0m │ \u001b[32mB6 \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 15:00:00\u001b[0m │\n", - "│ \u001b[35m10:48:00\u001b[0m │ \u001b[1;36m764\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAH \u001b[0m │ \u001b[1;36m207\u001b[0m │ \u001b[1;36m1400\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 15:00:00\u001b[0m │\n", - "│ \u001b[35m10:51:00\u001b[0m │ \u001b[1;36m2044\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mMIA \u001b[0m │ \u001b[1;36m171\u001b[0m │ \u001b[1;36m1096\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 16:00:00\u001b[0m │\n", - "│ \u001b[35m10:51:00\u001b[0m │ \u001b[1;36m2171\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mDCA \u001b[0m │ \u001b[1;36m40\u001b[0m │ \u001b[1;36m214\u001b[0m │ \u001b[32mUS \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 16:00:00\u001b[0m │\n", - "│ \u001b[35m10:57:00\u001b[0m │ \u001b[1;36m1275\u001b[0m │ \u001b[32mJFK \u001b[0m │ \u001b[32mSLC \u001b[0m │ \u001b[1;36m286\u001b[0m │ \u001b[1;36m1990\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 16:00:00\u001b[0m │\n", - "│ \u001b[35m10:57:00\u001b[0m │ \u001b[1;36m366\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mSTL \u001b[0m │ \u001b[1;36m135\u001b[0m │ \u001b[1;36m888\u001b[0m │ \u001b[32mWN \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 16:00:00\u001b[0m │\n", - "│ \u001b[35m10:57:00\u001b[0m │ \u001b[1;36m1550\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mSFO \u001b[0m │ \u001b[1;36m338\u001b[0m │ \u001b[1;36m2565\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 15:00:00\u001b[0m │\n", - "│ \u001b[35m10:58:00\u001b[0m │ \u001b[1;36m4694\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mMKE \u001b[0m │ \u001b[1;36m113\u001b[0m │ \u001b[1;36m725\u001b[0m │ \u001b[32mEV \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 15:00:00\u001b[0m │\n", - "│ \u001b[35m10:58:00\u001b[0m │ \u001b[1;36m1647\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mATL \u001b[0m │ \u001b[1;36m117\u001b[0m │ \u001b[1;36m762\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-02-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-14 16:00:00\u001b[0m │\n", + "│ \u001b[35m05:57:00\u001b[0m │ \u001b[1;36m461\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mATL \u001b[0m │ \u001b[1;36m100\u001b[0m │ \u001b[1;36m762\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m05:58:00\u001b[0m │ \u001b[1;36m4424\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mRDU \u001b[0m │ \u001b[1;36m63\u001b[0m │ \u001b[1;36m416\u001b[0m │ \u001b[32mEV \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m05:58:00\u001b[0m │ \u001b[1;36m6177\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAD \u001b[0m │ \u001b[1;36m45\u001b[0m │ \u001b[1;36m212\u001b[0m │ \u001b[32mEV \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:00:00\u001b[0m │ \u001b[1;36m731\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mDTW \u001b[0m │ \u001b[1;36m78\u001b[0m │ \u001b[1;36m502\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m684\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mLAX \u001b[0m │ \u001b[1;36m316\u001b[0m │ \u001b[1;36m2454\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m301\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mORD \u001b[0m │ \u001b[1;36m164\u001b[0m │ \u001b[1;36m733\u001b[0m │ \u001b[32mAA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m1\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m1837\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mMIA \u001b[0m │ \u001b[1;36m148\u001b[0m │ \u001b[1;36m1096\u001b[0m │ \u001b[32mAA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m1279\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mMEM \u001b[0m │ \u001b[1;36m128\u001b[0m │ \u001b[1;36m963\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:02:00\u001b[0m │ \u001b[1;36m1691\u001b[0m │ \u001b[32mJFK \u001b[0m │ \u001b[32mLAX \u001b[0m │ \u001b[1;36m309\u001b[0m │ \u001b[1;36m2475\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", + "│ \u001b[35m06:04:00\u001b[0m │ \u001b[1;36m1447\u001b[0m │ \u001b[32mJFK \u001b[0m │ \u001b[32mCLT \u001b[0m │ \u001b[1;36m75\u001b[0m │ \u001b[1;36m541\u001b[0m │ \u001b[32mUS \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │\n", "│ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │\n", "└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘" ] @@ -307,7 +307,7 @@ " \"time_hour\",\n", " )\n", " # Exclude missing data\n", - " .dropna()\n", + " .drop_null()\n", ")\n", "flight_data" ] @@ -376,147 +376,29 @@ { "cell_type": "code", "execution_count": 8, - "id": "732624f4-a2af-4c6e-b29d-4fb7cb5fc99e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n",
-       "┃ dep_time  flight  origin  dest    air_time  distance  carrier  date        arr_delay  time_hour            unique_key         ┃\n",
-       "┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n",
-       "│ timeint64stringstringint64int64stringdateint64timestamp(6)string             │\n",
-       "├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n",
-       "│ 05:57:00461LGA   ATL   100762DL     2013-06-2602013-06-26 10:00:00DL,461,2013-06-26  │\n",
-       "│ 05:58:004424EWR   RDU   63416EV     2013-06-2602013-06-26 10:00:00EV,4424,2013-06-26 │\n",
-       "│ 05:58:006177EWR   IAD   45212EV     2013-06-2602013-06-26 10:00:00EV,6177,2013-06-26 │\n",
-       "│ 06:00:00731LGA   DTW   78502DL     2013-06-2602013-06-26 10:00:00DL,731,2013-06-26  │\n",
-       "│ 06:01:00684EWR   LAX   3162454UA     2013-06-2602013-06-26 10:00:00UA,684,2013-06-26  │\n",
-       "│ 06:01:00301LGA   ORD   164733AA     2013-06-2612013-06-26 10:00:00AA,301,2013-06-26  │\n",
-       "│ 06:01:001837LGA   MIA   1481096AA     2013-06-2602013-06-26 10:00:00AA,1837,2013-06-26 │\n",
-       "│ 06:01:001279LGA   MEM   128963DL     2013-06-2602013-06-26 10:00:00DL,1279,2013-06-26 │\n",
-       "│ 06:02:001691JFK   LAX   3092475UA     2013-06-2602013-06-26 10:00:00UA,1691,2013-06-26 │\n",
-       "│ 06:04:001447JFK   CLT   75541US     2013-06-2602013-06-26 10:00:00US,1447,2013-06-26 │\n",
-       "│                   │\n",
-       "└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n",
-       "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mdep_time\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mflight\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1morigin\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdest\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mair_time\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdistance\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcarrier\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1marr_delay\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtime_hour\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1munique_key\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mtime\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mtimestamp(6)\u001b[0m │ \u001b[2mstring\u001b[0m │\n", - "├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n", - "│ \u001b[35m05:57:00\u001b[0m │ \u001b[1;36m461\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mATL \u001b[0m │ \u001b[1;36m100\u001b[0m │ \u001b[1;36m762\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mDL,461,2013-06-26 \u001b[0m │\n", - "│ \u001b[35m05:58:00\u001b[0m │ \u001b[1;36m4424\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mRDU \u001b[0m │ \u001b[1;36m63\u001b[0m │ \u001b[1;36m416\u001b[0m │ \u001b[32mEV \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mEV,4424,2013-06-26\u001b[0m │\n", - "│ \u001b[35m05:58:00\u001b[0m │ \u001b[1;36m6177\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAD \u001b[0m │ \u001b[1;36m45\u001b[0m │ \u001b[1;36m212\u001b[0m │ \u001b[32mEV \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mEV,6177,2013-06-26\u001b[0m │\n", - "│ \u001b[35m06:00:00\u001b[0m │ \u001b[1;36m731\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mDTW \u001b[0m │ \u001b[1;36m78\u001b[0m │ \u001b[1;36m502\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mDL,731,2013-06-26 \u001b[0m │\n", - "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m684\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mLAX \u001b[0m │ \u001b[1;36m316\u001b[0m │ \u001b[1;36m2454\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mUA,684,2013-06-26 \u001b[0m │\n", - "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m301\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mORD \u001b[0m │ \u001b[1;36m164\u001b[0m │ \u001b[1;36m733\u001b[0m │ \u001b[32mAA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m1\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mAA,301,2013-06-26 \u001b[0m │\n", - "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m1837\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mMIA \u001b[0m │ \u001b[1;36m148\u001b[0m │ \u001b[1;36m1096\u001b[0m │ \u001b[32mAA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mAA,1837,2013-06-26\u001b[0m │\n", - "│ \u001b[35m06:01:00\u001b[0m │ \u001b[1;36m1279\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mMEM \u001b[0m │ \u001b[1;36m128\u001b[0m │ \u001b[1;36m963\u001b[0m │ \u001b[32mDL \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mDL,1279,2013-06-26\u001b[0m │\n", - "│ \u001b[35m06:02:00\u001b[0m │ \u001b[1;36m1691\u001b[0m │ \u001b[32mJFK \u001b[0m │ \u001b[32mLAX \u001b[0m │ \u001b[1;36m309\u001b[0m │ \u001b[1;36m2475\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mUA,1691,2013-06-26\u001b[0m │\n", - "│ \u001b[35m06:04:00\u001b[0m │ \u001b[1;36m1447\u001b[0m │ \u001b[32mJFK \u001b[0m │ \u001b[32mCLT \u001b[0m │ \u001b[1;36m75\u001b[0m │ \u001b[1;36m541\u001b[0m │ \u001b[32mUS \u001b[0m │ \u001b[35m2013-06-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-06-26 10:00:00\u001b[0m │ \u001b[32mUS,1447,2013-06-26\u001b[0m │\n", - "│ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │\n", - "└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "flight_data_with_unique_key = flight_data.mutate(\n", - " unique_key=ibis.literal(\",\").join(\n", - " [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n", - " )\n", - ")\n", - "flight_data_with_unique_key" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c9cd58ce-dc2d-4e4e-8b4a-51100fe1182c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n",
-       "┃ dep_time  flight  origin  dest    air_time  distance  carrier  date        arr_delay  time_hour            unique_key          cnt   ┃\n",
-       "┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n",
-       "│ timeint64stringstringint64int64stringdateint64timestamp(6)stringint64 │\n",
-       "├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n",
-       "│ 19:59:001022EWR   IAH   1671400UA     2013-09-1402013-09-14 23:00:00UA,1022,2013-09-142 │\n",
-       "│ 20:00:001022EWR   IAH   1861400UA     2013-09-1402013-09-14 00:00:00UA,1022,2013-09-142 │\n",
-       "│ 19:12:001023LGA   ORD   112733UA     2013-05-2902013-05-29 23:00:00UA,1023,2013-05-292 │\n",
-       "│ 21:16:001023EWR   IAH   1751400UA     2013-05-2902013-05-29 01:00:00UA,1023,2013-05-292 │\n",
-       "│ 15:18:001052EWR   IAH   1741400UA     2013-08-2702013-08-27 19:00:00UA,1052,2013-08-272 │\n",
-       "│ 21:22:001052EWR   IAH   1731400UA     2013-08-2702013-08-27 01:00:00UA,1052,2013-08-272 │\n",
-       "│ 18:39:001053EWR   CLE   72404UA     2013-12-2002013-12-20 23:00:00UA,1053,2013-12-202 │\n",
-       "│ 19:27:001053EWR   CLE   69404UA     2013-12-2002013-12-20 00:00:00UA,1053,2013-12-202 │\n",
-       "│ 20:16:001071EWR   BQN   1961585UA     2013-02-2602013-02-26 01:00:00UA,1071,2013-02-262 │\n",
-       "│ 17:20:001071EWR   PHX   2812133UA     2013-02-2602013-02-26 22:00:00UA,1071,2013-02-262 │\n",
-       "│  │\n",
-       "└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n",
-       "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mdep_time\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mflight\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1morigin\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdest\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mair_time\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdistance\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcarrier\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1marr_delay\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtime_hour\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1munique_key\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcnt\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n", - "│ \u001b[2mtime\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mtimestamp(6)\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │\n", - "├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n", - "│ \u001b[35m19:59:00\u001b[0m │ \u001b[1;36m1022\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAH \u001b[0m │ \u001b[1;36m167\u001b[0m │ \u001b[1;36m1400\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-09-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-09-14 23:00:00\u001b[0m │ \u001b[32mUA,1022,2013-09-14\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m20:00:00\u001b[0m │ \u001b[1;36m1022\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAH \u001b[0m │ \u001b[1;36m186\u001b[0m │ \u001b[1;36m1400\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-09-14\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-09-14 00:00:00\u001b[0m │ \u001b[32mUA,1022,2013-09-14\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m19:12:00\u001b[0m │ \u001b[1;36m1023\u001b[0m │ \u001b[32mLGA \u001b[0m │ \u001b[32mORD \u001b[0m │ \u001b[1;36m112\u001b[0m │ \u001b[1;36m733\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-05-29\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-05-29 23:00:00\u001b[0m │ \u001b[32mUA,1023,2013-05-29\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m21:16:00\u001b[0m │ \u001b[1;36m1023\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAH \u001b[0m │ \u001b[1;36m175\u001b[0m │ \u001b[1;36m1400\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-05-29\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-05-29 01:00:00\u001b[0m │ \u001b[32mUA,1023,2013-05-29\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m15:18:00\u001b[0m │ \u001b[1;36m1052\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAH \u001b[0m │ \u001b[1;36m174\u001b[0m │ \u001b[1;36m1400\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-08-27\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-08-27 19:00:00\u001b[0m │ \u001b[32mUA,1052,2013-08-27\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m21:22:00\u001b[0m │ \u001b[1;36m1052\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mIAH \u001b[0m │ \u001b[1;36m173\u001b[0m │ \u001b[1;36m1400\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-08-27\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-08-27 01:00:00\u001b[0m │ \u001b[32mUA,1052,2013-08-27\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m18:39:00\u001b[0m │ \u001b[1;36m1053\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mCLE \u001b[0m │ \u001b[1;36m72\u001b[0m │ \u001b[1;36m404\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-12-20\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-12-20 23:00:00\u001b[0m │ \u001b[32mUA,1053,2013-12-20\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m19:27:00\u001b[0m │ \u001b[1;36m1053\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mCLE \u001b[0m │ \u001b[1;36m69\u001b[0m │ \u001b[1;36m404\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-12-20\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-12-20 00:00:00\u001b[0m │ \u001b[32mUA,1053,2013-12-20\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m20:16:00\u001b[0m │ \u001b[1;36m1071\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mBQN \u001b[0m │ \u001b[1;36m196\u001b[0m │ \u001b[1;36m1585\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-02-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-26 01:00:00\u001b[0m │ \u001b[32mUA,1071,2013-02-26\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[35m17:20:00\u001b[0m │ \u001b[1;36m1071\u001b[0m │ \u001b[32mEWR \u001b[0m │ \u001b[32mPHX \u001b[0m │ \u001b[1;36m281\u001b[0m │ \u001b[1;36m2133\u001b[0m │ \u001b[32mUA \u001b[0m │ \u001b[35m2013-02-26\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[35m2013-02-26 22:00:00\u001b[0m │ \u001b[32mUA,1071,2013-02-26\u001b[0m │ \u001b[1;36m2\u001b[0m │\n", - "│ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │\n", - "└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\n", - "flight_data_with_unique_key.group_by(\"unique_key\").mutate(\n", - " cnt=flight_data_with_unique_key.count()\n", - ")[ibis._.cnt > 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, "id": "6be459de-73cd-4d6e-a195-41b9e5c481a6", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "outputs": [], "source": [ - "import random\n", - "\n", - "# Fix the random numbers by setting the seed\n", - "# This enables the analysis to be reproducible when random numbers are used\n", - "random.seed(222)\n", - "\n", - "# Put 3/4 of the data into the training set\n", - "random_key = str(random.getrandbits(256))\n", - "data_split = flight_data_with_unique_key.mutate(\n", - " train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n", - ")\n", + "import ibis_ml as ml\n", "\n", "# Create data frames for the two sets:\n", - "train_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\n", - "test_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")" + "train_data, test_data = ml.train_test_split(\n", + " flight_data,\n", + " unique_key=[\"carrier\", \"flight\", \"date\"],\n", + " # Put 3/4 of the data into the training set\n", + " test_size=0.25,\n", + " num_buckets=4,\n", + " # Fix the random numbers by setting the seed\n", + " # This enables the analysis to be reproducible when random numbers are used\n", + " random_seed=222,\n", + ")" ] }, { @@ -529,13 +411,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "a223b57d-31b7-4ad1-88fd-a216de7da01a", "metadata": {}, "outputs": [], "source": [ - "import ibis_ml as ml\n", - "\n", "flights_rec = ml.Recipe(\n", " ml.ExpandDate(\"date\", components=[\"dow\", \"month\"]),\n", " ml.Drop(\"date\"),\n", @@ -561,14 +441,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "161b43a0-a3fc-4da3-a5ab-810b234bae32", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "042b3b33d16f421f9c6242642a03c55b", + "model_id": "a80c19c4c8664af0b2e916e2cf36158a", "version_major": 2, "version_minor": 0 }, @@ -576,7 +456,7 @@ "RadioButtons(description='Library:', index=2, options=('scikit-learn', 'XGBoost', 'skorch (PyTorch)'), value='…" ] }, - "execution_count": 12, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -610,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "dc04f24e-c8cb-4580-b502-a9410c64a126", "metadata": {}, "outputs": [], @@ -668,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "id": "42ac1426-0561-4a8b-a949-127b2b0c4f01", "metadata": {}, "outputs": [ @@ -678,16 +558,16 @@ "text": [ " epoch train_loss valid_acc valid_loss dur\n", "------- ------------ ----------- ------------ ------\n", - " 1 \u001b[36m4.4971\u001b[0m \u001b[32m0.8388\u001b[0m \u001b[35m2.5698\u001b[0m 1.0492\n", - " 2 \u001b[36m4.4671\u001b[0m 0.8388 2.5698 1.0529\n", - " 3 \u001b[36m4.4625\u001b[0m 0.8388 2.5698 1.0129\n", - " 4 \u001b[36m4.4451\u001b[0m 0.8388 2.5698 1.0246\n", - " 5 4.4488 0.8388 2.5698 1.0251\n", - " 6 4.4553 0.8388 2.5698 0.9891\n", - " 7 4.4630 0.8388 2.5698 1.0836\n", - " 8 4.4847 0.8388 2.5698 1.2162\n", - " 9 4.4798 0.8388 2.5698 1.2594\n", - " 10 4.4799 0.8388 2.5698 1.0920\n" + " 1 \u001b[36m2.4584\u001b[0m \u001b[32m0.8386\u001b[0m \u001b[35m2.5725\u001b[0m 0.9928\n", + " 2 \u001b[36m2.4424\u001b[0m 0.8386 2.5725 0.8958\n", + " 3 \u001b[36m2.4395\u001b[0m 0.8386 2.5725 0.9216\n", + " 4 2.4404 0.8386 2.5725 0.8905\n", + " 5 2.4411 0.8386 2.5725 0.8881\n", + " 6 2.4434 0.8386 2.5725 0.8884\n", + " 7 2.4442 0.8386 2.5725 0.9096\n", + " 8 \u001b[36m2.4391\u001b[0m 0.8386 2.5725 1.0850\n", + " 9 2.4432 0.8386 2.5725 0.9073\n", + " 10 \u001b[36m2.4354\u001b[0m 0.8386 2.5725 0.9601\n" ] }, { @@ -1143,7 +1023,7 @@ " DropZeroVariance(everything(), tolerance=0.0001),\n", " MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())),\n", " MutateAt(timestamp(), _.epoch_seconds()),\n", - " Cast(numeric(), 'float32'))
ExpandDate(cols(('date',)), components=['dow', 'month'])
Drop(cols(('date',)))
TargetEncode(nominal(), smooth=0.0)
DropZeroVariance(everything(), tolerance=0.0001)
MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute()))
MutateAt(timestamp(), _.epoch_seconds())
Cast(numeric(), 'float32')
<class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n",
+       "       Cast(numeric(), 'float32'))
ExpandDate(cols(('date',)), components=['dow', 'month'])
Drop(cols(('date',)))
TargetEncode(nominal(), smooth=0.0)
DropZeroVariance(everything(), tolerance=0.0001)
MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute()))
MutateAt(timestamp(), _.epoch_seconds())
Cast(numeric(), 'float32')
<class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n",
        "  module_=MyModule(\n",
        "    (dense0): Linear(in_features=10, out_features=10, bias=True)\n",
        "    (nonlin): ReLU()\n",
@@ -1178,7 +1058,7 @@
        "))])"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1201,17 +1081,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "id": "be3ff129-d56f-4441-acbc-da7d6cd93d19",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.8385534190130481"
+       "0.8390849833968762"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/ibis_ml/__init__.py b/ibis_ml/__init__.py
index 5fa7ac0..f422c64 100644
--- a/ibis_ml/__init__.py
+++ b/ibis_ml/__init__.py
@@ -1,6 +1,6 @@
 """IbisML is a library for building scalable ML pipelines using Ibis."""
 
-__version__ = "0.1.2"
+__version__ = "0.1.3"
 
 import pprint
 
@@ -44,11 +44,12 @@ def _auto_patch_skorch() -> None:
         return
 
     import ibis.expr.types as ir
-    import numpy as np
 
     old_fit = skorch.net.NeuralNet.fit
 
     def fit(self, X, y=None, **fit_params):
+        import numpy as np
+
         if isinstance(y, ir.Column):
             y = np.asarray(y)
 
diff --git a/ibis_ml/core.py b/ibis_ml/core.py
index 72f314a..6d21b9d 100644
--- a/ibis_ml/core.py
+++ b/ibis_ml/core.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import copy
+import inspect
 import os
 import pprint
 from collections import defaultdict
@@ -12,14 +13,14 @@
 import ibis
 import ibis.expr.operations as ops
 import ibis.expr.types as ir
-import numpy as np
-import pandas as pd
-import pyarrow as pa
 from ibis.common.dispatch import lazy_singledispatch
 
 if TYPE_CHECKING:
     import dask.dataframe as dd
+    import numpy as np
+    import pandas as pd
     import polars as pl
+    import pyarrow as pa
     import xgboost as xgb
     from sklearn.utils._estimator_html_repr import _VisualBlock
 
@@ -45,6 +46,9 @@ def _ibis_table_to_numpy(table: ir.Table) -> np.ndarray:
 
 def _y_as_dataframe(y: Any) -> pd.DataFrame:
     """Coerce `y` to a pandas dataframe"""
+    import numpy as np
+    import pandas as pd
+
     if isinstance(y, pd.DataFrame):
         return y
     elif isinstance(y, pd.Series):
@@ -144,8 +148,11 @@ def _(X, y=None, maintain_order=False):
     return table, tuple(y.columns), None
 
 
-@normalize_table.register(pd.DataFrame)
+@normalize_table.register("pd.DataFrame")
 def _(X, y=None, maintain_order=False):
+    import numpy as np
+    import pandas as pd
+
     if y is not None:
         y = _y_as_dataframe(y)
         table = pd.concat([X, y], axis=1)
@@ -162,8 +169,11 @@ def _(X, y=None, maintain_order=False):
     return ibis.memtable(table), targets, index
 
 
-@normalize_table.register(np.ndarray)
+@normalize_table.register("np.ndarray")
 def _(X, y=None, maintain_order=False):
+    import numpy as np
+    import pandas as pd
+
     X = pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])])
     if y is not None:
         y = _y_as_dataframe(y)
@@ -181,8 +191,11 @@ def _(X, y=None, maintain_order=False):
     return ibis.memtable(table), targets, index
 
 
-@normalize_table.register(pa.Table)
+@normalize_table.register("pa.Table")
 def _(X, y=None, maintain_order=False):
+    import numpy as np
+    import pyarrow as pa
+
     if y is not None:
         if isinstance(y, (pa.ChunkedArray, pa.Array)):
             y = pa.Table.from_pydict({"y": y})
@@ -246,6 +259,8 @@ def get_categories(self, column: str) -> pa.Array | None:
         return self.categories.get(column)
 
     def set_categories(self, column: str, values: pa.Array | list[Any]) -> None:
+        import pyarrow as pa
+
         self.categories[column] = pa.array(values)
 
     def drop_categories(self, column: str) -> None:
@@ -255,6 +270,8 @@ def drop_categories(self, column: str) -> None:
 def _categorize_wrap_reader(
     reader: pa.RecordBatchReader, categories: dict[str, pa.Array]
 ) -> Iterable[pa.RecordBatch]:
+    import pyarrow as pa
+
     for batch in reader:
         out = {}
         for name, col in zip(batch.schema.names, batch.columns):
@@ -290,6 +307,102 @@ def categorize(df: pd.DataFrame, categories: dict[str, list[Any]]) -> pd.DataFra
 
 
 class Step:
+    @classmethod
+    def _get_param_names(cls) -> list[str]:
+        """Get parameter names for the estimator.
+
+        Notes
+        -----
+        Copied from [1]_.
+
+        References
+        ----------
+        .. [1] https://github.com/scikit-learn/scikit-learn/blob/ab2f539/sklearn/base.py#L148-L173
+        """
+        # fetch the constructor or the original constructor before
+        # deprecation wrapping if any
+        init = getattr(cls.__init__, "deprecated_original", cls.__init__)
+        if init is object.__init__:
+            # No explicit constructor to introspect
+            return []
+
+        # introspect the constructor arguments to find the model parameters
+        # to represent
+        init_signature = inspect.signature(init)
+        # Consider the constructor parameters excluding 'self'
+        parameters = [
+            p
+            for p in init_signature.parameters.values()
+            if p.name != "self" and p.kind != p.VAR_KEYWORD
+        ]
+        for p in parameters:
+            if p.kind == p.VAR_POSITIONAL:
+                raise RuntimeError(
+                    "scikit-learn estimators should always "
+                    "specify their parameters in the signature"
+                    " of their __init__ (no varargs)."
+                    f" {cls} with constructor {init_signature} doesn't "
+                    " follow this convention."
+                )
+        # Extract and sort argument names excluding 'self'
+        return sorted([p.name for p in parameters])
+
+    def _get_params(self) -> dict[str, Any]:
+        """Get parameters for this step.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+
+        Notes
+        -----
+        Derived from [1]_.
+
+        References
+        ----------
+        .. [1] https://github.com/scikit-learn/scikit-learn/blob/626b460/sklearn/base.py#L145-L167
+        """
+        return {key: getattr(self, key) for key in self._get_param_names()}
+
+    def _set_params(self, **params):
+        """Set the parameters of this step.
+
+        Parameters
+        ----------
+        **params : dict
+            Step parameters.
+
+        Returns
+        -------
+        self : object
+            Step class instance.
+
+        Notes
+        -----
+        Derived from [1]_.
+
+        References
+        ----------
+        .. [1] https://github.com/scikit-learn/scikit-learn/blob/74016ab/sklearn/base.py#L214-L256
+        """
+        if not params:
+            # Simple optimization to gain speed (inspect is slow)
+            return self
+
+        valid_params = self._get_param_names()
+
+        for key, value in params.items():
+            if key not in valid_params:
+                raise ValueError(
+                    f"Invalid parameter {key!r} for step {self}. "
+                    f"Valid parameters are: {valid_params!r}."
+                )
+
+            setattr(self, key, value)
+
+        return self
+
     def __repr__(self) -> str:
         return pprint.pformat(self)
 
@@ -373,7 +486,7 @@ def _name_estimators(estimators):
 
 class Recipe:
     def __init__(self, *steps: Step):
-        self.steps = steps
+        self.steps = list(steps)
         self._output_format = "default"
 
     def __repr__(self):
@@ -385,16 +498,16 @@ def output_format(self) -> Literal["default", "pandas", "pyarrow", "polars"]:
         return self._output_format
 
     def get_params(self, deep=True) -> dict[str, Any]:
-        """Get parameters for this estimator.
+        """Get parameters for this recipe.
 
         Returns the parameters given in the constructor as well as the
-        estimators contained within the `steps` of the `Recipe`.
+        steps contained within the `steps` of the `Recipe`.
 
         Parameters
         ----------
         deep : bool, default=True
-            If True, will return the parameters for this estimator and
-            contained subobjects that are estimators.
+            If True, will return the parameters for this recipe and
+            contained steps.
 
         Returns
         -------
@@ -413,18 +526,77 @@ def get_params(self, deep=True) -> dict[str, Any]:
         if not deep:
             return out
 
-        estimators = _name_estimators(self.steps)
-        out.update(estimators)
+        steps = _name_estimators(self.steps)
+        out.update(steps)
 
-        for name, estimator in estimators:
-            if hasattr(estimator, "get_params"):
-                for key, value in estimator.get_params(deep=True).items():
-                    out[f"{name}__{key}"] = value
+        for name, step in steps:
+            for key, value in step._get_params().items():  # noqa: SLF001
+                out[f"{name}__{key}"] = value
         return out
 
-    def set_params(self, **kwargs):
-        if "steps" in kwargs:
-            self.steps = kwargs.get("steps")
+    def set_params(self, **params):
+        """Set the parameters of this recipe.
+
+        Valid parameter keys can be listed with ``get_params()``. Note that
+        you can directly set the parameters of the steps contained in
+        `steps`.
+
+        Parameters
+        ----------
+        **params : dict
+            Parameters of this recipe or parameters of steps contained
+            in `steps`. Parameters of the steps may be set using its name and
+            the parameter name separated by a '__'.
+
+        Returns
+        -------
+        self : object
+            Recipe class instance.
+
+        Notes
+        -----
+        Derived from [1]_ and [2]_.
+
+        References
+        ----------
+        .. [1] https://github.com/scikit-learn/scikit-learn/blob/ff1c6f3/sklearn/utils/metaestimators.py#L51-L70
+        .. [2] https://github.com/scikit-learn/scikit-learn/blob/74016ab/sklearn/base.py#L214-L256
+        """
+        if not params:
+            # Simple optimization to gain speed (inspect is slow)
+            return self
+
+        # Ensure strict ordering of parameter setting:
+        # 1. All steps
+        if "steps" in params:
+            self.steps = params.pop("steps")
+
+        # 2. Replace steps with steps in params
+        estimator_name_indexes = {
+            x: i for i, x in enumerate(name for name, _ in _name_estimators(self.steps))
+        }
+        for name in list(params):
+            if "__" not in name and name in estimator_name_indexes:
+                self.steps[estimator_name_indexes[name]] = params.pop(name)
+
+        # 3. Step parameters and other initialisation arguments
+        valid_params = self.get_params(deep=True)
+
+        nested_params = defaultdict(dict)  # grouped by prefix
+        for key, value in params.items():
+            key, sub_key = key.split("__", maxsplit=1)
+            if key not in valid_params:
+                raise ValueError(
+                    f"Invalid parameter {key!r} for recipe {self}. "
+                    f"Valid parameters are: ['steps']."
+                )
+
+            nested_params[key][sub_key] = value
+
+        for key, sub_params in nested_params.items():
+            valid_params[key]._set_params(**sub_params)  # noqa: SLF001
+
+        return self
 
     def set_output(
         self,
@@ -433,8 +605,6 @@ def set_output(
     ) -> Recipe:
         """Set output type returned by `transform`.
 
-        This is part of the standard Scikit-Learn API.
-
         Parameters
         ----------
         transform : {"default", "pandas"}, default=None
@@ -620,6 +790,8 @@ def _categorize_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _categorize_pyarrow(self, table: pa.Table) -> pa.Table:
+        import pyarrow as pa
+
         if not self.metadata_.categories:
             return table
 
@@ -645,6 +817,8 @@ def _categorize_dask_dataframe(self, ddf: dd.DataFrame) -> dd.DataFrame:
     def _categorize_pyarrow_batches(
         self, reader: pa.RecordBatchReader
     ) -> pa.RecordBatchReader:
+        import pyarrow as pa
+
         if not self.metadata_.categories:
             return reader
 
diff --git a/ibis_ml/steps/__init__.py b/ibis_ml/steps/__init__.py
index 61f8156..73c5d72 100644
--- a/ibis_ml/steps/__init__.py
+++ b/ibis_ml/steps/__init__.py
@@ -6,7 +6,7 @@
 from ibis_ml.steps._impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
 from ibis_ml.steps._select_features import DropZeroVariance
 from ibis_ml.steps._standardize import ScaleMinMax, ScaleStandard
-from ibis_ml.steps._temporal import ExpandDate, ExpandDateTime, ExpandTime
+from ibis_ml.steps._temporal import ExpandDate, ExpandTime, ExpandTimestamp
 
 __all__ = (
     "Cast",
@@ -16,8 +16,8 @@
     "Drop",
     "DropZeroVariance",
     "ExpandDate",
-    "ExpandDateTime",
     "ExpandTime",
+    "ExpandTimestamp",
     "FillNA",
     "HandleUnivariateOutliers",
     "ImputeMean",
diff --git a/ibis_ml/steps/_common.py b/ibis_ml/steps/_common.py
index 0c8b67c..ec8ef59 100644
--- a/ibis_ml/steps/_common.py
+++ b/ibis_ml/steps/_common.py
@@ -135,6 +135,11 @@ def __init__(
         self.expr = expr
         self.named_exprs = named_exprs
 
+    @classmethod
+    def _get_param_names(cls) -> list[str]:
+        """Get parameter names for the estimator."""
+        return ["expr", "inputs", "named_exprs"]
+
     def _repr(self) -> Iterable[tuple[str, Any]]:
         yield ("", self.inputs)
         if self.expr is not None:
@@ -191,11 +196,15 @@ def __init__(
         self.exprs = exprs
         self.named_exprs = named_exprs
 
+    @classmethod
+    def _get_param_names(cls) -> list[str]:
+        """Get parameter names for the estimator."""
+        return ["exprs", "named_exprs"]
+
     def _repr(self) -> Iterable[tuple[str, Any]]:
         for expr in self.exprs:
             yield "", expr
-        for name, expr in self.named_exprs.items():
-            yield name, expr
+        yield from self.named_exprs.items()
 
     def is_fitted(self):
         return True
diff --git a/ibis_ml/steps/_discretize.py b/ibis_ml/steps/_discretize.py
index b40ad1b..50ae92a 100644
--- a/ibis_ml/steps/_discretize.py
+++ b/ibis_ml/steps/_discretize.py
@@ -4,7 +4,6 @@
 
 import ibis
 import ibis.expr.types as ir
-import numpy as np
 
 from ibis_ml.core import Metadata, Step
 from ibis_ml.select import SelectionType, selector
@@ -94,6 +93,8 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
     def _fit_uniform_strategy(
         self, table: ir.Table, columns: list[str]
     ) -> dict[str, list[float]]:
+        import numpy as np
+
         aggs = []
         for col_name in columns:
             col = table[col_name]
@@ -117,6 +118,8 @@ def _fit_uniform_strategy(
     def _fit_quantile_strategy(
         self, table: ir.Table, columns: list[str]
     ) -> dict[str, list[float]]:
+        import numpy as np
+
         aggs = []
         percentiles = np.linspace(0, 1, self.n_bins + 1)
         for col_name in columns:
diff --git a/ibis_ml/steps/_handle_outliers.py b/ibis_ml/steps/_handle_outliers.py
index 883375d..140e18a 100644
--- a/ibis_ml/steps/_handle_outliers.py
+++ b/ibis_ml/steps/_handle_outliers.py
@@ -142,7 +142,11 @@ def transform_table(self, table: ir.Table) -> ir.Table:
                     (
                         (table[col_name] >= stat["lower_bound"])
                         & (table[col_name] <= stat["upper_bound"])
-                        | (table[col_name].isnull() | table[col_name].isnan())  # noqa: PD003
+                        | (
+                            table[col_name].isnull()  # noqa: PD003
+                            | table[col_name].type().is_floating()
+                            & table[col_name].isnan()
+                        )
                     )
                     for col_name, stat in self.stats_.items()
                 ]
diff --git a/ibis_ml/steps/_temporal.py b/ibis_ml/steps/_temporal.py
index eee700f..83e7a8c 100644
--- a/ibis_ml/steps/_temporal.py
+++ b/ibis_ml/steps/_temporal.py
@@ -13,8 +13,8 @@
 _DOCS_PAGE_NAME = "temporal-feature-extraction"
 
 
-class ExpandDateTime(Step):
-    """A step for expanding date and time columns into one or more features.
+class ExpandDate(Step):
+    """A step for expanding date columns into one or more features.
 
     New features will be named ``{input_column}_{component}``. For example, if
     expanding a ``"year"`` component from column ``"x"``, the feature column
@@ -23,9 +23,9 @@ class ExpandDateTime(Step):
     Parameters
     ----------
     inputs
-        A selection of date and time columns to expand into new features.
+        A selection of date columns to expand into new features.
     components
-        A sequence of date or time components to expand. Options include
+        A sequence of components to expand. Options include
 
         - ``day``: the day of the month as a numeric value
         - ``week``: the week of the year as a numeric value
@@ -33,43 +33,30 @@ class ExpandDateTime(Step):
         - ``year``: the year as a numeric value
         - ``dow``: the day of the week as a categorical value
         - ``doy``: the day of the year as a numeric value
-        - ``hour``: the hour as a numeric value
-        - ``minute``: the minute as a numeric value
-        - ``second``: the second as a numeric value
-        - ``millisecond``: the millisecond as a numeric value
 
-        Defaults to ``["dow", "month", "year", "hour", "minute", "second"]``.
+        Defaults to ``["dow", "month", "year"]``.
 
     Examples
     --------
     >>> import ibis_ml as ml
 
-    Expand date and time columns using the default components
+    Expand date columns using the default components
 
-    >>> step = ml.ExpandDateTime(ml.timestamp())
+    >>> step = ml.ExpandDate(ml.date())
 
-    Expand specific columns using specific components for date and time
+    Expand specific columns using specific components
 
-    >>> step = ml.ExpandDateTime(["x", "y"], ["day", "year", "hour"])
+    >>> step = ml.ExpandDate(["x", "y"], ["day", "year"])
     """
 
     def __init__(
         self,
         inputs: SelectionType,
-        components: list[
-            Literal[
-                "day",
-                "week",
-                "month",
-                "year",
-                "dow",
-                "doy",
-                "hour",
-                "minute",
-                "second",
-                "millisecond",
-            ]
-        ] = ("dow", "month", "year", "hour", "minute", "second"),
+        components: Sequence[Literal["day", "week", "month", "year", "dow", "doy"]] = (
+            "dow",
+            "month",
+            "year",
+        ),
     ):
         self.inputs = selector(inputs)
         self.components = list(components)
@@ -80,7 +67,6 @@ def _repr(self) -> Iterable[tuple[str, Any]]:
 
     def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
         columns = self.inputs.select_columns(table, metadata)
-
         if "month" in self.components:
             for col in columns:
                 metadata.set_categories(
@@ -114,12 +100,10 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
                         "Sunday",
                     ],
                 )
-
         self.columns_ = columns
 
     def transform_table(self, table: ir.Table) -> ir.Table:
         new_cols = []
-
         for name in self.columns_:
             col = table[name]
             for comp in self.components:
@@ -135,7 +119,65 @@ def transform_table(self, table: ir.Table) -> ir.Table:
                     feat = col.day_of_week.index()
                 elif comp == "doy":
                     feat = col.day_of_year()
-                elif comp == "hour":
+                new_cols.append(feat.name(f"{name}_{comp}"))
+        return table.mutate(new_cols)
+
+
+class ExpandTime(Step):
+    """A step for expanding time columns into one or more features.
+
+    New features will be named ``{input_column}_{component}``. For example, if
+    expanding an ``"hour"`` component from column ``"x"``, the feature column
+    would be named ``"x_hour"``.
+
+    Parameters
+    ----------
+    inputs
+        A selection of time columns to expand into new features.
+    components
+        A sequence of components to expand. Options include ``hour``,
+        ``minute``, ``second``, and ``millisecond``.
+
+        Defaults to ``["hour", "minute", "second"]``.
+
+    Examples
+    --------
+    >>> import ibis_ml as ml
+
+    Expand time columns using the default components
+
+    >>> step = ml.ExpandTime(ml.time())
+
+    Expand specific columns using specific components
+
+    >>> step = ml.ExpandTime(["x", "y"], ["hour", "minute"])
+    """
+
+    def __init__(
+        self,
+        inputs: SelectionType,
+        components: Sequence[Literal["hour", "minute", "second", "millisecond"]] = (
+            "hour",
+            "minute",
+            "second",
+        ),
+    ):
+        self.inputs = selector(inputs)
+        self.components = list(components)
+
+    def _repr(self) -> Iterable[tuple[str, Any]]:
+        yield ("", self.inputs)
+        yield ("components", self.components)
+
+    def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
+        self.columns_ = self.inputs.select_columns(table, metadata)
+
+    def transform_table(self, table: ir.Table) -> ir.Table:
+        new_cols = []
+        for name in self.columns_:
+            col = table[name]
+            for comp in self.components:
+                if comp == "hour":
                     feat = col.hour()
                 elif comp == "minute":
                     feat = col.minute()
@@ -144,12 +186,11 @@ def transform_table(self, table: ir.Table) -> ir.Table:
                 elif comp == "millisecond":
                     feat = col.millisecond()
                 new_cols.append(feat.name(f"{name}_{comp}"))
-
         return table.mutate(new_cols)
 
 
-class ExpandDate(Step):
-    """A step for expanding date columns into one or more features.
+class ExpandTimestamp(Step):
+    """A step for expanding timestamp columns into one or more features.
 
     New features will be named ``{input_column}_{component}``. For example, if
     expanding a ``"year"`` component from column ``"x"``, the feature column
@@ -158,9 +199,9 @@ class ExpandDate(Step):
     Parameters
     ----------
     inputs
-        A selection of date columns to expand into new features.
+        A selection of timestamp columns to expand into new features.
     components
-        A sequence of components to expand. Options include
+        A sequence of date or time components to expand. Options include
 
         - ``day``: the day of the month as a numeric value
         - ``week``: the week of the year as a numeric value
@@ -168,30 +209,43 @@ class ExpandDate(Step):
         - ``year``: the year as a numeric value
         - ``dow``: the day of the week as a categorical value
         - ``doy``: the day of the year as a numeric value
+        - ``hour``: the hour as a numeric value
+        - ``minute``: the minute as a numeric value
+        - ``second``: the second as a numeric value
+        - ``millisecond``: the millisecond as a numeric value
 
-        Defaults to ``["dow", "month", "year"]``.
+        Defaults to ``["dow", "month", "year", "hour", "minute", "second"]``.
 
     Examples
     --------
     >>> import ibis_ml as ml
 
-    Expand date columns using the default components
+    Expand timestamp columns using the default components
 
-    >>> step = ml.ExpandDate(ml.date())
+    >>> step = ml.ExpandTimestamp(ml.timestamp())
 
     Expand specific columns using specific components
 
-    >>> step = ml.ExpandDate(["x", "y"], ["day", "year"])
+    >>> step = ml.ExpandTimestamp(["x", "y"], ["day", "year", "hour"])
     """
 
     def __init__(
         self,
         inputs: SelectionType,
-        components: Sequence[Literal["day", "week", "month", "year", "dow", "doy"]] = (
-            "dow",
-            "month",
-            "year",
-        ),
+        components: list[
+            Literal[
+                "day",
+                "week",
+                "month",
+                "year",
+                "dow",
+                "doy",
+                "hour",
+                "minute",
+                "second",
+                "millisecond",
+            ]
+        ] = ("dow", "month", "year", "hour", "minute", "second"),
     ):
         self.inputs = selector(inputs)
         self.components = list(components)
@@ -202,6 +256,7 @@ def _repr(self) -> Iterable[tuple[str, Any]]:
 
     def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
         columns = self.inputs.select_columns(table, metadata)
+
         if "month" in self.components:
             for col in columns:
                 metadata.set_categories(
@@ -235,10 +290,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
                         "Sunday",
                     ],
                 )
+
         self.columns_ = columns
 
     def transform_table(self, table: ir.Table) -> ir.Table:
         new_cols = []
+
         for name in self.columns_:
             col = table[name]
             for comp in self.components:
@@ -254,65 +311,7 @@ def transform_table(self, table: ir.Table) -> ir.Table:
                     feat = col.day_of_week.index()
                 elif comp == "doy":
                     feat = col.day_of_year()
-                new_cols.append(feat.name(f"{name}_{comp}"))
-        return table.mutate(new_cols)
-
-
-class ExpandTime(Step):
-    """A step for expanding time columns into one or more features.
-
-    New features will be named ``{input_column}_{component}``. For example, if
-    expanding an ``"hour"`` component from column ``"x"``, the feature column
-    would be named ``"x_hour"``.
-
-    Parameters
-    ----------
-    inputs
-        A selection of time columns to expand into new features.
-    components
-        A sequence of components to expand. Options include ``hour``,
-        ``minute``, ``second``, and ``millisecond``.
-
-        Defaults to ``["hour", "minute", "second"]``.
-
-    Examples
-    --------
-    >>> import ibis_ml as ml
-
-    Expand time columns using the default components
-
-    >>> step = ml.ExpandTime(ml.time())
-
-    Expand specific columns using specific components
-
-    >>> step = ml.ExpandTime(["x", "y"], ["hour", "minute"])
-    """
-
-    def __init__(
-        self,
-        inputs: SelectionType,
-        components: Sequence[Literal["hour", "minute", "second", "millisecond"]] = (
-            "hour",
-            "minute",
-            "second",
-        ),
-    ):
-        self.inputs = selector(inputs)
-        self.components = list(components)
-
-    def _repr(self) -> Iterable[tuple[str, Any]]:
-        yield ("", self.inputs)
-        yield ("components", self.components)
-
-    def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
-        self.columns_ = self.inputs.select_columns(table, metadata)
-
-    def transform_table(self, table: ir.Table) -> ir.Table:
-        new_cols = []
-        for name in self.columns_:
-            col = table[name]
-            for comp in self.components:
-                if comp == "hour":
+                elif comp == "hour":
                     feat = col.hour()
                 elif comp == "minute":
                     feat = col.minute()
@@ -321,4 +320,5 @@ def transform_table(self, table: ir.Table) -> ir.Table:
                 elif comp == "millisecond":
                     feat = col.millisecond()
                 new_cols.append(feat.name(f"{name}_{comp}"))
+
         return table.mutate(new_cols)
diff --git a/ibis_ml/utils/_split.py b/ibis_ml/utils/_split.py
index 8b157f6..5290637 100644
--- a/ibis_ml/utils/_split.py
+++ b/ibis_ml/utils/_split.py
@@ -98,6 +98,7 @@ def train_test_split(
         }
     )
 
-    return table[table[train_flag]].drop([combined_key, train_flag]), table[
-        ~table[train_flag]
-    ].drop([combined_key, train_flag])
+    return (
+        table.filter(table[train_flag]).drop([combined_key, train_flag]),
+        table.filter(~table[train_flag]).drop([combined_key, train_flag]),
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 09fc216..d15cd07 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ dynamic = ["version", "description"]
 dask = ["dask[dataframe]"]
 doc = [
   "clickhouse-connect",
-  "ibis-framework[bigquery,clickhouse,dask,datafusion,druid,duckdb,examples,exasol,flink,impala,mssql,mysql,oracle,pandas,polars,postgres,pyspark,risingwave,snowflake,sqlite,trino]",
+  "ibis-framework[bigquery,clickhouse,dask,datafusion,druid,duckdb,examples,exasol,flink,impala,mssql,mysql,oracle,pandas,polars,postgres,pyspark,risingwave,snowflake,sqlite,trino] >=9.1",
   "itables",
   "jupyter",
   "quartodoc", 
@@ -25,7 +25,7 @@ doc = [
   "torch",
   "xgboost",
 ]
-dev = ["ibis-framework[duckdb,examples]", "pytest", "pytest-cov", "scikit-learn", "skorch", "torch", "xgboost"]
+dev = ["ibis-framework[duckdb,examples]", "polars", "pytest", "pytest-cov", "scikit-learn", "skorch", "torch", "xgboost"]
 
 [project.urls]
 Home = "https://ibis-project.github.io/ibis-ml/"
diff --git a/tests/test_common.py b/tests/test_common.py
index 5ff5abd..0119d62 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -34,6 +34,7 @@ def test_mutate_at_expr():
     res = step.transform_table(t)
     sol = t.mutate(x=_.x.abs(), y=_.y.abs())
     assert res.equals(sol)
+    assert list(step._get_params()) == ["expr", "inputs", "named_exprs"]  # noqa: SLF001
 
 
 def test_mutate_at_named_exprs():
@@ -44,6 +45,7 @@ def test_mutate_at_named_exprs():
     res = step.transform_table(t)
     sol = t.mutate(x=_.x.abs(), y=_.y.abs(), x_log=_.x.log(), y_log=_.y.log())
     assert res.equals(sol)
+    assert list(step._get_params()) == ["expr", "inputs", "named_exprs"]  # noqa: SLF001
 
 
 def test_mutate():
@@ -54,3 +56,4 @@ def test_mutate():
     res = step.transform_table(t)
     sol = t.mutate(_.x.abs().name("x_abs"), y_log=lambda t: t.y.log())
     assert res.equals(sol)
+    assert list(step._get_params()) == ["exprs", "named_exprs"]  # noqa: SLF001
diff --git a/tests/test_core.py b/tests/test_core.py
index b16cf59..70f0f78 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,3 +1,5 @@
+from unittest.mock import patch
+
 import ibis
 import ibis.expr.types as ir
 import numpy as np
@@ -207,7 +209,8 @@ def test_can_use_in_sklearn_pipeline():
 
     # get/set params works
     params = p.get_params()
-    p.set_params(**params)
+    p.set_params(**params | {"recipe__scalestandard__inputs": ml.numeric()})
+    assert p["recipe"].steps[1].inputs == ml.numeric()
 
     # fit and predict work
     p.fit(X, y)
@@ -365,6 +368,62 @@ def test_errors_nicely_if_not_fitted(table, method):
         getattr(r, method)(table)
 
 
+def test_get_params():
+    rec = ml.Recipe(ml.ExpandTimestamp(ml.timestamp()))
+
+    assert "expandtimestamp__components" in rec.get_params(deep=True)
+    assert "expandtimestamp__components" not in rec.get_params(deep=False)
+
+
+def test_set_params():
+    rec = ml.Recipe(ml.ExpandTimestamp(ml.timestamp()))
+
+    # Nonexistent parameter in step
+    with pytest.raises(
+        ValueError,
+        match="Invalid parameter 'nonexistent_param' for step ExpandTimestamp",
+    ):
+        rec.set_params(expandtimestamp__nonexistent_param=True)
+
+    # Nonexistent parameter of pipeline
+    with pytest.raises(
+        ValueError, match="Invalid parameter 'expanddatetime' for recipe Recipe"
+    ):
+        rec.set_params(expanddatetime__nonexistent_param=True)
+
+
+def test_set_params_passes_all_parameters():
+    # Make sure all parameters are passed together to set_params
+    # of nested estimator.
+    rec = ml.Recipe(ml.ExpandTimestamp(ml.timestamp()))
+    with patch.object(ml.ExpandTimestamp, "_set_params") as mock_set_params:
+        rec.set_params(
+            expandtimestamp__inputs=["x", "y"],
+            expandtimestamp__components=["day", "year", "hour"],
+        )
+
+    mock_set_params.assert_called_once_with(
+        inputs=["x", "y"], components=["day", "year", "hour"]
+    )
+
+
+def test_set_params_updates_valid_params():
+    # Check that set_params tries to set `replacement_mutateat.inputs`, not
+    # `original_mutateat.inputs`.
+    original_mutateat = ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute())  # noqa: SLF001
+    rec = ml.Recipe(
+        original_mutateat,
+        ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()),  # noqa: SLF001
+    )
+    replacement_mutateat = ml.MutateAt("arr_time", ibis._.hour() * 60 + ibis._.minute())  # noqa: SLF001
+    rec.set_params(
+        **{"mutateat-1": replacement_mutateat, "mutateat-1__inputs": ml.cols("arrival")}
+    )
+    assert original_mutateat.inputs == ml.cols("dep_time")
+    assert replacement_mutateat.inputs == ml.cols("arrival")
+    assert rec.steps[0] is replacement_mutateat
+
+
 @pytest.mark.parametrize(
     ("step", "url"),
     [
diff --git a/tests/test_handle_outliers.py b/tests/test_handle_outliers.py
index 3a84cf5..0192a57 100644
--- a/tests/test_handle_outliers.py
+++ b/tests/test_handle_outliers.py
@@ -1,66 +1,109 @@
 import ibis
+import numpy as np
+import pandas as pd
+import pandas.testing as tm
 import pytest
 
 import ibis_ml as ml
 
 
 @pytest.mark.parametrize(
-    ("deviation_factor", "method", "treatment"),
+    ("deviation_factor", "method", "treatment", "cols", "test_table", "expected"),
     [
-        (2, "z-score", "capping"),
-        (2, "IQR", "capping"),
-        (3.0, "z-score", "trimming"),
-        (3.0, "IQR", "trimming"),
+        (
+            2,
+            "z-score",
+            "capping",
+            "int_col",
+            {"int_col": [None, 0, -1, 1]},
+            {"int_col": [None, 0, 0, 0]},
+        ),
+        (
+            2,
+            "IQR",
+            "capping",
+            "int_col",
+            {"int_col": [None, 0, -1, 1]},
+            {"int_col": [None, 0, 0, 0]},
+        ),
+        (
+            3.0,
+            "z-score",
+            "trimming",
+            "int_col",
+            {"int_col": [None, 0, -1, 1]},
+            {"int_col": [None, 0]},
+        ),
+        (
+            3.0,
+            "IQR",
+            "trimming",
+            "int_col",
+            {"int_col": [None, 0, -1, 1]},
+            {"int_col": [None, 0]},
+        ),
+        (
+            2,
+            "z-score",
+            "capping",
+            "floating_col",
+            {"floating_col": [None, 0, -1, 1, np.nan]},
+            {"floating_col": [None, 0.0, 0.0, 0.0, np.nan]},
+        ),
+        (
+            2,
+            "z-score",
+            "trimming",
+            "floating_col",
+            {"floating_col": [None, 0, -1, 1, np.nan]},
+            {"floating_col": [None, np.nan, 0.0]},
+        ),
+        (
+            2,
+            "z-score",
+            "trimming",
+            ["floating_col", "int_col"],
+            {
+                "floating_col": [None, 0, -1, 1, np.nan],
+                "int_col": [None, 0, 0, None, None],
+            },
+            {"floating_col": [None, np.nan, 0.0], "int_col": [None, None, 0]},
+        ),
+        (
+            2,
+            "z-score",
+            "capping",
+            ["floating_col", "int_col"],
+            {
+                "floating_col": [None, 0, -1, 1, np.nan],
+                "int_col": [None, 0, 0, None, None],
+            },
+            {
+                "floating_col": [None, 0, 0, 0, np.nan],
+                "int_col": [None, 0, 0, None, None],
+            },
+        ),
     ],
 )
-def test_handle_univariate_outliers(deviation_factor, method, treatment):
-    cols = {"col1": 0, "col2": 1}
+def test_handle_univariate_outliers(
+    deviation_factor, method, treatment, cols, test_table, expected
+):
     train_table = ibis.memtable(
         {
             # use same value for easier calculation statistics
-            "col1": [cols["col1"]] * 10,  # mean = 0, std = 0
-            "col2": [cols["col2"]] * 10,  # Q1 = 1, Q3 = 1
+            "int_col": [0] * 10,  # mean = 0, std = 0 Q1 = 0, Q3 = 0
+            "floating_col": [0.0] * 10,  # mean = 0, std = 0, Q1 = 0, Q3 = 0
         }
     )
 
-    test_table = ibis.memtable(
-        {
-            "col1": [
-                None,  # keep
-                cols["col1"],  # keep
-                cols["col1"] - 1,  # outlier
-                cols["col1"] + 1,  # outlier
-                cols["col1"] + 1,  # outlier
-            ],
-            "col2": [
-                cols["col2"],  # keep
-                cols["col2"],  # keep
-                cols["col2"] - 1,  # outlier
-                cols["col2"] + 1,  # outlier
-                None,  # keep
-            ],
-        }
-    )
+    test_table = ibis.memtable(test_table)
     step = ml.HandleUnivariateOutliers(
-        ml.numeric(),
-        method=method,
-        deviation_factor=deviation_factor,
-        treatment=treatment,
+        cols, method=method, deviation_factor=deviation_factor, treatment=treatment
     )
     step.fit_table(train_table, ml.core.Metadata())
     assert step.is_fitted()
-    stats = step.stats_
-    res = step.transform_table(test_table)
 
-    if treatment == "trimming":
-        assert res.count().execute() == 2
-    elif treatment == "capping":
-        assert res.count().execute() == 5
+    result = step.transform_table(test_table)
+    expected = pd.DataFrame(expected)
 
-    for col_name, val in cols.items():
-        # check the boundary
-        assert stats[col_name]["lower_bound"] == val
-        assert stats[col_name]["upper_bound"] == val
-        # make sure there is no value beyond the boundary
-        assert res[col_name].max().execute() <= val
-        assert res[col_name].min().execute() >= val
+    tm.assert_frame_equal(result.execute(), expected, check_dtype=False)
diff --git a/tests/test_optional_dependencies.py b/tests/test_optional_dependencies.py
new file mode 100644
index 0000000..f0d4508
--- /dev/null
+++ b/tests/test_optional_dependencies.py
@@ -0,0 +1,15 @@
+import sys
+from importlib import import_module, reload
+from unittest.mock import patch
+
+import pytest
+
+
+# https://stackoverflow.com/a/65163627
+@pytest.mark.parametrize("optional_dependency", ["numpy", "pandas", "pyarrow"])
+def test_without_dependency(optional_dependency):
+    with patch.dict(sys.modules, {optional_dependency: None}):
+        if "ibis_ml" in sys.modules:
+            reload(sys.modules["ibis_ml"])
+        else:
+            import_module("ibis_ml")
diff --git a/tests/test_temporal.py b/tests/test_temporal.py
index 141ab74..adf1504 100644
--- a/tests/test_temporal.py
+++ b/tests/test_temporal.py
@@ -36,9 +36,9 @@ def test_expand_time():
     assert res.equals(sol)
 
 
-def test_expand_datetime():
+def test_expand_timestamp():
     t = ibis.table({"y": "timestamp", "z": "int"})
-    step = ml.ExpandDateTime(
+    step = ml.ExpandTimestamp(
         ml.timestamp(),
         components=[
             "dow",