Merge branch 'ibis-project:main' into main

jitingxu1 · Sep 17, 2024 · cddef7c · cddef7c
2 parents 1739bbd + 6dce35e
commit cddef7c
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 280 deletions.
diff --git a/docs/tutorial/pytorch.qmd b/docs/tutorial/pytorch.qmd
@@ -102,7 +102,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -122,44 +122,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),

diff --git a/docs/tutorial/scikit-learn.qmd b/docs/tutorial/scikit-learn.qmd
@@ -101,7 +101,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),

diff --git a/docs/tutorial/xgboost.qmd b/docs/tutorial/xgboost.qmd
@@ -101,7 +101,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),