Skip to content

Commit

Permalink
feat: Add MinMaxScaler. (#16)
Browse files Browse the repository at this point in the history
* feat: Add MinMaxScaler.

* style: fix docstring format.
  • Loading branch information
toohsk authored Jan 12, 2024
1 parent 1055c37 commit 083496d
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 2 deletions.
3 changes: 2 additions & 1 deletion ibisml/steps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ibisml.steps.common import Cast, Drop, MutateAt, Mutate
from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
from ibisml.steps.standardize import ScaleStandard
from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
from ibisml.steps.encode import OneHotEncode, CategoricalEncode
from ibisml.steps.temporal import ExpandDate, ExpandTime

Expand All @@ -15,6 +15,7 @@
"ImputeMedian",
"ImputeMode",
"ScaleStandard",
"ScaleMinMax",
"OneHotEncode",
"CategoricalEncode",
"ExpandDate",
Expand Down
50 changes: 50 additions & 0 deletions ibisml/steps/standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,56 @@
import ibis.expr.types as ir


class ScaleMinMax(Step):
"""A step for normalizing selected numeric columns to have a maximum value of 1
and a minimum value of 0.
Parameters
----------
inputs
A selection of columns to normalize. All columns must be numeric.
Examples
--------
>>> import ibisml as ml
Normalize all numeric columns.
>>> step = ml.ScaleMinMax(ml.numeric())
Normalize a specific set of columns.
>>> step = ml.ScaleMinMax(["x", "y"])
"""

def __init__(self, inputs: SelectionType):
self.inputs = selector(inputs)

def _repr(self) -> Iterable[tuple[str, Any]]:
yield ("", self.inputs)

def fit(self, table: ir.Table, metadata: Metadata) -> Transform:
columns = self.inputs.select_columns(table, metadata)

stats = {}
if columns:
aggs = []
for name in columns:
c = table[name]
if not isinstance(c, ir.NumericColumn):
raise ValueError(
f"Cannot be normalized {name!r} - this column is not numeric"
)

aggs.append(c.max().name(f"{name}_max"))
aggs.append(c.min().name(f"{name}_min"))

results = table.aggregate(aggs).execute().to_dict("records")[0]
for name in columns:
stats[name] = (results[f"{name}_max"], results[f"{name}_min"])
return ml.transforms.ScaleMinMax(stats)


class ScaleStandard(Step):
"""A step for normalizing select numeric columns to have a standard
deviation of one and a mean of zero.
Expand Down
3 changes: 2 additions & 1 deletion ibisml/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ibisml.transforms.common import Cast, Drop, MutateAt, Mutate
from ibisml.transforms.impute import FillNA
from ibisml.transforms.standardize import ScaleStandard
from ibisml.transforms.standardize import ScaleMinMax, ScaleStandard
from ibisml.transforms.encode import OneHotEncode, CategoricalEncode
from ibisml.transforms.temporal import ExpandDate, ExpandTime

Expand All @@ -10,6 +10,7 @@
"MutateAt",
"Mutate",
"FillNA",
"ScaleMinMax",
"ScaleStandard",
"OneHotEncode",
"CategoricalEncode",
Expand Down
17 changes: 17 additions & 0 deletions ibisml/transforms/standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,23 @@
import ibis.expr.types as ir


class ScaleMinMax(Transform):
def __init__(self, stats: dict[str, tuple[float, float]]):
self.stats = stats

@property
def input_columns(self) -> list[str]:
return list(self.stats)

def transform(self, table: ir.Table) -> ir.Table:
return table.mutate(
[
((table[c] - min) / (max - min)).name(c) # type: ignore
for c, (max, min) in self.stats.items()
]
)


class ScaleStandard(Transform):
def __init__(self, stats: dict[str, tuple[float, float]]):
self.stats = stats
Expand Down
2 changes: 2 additions & 0 deletions tests/test_standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
@pytest.mark.parametrize(
"step, sol",
[
(ml.ScaleMinMax(ml.numeric()), "ScaleMinMax(numeric())"),
(ml.ScaleStandard(ml.numeric()), "ScaleStandard(numeric())"),
],
)
Expand All @@ -16,6 +17,7 @@ def test_step_repr(step, sol):
@pytest.mark.parametrize(
"transform, sol",
[
(ml.transforms.ScaleMinMax({"x": (0.5, 0.2)}), "ScaleMinMax<x>"),
(ml.transforms.ScaleStandard({"x": (0.5, 0.2)}), "ScaleStandard<x>"),
],
)
Expand Down

0 comments on commit 083496d

Please sign in to comment.