Skip to content

Commit

Permalink
Merge pull request #233 from BCG-Gamma/dev/2.0.1
Browse files Browse the repository at this point in the history
BUILD: release sklearndf 2.0.1
  • Loading branch information
j-ittner authored Sep 7, 2022
2 parents e769c73 + 8efe56f commit fe9670c
Show file tree
Hide file tree
Showing 19 changed files with 363 additions and 198 deletions.
13 changes: 13 additions & 0 deletions RELEASE_NOTES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ Release Notes
adds data frame support for clusterers along with additional API enhancements and
improvements, and is now subject to static type checking with |mypy|.

2.0.1
~~~~~

- API: upon declaration of new wrapper classes, automatically validate that their
associated native estimators are compatible with the wrapper class
- API: new public constants ``DROP`` and ``PASSTHROUGH`` in
:class:`.ColumnTransformerDF`
- FIX: base :class:`.LGBMClassifierDF` and :class:`.XGBClassifierDF` on the
the correct wrapper class :class:`.ClassifierWrapperDF`
- FIX: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF`
- FIX: various minor tweaks and stability improvements


2.0.0
~~~~~

Expand Down
6 changes: 3 additions & 3 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ schedules:
displayName: Nightly full build
branches:
include:
- 1.2.x
- 2.0.x

resources:
repositories:
Expand Down Expand Up @@ -539,8 +539,8 @@ stages:
displayName: 'Publish to PyPi'
condition: eq(variables['source_is_release_branch'], 'True')
env:
FLIT_PASSWORD: $(pypi_pw)
FLIT_USERNAME: $(pypi_user)
FLIT_PASSWORD: $(pypi_sklearndf_uploads)
FLIT_USERNAME: __token__
- task: GitHubRelease@1
condition: >
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dist-name = "sklearndf"
license = "Apache Software License v2.0"

requires = [
"gamma-pytools ~=2.0,>=2.0.1",
"gamma-pytools ~=2.0,!=2.0.0",
"numpy >=1.21,<2a", # cannot use ~= due to conda bug
"packaging >=20",
"pandas >=1,<2a", # cannot use ~= due to conda bug
Expand Down Expand Up @@ -88,7 +88,7 @@ typing_inspect = "~=0.4.0"
[build.matrix.max]
# direct requirements of sklearndf
boruta = "~=0.3"
gamma-pytools = "~=2.0,>=2.0.1"
gamma-pytools = "~=2.0,!=2.0.0"
lightgbm = "~=3.3"
numpy = ">=1.23,<2a" # cannot use ~= due to conda bug
packaging = ">=20"
Expand Down
2 changes: 1 addition & 1 deletion src/sklearndf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
from ._sklearn_version import *
from ._sklearndf import *

__version__ = "2.0.0"
__version__ = "2.0.1"
33 changes: 15 additions & 18 deletions src/sklearndf/classification/extra/_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
"""
import logging

from sklearn.base import ClassifierMixin

from pytools.api import AllTracker

from ...wrapper import MissingEstimator, RegressorWrapperDF
from ...wrapper import ClassifierWrapperDF, MissingEstimator

log = logging.getLogger(__name__)

Expand All @@ -18,6 +20,7 @@

class LGBMClassifier( # type: ignore
MissingEstimator,
ClassifierMixin, # type: ignore
):
"""Mock-up for missing estimator."""

Expand All @@ -29,6 +32,7 @@ class LGBMClassifier( # type: ignore

class XGBClassifier( # type: ignore
MissingEstimator,
ClassifierMixin, # type: ignore
):
"""Mock-up for missing estimator."""

Expand All @@ -44,27 +48,20 @@ class XGBClassifier( # type: ignore
# Class definitions
#

if LGBMClassifier:

class LGBMClassifierDF(
RegressorWrapperDF[LGBMClassifier],
native=LGBMClassifier,
):
"""Stub for DF wrapper of class ``LGBMClassifierDF``"""

else:
__all__.remove("LGBMClassifierDF")
class LGBMClassifierDF(
ClassifierWrapperDF[LGBMClassifier],
native=LGBMClassifier,
):
"""Stub for DF wrapper of class ``LGBMClassifierDF``"""

if XGBClassifier:

class XGBClassifierDF(
RegressorWrapperDF[XGBClassifier],
native=XGBClassifier,
):
"""Stub for DF wrapper of class ``XGBClassifierDF``"""
class XGBClassifierDF(
ClassifierWrapperDF[XGBClassifier],
native=XGBClassifier,
):
"""Stub for DF wrapper of class ``XGBClassifierDF``"""

else:
__all__.remove("XGBClassifierDF")

#
# validate that __all__
Expand Down
7 changes: 2 additions & 5 deletions src/sklearndf/clustering/_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
"DBSCANDF",
"FeatureAgglomerationDF",
"KMeansDF",
"MiniBatchKMeansDF",
"MeanShiftDF",
"MiniBatchKMeansDF",
"OPTICSDF",
"SpectralClusteringDF",
]
Expand Down Expand Up @@ -77,10 +77,7 @@ class KMeansDF(KMeansBaseWrapperDF[KMeans], native=KMeans):
"""Stub for DF wrapper of class ``KMeans``"""


class MiniBatchKMeansDF(
KMeansBaseWrapperDF[MiniBatchKMeans],
native=MiniBatchKMeans,
):
class MiniBatchKMeansDF(KMeansBaseWrapperDF[MiniBatchKMeans], native=MiniBatchKMeans):
"""Stub for DF wrapper of class ``MiniBatchKMeans``"""


Expand Down
12 changes: 7 additions & 5 deletions src/sklearndf/pipeline/wrapper/_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class PipelineWrapperDF(
DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.Pipeline`.
"""

__native_base_class__ = Pipeline

#: Placeholder that can be used in place of an estimator to designate a pipeline
#: step that preserves the original ingoing data.
PASSTHROUGH = "passthrough"
Expand All @@ -66,8 +68,8 @@ def _validate_delegate_estimator(self) -> None:
or isinstance(transformer, TransformerDF)
):
raise ValueError(
f'expected step "{name}" to contain a '
f"{TransformerDF.__name__}, but found an instance of "
f"expected step {name!r} to be a {TransformerDF.__name__}, "
f"or {PipelineWrapperDF.PASSTHROUGH}, but found an instance of "
f"{type(transformer).__name__}"
)

Expand All @@ -78,9 +80,9 @@ def _validate_delegate_estimator(self) -> None:
or isinstance(final_estimator, EstimatorDF)
):
raise ValueError(
f'expected final step "{final_step[0]}" to contain a '
f"{EstimatorDF.__name__}, but found an instance of "
f"{type(final_estimator).__name__}"
f"expected final step {final_step[0]!r} to be an "
f"{EstimatorDF.__name__} or {PipelineWrapperDF.PASSTHROUGH}, "
f"but found an instance of {type(final_estimator).__name__}"
)

@property
Expand Down
13 changes: 4 additions & 9 deletions src/sklearndf/regression/extra/_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,13 @@
Core implementation of :mod:`sklearndf.regression.extra`
"""
import logging
import warnings

from sklearn.base import RegressorMixin

from pytools.api import AllTracker

from ...wrapper import MissingEstimator, RegressorWrapperDF

# since we install LGBM via conda, the warning about the Clang compiler is irrelevant
warnings.filterwarnings("ignore", message=r"Starting from version 2\.2\.1")
# cross-validation will invariably generate sliced subsets, so the following warning
# is not helpful
warnings.filterwarnings(
"ignore", message=r"Usage of np\.ndarray subset \(sliced data\) is not recommended"
)

log = logging.getLogger(__name__)

__all__ = ["LGBMRegressorDF", "XGBRegressorDF"]
Expand All @@ -28,6 +21,7 @@

class LGBMRegressor( # type: ignore
MissingEstimator,
RegressorMixin, # type: ignore
):
"""Mock-up for missing estimator."""

Expand All @@ -40,6 +34,7 @@ class LGBMRegressor( # type: ignore

class XGBRegressor( # type: ignore
MissingEstimator,
RegressorMixin, # type: ignore
):
"""Mock-up for missing estimator."""

Expand Down
3 changes: 3 additions & 0 deletions src/sklearndf/transformation/extra/_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import logging

from sklearn.base import TransformerMixin

from pytools.api import AllTracker

from ...wrapper import MissingEstimator
Expand All @@ -21,6 +23,7 @@

class BorutaPy( # type: ignore
MissingEstimator,
TransformerMixin, # type: ignore
):
"""Mock-up for missing estimator."""

Expand Down
78 changes: 42 additions & 36 deletions src/sklearndf/transformation/wrapper/_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Core implementation of :mod:`sklearndf.transformation.wrapper`
"""

import itertools
import logging
from abc import ABCMeta, abstractmethod
from typing import (
Expand Down Expand Up @@ -64,7 +64,7 @@
# onwards.
# Once we drop support for sklearn 0.21, _BaseImputer can be used instead.
# The following TypeVar helps to annotate availability of "add_indicator" and
# "missing_values" attributes on an imputer instance for ImputerWrapperDF below
# "missing_values" attributes on an imputer instance for ImputerWrapperDF below.

# noinspection PyProtectedMember
from sklearn.impute._iterative import IterativeImputer
Expand Down Expand Up @@ -268,10 +268,15 @@ class ColumnTransformerWrapperDF(
:class:`.TransformerDF`.
"""

__DROP = "drop"
__PASSTHROUGH = "passthrough"
#: Special transformer argument for use with parameters ``transformers`` and
#: ``remainder``.
DROP = "drop"

#: Special transformer argument for use with parameters ``transformers`` and
#: ``remainder``.
PASSTHROUGH = "passthrough"

__SPECIAL_TRANSFORMERS = (__DROP, __PASSTHROUGH)
__SPECIAL_TRANSFORMERS = (DROP, PASSTHROUGH)

def _validate_delegate_estimator(self) -> None:
column_transformer: ColumnTransformer = self.native_estimator
Expand Down Expand Up @@ -326,7 +331,7 @@ def _features_original(
input_column_names: npt.NDArray[Any]
output_column_names: npt.NDArray[Any]

if df_transformer == ColumnTransformerWrapperDF.__PASSTHROUGH:
if df_transformer == ColumnTransformerWrapperDF.PASSTHROUGH:
# we may get positional indices for columns selected by the
# 'passthrough' transformer, and in that case so need to look up the
# associated column names
Expand Down Expand Up @@ -368,7 +373,7 @@ def _features_original(
)
if (
len(columns) > 0
and df_transformer != ColumnTransformerWrapperDF.__DROP
and df_transformer != ColumnTransformerWrapperDF.DROP
)
]
)
Expand Down Expand Up @@ -499,41 +504,42 @@ def _validate_delegate_estimator(self) -> None:

def _get_features_original(self) -> pd.Series:
# Return the series mapping output column names to original column names.
#
# Remove 1st category column if argument drop == 'first'
# Remove 1st category column only of binary features if arg drop == 'if_binary'

native_estimator: OneHotEncoder = self.native_estimator
feature_names_in: pd.Index = self.feature_names_in_

feature_names_out = _get_native_feature_names_out(
feature_names_out: pd.Index = _get_native_feature_names_out(
feature_names_in_=feature_names_in, native_estimator=native_estimator
)

if self.drop == "first":
feature_names_in_mapped = [
column_original
for column_original, category in zip(
feature_names_in, native_estimator.categories_
)
for _ in range(len(category) - 1)
]
elif self.drop == "if_binary":
feature_names_in_mapped = [
column_original
for column_original, category in zip(
feature_names_in, native_estimator.categories_
)
for _ in (range(1) if len(category) == 2 else category)
]
else:
feature_names_in_mapped = [
column_original
for column_original, category in zip(
feature_names_in, native_estimator.categories_
)
for _ in category
]
def _adjust_n_features_in(n: npt.NDArray[np.int_]) -> None:
drop = self.drop
if drop is None:
return
elif isinstance(drop, str):
if drop == "first":
# drop one category for all feature
n -= 1
return
elif drop == "if_binary":
# drop one category only for binary features
n[n == 2] = 1
return
elif isinstance(drop, (Sequence, np.ndarray)):
# drop is an array-like
n -= 1
return

raise ValueError(f"unexpected value for arg drop: {drop!r}")

n_features_in: npt.NDArray[np.int_] = np.array(
[len(categories) for categories in native_estimator.categories_],
dtype=np.int_,
)
_adjust_n_features_in(n_features_in)

feature_names_in_mapped = itertools.chain(
*([feature] * n for feature, n in zip(feature_names_in, n_features_in))
)

return pd.Series(index=feature_names_out, data=feature_names_in_mapped)

Expand Down
Loading

0 comments on commit fe9670c

Please sign in to comment.