Merge pull request #233 from BCG-Gamma/dev/2.0.1

BUILD: release sklearndf 2.0.1
BCG-X-Official · Sep 7, 2022 · fe9670c · fe9670c
2 parents e769c73 + 8efe56f
commit fe9670c
Show file tree

Hide file tree

Showing 19 changed files with 363 additions and 198 deletions.
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -12,6 +12,19 @@ Release Notes
 adds data frame support for clusterers along with additional API enhancements and
 improvements, and is now subject to static type checking with |mypy|.
 
+2.0.1
+~~~~~
+
+- API: upon declaration of new wrapper classes, automatically validate that their
+  associated native estimators are compatible with the wrapper class
+- API: new public constants ``DROP`` and ``PASSTHROUGH`` in
+  :class:`.ColumnTransformerDF`
+- FIX: base :class:`.LGBMClassifierDF` and :class:`.XGBClassifierDF` on the
+  the correct wrapper class :class:`.ClassifierWrapperDF`
+- FIX: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF`
+- FIX: various minor tweaks and stability improvements
+
+
 2.0.0
 ~~~~~
 

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -18,7 +18,7 @@ schedules:
     displayName: Nightly full build
     branches:
       include:
-        - 1.2.x
+        - 2.0.x
 
 resources:
   repositories:
@@ -539,8 +539,8 @@ stages:
             displayName: 'Publish to PyPi'
             condition: eq(variables['source_is_release_branch'], 'True')
             env:
-              FLIT_PASSWORD: $(pypi_pw)
-              FLIT_USERNAME: $(pypi_user)
+              FLIT_PASSWORD: $(pypi_sklearndf_uploads)
+              FLIT_USERNAME: __token__
 
           - task: GitHubRelease@1
             condition: >

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ dist-name = "sklearndf"
 license = "Apache Software License v2.0"
 
 requires = [
-    "gamma-pytools  ~=2.0,>=2.0.1",
+    "gamma-pytools  ~=2.0,!=2.0.0",
     "numpy          >=1.21,<2a",  # cannot use ~= due to conda bug
     "packaging      >=20",
     "pandas         >=1,<2a",  # cannot use ~= due to conda bug
@@ -88,7 +88,7 @@ typing_inspect = "~=0.4.0"
 [build.matrix.max]
 # direct requirements of sklearndf
 boruta         = "~=0.3"
-gamma-pytools  = "~=2.0,>=2.0.1"
+gamma-pytools  = "~=2.0,!=2.0.0"
 lightgbm       = "~=3.3"
 numpy          = ">=1.23,<2a"  # cannot use ~= due to conda bug
 packaging      = ">=20"

diff --git a/src/sklearndf/__init__.py b/src/sklearndf/__init__.py
@@ -8,4 +8,4 @@
 from ._sklearn_version import *
 from ._sklearndf import *
 
-__version__ = "2.0.0"
+__version__ = "2.0.1"
diff --git a/src/sklearndf/classification/extra/_extra.py b/src/sklearndf/classification/extra/_extra.py
@@ -3,9 +3,11 @@
 """
 import logging
 
+from sklearn.base import ClassifierMixin
+
 from pytools.api import AllTracker
 
-from ...wrapper import MissingEstimator, RegressorWrapperDF
+from ...wrapper import ClassifierWrapperDF, MissingEstimator
 
 log = logging.getLogger(__name__)
 
@@ -18,6 +20,7 @@
 
     class LGBMClassifier(  # type: ignore
         MissingEstimator,
+        ClassifierMixin,  # type: ignore
     ):
         """Mock-up for missing estimator."""
 
@@ -29,6 +32,7 @@ class LGBMClassifier(  # type: ignore
 
     class XGBClassifier(  # type: ignore
         MissingEstimator,
+        ClassifierMixin,  # type: ignore
     ):
         """Mock-up for missing estimator."""
 
@@ -44,27 +48,20 @@ class XGBClassifier(  # type: ignore
 # Class definitions
 #
 
-if LGBMClassifier:
-
-    class LGBMClassifierDF(
-        RegressorWrapperDF[LGBMClassifier],
-        native=LGBMClassifier,
-    ):
-        """Stub for DF wrapper of class ``LGBMClassifierDF``"""
 
-else:
-    __all__.remove("LGBMClassifierDF")
+class LGBMClassifierDF(
+    ClassifierWrapperDF[LGBMClassifier],
+    native=LGBMClassifier,
+):
+    """Stub for DF wrapper of class ``LGBMClassifierDF``"""
 
-if XGBClassifier:
 
-    class XGBClassifierDF(
-        RegressorWrapperDF[XGBClassifier],
-        native=XGBClassifier,
-    ):
-        """Stub for DF wrapper of class ``XGBClassifierDF``"""
+class XGBClassifierDF(
+    ClassifierWrapperDF[XGBClassifier],
+    native=XGBClassifier,
+):
+    """Stub for DF wrapper of class ``XGBClassifierDF``"""
 
-else:
-    __all__.remove("XGBClassifierDF")
 
 #
 # validate that __all__

diff --git a/src/sklearndf/clustering/_clustering.py b/src/sklearndf/clustering/_clustering.py
@@ -30,8 +30,8 @@
     "DBSCANDF",
     "FeatureAgglomerationDF",
     "KMeansDF",
-    "MiniBatchKMeansDF",
     "MeanShiftDF",
+    "MiniBatchKMeansDF",
     "OPTICSDF",
     "SpectralClusteringDF",
 ]
@@ -77,10 +77,7 @@ class KMeansDF(KMeansBaseWrapperDF[KMeans], native=KMeans):
     """Stub for DF wrapper of class ``KMeans``"""
 
 
-class MiniBatchKMeansDF(
-    KMeansBaseWrapperDF[MiniBatchKMeans],
-    native=MiniBatchKMeans,
-):
+class MiniBatchKMeansDF(KMeansBaseWrapperDF[MiniBatchKMeans], native=MiniBatchKMeans):
     """Stub for DF wrapper of class ``MiniBatchKMeans``"""
 
 

diff --git a/src/sklearndf/pipeline/wrapper/_wrapper.py b/src/sklearndf/pipeline/wrapper/_wrapper.py
@@ -47,6 +47,8 @@ class PipelineWrapperDF(
     DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.Pipeline`.
     """
 
+    __native_base_class__ = Pipeline
+
     #: Placeholder that can be used in place of an estimator to designate a pipeline
     #: step that preserves the original ingoing data.
     PASSTHROUGH = "passthrough"
@@ -66,8 +68,8 @@ def _validate_delegate_estimator(self) -> None:
                 or isinstance(transformer, TransformerDF)
             ):
                 raise ValueError(
-                    f'expected step "{name}" to contain a '
-                    f"{TransformerDF.__name__}, but found an instance of "
+                    f"expected step {name!r} to be a {TransformerDF.__name__}, "
+                    f"or {PipelineWrapperDF.PASSTHROUGH}, but found an instance of "
                     f"{type(transformer).__name__}"
                 )
 
@@ -78,9 +80,9 @@ def _validate_delegate_estimator(self) -> None:
             or isinstance(final_estimator, EstimatorDF)
         ):
             raise ValueError(
-                f'expected final step "{final_step[0]}" to contain a '
-                f"{EstimatorDF.__name__}, but found an instance of "
-                f"{type(final_estimator).__name__}"
+                f"expected final step {final_step[0]!r} to be an "
+                f"{EstimatorDF.__name__} or {PipelineWrapperDF.PASSTHROUGH}, "
+                f"but found an instance of {type(final_estimator).__name__}"
             )
 
     @property

diff --git a/src/sklearndf/regression/extra/_extra.py b/src/sklearndf/regression/extra/_extra.py
@@ -2,20 +2,13 @@
 Core implementation of :mod:`sklearndf.regression.extra`
 """
 import logging
-import warnings
+
+from sklearn.base import RegressorMixin
 
 from pytools.api import AllTracker
 
 from ...wrapper import MissingEstimator, RegressorWrapperDF
 
-# since we install LGBM via conda, the warning about the Clang compiler is irrelevant
-warnings.filterwarnings("ignore", message=r"Starting from version 2\.2\.1")
-# cross-validation will invariably generate sliced subsets, so the following warning
-# is not helpful
-warnings.filterwarnings(
-    "ignore", message=r"Usage of np\.ndarray subset \(sliced data\) is not recommended"
-)
-
 log = logging.getLogger(__name__)
 
 __all__ = ["LGBMRegressorDF", "XGBRegressorDF"]
@@ -28,6 +21,7 @@
 
     class LGBMRegressor(  # type: ignore
         MissingEstimator,
+        RegressorMixin,  # type: ignore
     ):
         """Mock-up for missing estimator."""
 
@@ -40,6 +34,7 @@ class LGBMRegressor(  # type: ignore
 
     class XGBRegressor(  # type: ignore
         MissingEstimator,
+        RegressorMixin,  # type: ignore
     ):
         """Mock-up for missing estimator."""
 

diff --git a/src/sklearndf/transformation/extra/_extra.py b/src/sklearndf/transformation/extra/_extra.py
@@ -5,6 +5,8 @@
 
 import logging
 
+from sklearn.base import TransformerMixin
+
 from pytools.api import AllTracker
 
 from ...wrapper import MissingEstimator
@@ -21,6 +23,7 @@
 
     class BorutaPy(  # type: ignore
         MissingEstimator,
+        TransformerMixin,  # type: ignore
     ):
         """Mock-up for missing estimator."""
 

diff --git a/src/sklearndf/transformation/wrapper/_wrapper.py b/src/sklearndf/transformation/wrapper/_wrapper.py
@@ -1,7 +1,7 @@
 """
 Core implementation of :mod:`sklearndf.transformation.wrapper`
 """
-
+import itertools
 import logging
 from abc import ABCMeta, abstractmethod
 from typing import (
@@ -64,7 +64,7 @@
 # onwards.
 # Once we drop support for sklearn 0.21, _BaseImputer can be used instead.
 # The following TypeVar helps to annotate availability of "add_indicator" and
-# "missing_values" attributes on an imputer instance for ImputerWrapperDF below
+# "missing_values" attributes on an imputer instance for ImputerWrapperDF below.
 
 # noinspection PyProtectedMember
 from sklearn.impute._iterative import IterativeImputer
@@ -268,10 +268,15 @@ class ColumnTransformerWrapperDF(
     :class:`.TransformerDF`.
     """
 
-    __DROP = "drop"
-    __PASSTHROUGH = "passthrough"
+    #: Special transformer argument for use with parameters ``transformers`` and
+    #: ``remainder``.
+    DROP = "drop"
+
+    #: Special transformer argument for use with parameters ``transformers`` and
+    #: ``remainder``.
+    PASSTHROUGH = "passthrough"
 
-    __SPECIAL_TRANSFORMERS = (__DROP, __PASSTHROUGH)
+    __SPECIAL_TRANSFORMERS = (DROP, PASSTHROUGH)
 
     def _validate_delegate_estimator(self) -> None:
         column_transformer: ColumnTransformer = self.native_estimator
@@ -326,7 +331,7 @@ def _features_original(
             input_column_names: npt.NDArray[Any]
             output_column_names: npt.NDArray[Any]
 
-            if df_transformer == ColumnTransformerWrapperDF.__PASSTHROUGH:
+            if df_transformer == ColumnTransformerWrapperDF.PASSTHROUGH:
                 # we may get positional indices for columns selected by the
                 # 'passthrough' transformer, and in that case so need to look up the
                 # associated column names
@@ -368,7 +373,7 @@ def _features_original(
                 )
                 if (
                     len(columns) > 0
-                    and df_transformer != ColumnTransformerWrapperDF.__DROP
+                    and df_transformer != ColumnTransformerWrapperDF.DROP
                 )
             ]
         )
@@ -499,41 +504,42 @@ def _validate_delegate_estimator(self) -> None:
 
     def _get_features_original(self) -> pd.Series:
         # Return the series mapping output column names to original column names.
-        #
-        # Remove 1st category column if argument drop == 'first'
-        # Remove 1st category column only of binary features if arg drop == 'if_binary'
 
         native_estimator: OneHotEncoder = self.native_estimator
         feature_names_in: pd.Index = self.feature_names_in_
-
-        feature_names_out = _get_native_feature_names_out(
+        feature_names_out: pd.Index = _get_native_feature_names_out(
             feature_names_in_=feature_names_in, native_estimator=native_estimator
         )
 
-        if self.drop == "first":
-            feature_names_in_mapped = [
-                column_original
-                for column_original, category in zip(
-                    feature_names_in, native_estimator.categories_
-                )
-                for _ in range(len(category) - 1)
-            ]
-        elif self.drop == "if_binary":
-            feature_names_in_mapped = [
-                column_original
-                for column_original, category in zip(
-                    feature_names_in, native_estimator.categories_
-                )
-                for _ in (range(1) if len(category) == 2 else category)
-            ]
-        else:
-            feature_names_in_mapped = [
-                column_original
-                for column_original, category in zip(
-                    feature_names_in, native_estimator.categories_
-                )
-                for _ in category
-            ]
+        def _adjust_n_features_in(n: npt.NDArray[np.int_]) -> None:
+            drop = self.drop
+            if drop is None:
+                return
+            elif isinstance(drop, str):
+                if drop == "first":
+                    # drop one category for all feature
+                    n -= 1
+                    return
+                elif drop == "if_binary":
+                    # drop one category only for binary features
+                    n[n == 2] = 1
+                    return
+            elif isinstance(drop, (Sequence, np.ndarray)):
+                # drop is an array-like
+                n -= 1
+                return
+
+            raise ValueError(f"unexpected value for arg drop: {drop!r}")
+
+        n_features_in: npt.NDArray[np.int_] = np.array(
+            [len(categories) for categories in native_estimator.categories_],
+            dtype=np.int_,
+        )
+        _adjust_n_features_in(n_features_in)
+
+        feature_names_in_mapped = itertools.chain(
+            *([feature] * n for feature, n in zip(feature_names_in, n_features_in))
+        )
 
         return pd.Series(index=feature_names_out, data=feature_names_in_mapped)