Merge pull request #257 from BCG-Gamma/dev/2.2.1

BUILD: Release sklearndf 2.2.1
BCG-X-Official · May 26, 2023 · 8335d8c · 8335d8c
2 parents 020183a + 61e8ca7
commit 8335d8c
Show file tree

Hide file tree

Showing 9 changed files with 173 additions and 55 deletions.
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -7,13 +7,21 @@ Release Notes
 .. |nbsp| unicode:: 0xA0
    :trim:
 
+
 *sklearndf* 2.2
 ---------------
 
 *sklearndf* 2.2 adds support for
 `scikit-learn 1.2 <https://scikit-learn.org/1.2>`_, and enhances the EstimatorDF
 API.
 
+
+2.2.1
+~~~~~
+
+- VIZ: use *scikit-learn*'s native HTML representation of estimators, if available
+
+
 2.2.0
 ~~~~~
 
@@ -48,6 +56,7 @@ support of sparse output (see below).
 *sklearndf* 2.1 adds support for
 `scikit-learn 1.1 <https://scikit-learn.org/1.1>`_.
 
+
 2.1.1
 ~~~~~
 

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -18,7 +18,7 @@ schedules:
     displayName: Nightly full build
     branches:
       include:
-        - 2.1.x
+        - 2.2.x
 
 resources:
   repositories:
@@ -243,7 +243,15 @@ stages:
           - script: dir $(Build.SourcesDirectory)
 
           - script: |
-              conda install -y -c anaconda conda-build~=3.21 conda-verify~=3.4 toml~=0.10 flit~=3.6 packaging~=20.9
+              # install micromamba
+              curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+              export MAMBA_ROOT_PREFIX=~/micromamba
+              eval "$(./bin/micromamba shell hook -s posix)"
+              
+              # create and activate a build environment, then install the tools we need
+              micromamba create -n build
+              micromamba activate build
+              micromamba install -y -c conda-forge boa~=0.14 toml~=0.10 flit~=3.6 packaging~=20.9
             displayName: 'Install conda-build, flit, toml'
             condition: eq(variables['BUILD_SYSTEM'], 'conda')
 
@@ -261,7 +269,11 @@ stages:
               targetType: 'inline'
               script: |
                 set -eux
-                if [ "$BUILD_SYSTEM" = "conda" ] ; then eval "$(conda shell.bash hook)" ; fi
+                if [ "$BUILD_SYSTEM" = "conda" ] ; then
+                  export MAMBA_ROOT_PREFIX=~/micromamba
+                  eval "$(./bin/micromamba shell hook -s posix)"
+                  micromamba activate build
+                fi
                 export RUN_PACKAGE_VERSION_TEST=$(project_name)
 
                 cd $(Build.SourcesDirectory)/$(project_root)
@@ -330,7 +342,15 @@ stages:
           - script: dir $(Build.SourcesDirectory)
 
           - script: |
-              conda install -y -c anaconda conda-build~=3.21 conda-verify~=3.4 toml~=0.10 flit~=3.6 packaging~=20.9
+              # install micromamba
+              curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+              export MAMBA_ROOT_PREFIX=~/micromamba
+              eval "$(./bin/micromamba shell hook -s posix)"
+              
+              # create and activate a build environment, then install the tools we need
+              micromamba create -n build
+              micromamba activate build
+              micromamba install -y -c conda-forge boa~=0.14 toml~=0.10 flit~=3.6 packaging~=20.9
             displayName: 'Install conda-build, flit, toml'
             condition: eq(variables['BUILD_SYSTEM'], 'conda')
 
@@ -348,7 +368,11 @@ stages:
               targetType: 'inline'
               script: |
                 set -eux
-                if [ "$BUILD_SYSTEM" = "conda" ] ; then eval "$(conda shell.bash hook)" ; fi
+                if [ "$BUILD_SYSTEM" = "conda" ] ; then
+                   export MAMBA_ROOT_PREFIX=~/micromamba
+                   eval "$(./bin/micromamba shell hook -s posix)"
+                   micromamba activate build
+                 fi
                 export RUN_PACKAGE_VERSION_TEST=$(project_name)
 
                 cd $(Build.SourcesDirectory)/$(project_root)

diff --git a/environment.yml b/environment.yml
@@ -4,42 +4,31 @@ channels:
   - bcg_gamma
 dependencies:
   # run
-  - gamma-pytools ~= 2.1
-  - joblib ~= 1.2
-  - lightgbm ~= 3.3
-  - matplotlib ~= 3.6
-  - numpy ~= 1.24
-  - pandas ~=1.5|~=2.0
-  - python ~= 3.9
-  - scikit-learn ~= 1.2.2
-  - scipy ~= 1.10.0
-  - xgboost ~= 1.7
-  - pip ~= 23.0.1
+  - boruta_py         ~= 0.3
+  - gamma-pytools     ~= 2.1
+  - joblib            ~= 1.2
+  - lightgbm          ~= 3.3
+  - matplotlib        ~= 3.7
+  - numpy             ~= 1.24
+  - pandas            ~= 2.0
+  - pip               ~= 23.0
+  - python            ~= 3.9
+  - scikit-learn      ~= 1.2.0
+  - scipy             ~= 1.10
+  - xgboost           ~= 1.7
   - pip:
-    - arfs ~= 1.0.5
-    - shap ~= 0.41.0
-  # build/test
-  - conda-build ~= 3.23.3
-  - conda-verify ~= 3.1.1
-  - docutils ~= 0.17.1
-  - flit ~= 3.8.0
-  - jinja2 ~= 2.11.3
-  - markupsafe ~= 2.0.1  # markupsafe 2.1 breaks support for jinja2
-  - m2r ~= 0.3.1
-  - pluggy ~= 0.13.1
-  - pre-commit ~= 2.21.0
-  - pytest ~= 7.2.2
+     - arfs           ~= 1.1
+  # test
+  - pytest     ~= 7.2.1
   - pytest-cov ~= 2.12.1
-  - pyyaml ~= 5.4.1
-  - toml ~= 0.10.2
-  - tox ~= 3.27.1
-  - yaml ~= 0.2.5
   # sphinx
-  - nbsphinx ~= 0.8.12
-  - sphinx ~= 4.5.0
+  - nbsphinx                 ~= 0.8.9
+  - sphinx                   ~= 4.5.0
   - sphinx-autodoc-typehints ~= 1.19.2
-  - pydata-sphinx-theme ~= 0.8.1
+  - pydata-sphinx-theme      ~= 0.8.1
   # notebooks
-  - jupyterlab ~= 3.5.3
-  - openpyxl ~= 3.0.10
-  - seaborn ~= 0.12.2
+  - ipywidgets ~= 8.0
+  - jupyterlab ~= 3.5
+  - openpyxl   ~= 3.0
+  - seaborn    ~= 0.12
+  - tableone   ~= 0.7
diff --git a/pyproject.toml b/pyproject.toml
@@ -87,7 +87,7 @@ typing_inspect = "~=0.4.0"
 
 [build.matrix.max]
 # direct requirements of sklearndf
-arfs           = "~=1.0.5"
+arfs           = "~=1.1"
 gamma-pytools  = "~=2.1"
 lightgbm       = "~=3.3"
 numpy          = ">=1.23"  # cannot use ~= due to conda bug

diff --git a/src/sklearndf/__init__.py b/src/sklearndf/__init__.py
@@ -8,4 +8,4 @@
 from ._sklearn_version import *
 from ._sklearndf import *
 
-__version__ = "2.2.0"
+__version__ = "2.2.1"
diff --git a/src/sklearndf/_sklearndf.py b/src/sklearndf/_sklearndf.py
@@ -4,7 +4,7 @@
 import inspect
 import logging
 from abc import ABCMeta, abstractmethod
-from typing import Any, Dict, List, Mapping, Optional, TypeVar, Union, cast
+from typing import Any, Callable, Dict, List, Mapping, Optional, TypeVar, Union, cast
 
 import numpy.typing as npt
 import pandas as pd
@@ -210,6 +210,18 @@ def _get_n_outputs(self) -> int:
         # get the number of outputs this estimator has been fitted to
         return len(self._get_outputs() or [])
 
+    def _repr_html_(self) -> str:
+        try:
+            # class HasExpressionRepr defines a _repr_html_ method; we want to
+            # skip that one and call the _repr_html_ method of the superclass
+            sklearn_repr_html: Callable[[], str] = super(
+                HasExpressionRepr, self
+            )._repr_html_
+        except AttributeError:
+            return super()._repr_html_()
+        else:
+            return sklearn_repr_html()
+
     def to_expression(self) -> Expression:
         """[see superclass]"""
 

diff --git a/src/sklearndf/transformation/extra/_extra.py b/src/sklearndf/transformation/extra/_extra.py
@@ -5,6 +5,7 @@
 
 import logging
 
+import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
 
 from pytools.api import AllTracker
@@ -19,6 +20,17 @@
     # import boruta classes only if installed
     from boruta import BorutaPy
 
+    # Apply a hack to address boruta's incompatibility with numpy >= 1.24:
+    # boruta uses np.float_ which is deprecated in numpy >= 1.20 and removed in 1.24.
+    #
+    # We check these types are already defined in numpy, and if not, we define them
+    # as aliases to the corresponding new types with a trailing underscore.
+
+    for __attr in ["bool", "int", "float"]:
+        if not hasattr(np, __attr):
+            setattr(np, __attr, getattr(np, f"{__attr}_"))
+    del __attr
+
 except ImportError:
 
     class BorutaPy(  # type: ignore

diff --git a/test/test/sklearndf/test_base.py b/test/test/sklearndf/test_base.py
@@ -1,24 +1,28 @@
 # inspired by:
 # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tests/test_base.py
+
+import re
 from typing import Any
 
 import numpy as np
 import pytest
 import scipy.sparse as sp
+import sklearn
 from numpy.testing import assert_array_equal
-from sklearn import clone
 from sklearn.base import BaseEstimator, is_classifier
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
+from sklearn.utils import estimator_html_repr
 
 from pytools.expression import freeze, make_expression
 from pytools.expression.atomic import Id
 
 from sklearndf.classification import SVCDF, DecisionTreeClassifierDF
 from sklearndf.clustering.wrapper import KMeansBaseWrapperDF
 from sklearndf.pipeline import PipelineDF
-from sklearndf.transformation import OneHotEncoderDF
+from sklearndf.regression import RandomForestRegressorDF
+from sklearndf.transformation import OneHotEncoderDF, SimpleImputerDF
 from sklearndf.transformation.wrapper import ImputerWrapperDF
 from sklearndf.wrapper import (
     ClassifierWrapperDF,
@@ -75,7 +79,7 @@ def test_clone() -> None:
     assert encoder.get_params() == new_encoder.get_params()
 
     encoder = OneHotEncoderDF(handle_unknown="ignore", sparse=False)
-    new_encoder = clone(encoder)
+    new_encoder = sklearn.clone(encoder)
 
     assert encoder is not new_encoder
 
@@ -161,6 +165,45 @@ def test_str() -> None:
     str(my_estimator)
 
 
+def test_html_repr() -> None:
+    # store the original display config
+    display_original = sklearn.get_config()["display"]
+
+    # set the display config to use diagrams
+    sklearn.set_config(display="diagram")
+
+    try:
+        pipeline_df = PipelineDF(
+            [
+                (
+                    "preprocess",
+                    PipelineDF(
+                        [
+                            ("impute", SimpleImputerDF()),
+                        ]
+                    ),
+                ),
+                ("rf", RandomForestRegressorDF(n_estimators=120)),
+            ]
+        )
+
+        def _replace_ids(_html: str) -> str:
+            # scikit-learn generates new ids on subsequent calls to estimator_html_repr,
+            # so we replace them with a placeholder
+            return re.sub(
+                r'(?<=id-)\d+|(?:(?<=sk-)|(?<=id=")|(?<=for="))\w+(?:-\w+)*', "#", _html
+            )
+
+        assert _replace_ids(pipeline_df._repr_html_()) == _replace_ids(
+            estimator_html_repr(pipeline_df)
+        )
+
+    finally:
+        # reset the display config to its original value
+        sklearn.set_config(display=display_original)
+        pass
+
+
 def test_get_params() -> None:
     test = DummyEstimator2DF(DummyEstimator3DF(), DummyEstimator3DF())