Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error - cannot reindex on an axis with duplicate labels #115

Open
hanhandata opened this issue Aug 26, 2023 · 3 comments
Open

Error - cannot reindex on an axis with duplicate labels #115

hanhandata opened this issue Aug 26, 2023 · 3 comments

Comments

@hanhandata
Copy link

Hi,

I have successful to used the function in 100k data, but I found the error when I trying to fit used 2.5mio row data.
The error is cannot reindex on an axis with duplicate labels

Do you know solv for this the error? or we have limit the row?

**```

ValueError Traceback (most recent call last)
Cell In[43], line 1
----> 1 bucketing_process.fit(X_train, y_train)

File ~/anaconda3/lib/python3.10/site-packages/skorecard/pipeline/bucketing_process.py:281, in BucketingProcess.fit(self, X, y)
273 self.bucket_tables_[column] = build_bucket_table(
274 X_prebucketed_,
275 y,
276 column=column,
277 bucket_mapping=self.pipeline_.features_bucket_mapping_.get(column),
278 )
280 # Calculate the summary
--> 281 self._generate_summary(X, y)
283 return self

File ~/anaconda3/lib/python3.10/site-packages/skorecard/reporting/report.py:229, in SummaryMethod._generate_summary(self, X, y)
227 # Calculate information value
228 if y is not None:
--> 229 iv_scores = iv(self.transform(X), y)
230 else:
231 iv_scores = {}

File ~/anaconda3/lib/python3.10/site-packages/skorecard/reporting/report.py:378, in iv(X, y, epsilon, digits)
338 def iv(X: pd.DataFrame, y: pd.Series, epsilon: float = 0.0001, digits: Optional[int] = None) -> Dict:
339 r"""
340 Calculate the Information Value (IV) of the features in X.
341
(...)
376 IVs (dict): Keys are feature names, values are the IV values
377 """ # noqa
--> 378 return {col: _IV_score(y, X[col], epsilon=epsilon, digits=digits) for col in X.columns}

File ~/anaconda3/lib/python3.10/site-packages/skorecard/reporting/report.py:378, in (.0)
338 def iv(X: pd.DataFrame, y: pd.Series, epsilon: float = 0.0001, digits: Optional[int] = None) -> Dict:
339 r"""
340 Calculate the Information Value (IV) of the features in X.
341
(...)
376 IVs (dict): Keys are feature names, values are the IV values
377 """ # noqa
--> 378 return {col: _IV_score(y, X[col], epsilon=epsilon, digits=digits) for col in X.columns}

File ~/anaconda3/lib/python3.10/site-packages/skorecard/metrics/metrics.py:76, in _IV_score(y_test, y_pred, epsilon, digits)
62 def _IV_score(y_test, y_pred, epsilon=0.0001, digits=None):
63 """Using the unique values in y_pred, calculates the information value for the specific np.array.
64
65 Args:
(...)
74
75 """
---> 76 df = woe_1d(y_pred, y_test, epsilon=epsilon)
78 iv = ((df["non_target"] - df["target"]) * df["woe"]).sum()
80 if digits:

File ~/anaconda3/lib/python3.10/site-packages/skorecard/metrics/metrics.py:31, in woe_1d(X, y, epsilon)
28 # Ensure classes in y start at zero
29 y = y - min(y)
---> 31 df = pd.concat([X, y], axis=1, ignore_index=True)
32 df.columns = ["feat", "target"]
34 total_pos = df["target"].sum()

File ~/anaconda3/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments..decorate..wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/reshape/concat.py:381, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
159 """
160 Concatenate pandas objects along a particular axis.
161
(...)
366 1 3 4
367 """
368 op = _Concatenator(
369 objs,
370 axis=axis,
(...)
378 sort=sort,
379 )
--> 381 return op.get_result()

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/reshape/concat.py:592, in _Concatenator.get_result(self)
589 cons = sample._constructor_expanddim
591 index, columns = self.new_axes
--> 592 df = cons(data, index=index, copy=self.copy)
593 df.columns = columns
594 return df.finalize(self, method="concat")

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/frame.py:664, in DataFrame.init(self, data, index, columns, dtype, copy)
658 mgr = self._init_mgr(
659 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
660 )
662 elif isinstance(data, dict):
663 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 664 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
665 elif isinstance(data, ma.MaskedArray):
666 import numpy.ma.mrecords as mrecords

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/construction.py:493, in dict_to_mgr(data, index, columns, dtype, typ, copy)
489 else:
490 # dtype check to exclude e.g. range objects, scalars
491 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
--> 493 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/construction.py:123, in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
120 index = ensure_index(index)
122 # don't force copy because getting jammed in an ndarray anyway
--> 123 arrays = _homogenize(arrays, index, dtype)
124 # _homogenize ensures
125 # - all(len(x) == len(index) for x in arrays)
126 # - all(x.ndim == 1 for x in arrays)
(...)
129
130 else:
131 index = ensure_index(index)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/construction.py:599, in _homogenize(data, index, dtype)
595 val = val.astype(dtype, copy=False)
596 if val.index is not index:
597 # Forces alignment. No need to copy data since we
598 # are putting it into an ndarray later
--> 599 val = val.reindex(index, copy=False)
601 val = val._values
602 else:

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/series.py:5094, in Series.reindex(self, *args, **kwargs)
5090 raise TypeError(
5091 "'index' passed as both positional and keyword argument"
5092 )
5093 kwargs.update({"index": index})
-> 5094 return super().reindex(**kwargs)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py:5289, in NDFrame.reindex(self, *args, **kwargs)
5286 return self._reindex_multi(axes, copy, fill_value)
5288 # perform the reindex on the axes
-> 5289 return self._reindex_axes(
5290 axes, level, limit, tolerance, method, fill_value, copy
5291 ).finalize(self, method="reindex")

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py:5309, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
5304 new_index, indexer = ax.reindex(
5305 labels, level=level, limit=limit, tolerance=tolerance, method=method
5306 )
5308 axis = self._get_axis_number(a)
-> 5309 obj = obj._reindex_with_indexers(
5310 {axis: [new_index, indexer]},
5311 fill_value=fill_value,
5312 copy=copy,
5313 allow_dups=False,
5314 )
5315 # If we've made a copy once, no need to make another one
5316 copy = False

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py:5355, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5352 indexer = ensure_platform_int(indexer)
5354 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
-> 5355 new_data = new_data.reindex_indexer(
5356 index,
5357 indexer,
5358 axis=baxis,
5359 fill_value=fill_value,
5360 allow_dups=allow_dups,
5361 copy=copy,
5362 )
5363 # If we've made a copy once, no need to make another one
5364 copy = False

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:737, in BaseBlockManager.reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, only_slice, use_na_proxy)
735 # some axes don't allow reindexing with dups
736 if not allow_dups:
--> 737 self.axes[axis]._validate_can_reindex(indexer)
739 if axis >= self.ndim:
740 raise IndexError("Requested axis not found in manager")

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/indexes/base.py:4316, in Index._validate_can_reindex(self, indexer)
4314 # trying to reindex on an axis with duplicates
4315 if not self._index_as_unique and len(indexer):
-> 4316 raise ValueError("cannot reindex on an axis with duplicate labels")

ValueError: cannot reindex on an axis with duplicate labels

@hanhandata
Copy link
Author

image

@timvink
Copy link
Collaborator

timvink commented Aug 26, 2023

It seems the index of your dataset is not unique. Try pandas df.reset_index(drop=True) (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html)

You might also be using a pandas multi-index, which I recommend removing before training

@hanhandata
Copy link
Author

Hi @timvink

Its work for me, thanks a lot

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants