Error - cannot reindex on an axis with duplicate labels #115

hanhandata · 2023-08-26T19:08:23Z

Hi,

I have successful to used the function in 100k data, but I found the error when I trying to fit used 2.5mio row data.
The error is cannot reindex on an axis with duplicate labels

Do you know solv for this the error? or we have limit the row?

**```

ValueError Traceback (most recent call last)
Cell In[43], line 1
----> 1 bucketing_process.fit(X_train, y_train)

File ~/anaconda3/lib/python3.10/site-packages/skorecard/pipeline/bucketing_process.py:281, in BucketingProcess.fit(self, X, y)
273 self.bucket_tables_[column] = build_bucket_table(
274 X_prebucketed_,
275 y,
276 column=column,
277 bucket_mapping=self.pipeline_.features_bucket_mapping_.get(column),
278 )
280 # Calculate the summary
--> 281 self._generate_summary(X, y)
283 return self

File ~/anaconda3/lib/python3.10/site-packages/skorecard/reporting/report.py:229, in SummaryMethod._generate_summary(self, X, y)
227 # Calculate information value
228 if y is not None:
--> 229 iv_scores = iv(self.transform(X), y)
230 else:
231 iv_scores = {}

File ~/anaconda3/lib/python3.10/site-packages/skorecard/reporting/report.py:378, in iv(X, y, epsilon, digits)
338 def iv(X: pd.DataFrame, y: pd.Series, epsilon: float = 0.0001, digits: Optional[int] = None) -> Dict:
339 r"""
340 Calculate the Information Value (IV) of the features in X.
341
(...)
376 IVs (dict): Keys are feature names, values are the IV values
377 """ # noqa
--> 378 return {col: _IV_score(y, X[col], epsilon=epsilon, digits=digits) for col in X.columns}

File ~/anaconda3/lib/python3.10/site-packages/skorecard/reporting/report.py:378, in (.0)
338 def iv(X: pd.DataFrame, y: pd.Series, epsilon: float = 0.0001, digits: Optional[int] = None) -> Dict:
339 r"""
340 Calculate the Information Value (IV) of the features in X.
341
(...)
376 IVs (dict): Keys are feature names, values are the IV values
377 """ # noqa
--> 378 return {col: _IV_score(y, X[col], epsilon=epsilon, digits=digits) for col in X.columns}

File ~/anaconda3/lib/python3.10/site-packages/skorecard/metrics/metrics.py:76, in _IV_score(y_test, y_pred, epsilon, digits)
62 def _IV_score(y_test, y_pred, epsilon=0.0001, digits=None):
63 """Using the unique values in y_pred, calculates the information value for the specific np.array.
64
65 Args:
(...)
74
75 """
---> 76 df = woe_1d(y_pred, y_test, epsilon=epsilon)
78 iv = ((df["non_target"] - df["target"]) * df["woe"]).sum()
80 if digits:

File ~/anaconda3/lib/python3.10/site-packages/skorecard/metrics/metrics.py:31, in woe_1d(X, y, epsilon)
28 # Ensure classes in y start at zero
29 y = y - min(y)
---> 31 df = pd.concat([X, y], axis=1, ignore_index=True)
32 df.columns = ["feat", "target"]
34 total_pos = df["target"].sum()

File ~/anaconda3/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments..decorate..wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/reshape/concat.py:381, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
159 """
160 Concatenate pandas objects along a particular axis.
161
(...)
366 1 3 4
367 """
368 op = _Concatenator(
369 objs,
370 axis=axis,
(...)
378 sort=sort,
379 )
--> 381 return op.get_result()

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/reshape/concat.py:592, in _Concatenator.get_result(self)
589 cons = sample._constructor_expanddim
591 index, columns = self.new_axes
--> 592 df = cons(data, index=index, copy=self.copy)
593 df.columns = columns
594 return df.finalize(self, method="concat")

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/frame.py:664, in DataFrame.init(self, data, index, columns, dtype, copy)
658 mgr = self._init_mgr(
659 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
660 )
662 elif isinstance(data, dict):
663 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 664 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
665 elif isinstance(data, ma.MaskedArray):
666 import numpy.ma.mrecords as mrecords

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/construction.py:493, in dict_to_mgr(data, index, columns, dtype, typ, copy)
489 else:
490 # dtype check to exclude e.g. range objects, scalars
491 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
--> 493 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/construction.py:123, in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
120 index = ensure_index(index)
122 # don't force copy because getting jammed in an ndarray anyway
--> 123 arrays = _homogenize(arrays, index, dtype)
124 # _homogenize ensures
125 # - all(len(x) == len(index) for x in arrays)
126 # - all(x.ndim == 1 for x in arrays)
(...)
129
130 else:
131 index = ensure_index(index)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/construction.py:599, in _homogenize(data, index, dtype)
595 val = val.astype(dtype, copy=False)
596 if val.index is not index:
597 # Forces alignment. No need to copy data since we
598 # are putting it into an ndarray later
--> 599 val = val.reindex(index, copy=False)
601 val = val._values
602 else:

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/series.py:5094, in Series.reindex(self, *args, **kwargs)
5090 raise TypeError(
5091 "'index' passed as both positional and keyword argument"
5092 )
5093 kwargs.update({"index": index})
-> 5094 return super().reindex(**kwargs)

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py:5289, in NDFrame.reindex(self, *args, **kwargs)
5286 return self._reindex_multi(axes, copy, fill_value)
5288 # perform the reindex on the axes
-> 5289 return self._reindex_axes(
5290 axes, level, limit, tolerance, method, fill_value, copy
5291 ).finalize(self, method="reindex")

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py:5309, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
5304 new_index, indexer = ax.reindex(
5305 labels, level=level, limit=limit, tolerance=tolerance, method=method
5306 )
5308 axis = self._get_axis_number(a)
-> 5309 obj = obj._reindex_with_indexers(
5310 {axis: [new_index, indexer]},
5311 fill_value=fill_value,
5312 copy=copy,
5313 allow_dups=False,
5314 )
5315 # If we've made a copy once, no need to make another one
5316 copy = False

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py:5355, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5352 indexer = ensure_platform_int(indexer)
5354 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
-> 5355 new_data = new_data.reindex_indexer(
5356 index,
5357 indexer,
5358 axis=baxis,
5359 fill_value=fill_value,
5360 allow_dups=allow_dups,
5361 copy=copy,
5362 )
5363 # If we've made a copy once, no need to make another one
5364 copy = False

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:737, in BaseBlockManager.reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, only_slice, use_na_proxy)
735 # some axes don't allow reindexing with dups
736 if not allow_dups:
--> 737 self.axes[axis]._validate_can_reindex(indexer)
739 if axis >= self.ndim:
740 raise IndexError("Requested axis not found in manager")

File ~/anaconda3/lib/python3.10/site-packages/pandas/core/indexes/base.py:4316, in Index._validate_can_reindex(self, indexer)
4314 # trying to reindex on an axis with duplicates
4315 if not self._index_as_unique and len(indexer):
-> 4316 raise ValueError("cannot reindex on an axis with duplicate labels")

ValueError: cannot reindex on an axis with duplicate labels

The text was updated successfully, but these errors were encountered:

hanhandata · 2023-08-26T19:17:14Z

timvink · 2023-08-26T19:22:42Z

It seems the index of your dataset is not unique. Try pandas df.reset_index(drop=True) (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html)

You might also be using a pandas multi-index, which I recommend removing before training

hanhandata · 2023-08-28T15:02:28Z

Hi @timvink

Its work for me, thanks a lot

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error - cannot reindex on an axis with duplicate labels #115

Error - cannot reindex on an axis with duplicate labels #115

hanhandata commented Aug 26, 2023

hanhandata commented Aug 26, 2023

timvink commented Aug 26, 2023 •

edited

Loading

hanhandata commented Aug 28, 2023

Error - cannot reindex on an axis with duplicate labels #115

Error - cannot reindex on an axis with duplicate labels #115

Comments

hanhandata commented Aug 26, 2023

**```

hanhandata commented Aug 26, 2023

timvink commented Aug 26, 2023 • edited Loading

hanhandata commented Aug 28, 2023

timvink commented Aug 26, 2023 •

edited

Loading