Source code for liesel_gam.category_mapping
from __future__ import annotations
from collections.abc import Sequence
from typing import Any
import numpy as np
import pandas as pd
Array = Any
class CategoryError(KeyError):
pass
class UnknownLabelError(CategoryError):
pass
class UnknownCodeError(CategoryError):
pass
[docs]
class CategoryMapping:
"""Wraps a category mapping of labels to integers."""
def __init__(self, labels_to_integers_map: dict[Any, int]) -> None:
self._code_for_unknown_label = -1
self._label_for_unknown_code = None
self.labels_to_integers_map = labels_to_integers_map
self.integers_to_labels_map = {
code: label for label, code in self.labels_to_integers_map.items()
}
[docs]
@classmethod
def from_series(cls, series: pd.Series | pd.Categorical) -> CategoryMapping:
"""
When series is a pd.Categorical, the category sorting is kept.
When series is a series of dtype str or object, categories are sorted
alphabetically.
"""
is_series = isinstance(series, pd.Series)
has_cat_dtype = isinstance(series.dtype, pd.CategoricalDtype)
is_cat = isinstance(series, pd.Categorical)
if is_cat:
unique_labels = np.asarray(series.categories)
elif is_series and has_cat_dtype:
unique_labels = np.asarray(series.cat.categories)
elif is_series:
cat = pd.Categorical(series)
unique_labels = np.sort(np.asarray(cat.categories))
else:
raise TypeError(
f"series must be a pd.Series or pd.Categorical, got {type(series)}."
)
mapping = {val: i for i, val in enumerate(unique_labels)}
return cls(mapping)
[docs]
def to_integers(
self, labels_or_integers: np.typing.ArrayLike | Sequence[int] | Sequence[str]
) -> np.typing.NDArray[np.int_]:
arr = np.asarray(labels_or_integers)
# Case 1: Already an integer array
if np.issubdtype(arr.dtype, np.integer):
valid_integers = np.array(list(self.integers_to_labels_map.keys()))
if not np.isin(arr, valid_integers).all():
invalid = arr[~np.isin(arr, valid_integers)]
raise ValueError(
f"Unknown integer codes: {invalid.tolist()} "
f"(valid integers: {valid_integers.tolist()})"
)
return arr.astype(int, copy=False)
# Case 2: Otherwise treat as labels
return self.labels_to_integers(arr)
[docs]
def to_labels(
self, labels_or_integers: np.typing.ArrayLike | Sequence[int] | Sequence[str]
) -> np.typing.NDArray[Any]:
arr = np.asarray(labels_or_integers)
# Case 1: It is an integer array
if np.issubdtype(arr.dtype, np.integer):
return self.integers_to_labels(arr)
# Case 2: Otherwise treat as labels
valid_labels = np.array(list(self.labels_to_integers_map.keys()))
if not np.isin(arr, valid_labels).all():
invalid = arr[~np.isin(arr, valid_labels)]
raise ValueError(
f"Unknown labels: {invalid.tolist()} "
f"(valid labels: {valid_labels.tolist()})"
)
return arr
[docs]
def labels_to_integers(
self, labels: np.typing.ArrayLike | Sequence[str]
) -> np.typing.NDArray[np.int_]:
"""
A function of labels -> integers.
For unknown labels, returns -1.
"""
labels = np.asarray(labels)
labels_flat = labels.flatten()
codes_flat = np.zeros_like(labels_flat, dtype=int)
for i, xi in enumerate(labels_flat):
codes_flat[i] = self.labels_to_integers_map.get(
xi, self._code_for_unknown_label
)
if codes_flat[i] == self._code_for_unknown_label:
raise UnknownLabelError(f"Category label {xi} is unknown.")
codes = np.reshape(codes_flat, shape=labels.shape)
return np.astype(codes, np.int_)
[docs]
def integers_to_labels(
self, integers: np.typing.ArrayLike | Sequence[int]
) -> np.typing.NDArray[Any]:
"""
A function of integers -> labels.
For integers without labels, returns
"""
integers = np.asarray(integers)
integers_flat = integers.flatten()
labels_flat_list = []
for xi in integers_flat:
label = self.integers_to_labels_map.get(xi, self._label_for_unknown_code)
if label == self._label_for_unknown_code:
raise UnknownCodeError(f"Category code {xi} is unknown.")
labels_flat_list.append(label)
labels_flat = np.asarray(labels_flat_list)
labels = np.reshape(labels_flat, shape=integers.shape)
return labels
def series_is_categorical(series: pd.Series | pd.Categorical) -> bool:
"""
Provides a liberal interpretation of when a series is categorical. The following
are treated as categorical:
- Series with dtype str
- Series with dtype object
- Series with dtype CategoricalDtype
"""
# This corresponds to how formulaic determines categorical columns.
# See formulaic.materializers.pandas.PandasMaterializer._is_categorical
is_cat1 = series.dtype in ("str", "object")
is_cat2 = isinstance(series.dtype, pd.CategoricalDtype)
if series.dtype == "string":
if series.dtype.name == "string":
raise TypeError(
f"Pandas dtype {series.dtype} cannot be safely interpreted as "
"categorical, please process to dtype str or object."
)
return is_cat1 or is_cat2