Source code for dcbench.tasks.budgetclean.common
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
[docs]class Preprocessor(object):
"""docstring for Preprocessor."""
def __init__(self, num_strategy="mean"):
super(Preprocessor, self).__init__()
self.num_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy=num_strategy)),
("scaler", MinMaxScaler()),
]
)
self.feature_enc = OneHotEncoder(sparse=False, handle_unknown="ignore")
self.cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
self.label_enc = LabelEncoder()
[docs] def fit(self, X_train, y_train, X_full=None):
self.num_features = X_train.select_dtypes(include="number").columns
self.cat_features = X_train.select_dtypes(exclude="number").columns
if len(self.num_features) > 0:
self.num_transformer.fit(X_train[self.num_features].values)
if len(self.cat_features) > 0:
if X_full is None:
X_full = X_train
# self.feature_enc.fit(X_full[self.cat_features].values)
# self.cat_imputer.fit(X_train[self.cat_features].values)
self.cat_transformer = Pipeline(
steps=[("imputer", self.cat_imputer), ("onehot", self.feature_enc)]
)
self.cat_transformer.fit(X_full[self.cat_features].values)
self.label_enc.fit(y_train.values.ravel())
[docs] def transform(self, X=None, y=None):
if X is not None:
X_after = []
if len(self.num_features) > 0:
X_arr = X[self.num_features].values
if len(X_arr.shape) == 1:
X_arr = X_arr.reshape(1, -1)
X_num = self.num_transformer.transform(X_arr)
X_after.append(X_num)
if len(self.cat_features) > 0:
X_arr = X[self.cat_features].values.astype(object)
if len(X_arr.shape) == 1:
X_arr = X_arr.reshape(1, -1)
X_cat = self.cat_transformer.transform(X_arr)
X_after.append(X_cat)
X = np.hstack(X_after)
if y is not None:
y = self.label_enc.transform(y.values.ravel())
if X is None:
return y
elif y is None:
return X
else:
return X, y