Source code for dcbench.tasks.budgetclean.baselines

import random
import time

import numpy as np

from ...common.solver import solver
from .common import Preprocessor
from .cpclean.algorithm.select import entropy_expected
from .cpclean.algorithm.sort_count import sort_count_after_clean_multi
from .cpclean.clean import CPClean, Querier

# avoid circular dependency
from .problem import BudgetcleanProblem, BudgetcleanSolution


[docs]@solver( id="random_clean", summary="Always selects a random subset of the data to clean." ) def random_clean(problem: BudgetcleanProblem, seed: int = 1337) -> BudgetcleanSolution: size = len(problem["X_train_dirty"]) budget = int(problem.attributes["budget"] * size) random.seed(seed) selection = random.sample(range(size), budget) idx_selected = [idx in selection for idx in range(size)] return problem.solve(idx_selected=idx_selected)
[docs]@solver( id="cp_clean", summary=( "Perform the selection using the CPClean algorithm which aims to " " maximize expected information gain." ), ) def cp_clean( problem: BudgetcleanProblem, seed: int = 1337, n_jobs=8, kparam=3 ) -> BudgetcleanSolution: size = len(problem["X_train_dirty"]) budget = int(problem.attributes["budget"] * size) X_train_dirty = problem["X_train_dirty"] X_train_clean = problem["X_train_clean"] y_train = problem["y_train"] X_val = problem["X_val"] # Compute number of repairs. def length(x): if isinstance(x, list): return len(x) return 0 num_repairs = X_train_dirty.applymap(length).max().max() # Reconstruct separate repair data frames. X_train_repairs = {} for i in range(num_repairs): def getitem(x): if isinstance(x, list): return x[i] if len(x) > i else None return x X_train_repairs["repair%02d" % i] = X_train_dirty.applymap(getitem) # Replace lists with None values. def clearlists(x): if isinstance(x, list): return None return x X_train_dirty = X_train_dirty.applymap(clearlists) # Preprocess data. preprocessor = Preprocessor() preprocessor.fit(X_train_dirty, y_train) X_train_clean, y_train = preprocessor.transform(X_train_clean, y_train) X_val = preprocessor.transform(X_val) for name, X in X_train_repairs.items(): X_train_repairs[name] = preprocessor.transform(X=X) d_train_repairs = [] repair_methods = sorted(X_train_repairs.keys()) X_train_repairs_sorted = [X_train_repairs[m] for m in repair_methods] for X in X_train_repairs_sorted: d = np.sum((X - X_train_clean) ** 2, axis=1) d_train_repairs.append(d) d_train_repairs = np.array(d_train_repairs).T gt_indices = np.argmin(d_train_repairs, axis=1) X_train_gt = [] for i, gt_i in enumerate(gt_indices): X_train_gt.append(X_train_repairs_sorted[gt_i][i]) X_train_gt = np.array(X_train_gt) # Perform cleaning using CPClean. cleaner = CPClean(K=kparam, n_jobs=n_jobs, random_state=seed) X_train_repairs = np.array([X_train_repairs[k] for k in X_train_repairs]) space, S_val, gt_indices, MM = cleaner.make_space( X_train_repairs, X_val, gt=X_train_gt ) init_querier = Querier(kparam, S_val, y_train, n_jobs=n_jobs, random_state=seed) start = time.time() after_entropy_val = sort_count_after_clean_multi( S_val, y_train, kparam, n_jobs, MM=MM ) end = time.time() print("sort_count_after_clean_multi", end - start) start = time.time() _, before_entropy_val = init_querier.run_q2(MM=MM, return_entropy=True) end = time.time() print("run_q2", end - start) dirty_rows = [i for i, x in enumerate(S_val[0]) if len(x) > 1] info_gain = entropy_expected( after_entropy_val, dirty_rows, before_entropy_val, n_jobs=n_jobs ) selection = np.argpartition(info_gain, -budget)[-budget:] # Produce solution. idx_selected = [idx in selection for idx in range(len(X_train_dirty))] return problem.solve(idx_selected=idx_selected)