dcbench package

Subpackages

Submodules

dcbench.constants module

dcbench.version module

Module contents

The dcbench module is a collection for benchmarks that test various apsects of data preparation and handling in the context of AI workflows.

class Artifact(artifact_id, **kwargs)[source]

Bases: abc.ABC

Parameters

artifact_id (str) –

Return type

None

DEFAULT_EXT: str = ''
download(force=False)[source]
Parameters

force (bool) –

classmethod from_data(data, artifact_id=None)[source]
Parameters
  • data (Any) –

  • artifact_id (Optional[str]) –

static from_yaml(loader, node)[source]
Parameters

loader (yaml.loader.Loader) –

property is_downloaded: bool
property is_uploaded: bool
isdir: bool = False
abstract load()[source]
Return type

Any

property local_path: str
property remote_url: str
abstract save(data)[source]
Parameters

data (Any) –

Return type

None

static to_yaml(dumper, data)[source]
Parameters
upload(force=False, bucket=None)[source]
Parameters
  • force (bool) –

  • bucket (Optional[storage.Bucket]) –

class BudgetcleanProblem(id, artifacts, attributes=None)[source]

Bases: dcbench.common.problem.Problem

Parameters
  • id (str) –

  • artifacts (Mapping[str, Artifact]) –

  • attributes (Mapping[str, BASIC_TYPE]) –

artifact_specs: Mapping[str, dcbench.common.artifact.ArtifactSpec] = {'X_test': ArtifactSpec(description=('Features of the test dataset used to produce the final evaluation score of the model.',), artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'X_train_clean': ArtifactSpec(description='Features of the clean training dataset where each dirty value from the dirty dataset is replaced with the correct clean candidate.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'X_train_dirty': ArtifactSpec(description=('Features of the dirty training dataset which we need to clean. Each dirty cell contains an embedded list of clean candidate values.',), artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'X_val': ArtifactSpec(description='Feature of the validtion dataset which can be used to guide the cleaning optimization process.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'y_test': ArtifactSpec(description='Labels of the test dataset.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'y_train': ArtifactSpec(description='Labels of the training dataset.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'y_val': ArtifactSpec(description='Labels of the validation dataset.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>)}
evaluate(solution)[source]
Parameters

solution (dcbench.tasks.budgetclean.problem.BudgetcleanSolution) –

Return type

dcbench.common.result.Result

classmethod from_id(scenario_id)[source]
Parameters

scenario_id (str) –

classmethod list()[source]
solve(idx_selected, **kwargs)[source]
Parameters
  • idx_selected (Any) –

  • kwargs (Any) –

Return type

dcbench.common.solution.Solution

task_id: str = 'budgetclean'
class CSVArtifact(artifact_id, **kwargs)[source]

Bases: dcbench.common.artifact.Artifact

Parameters

artifact_id (str) –

Return type

None

DEFAULT_EXT: str = 'csv'
load()[source]
Return type

pandas.core.frame.DataFrame

save(data)[source]
Parameters

data (pandas.core.frame.DataFrame) –

Return type

None

class DataPanelArtifact(artifact_id, **kwargs)[source]

Bases: dcbench.common.artifact.Artifact

Parameters

artifact_id (str) –

Return type

None

DEFAULT_EXT: str = 'mk'
isdir: bool = True
load()[source]
Return type

pandas.core.frame.DataFrame

save(data)[source]
Parameters

data (meerkat.datapanel.DataPanel) –

Return type

None

class MiniDataProblem(id, artifacts, attributes=None)[source]

Bases: dcbench.common.problem.Problem

Parameters
  • id (str) –

  • artifacts (Mapping[str, Artifact]) –

  • attributes (Mapping[str, BASIC_TYPE]) –

artifact_specs: Mapping[str, dcbench.common.artifact.ArtifactSpec] = {'test_data': ArtifactSpec(description='A DataPanel of test examples with columns ``id``, ``input``, and ``target``.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'train_data': ArtifactSpec(description='A DataPanel of train examples with columns ``id``, ``input``, and ``target``.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'val_data': ArtifactSpec(description='A DataPanel of validation examples with columns ``id``, ``input``, and ``target``.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>)}
evaluate(solution)[source]
Parameters

solution (dcbench.common.solution.Solution) –

task_id: str = 'minidata'
class ModelArtifact(artifact_id, **kwargs)[source]

Bases: dcbench.common.artifact.Artifact

Parameters

artifact_id (str) –

Return type

None

DEFAULT_EXT: str = 'pt'
load()[source]
Return type

dcbench.common.modeling.Model

save(data)[source]
Parameters

data (dcbench.common.modeling.Model) –

Return type

None

class Problem(id, artifacts, attributes=None)[source]

Bases: dcbench.common.artifact.ArtifactContainer

Parameters
  • id (str) –

  • artifacts (Mapping[str, Artifact]) –

  • attributes (Mapping[str, BASIC_TYPE]) –

container_type: str = 'problem'
abstract evaluate(solution)[source]
Parameters

solution (Solution) –

name: str
solution_class: type
summary: str
class SliceDiscoveryProblem(id, artifacts, attributes=None)[source]

Bases: dcbench.common.problem.Problem

Parameters
  • id (str) –

  • artifacts (Mapping[str, Artifact]) –

  • attributes (Mapping[str, BASIC_TYPE]) –

artifact_specs: Mapping[str, dcbench.common.artifact.ArtifactSpec] = {'activations': ArtifactSpec(description="A DataPanel of the model's activations with columns `id`,`act`", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'base_dataset': ArtifactSpec(description='A DataPanel representing the base dataset with columns `id` and `image`.', artifact_type=<class 'dcbench.common.artifact.VisionDatasetArtifact'>), 'clip': ArtifactSpec(description="A DataPanel of the image embeddings from OpenAI's CLIP model", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'model': ArtifactSpec(description='A trained PyTorch model to audit.', artifact_type=<class 'dcbench.common.artifact.ModelArtifact'>), 'test_predictions': ArtifactSpec(description="A DataPanel of the model's predictions with columns `id`,`target`, and `probs.`", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'test_slices': ArtifactSpec(description='A DataPanel of the ground truth slice labels with columns  `id`, `slices`.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'val_predictions': ArtifactSpec(description="A DataPanel of the model's predictions with columns `id`,`target`, and `probs.`", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>)}
evaluate(solution)[source]
Parameters

solution (dcbench.tasks.slice_discovery.problem.SliceDiscoverySolution) –

Return type

dict

solve(pred_slices_dp)[source]
Parameters

pred_slices_dp (meerkat.datapanel.DataPanel) –

Return type

dcbench.tasks.slice_discovery.problem.SliceDiscoverySolution

task_id: str = 'slice_discovery'
class Solution(id, artifacts, attributes=None)[source]

Bases: dcbench.common.artifact.ArtifactContainer

Parameters
  • id (str) –

  • artifacts (Mapping[str, Artifact]) –

  • attributes (Mapping[str, BASIC_TYPE]) –

container_type: str = 'solution'
class Task(task_id, name, summary, problem_class, solution_class, baselines=Empty DataFrame Columns: [] Index: [])[source]

Bases: dcbench.common.table.RowMixin

Task(task_id: str, name: str, summary: str, problem_class: type, solution_class: type, baselines: dcbench.common.table.Table = Empty DataFrame Columns: [] Index: [])

Parameters
  • task_id (str) –

  • name (str) –

  • summary (str) –

  • problem_class (type) –

  • solution_class (type) –

  • baselines (dcbench.common.table.Table) –

Return type

None

baselines: dcbench.common.table.Table = Empty DataFrame Columns: [] Index: []
download_problems(include_artifacts=False)[source]
Parameters

include_artifacts (bool) –

property local_problems_path
name: str
problem_class: type
property problems
property problems_path
property remote_problems_url
solution_class: type
summary: str
task_id: str
upload_problems(include_artifacts=False)[source]
Parameters

include_artifacts (bool) –

write_problems(containers)[source]
Parameters

containers (Sequence[dcbench.common.artifact.ArtifactContainer]) –

class VisionDatasetArtifact(artifact_id, **kwargs)[source]

Bases: dcbench.common.artifact.DataPanelArtifact

Parameters

artifact_id (str) –

Return type

None

COLUMN_SUBSETS = {'celeba': ['id', 'image', 'identity', 'split'], 'imagenet': ['id', 'image', 'name', 'synset']}
DEFAULT_EXT: str = 'mk'
download(force=False)[source]
Parameters

force (bool) –

classmethod from_name(name)[source]
Parameters

name (str) –

isdir: bool = True
class YAMLArtifact(artifact_id, **kwargs)[source]

Bases: dcbench.common.artifact.Artifact

Parameters

artifact_id (str) –

Return type

None

DEFAULT_EXT: str = 'yaml'
load()[source]
Return type

pandas.core.frame.DataFrame

save(data)[source]
Parameters

data (Any) –

Return type

None