dcbench package
Subpackages
Submodules
dcbench.constants module
dcbench.version module
Module contents
The dcbench module is a collection for benchmarks that test various apsects of data preparation and handling in the context of AI workflows.
- class Artifact(artifact_id, **kwargs)[source]
Bases:
abc.ABC
- Parameters
artifact_id (str) –
- Return type
None
- DEFAULT_EXT: str = ''
- classmethod from_data(data, artifact_id=None)[source]
- Parameters
data (Any) –
artifact_id (Optional[str]) –
- property is_downloaded: bool
- property is_uploaded: bool
- isdir: bool = False
- property local_path: str
- property remote_url: str
- static to_yaml(dumper, data)[source]
- Parameters
dumper (yaml.dumper.Dumper) –
data (dcbench.common.artifact.Artifact) –
- class BudgetcleanProblem(id, artifacts, attributes=None)[source]
Bases:
dcbench.common.problem.Problem
- Parameters
id (str) –
artifacts (Mapping[str, Artifact]) –
attributes (Mapping[str, BASIC_TYPE]) –
- artifact_specs: Mapping[str, dcbench.common.artifact.ArtifactSpec] = {'X_test': ArtifactSpec(description=('Features of the test dataset used to produce the final evaluation score of the model.',), artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'X_train_clean': ArtifactSpec(description='Features of the clean training dataset where each dirty value from the dirty dataset is replaced with the correct clean candidate.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'X_train_dirty': ArtifactSpec(description=('Features of the dirty training dataset which we need to clean. Each dirty cell contains an embedded list of clean candidate values.',), artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'X_val': ArtifactSpec(description='Feature of the validtion dataset which can be used to guide the cleaning optimization process.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'y_test': ArtifactSpec(description='Labels of the test dataset.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'y_train': ArtifactSpec(description='Labels of the training dataset.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>), 'y_val': ArtifactSpec(description='Labels of the validation dataset.', artifact_type=<class 'dcbench.common.artifact.CSVArtifact'>)}
- evaluate(solution)[source]
- Parameters
solution (dcbench.tasks.budgetclean.problem.BudgetcleanSolution) –
- Return type
- task_id: str = 'budgetclean'
- class CSVArtifact(artifact_id, **kwargs)[source]
Bases:
dcbench.common.artifact.Artifact
- Parameters
artifact_id (str) –
- Return type
None
- DEFAULT_EXT: str = 'csv'
- class DataPanelArtifact(artifact_id, **kwargs)[source]
Bases:
dcbench.common.artifact.Artifact
- Parameters
artifact_id (str) –
- Return type
None
- DEFAULT_EXT: str = 'mk'
- isdir: bool = True
- class MiniDataProblem(id, artifacts, attributes=None)[source]
Bases:
dcbench.common.problem.Problem
- Parameters
id (str) –
artifacts (Mapping[str, Artifact]) –
attributes (Mapping[str, BASIC_TYPE]) –
- artifact_specs: Mapping[str, dcbench.common.artifact.ArtifactSpec] = {'test_data': ArtifactSpec(description='A DataPanel of test examples with columns ``id``, ``input``, and ``target``.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'train_data': ArtifactSpec(description='A DataPanel of train examples with columns ``id``, ``input``, and ``target``.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'val_data': ArtifactSpec(description='A DataPanel of validation examples with columns ``id``, ``input``, and ``target``.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>)}
- evaluate(solution)[source]
- Parameters
solution (dcbench.common.solution.Solution) –
- task_id: str = 'minidata'
- class ModelArtifact(artifact_id, **kwargs)[source]
Bases:
dcbench.common.artifact.Artifact
- Parameters
artifact_id (str) –
- Return type
None
- DEFAULT_EXT: str = 'pt'
- class Problem(id, artifacts, attributes=None)[source]
Bases:
dcbench.common.artifact.ArtifactContainer
- Parameters
id (str) –
artifacts (Mapping[str, Artifact]) –
attributes (Mapping[str, BASIC_TYPE]) –
- container_type: str = 'problem'
- name: str
- solution_class: type
- summary: str
- class SliceDiscoveryProblem(id, artifacts, attributes=None)[source]
Bases:
dcbench.common.problem.Problem
- Parameters
id (str) –
artifacts (Mapping[str, Artifact]) –
attributes (Mapping[str, BASIC_TYPE]) –
- artifact_specs: Mapping[str, dcbench.common.artifact.ArtifactSpec] = {'activations': ArtifactSpec(description="A DataPanel of the model's activations with columns `id`,`act`", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'base_dataset': ArtifactSpec(description='A DataPanel representing the base dataset with columns `id` and `image`.', artifact_type=<class 'dcbench.common.artifact.VisionDatasetArtifact'>), 'clip': ArtifactSpec(description="A DataPanel of the image embeddings from OpenAI's CLIP model", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'model': ArtifactSpec(description='A trained PyTorch model to audit.', artifact_type=<class 'dcbench.common.artifact.ModelArtifact'>), 'test_predictions': ArtifactSpec(description="A DataPanel of the model's predictions with columns `id`,`target`, and `probs.`", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'test_slices': ArtifactSpec(description='A DataPanel of the ground truth slice labels with columns `id`, `slices`.', artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>), 'val_predictions': ArtifactSpec(description="A DataPanel of the model's predictions with columns `id`,`target`, and `probs.`", artifact_type=<class 'dcbench.common.artifact.DataPanelArtifact'>)}
- evaluate(solution)[source]
- Parameters
solution (dcbench.tasks.slice_discovery.problem.SliceDiscoverySolution) –
- Return type
dict
- solve(pred_slices_dp)[source]
- Parameters
pred_slices_dp (meerkat.datapanel.DataPanel) –
- Return type
dcbench.tasks.slice_discovery.problem.SliceDiscoverySolution
- task_id: str = 'slice_discovery'
- class Solution(id, artifacts, attributes=None)[source]
Bases:
dcbench.common.artifact.ArtifactContainer
- Parameters
id (str) –
artifacts (Mapping[str, Artifact]) –
attributes (Mapping[str, BASIC_TYPE]) –
- container_type: str = 'solution'
- class Task(task_id, name, summary, problem_class, solution_class, baselines=Empty DataFrame Columns: [] Index: [])[source]
Bases:
dcbench.common.table.RowMixin
Task(task_id: str, name: str, summary: str, problem_class: type, solution_class: type, baselines: dcbench.common.table.Table = Empty DataFrame Columns: [] Index: [])
- Parameters
task_id (str) –
name (str) –
summary (str) –
problem_class (type) –
solution_class (type) –
baselines (dcbench.common.table.Table) –
- Return type
None
- baselines: dcbench.common.table.Table = Empty DataFrame Columns: [] Index: []
- property local_problems_path
- name: str
- problem_class: type
- property problems
- property problems_path
- property remote_problems_url
- solution_class: type
- summary: str
- task_id: str
- write_problems(containers)[source]
- Parameters
containers (Sequence[dcbench.common.artifact.ArtifactContainer]) –
- class VisionDatasetArtifact(artifact_id, **kwargs)[source]
Bases:
dcbench.common.artifact.DataPanelArtifact
- Parameters
artifact_id (str) –
- Return type
None
- COLUMN_SUBSETS = {'celeba': ['id', 'image', 'identity', 'split'], 'imagenet': ['id', 'image', 'name', 'synset']}
- DEFAULT_EXT: str = 'mk'
- isdir: bool = True