Source code for dcbench.tasks.minidata

import os
import shutil
import tempfile
from typing import Any, Mapping, Sequence

import meerkat as mk
import pandas as pd

from dcbench.common import Problem, Solution, Task
from dcbench.common.artifact import DataPanelArtifact, YAMLArtifact
from dcbench.common.artifact_container import ArtifactSpec


[docs]class MiniDataSolution(Solution): artifact_specs: Mapping[str, ArtifactSpec] = { "train_ids": ArtifactSpec( artifact_type=YAMLArtifact, description=( "A list of train example ids from the " " ``id`` column of ``train_data``." ), ), } task_id: str = "minidata"
[docs] @classmethod def from_ids(cls, train_ids: Sequence[str], problem_id: str): cls.from_artifacts( {"train_ids": train_ids}, attributes={"problem_id": problem_id} )
[docs]class MiniDataProblem(Problem): artifact_specs: Mapping[str, ArtifactSpec] = { "train_data": ArtifactSpec( artifact_type=DataPanelArtifact, description="A DataPanel of train examples with columns ``id``, " "``input``, and ``target``.", ), "val_data": ArtifactSpec( artifact_type=DataPanelArtifact, description="A DataPanel of validation examples with columns ``id``, " "``input``, and ``target``.", ), "test_data": ArtifactSpec( artifact_type=DataPanelArtifact, description="A DataPanel of test examples with columns ``id``, " "``input``, and ``target``.", ), } task_id: str = "minidata"
[docs] def solve(self, idx_selected: Any, **kwargs: Any) -> Solution: # Construct the solution object as a Pandas DataFrame. idx_selected_dp = None if isinstance(idx_selected, mk.DataPanel): idx_selected_dp = mk.DataPanel( { "idx_selected": idx_selected[idx_selected.columns[0]].data.astype( bool ) } ) elif isinstance(idx_selected, pd.DataFrame): idx_selected_dp = mk.DataPanel( {"idx_selected": idx_selected.iloc[:, 0].values.astype(bool)} ) elif isinstance(idx_selected, list): idx_selected_dp = mk.DataPanel({"idx_selected": idx_selected}).astype( "bool" ) else: raise ValueError( "The provided idx_selected object must be either a list or a DataFrame." ) # Check if the content of the solution object is valid. X_train_dirty = self["X_train_dirty"] if len(X_train_dirty) != len(idx_selected_dp): raise ValueError( "The number of elements of the provided solution object must be the " "same as for the training dataset. (expected: %d, found: %d)" % (len(X_train_dirty), len(idx_selected_dp)) ) # Construct and return a solution object. solution = MiniDataSolution.from_artifacts({"idx_selected": idx_selected_dp}) solution.attributes["problem_id"] = self.container_id for k, v in self.attributes.items(): solution.attributes[k] = v return solution
[docs] def evaluate(self, solution: Solution): train_dp = self["train_data"] train_ids = solution["train_ids"] train_dp = train_dp.lz[train_dp["id"].isin(train_ids)] dirpath = tempfile.mkdtemp() dp_path = os.path.join(dirpath, "dataset.mk") train_dp.write(dp_path) from unagi.unagi import main from unagi.utils.config_utils import build_config from .unagi_configs import RESNET_CONFIG config = RESNET_CONFIG.copy() config["dataset"]["path_to_dp"] = dp_path config["dataset"]["index_name"] = "id" config = build_config(config) main(config) shutil.rmtree(dirpath)
# TODO: Plug unagi in here # model = fit(train_dp) # score = score(self["test_dp"], model) # returnscore task = Task( task_id="minidata", name="Minimal Data Selection", # flake8: noqa summary="Given a large training dataset, what is the smallest subset you can sample that still achieves some threshold of performance.", problem_class=MiniDataProblem, solution_class=MiniDataSolution, baselines=None, )