Source code for cl_gym.benchmarks.toy_2D_clf

import torch
from typing import Optional
from sklearn.datasets import make_blobs
from torch.utils.data import DataLoader, Dataset, Subset, ConcatDataset
from cl_gym.benchmarks import Benchmark

DEFAULT_TOY_DATASET_SIZE = 100
DEFAULT_CLUSTER_STD = 0.3
SMALL_COORD = 0.5
LARGE_COORD = 2.0


class Toy2DCLFDataset(Dataset):
    def __init__(self, num_tasks: int, task_id: int, samples_per_task: int, cluster_std: float):
        self.data = []
        self.targets = []
        self.num_tasks = num_tasks
        self.task_id = task_id
        self.cluster_std = cluster_std
        self.samples_per_task = samples_per_task
        self.__generate_data()
    
    def __get_centers(self):
        if self.num_tasks == 2:
            task_centers = {1: ((SMALL_COORD, SMALL_COORD), (LARGE_COORD, LARGE_COORD)),
                            2: ((-SMALL_COORD, -SMALL_COORD), (-LARGE_COORD, -LARGE_COORD))}
        elif self.num_tasks == 4:
            task_centers = {1: ((SMALL_COORD, SMALL_COORD), (LARGE_COORD, LARGE_COORD)),
                            2: ((SMALL_COORD, -SMALL_COORD), (LARGE_COORD, -LARGE_COORD)),
                            3: ((-SMALL_COORD, -SMALL_COORD), (-LARGE_COORD, -LARGE_COORD)),
                            4: ((-SMALL_COORD, SMALL_COORD), (-LARGE_COORD, LARGE_COORD))}
        else:
            raise ValueError("2D Toy Classification dataset can have either 2 or 4 tasks")
        return task_centers
        
    def __generate_data(self):
        centers = self.__get_centers()[self.task_id]
        inp, targ = make_blobs(n_samples=self.samples_per_task, n_features=2, centers=centers, cluster_std=self.cluster_std)
        self.data = torch.from_numpy(inp).float()
        self.targets = torch.from_numpy(targ)
        
    def __getitem__(self, index):
        return self.data[index], int(self.targets[index]), self.task_id
    
    def __len__(self):
        return len(self.data)


[docs]class Toy2DClassification(Benchmark):
    """
    Toy benchmark: each task will be a binary classification with linearly separable classes in 2D space.
    Essentially, each task is a Gaussian cluster at some coordinates.
    """
    def __init__(self,
                 num_tasks: int,
                 per_task_examples: Optional[int] = None,
                 per_task_joint_examples: Optional[int] = 0,
                 per_task_memory_examples: Optional[int] = 0,
                 per_task_subset_examples: Optional[int] = 0,
                 cluster_size: Optional[int] = DEFAULT_TOY_DATASET_SIZE,
                 cluster_std: Optional[float] = DEFAULT_CLUSTER_STD):
        super().__init__(num_tasks, per_task_examples, per_task_joint_examples, per_task_memory_examples,
                         per_task_subset_examples)
        self.cluster_size = cluster_size
        self.cluster_std = cluster_std
        self.load_datasets()
        self.prepare_datasets()

[docs]    def load_datasets(self):
        for task in range(1, self.num_tasks + 1):
            self.trains[task] = Toy2DCLFDataset(self.num_tasks, task, self.cluster_size, self.cluster_std)
            self.tests[task] = Toy2DCLFDataset(self.num_tasks, task, self.cluster_size, self.cluster_std)

[docs]    def precompute_memory_indices(self):
        for task in range(1, self.num_tasks + 1):
            indices_train = self.sample_uniform_class_indices(self.trains[task], 0, 1, self.per_task_memory_examples)
            indices_test = self.sample_uniform_class_indices(self.tests[task], 0, 1, self.per_task_memory_examples)
            assert len(indices_train) == len(indices_test) == self.per_task_memory_examples
            self.memory_indices_train[task] = indices_train[:]
            self.memory_indices_test[task] = indices_test[:]
CL-Gym 1.0.0-beta documentation

Source code for cl_gym.benchmarks.toy_2D_clf