From dfe14dd973ec1109270826a6c19c13d0e13de7ff Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 6 Mar 2020 17:46:22 -0500
Subject: [PATCH] Delete BatchGenerator class

---
 mhcflurry/batch_generator.py | 336 -----------------------------------
 test/test_batch_generator.py | 120 -------------
 2 files changed, 456 deletions(-)
 delete mode 100644 mhcflurry/batch_generator.py
 delete mode 100644 test/test_batch_generator.py

diff --git a/mhcflurry/batch_generator.py b/mhcflurry/batch_generator.py
deleted file mode 100644
index 9a6d3789..00000000
--- a/mhcflurry/batch_generator.py
+++ /dev/null
@@ -1,336 +0,0 @@
-import collections
-
-import numpy
-import pandas
-
-
-from .hyperparameters import HyperparameterDefaults
-
-
-class BatchPlan(object):
-    def __init__(self, equivalence_classes, batch_compositions, equivalence_class_labels=None):
-        """
-
-        Parameters
-        ----------
-        equivalence_classes
-        batch_compositions
-        equivalence_class_labels : list of string, optional
-            Used only for summary().
-        """
-        # batch_compositions is (num batches_generator, batch size)
-
-        self.equivalence_classes = equivalence_classes # indices into points
-        self.batch_compositions = batch_compositions # indices into equivalence_classes
-        indices_into_equivalence_classes = []
-        next_index = collections.defaultdict(int)
-        for batch_composition in batch_compositions:
-            indices = []
-            for equivalence_class in batch_composition:
-                indices.append(next_index[equivalence_class])
-                next_index[equivalence_class] += 1
-            indices_into_equivalence_classes.append(
-                numpy.array(indices, dtype=int))
-        self.indices_into_equivalence_classes = indices_into_equivalence_classes
-        self.equivalence_class_labels = (
-            numpy.array(equivalence_class_labels)
-            if equivalence_class_labels is not None else None)
-
-    def batch_indices_generator(self, epochs=1):
-        batch_nums = numpy.arange(len(self.batch_compositions))
-        for epoch in range(epochs):
-            # Shuffle equivalence classes
-            for arr in self.equivalence_classes:
-                numpy.random.shuffle(arr)
-            numpy.random.shuffle(batch_nums)
-            for batch_num in batch_nums:
-                class_indices = self.batch_compositions[batch_num]
-                indices_into_classes = self.indices_into_equivalence_classes[
-                    batch_num
-                ]
-                batch_indices = [
-                    self.equivalence_classes[i][j]
-                    for (i, j) in zip(class_indices, indices_into_classes)
-                ]
-                yield batch_indices
-
-    def batches_generator(self, x_dict, y_list, epochs=1):
-        for indices in self.batch_indices_generator(epochs=epochs):
-            batch_x_dict = {}
-            for (item, value) in x_dict.items():
-                assert not numpy.isnan(value[indices]).any(), (item, value)
-                batch_x_dict[item] = value[indices]
-            batch_y_list = []
-            for value in y_list:
-                assert not numpy.isnan(value[indices]).any(), (
-                    len(batch_y_list), value)
-                batch_y_list.append(value[indices])
-            yield (batch_x_dict, batch_y_list)
-
-    def summary(self, indent=0):
-        lines = []
-        equivalence_class_labels = self.equivalence_class_labels
-        if equivalence_class_labels is None:
-            equivalence_class_labels = numpy.array([
-                "class-%d" % i for i in range(len(self.equivalence_classes))
-            ])
-        i = 0
-        while i < len(self.batch_compositions):
-            composition = self.batch_compositions[i]
-            label_counts = pandas.Series(
-                equivalence_class_labels[composition]).value_counts()
-            lines.append(
-                ("Batch %5d: " % i) + ", ".join(
-                    "{key}[{value}]".format(key=key, value=value)
-                    for (key, value) in label_counts.iteritems()))
-            if i == 5 and len(self.batch_compositions) > i + 3:
-                lines.append("...")
-                i = len(self.batch_compositions) - i + 1
-            i += 1
-
-        indent_spaces = "    " * indent
-        return "\n".join([indent_spaces + str(line) for line in lines])
-
-    @property
-    def num_batches(self):
-        return len(self.batch_compositions)
-
-    @property
-    def batch_size(self):
-        return max(len(b) for b in self.batch_compositions)
-
-
-class BatchGenerator(object):
-    implementations = {}
-    hyperparameter_defaults = HyperparameterDefaults(
-        batch_generator="simple",
-        batch_generator_validation_split=0.1,
-        batch_generator_batch_size=128)
-
-    @staticmethod
-    def register_implementation(name, klass):
-        BatchGenerator.implementations[name] = klass
-        BatchGenerator.hyperparameter_defaults = (
-            BatchGenerator.hyperparameter_defaults.extend(
-                klass.hyperparameter_defaults))
-
-    @staticmethod
-    def create(hyperparameters):
-        name = hyperparameters['batch_generator']
-        return BatchGenerator.implementations[name](hyperparameters)
-
-    def __init__(self, hyperparameters):
-        self.hyperparameters = BatchGenerator.hyperparameter_defaults.with_defaults(
-            hyperparameters)
-        self.train_batch_plan = None
-        self.test_batch_plan = None
-
-    def plan(self, *args, **kwargs):
-        raise NotImplementedError()
-
-    def summary(self):
-        return (
-            "Train:\n" + self.train_batch_plan.summary(indent=1) +
-            "\n***\nTest: " + self.test_batch_plan.summary(indent=1))
-
-    def get_train_and_test_generators(self, x_dict, y_list, epochs=1):
-        train_generator = self.train_batch_plan.batches_generator(
-            x_dict, y_list, epochs=epochs)
-        test_generator = self.test_batch_plan.batches_generator(
-            x_dict, y_list, epochs=epochs)
-        return (train_generator, test_generator)
-
-    @property
-    def num_train_batches(self):
-        return self.train_batch_plan.num_batches
-
-    @property
-    def num_test_batches(self):
-        return self.test_batch_plan.num_batches
-
-
-class SimpleBatchGenerator(BatchGenerator):
-    hyperparameter_defaults = HyperparameterDefaults()
-
-    def __init__(self, hyperparameters):
-        BatchGenerator.__init__(self, hyperparameters)
-
-    def plan(self, num, validation_weights=None, **kwargs):
-        if validation_weights is not None:
-            validation_weights = numpy.array(
-                validation_weights, copy=True, dtype=float)
-            numpy.testing.assert_equal(len(validation_weights), num)
-            validation_weights /= validation_weights.sum()
-
-        validation_items = numpy.random.choice(
-            num,
-            int((self.hyperparameters['batch_generator_validation_split']) * num),
-            replace=False,
-            p=validation_weights)
-        validation_items_set = set(validation_items)
-        numpy.testing.assert_equal(
-            len(validation_items), len(validation_items_set))
-        training_items = numpy.array([
-            x for x in range(num) if x not in validation_items_set
-        ], dtype=int)
-        numpy.testing.assert_equal(
-            len(validation_items) + len(training_items), num)
-
-        def simple_compositions(
-                num,
-                num_per_batch=self.hyperparameters['batch_generator_batch_size']):
-            full_batch = numpy.zeros(num_per_batch, dtype=int)
-            result = [full_batch] * int(numpy.floor(num / num_per_batch))
-            if num % num_per_batch != 0:
-                result.append(numpy.zeros(num % num_per_batch, dtype=int))
-            numpy.testing.assert_equal(sum(len(x) for x in result), num)
-            return result
-
-        self.train_batch_plan = BatchPlan(
-            equivalence_classes=[training_items],
-            batch_compositions=simple_compositions(len(training_items)))
-        self.test_batch_plan = BatchPlan(
-            equivalence_classes=[validation_items],
-            batch_compositions=simple_compositions(len(validation_items)))
-
-
-BatchGenerator.register_implementation("simple", SimpleBatchGenerator)
-
-class MultiallelicMassSpecBatchGenerator(BatchGenerator):
-    hyperparameter_defaults = HyperparameterDefaults(
-        batch_generator_affinity_fraction=0.5)
-    """
-    Hyperperameters for batch generation for the presentation predictor.
-    """
-
-    def __init__(self, hyperparameters):
-        BatchGenerator.__init__(self, hyperparameters)
-        self.equivalence_classes = None
-        self.batch_indices = None
-
-    @staticmethod
-    def plan_from_dataframe(df, hyperparameters):
-        affinity_fraction = hyperparameters["batch_generator_affinity_fraction"]
-        batch_size = hyperparameters["batch_generator_batch_size"]
-        df["first_allele"] = df.alleles.str.get(0)
-        df["equivalence_key"] = numpy.where(
-            df.is_affinity,
-            df.first_allele,
-            df.experiment_name,
-        ) + " " + df.is_binder.map({True: "binder", False: "nonbinder"})
-        (df["equivalence_class"], equivalence_class_labels) = (
-            df.equivalence_key.factorize())
-        df["idx"] = df.index
-        df = df.sample(frac=1.0)
-
-        affinities_per_batch = int(affinity_fraction * batch_size)
-
-        remaining_affinities_df = df.loc[df.is_affinity].copy()
-
-        # First do mixed affinity / multiallelic ms batches_generator.
-        batch_compositions = []
-        for (experiment, experiment_df) in df.loc[~df.is_affinity].groupby(
-                "experiment_name"):
-            (experiment_alleles,) = experiment_df.alleles.unique()
-            remaining_affinities_df["matches_allele"] = (
-                remaining_affinities_df.first_allele.isin(experiment_alleles))
-            # Whenever possible we try to use affinities with the same
-            # alleles as the mass spec experiment
-            remaining_affinities_df = remaining_affinities_df.sort_values(
-                "matches_allele", ascending=False)
-            while len(experiment_df) > 0:
-                affinities_for_this_batch = min(
-                    affinities_per_batch, len(remaining_affinities_df))
-                mass_spec_for_this_batch = (
-                    batch_size - affinities_for_this_batch)
-                if len(experiment_df) < mass_spec_for_this_batch:
-                    mass_spec_for_this_batch = len(experiment_df)
-                    affinities_for_this_batch = (
-                            batch_size - mass_spec_for_this_batch)
-
-                batch_composition = []
-
-                # take mass spec
-                to_use = experiment_df.iloc[:mass_spec_for_this_batch]
-                experiment_df = experiment_df.iloc[mass_spec_for_this_batch:]
-                batch_composition.extend(to_use.equivalence_class.values)
-
-                # take affinities
-                to_use = remaining_affinities_df.iloc[
-                    :affinities_for_this_batch
-                ]
-                remaining_affinities_df = remaining_affinities_df.iloc[
-                    affinities_for_this_batch:
-                ]
-                batch_composition.extend(to_use.equivalence_class.values)
-                batch_compositions.append(batch_composition)
-
-        # Affinities-only batches
-        while len(remaining_affinities_df) > 0:
-            to_use = remaining_affinities_df.iloc[:batch_size]
-            remaining_affinities_df = remaining_affinities_df.iloc[batch_size:]
-            batch_compositions.append(to_use.equivalence_class.values)
-
-        class_to_indices = df.groupby("equivalence_class").idx.unique()
-        equivalence_classes = [
-            class_to_indices[i]
-            for i in range(len(class_to_indices))
-        ]
-        return BatchPlan(
-            equivalence_classes=equivalence_classes,
-            batch_compositions=batch_compositions,
-            equivalence_class_labels=equivalence_class_labels)
-
-    def plan(
-            self,
-            affinities_mask,
-            experiment_names,
-            alleles_matrix,
-            is_binder,
-            validation_weights=None,
-            num=None):
-        affinities_mask = numpy.array(affinities_mask, copy=False, dtype=bool)
-        experiment_names = numpy.array(experiment_names, copy=False)
-        alleles_matrix = numpy.array(alleles_matrix, copy=False)
-        is_binder = numpy.array(is_binder, copy=False, dtype=bool)
-        n = len(experiment_names)
-        if num is not None:
-            numpy.testing.assert_equal(num, n)
-
-        numpy.testing.assert_equal(len(affinities_mask), n)
-        numpy.testing.assert_equal(len(alleles_matrix), n)
-        numpy.testing.assert_equal(len(is_binder), n)
-
-        if validation_weights is not None:
-            validation_weights = numpy.array(
-                validation_weights, copy=True, dtype=float)
-            numpy.testing.assert_equal(len(validation_weights), n)
-            validation_weights /= validation_weights.sum()
-
-        validation_items = numpy.random.choice(
-            n,
-            int((self.hyperparameters['batch_generator_validation_split']) * n),
-            replace=False,
-            p=validation_weights)
-        validation_mask = numpy.zeros(n, dtype=bool)
-        validation_mask[validation_items] = True
-
-        df = pandas.DataFrame({
-            "is_affinity": affinities_mask,
-            "experiment_name": experiment_names,
-            "is_binder": is_binder,
-            "is_validation": validation_mask,
-            "alleles": [tuple(row[row != None]) for row in alleles_matrix],
-        })
-        df.loc[df.is_affinity, "experiment_name"] = None
-
-        train_df = df.loc[~df.is_validation].copy()
-        test_df = df.loc[df.is_validation].copy()
-
-        self.train_batch_plan = self.plan_from_dataframe(
-            train_df, self.hyperparameters)
-        self.test_batch_plan = self.plan_from_dataframe(
-            test_df, self.hyperparameters)
-
-BatchGenerator.register_implementation(
-    "multiallelic_mass_spec", MultiallelicMassSpecBatchGenerator)
diff --git a/test/test_batch_generator.py b/test/test_batch_generator.py
deleted file mode 100644
index 02779542..00000000
--- a/test/test_batch_generator.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import logging
-logging.getLogger('matplotlib').disabled = True
-logging.getLogger('tensorflow').disabled = True
-
-import os
-import collections
-import time
-import cProfile
-import pstats
-
-import pandas
-import numpy
-
-from mhcflurry.downloads import get_path
-from mhcflurry.batch_generator import (
-    MultiallelicMassSpecBatchGenerator)
-from mhcflurry.regression_target import to_ic50
-from mhcflurry import Class1AffinityPredictor
-
-from numpy.testing import assert_equal
-from nose.tools import assert_greater, assert_less
-
-
-def data_path(name):
-    '''
-    Return the absolute path to a file in the test/data directory.
-    The name specified should be relative to test/data.
-    '''
-    return os.path.join(os.path.dirname(__file__), "data", name)
-
-
-def test_basic_repeat():
-    for _ in range(100):
-        test_basic()
-
-
-def test_basic():
-    batch_size = 7
-    validation_split = 0.2
-    planner = MultiallelicMassSpecBatchGenerator(
-        hyperparameters=dict(
-            batch_generator_validation_split=validation_split,
-            batch_generator_batch_size=batch_size,
-            batch_generator_affinity_fraction=0.5))
-
-    exp1_alleles = ["HLA-A*03:01", "HLA-B*07:02", "HLA-C*02:01"]
-    exp2_alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*02:01"]
-
-    df = pandas.DataFrame(dict(
-        affinities_mask=([True] * 14) + ([False] * 6),
-        experiment_names=([None] * 14) + (["exp1"] * 2) + (["exp2"] * 4),
-        alleles_matrix=[["HLA-C*07:01", None, None]] * 10 + [
-            ["HLA-A*02:01", None, None],
-            ["HLA-A*02:01", None, None],
-            ["HLA-A*03:01", None, None],
-            ["HLA-A*03:01", None, None],
-            exp1_alleles,
-            exp1_alleles,
-            exp2_alleles,
-            exp2_alleles,
-            exp2_alleles,
-            exp2_alleles,
-        ],
-        is_binder=[False, True] * 5 + [
-            True, True, False, False, True, False, True, False, True, False,
-        ]))
-    df = pandas.concat([df, df], ignore_index=True)
-    df = pandas.concat([df, df], ignore_index=True)
-
-    planner.plan(**df.to_dict("list"))
-
-    assert_equal(
-        planner.num_train_batches,
-        numpy.ceil(len(df) * (1 - validation_split) / batch_size))
-    assert_equal(
-        planner.num_test_batches,
-        numpy.ceil(len(df) * validation_split / batch_size))
-
-    (train_iter, test_iter) = planner.get_train_and_test_generators(
-        x_dict={
-            "idx": numpy.arange(len(df)),
-        },
-        y_list=[])
-
-    for (kind, it) in [("train", train_iter), ("test", test_iter)]:
-        for (i, (x_item, y_item)) in enumerate(it):
-            idx = x_item["idx"]
-            df.loc[idx, "kind"] = kind
-            df.loc[idx, "idx"] = idx
-            df.loc[idx, "batch"] = i
-    df["idx"] = df.idx.astype(int)
-    df["batch"] = df.batch.astype(int)
-
-    assert_equal(df.kind.value_counts()["test"], len(df) * validation_split)
-    assert_equal(df.kind.value_counts()["train"], len(df) * (1 - validation_split))
-
-    experiment_allele_colocations = collections.defaultdict(int)
-    for ((kind, batch_num), batch_df) in df.groupby(["kind", "batch"]):
-        if not batch_df.affinities_mask.all():
-            # Test each batch has at most one multiallelic ms experiment.
-            names = batch_df.loc[
-                ~batch_df.affinities_mask
-            ].experiment_names.unique()
-            assert_equal(len(names), 1)
-            (experiment,) = names
-            if batch_df.affinities_mask.any():
-                # Test experiments are matched to the correct affinity alleles.
-                affinity_alleles = batch_df.loc[
-                    batch_df.affinities_mask
-                ].alleles_matrix.str.get(0).values
-                for allele in affinity_alleles:
-                    experiment_allele_colocations[(experiment, allele)] += 1
-
-    assert_greater(
-        experiment_allele_colocations[('exp1', 'HLA-A*03:01')],
-        experiment_allele_colocations[('exp1', 'HLA-A*02:01')])
-    assert_less(
-        experiment_allele_colocations[('exp2', 'HLA-A*03:01')],
-        experiment_allele_colocations[('exp2', 'HLA-A*02:01')])
-
-- 
GitLab