From dfe14dd973ec1109270826a6c19c13d0e13de7ff Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 6 Mar 2020 17:46:22 -0500 Subject: [PATCH] Delete BatchGenerator class --- mhcflurry/batch_generator.py | 336 ----------------------------------- test/test_batch_generator.py | 120 ------------- 2 files changed, 456 deletions(-) delete mode 100644 mhcflurry/batch_generator.py delete mode 100644 test/test_batch_generator.py diff --git a/mhcflurry/batch_generator.py b/mhcflurry/batch_generator.py deleted file mode 100644 index 9a6d3789..00000000 --- a/mhcflurry/batch_generator.py +++ /dev/null @@ -1,336 +0,0 @@ -import collections - -import numpy -import pandas - - -from .hyperparameters import HyperparameterDefaults - - -class BatchPlan(object): - def __init__(self, equivalence_classes, batch_compositions, equivalence_class_labels=None): - """ - - Parameters - ---------- - equivalence_classes - batch_compositions - equivalence_class_labels : list of string, optional - Used only for summary(). - """ - # batch_compositions is (num batches_generator, batch size) - - self.equivalence_classes = equivalence_classes # indices into points - self.batch_compositions = batch_compositions # indices into equivalence_classes - indices_into_equivalence_classes = [] - next_index = collections.defaultdict(int) - for batch_composition in batch_compositions: - indices = [] - for equivalence_class in batch_composition: - indices.append(next_index[equivalence_class]) - next_index[equivalence_class] += 1 - indices_into_equivalence_classes.append( - numpy.array(indices, dtype=int)) - self.indices_into_equivalence_classes = indices_into_equivalence_classes - self.equivalence_class_labels = ( - numpy.array(equivalence_class_labels) - if equivalence_class_labels is not None else None) - - def batch_indices_generator(self, epochs=1): - batch_nums = numpy.arange(len(self.batch_compositions)) - for epoch in range(epochs): - # Shuffle equivalence classes - for arr in self.equivalence_classes: - numpy.random.shuffle(arr) - numpy.random.shuffle(batch_nums) - for batch_num in batch_nums: - class_indices = self.batch_compositions[batch_num] - indices_into_classes = self.indices_into_equivalence_classes[ - batch_num - ] - batch_indices = [ - self.equivalence_classes[i][j] - for (i, j) in zip(class_indices, indices_into_classes) - ] - yield batch_indices - - def batches_generator(self, x_dict, y_list, epochs=1): - for indices in self.batch_indices_generator(epochs=epochs): - batch_x_dict = {} - for (item, value) in x_dict.items(): - assert not numpy.isnan(value[indices]).any(), (item, value) - batch_x_dict[item] = value[indices] - batch_y_list = [] - for value in y_list: - assert not numpy.isnan(value[indices]).any(), ( - len(batch_y_list), value) - batch_y_list.append(value[indices]) - yield (batch_x_dict, batch_y_list) - - def summary(self, indent=0): - lines = [] - equivalence_class_labels = self.equivalence_class_labels - if equivalence_class_labels is None: - equivalence_class_labels = numpy.array([ - "class-%d" % i for i in range(len(self.equivalence_classes)) - ]) - i = 0 - while i < len(self.batch_compositions): - composition = self.batch_compositions[i] - label_counts = pandas.Series( - equivalence_class_labels[composition]).value_counts() - lines.append( - ("Batch %5d: " % i) + ", ".join( - "{key}[{value}]".format(key=key, value=value) - for (key, value) in label_counts.iteritems())) - if i == 5 and len(self.batch_compositions) > i + 3: - lines.append("...") - i = len(self.batch_compositions) - i + 1 - i += 1 - - indent_spaces = " " * indent - return "\n".join([indent_spaces + str(line) for line in lines]) - - @property - def num_batches(self): - return len(self.batch_compositions) - - @property - def batch_size(self): - return max(len(b) for b in self.batch_compositions) - - -class BatchGenerator(object): - implementations = {} - hyperparameter_defaults = HyperparameterDefaults( - batch_generator="simple", - batch_generator_validation_split=0.1, - batch_generator_batch_size=128) - - @staticmethod - def register_implementation(name, klass): - BatchGenerator.implementations[name] = klass - BatchGenerator.hyperparameter_defaults = ( - BatchGenerator.hyperparameter_defaults.extend( - klass.hyperparameter_defaults)) - - @staticmethod - def create(hyperparameters): - name = hyperparameters['batch_generator'] - return BatchGenerator.implementations[name](hyperparameters) - - def __init__(self, hyperparameters): - self.hyperparameters = BatchGenerator.hyperparameter_defaults.with_defaults( - hyperparameters) - self.train_batch_plan = None - self.test_batch_plan = None - - def plan(self, *args, **kwargs): - raise NotImplementedError() - - def summary(self): - return ( - "Train:\n" + self.train_batch_plan.summary(indent=1) + - "\n***\nTest: " + self.test_batch_plan.summary(indent=1)) - - def get_train_and_test_generators(self, x_dict, y_list, epochs=1): - train_generator = self.train_batch_plan.batches_generator( - x_dict, y_list, epochs=epochs) - test_generator = self.test_batch_plan.batches_generator( - x_dict, y_list, epochs=epochs) - return (train_generator, test_generator) - - @property - def num_train_batches(self): - return self.train_batch_plan.num_batches - - @property - def num_test_batches(self): - return self.test_batch_plan.num_batches - - -class SimpleBatchGenerator(BatchGenerator): - hyperparameter_defaults = HyperparameterDefaults() - - def __init__(self, hyperparameters): - BatchGenerator.__init__(self, hyperparameters) - - def plan(self, num, validation_weights=None, **kwargs): - if validation_weights is not None: - validation_weights = numpy.array( - validation_weights, copy=True, dtype=float) - numpy.testing.assert_equal(len(validation_weights), num) - validation_weights /= validation_weights.sum() - - validation_items = numpy.random.choice( - num, - int((self.hyperparameters['batch_generator_validation_split']) * num), - replace=False, - p=validation_weights) - validation_items_set = set(validation_items) - numpy.testing.assert_equal( - len(validation_items), len(validation_items_set)) - training_items = numpy.array([ - x for x in range(num) if x not in validation_items_set - ], dtype=int) - numpy.testing.assert_equal( - len(validation_items) + len(training_items), num) - - def simple_compositions( - num, - num_per_batch=self.hyperparameters['batch_generator_batch_size']): - full_batch = numpy.zeros(num_per_batch, dtype=int) - result = [full_batch] * int(numpy.floor(num / num_per_batch)) - if num % num_per_batch != 0: - result.append(numpy.zeros(num % num_per_batch, dtype=int)) - numpy.testing.assert_equal(sum(len(x) for x in result), num) - return result - - self.train_batch_plan = BatchPlan( - equivalence_classes=[training_items], - batch_compositions=simple_compositions(len(training_items))) - self.test_batch_plan = BatchPlan( - equivalence_classes=[validation_items], - batch_compositions=simple_compositions(len(validation_items))) - - -BatchGenerator.register_implementation("simple", SimpleBatchGenerator) - -class MultiallelicMassSpecBatchGenerator(BatchGenerator): - hyperparameter_defaults = HyperparameterDefaults( - batch_generator_affinity_fraction=0.5) - """ - Hyperperameters for batch generation for the presentation predictor. - """ - - def __init__(self, hyperparameters): - BatchGenerator.__init__(self, hyperparameters) - self.equivalence_classes = None - self.batch_indices = None - - @staticmethod - def plan_from_dataframe(df, hyperparameters): - affinity_fraction = hyperparameters["batch_generator_affinity_fraction"] - batch_size = hyperparameters["batch_generator_batch_size"] - df["first_allele"] = df.alleles.str.get(0) - df["equivalence_key"] = numpy.where( - df.is_affinity, - df.first_allele, - df.experiment_name, - ) + " " + df.is_binder.map({True: "binder", False: "nonbinder"}) - (df["equivalence_class"], equivalence_class_labels) = ( - df.equivalence_key.factorize()) - df["idx"] = df.index - df = df.sample(frac=1.0) - - affinities_per_batch = int(affinity_fraction * batch_size) - - remaining_affinities_df = df.loc[df.is_affinity].copy() - - # First do mixed affinity / multiallelic ms batches_generator. - batch_compositions = [] - for (experiment, experiment_df) in df.loc[~df.is_affinity].groupby( - "experiment_name"): - (experiment_alleles,) = experiment_df.alleles.unique() - remaining_affinities_df["matches_allele"] = ( - remaining_affinities_df.first_allele.isin(experiment_alleles)) - # Whenever possible we try to use affinities with the same - # alleles as the mass spec experiment - remaining_affinities_df = remaining_affinities_df.sort_values( - "matches_allele", ascending=False) - while len(experiment_df) > 0: - affinities_for_this_batch = min( - affinities_per_batch, len(remaining_affinities_df)) - mass_spec_for_this_batch = ( - batch_size - affinities_for_this_batch) - if len(experiment_df) < mass_spec_for_this_batch: - mass_spec_for_this_batch = len(experiment_df) - affinities_for_this_batch = ( - batch_size - mass_spec_for_this_batch) - - batch_composition = [] - - # take mass spec - to_use = experiment_df.iloc[:mass_spec_for_this_batch] - experiment_df = experiment_df.iloc[mass_spec_for_this_batch:] - batch_composition.extend(to_use.equivalence_class.values) - - # take affinities - to_use = remaining_affinities_df.iloc[ - :affinities_for_this_batch - ] - remaining_affinities_df = remaining_affinities_df.iloc[ - affinities_for_this_batch: - ] - batch_composition.extend(to_use.equivalence_class.values) - batch_compositions.append(batch_composition) - - # Affinities-only batches - while len(remaining_affinities_df) > 0: - to_use = remaining_affinities_df.iloc[:batch_size] - remaining_affinities_df = remaining_affinities_df.iloc[batch_size:] - batch_compositions.append(to_use.equivalence_class.values) - - class_to_indices = df.groupby("equivalence_class").idx.unique() - equivalence_classes = [ - class_to_indices[i] - for i in range(len(class_to_indices)) - ] - return BatchPlan( - equivalence_classes=equivalence_classes, - batch_compositions=batch_compositions, - equivalence_class_labels=equivalence_class_labels) - - def plan( - self, - affinities_mask, - experiment_names, - alleles_matrix, - is_binder, - validation_weights=None, - num=None): - affinities_mask = numpy.array(affinities_mask, copy=False, dtype=bool) - experiment_names = numpy.array(experiment_names, copy=False) - alleles_matrix = numpy.array(alleles_matrix, copy=False) - is_binder = numpy.array(is_binder, copy=False, dtype=bool) - n = len(experiment_names) - if num is not None: - numpy.testing.assert_equal(num, n) - - numpy.testing.assert_equal(len(affinities_mask), n) - numpy.testing.assert_equal(len(alleles_matrix), n) - numpy.testing.assert_equal(len(is_binder), n) - - if validation_weights is not None: - validation_weights = numpy.array( - validation_weights, copy=True, dtype=float) - numpy.testing.assert_equal(len(validation_weights), n) - validation_weights /= validation_weights.sum() - - validation_items = numpy.random.choice( - n, - int((self.hyperparameters['batch_generator_validation_split']) * n), - replace=False, - p=validation_weights) - validation_mask = numpy.zeros(n, dtype=bool) - validation_mask[validation_items] = True - - df = pandas.DataFrame({ - "is_affinity": affinities_mask, - "experiment_name": experiment_names, - "is_binder": is_binder, - "is_validation": validation_mask, - "alleles": [tuple(row[row != None]) for row in alleles_matrix], - }) - df.loc[df.is_affinity, "experiment_name"] = None - - train_df = df.loc[~df.is_validation].copy() - test_df = df.loc[df.is_validation].copy() - - self.train_batch_plan = self.plan_from_dataframe( - train_df, self.hyperparameters) - self.test_batch_plan = self.plan_from_dataframe( - test_df, self.hyperparameters) - -BatchGenerator.register_implementation( - "multiallelic_mass_spec", MultiallelicMassSpecBatchGenerator) diff --git a/test/test_batch_generator.py b/test/test_batch_generator.py deleted file mode 100644 index 02779542..00000000 --- a/test/test_batch_generator.py +++ /dev/null @@ -1,120 +0,0 @@ -import logging -logging.getLogger('matplotlib').disabled = True -logging.getLogger('tensorflow').disabled = True - -import os -import collections -import time -import cProfile -import pstats - -import pandas -import numpy - -from mhcflurry.downloads import get_path -from mhcflurry.batch_generator import ( - MultiallelicMassSpecBatchGenerator) -from mhcflurry.regression_target import to_ic50 -from mhcflurry import Class1AffinityPredictor - -from numpy.testing import assert_equal -from nose.tools import assert_greater, assert_less - - -def data_path(name): - ''' - Return the absolute path to a file in the test/data directory. - The name specified should be relative to test/data. - ''' - return os.path.join(os.path.dirname(__file__), "data", name) - - -def test_basic_repeat(): - for _ in range(100): - test_basic() - - -def test_basic(): - batch_size = 7 - validation_split = 0.2 - planner = MultiallelicMassSpecBatchGenerator( - hyperparameters=dict( - batch_generator_validation_split=validation_split, - batch_generator_batch_size=batch_size, - batch_generator_affinity_fraction=0.5)) - - exp1_alleles = ["HLA-A*03:01", "HLA-B*07:02", "HLA-C*02:01"] - exp2_alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*02:01"] - - df = pandas.DataFrame(dict( - affinities_mask=([True] * 14) + ([False] * 6), - experiment_names=([None] * 14) + (["exp1"] * 2) + (["exp2"] * 4), - alleles_matrix=[["HLA-C*07:01", None, None]] * 10 + [ - ["HLA-A*02:01", None, None], - ["HLA-A*02:01", None, None], - ["HLA-A*03:01", None, None], - ["HLA-A*03:01", None, None], - exp1_alleles, - exp1_alleles, - exp2_alleles, - exp2_alleles, - exp2_alleles, - exp2_alleles, - ], - is_binder=[False, True] * 5 + [ - True, True, False, False, True, False, True, False, True, False, - ])) - df = pandas.concat([df, df], ignore_index=True) - df = pandas.concat([df, df], ignore_index=True) - - planner.plan(**df.to_dict("list")) - - assert_equal( - planner.num_train_batches, - numpy.ceil(len(df) * (1 - validation_split) / batch_size)) - assert_equal( - planner.num_test_batches, - numpy.ceil(len(df) * validation_split / batch_size)) - - (train_iter, test_iter) = planner.get_train_and_test_generators( - x_dict={ - "idx": numpy.arange(len(df)), - }, - y_list=[]) - - for (kind, it) in [("train", train_iter), ("test", test_iter)]: - for (i, (x_item, y_item)) in enumerate(it): - idx = x_item["idx"] - df.loc[idx, "kind"] = kind - df.loc[idx, "idx"] = idx - df.loc[idx, "batch"] = i - df["idx"] = df.idx.astype(int) - df["batch"] = df.batch.astype(int) - - assert_equal(df.kind.value_counts()["test"], len(df) * validation_split) - assert_equal(df.kind.value_counts()["train"], len(df) * (1 - validation_split)) - - experiment_allele_colocations = collections.defaultdict(int) - for ((kind, batch_num), batch_df) in df.groupby(["kind", "batch"]): - if not batch_df.affinities_mask.all(): - # Test each batch has at most one multiallelic ms experiment. - names = batch_df.loc[ - ~batch_df.affinities_mask - ].experiment_names.unique() - assert_equal(len(names), 1) - (experiment,) = names - if batch_df.affinities_mask.any(): - # Test experiments are matched to the correct affinity alleles. - affinity_alleles = batch_df.loc[ - batch_df.affinities_mask - ].alleles_matrix.str.get(0).values - for allele in affinity_alleles: - experiment_allele_colocations[(experiment, allele)] += 1 - - assert_greater( - experiment_allele_colocations[('exp1', 'HLA-A*03:01')], - experiment_allele_colocations[('exp1', 'HLA-A*02:01')]) - assert_less( - experiment_allele_colocations[('exp2', 'HLA-A*03:01')], - experiment_allele_colocations[('exp2', 'HLA-A*02:01')]) - -- GitLab