From 36d65c3997ce1104c7967233ce7ed24e822d7d82 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 4 Feb 2020 15:34:38 -0500 Subject: [PATCH] fix --- mhcflurry/class1_affinity_predictor.py | 4 +- test/test_batch_generator.py | 100 ------------------ test/test_class1_processing_neural_network.py | 4 - 3 files changed, 3 insertions(+), 105 deletions(-) diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 2b08a6ec..b6f2d903 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -419,7 +419,9 @@ class Class1AffinityPredictor(object): series = transform.to_series() if percent_ranks_df is None: percent_ranks_df = pandas.DataFrame(index=series.index) - assert_equal(series.index.values, percent_ranks_df.index.values) + numpy.testing.assert_array_almost_equal( + series.index.values, + percent_ranks_df.index.values) percent_ranks_df[allele] = series percent_ranks_path = join(models_dir, "percent_ranks.csv") percent_ranks_df.to_csv( diff --git a/test/test_batch_generator.py b/test/test_batch_generator.py index 3cc6c11c..02779542 100644 --- a/test/test_batch_generator.py +++ b/test/test_batch_generator.py @@ -11,7 +11,6 @@ import pstats import pandas import numpy -from mhcflurry.allele_encoding import MultipleAlleleEncoding from mhcflurry.downloads import get_path from mhcflurry.batch_generator import ( MultiallelicMassSpecBatchGenerator) @@ -119,102 +118,3 @@ def test_basic(): experiment_allele_colocations[('exp2', 'HLA-A*03:01')], experiment_allele_colocations[('exp2', 'HLA-A*02:01')]) - -def test_large(sample_rate=1.0): - multi_train_df = pandas.read_csv( - data_path("multiallelic_ms.benchmark1.csv.bz2")) - multi_train_df["label"] = multi_train_df.hit - multi_train_df["is_affinity"] = False - - sample_table = multi_train_df.loc[ - multi_train_df.label == True - ].drop_duplicates("sample_id").set_index("sample_id").loc[ - multi_train_df.sample_id.unique() - ] - grouped = multi_train_df.groupby("sample_id").nunique() - for col in sample_table.columns: - if (grouped[col] > 1).any(): - del sample_table[col] - sample_table["alleles"] = sample_table.hla.str.split() - - pan_train_df = pandas.read_csv( - get_path( - "models_class1_pan", "models.combined/train_data.csv.bz2")) - pan_sub_train_df = pan_train_df - pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"] - del pan_sub_train_df["measurement_value"] - pan_sub_train_df["is_affinity"] = True - - pan_sub_train_df = pan_sub_train_df.sample(frac=sample_rate) - multi_train_df = multi_train_df.sample(frac=sample_rate) - - pan_predictor = Class1AffinityPredictor.load( - get_path("models_class1_pan", "models.combined"), - optimization_level=0, - max_models=1) - - allele_encoding = MultipleAlleleEncoding( - experiment_names=multi_train_df.sample_id.values, - experiment_to_allele_list=sample_table.alleles.to_dict(), - max_alleles_per_experiment=sample_table.alleles.str.len().max(), - allele_to_sequence=pan_predictor.allele_to_sequence, - ) - allele_encoding.append_alleles(pan_sub_train_df.allele.values) - allele_encoding = allele_encoding.compact() - - combined_train_df = pandas.concat( - [multi_train_df, pan_sub_train_df], ignore_index=True, sort=True) - - print("Total size", combined_train_df) - - planner = MultiallelicMassSpecBatchGenerator( - hyperparameters=dict( - batch_generator_validation_split=0.2, - batch_generator_batch_size=128, - batch_generator_affinity_fraction=0.5)) - - s = time.time() - profiler = cProfile.Profile() - profiler.enable() - planner.plan( - affinities_mask=combined_train_df.is_affinity.values, - experiment_names=combined_train_df.sample_id.values, - alleles_matrix=allele_encoding.alleles, - is_binder=numpy.where( - combined_train_df.is_affinity.values, - combined_train_df.label.values, - to_ic50(combined_train_df.label.values)) < 1000.0) - profiler.disable() - stats = pstats.Stats(profiler) - stats.sort_stats("cumtime").reverse_order().print_stats() - print(planner.summary()) - print("Planning took [sec]: ", time.time() - s) - - (train_iter, test_iter) = planner.get_train_and_test_generators( - x_dict={ - "idx": numpy.arange(len(combined_train_df)), - }, - y_list=[]) - - train_batch_sizes = [] - indices_total = numpy.zeros(len(combined_train_df)) - for (kind, it) in [("train", train_iter), ("test", test_iter)]: - for (i, (x_item, y_item)) in enumerate(it): - idx = x_item["idx"] - indices_total[idx] += 1 - batch_df = combined_train_df.iloc[idx] - if not batch_df.is_affinity.all(): - # Test each batch has at most one multiallelic ms experiment. - assert_equal( - batch_df.loc[~batch_df.is_affinity].sample_id.nunique(), 1) - if kind == "train": - train_batch_sizes.append(len(batch_df)) - - # At most one short batch. - assert_less(sum(b != 128 for b in train_batch_sizes), 2) - assert_greater( - sum(b == 128 for b in train_batch_sizes), len(train_batch_sizes) - 2) - - # Each point used exactly once. - assert_equal( - indices_total, numpy.ones(len(combined_train_df))) diff --git a/test/test_class1_processing_neural_network.py b/test/test_class1_processing_neural_network.py index a0e100c4..0a1368bd 100644 --- a/test/test_class1_processing_neural_network.py +++ b/test/test_class1_processing_neural_network.py @@ -131,10 +131,6 @@ def test_neural_network_input(): results['peptide_length'], df.peptide.str.len().values) -def test_big(): - train_basic_network(num=100000) - - def test_small(): train_basic_network(num=10000) -- GitLab