Skip to content
Snippets Groups Projects
Commit 36d65c39 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent 44f47fef
No related branches found
No related tags found
No related merge requests found
......@@ -419,7 +419,9 @@ class Class1AffinityPredictor(object):
series = transform.to_series()
if percent_ranks_df is None:
percent_ranks_df = pandas.DataFrame(index=series.index)
assert_equal(series.index.values, percent_ranks_df.index.values)
numpy.testing.assert_array_almost_equal(
series.index.values,
percent_ranks_df.index.values)
percent_ranks_df[allele] = series
percent_ranks_path = join(models_dir, "percent_ranks.csv")
percent_ranks_df.to_csv(
......
......@@ -11,7 +11,6 @@ import pstats
import pandas
import numpy
from mhcflurry.allele_encoding import MultipleAlleleEncoding
from mhcflurry.downloads import get_path
from mhcflurry.batch_generator import (
MultiallelicMassSpecBatchGenerator)
......@@ -119,102 +118,3 @@ def test_basic():
experiment_allele_colocations[('exp2', 'HLA-A*03:01')],
experiment_allele_colocations[('exp2', 'HLA-A*02:01')])
def test_large(sample_rate=1.0):
multi_train_df = pandas.read_csv(
data_path("multiallelic_ms.benchmark1.csv.bz2"))
multi_train_df["label"] = multi_train_df.hit
multi_train_df["is_affinity"] = False
sample_table = multi_train_df.loc[
multi_train_df.label == True
].drop_duplicates("sample_id").set_index("sample_id").loc[
multi_train_df.sample_id.unique()
]
grouped = multi_train_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
pan_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.combined/train_data.csv.bz2"))
pan_sub_train_df = pan_train_df
pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
del pan_sub_train_df["measurement_value"]
pan_sub_train_df["is_affinity"] = True
pan_sub_train_df = pan_sub_train_df.sample(frac=sample_rate)
multi_train_df = multi_train_df.sample(frac=sample_rate)
pan_predictor = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.combined"),
optimization_level=0,
max_models=1)
allele_encoding = MultipleAlleleEncoding(
experiment_names=multi_train_df.sample_id.values,
experiment_to_allele_list=sample_table.alleles.to_dict(),
max_alleles_per_experiment=sample_table.alleles.str.len().max(),
allele_to_sequence=pan_predictor.allele_to_sequence,
)
allele_encoding.append_alleles(pan_sub_train_df.allele.values)
allele_encoding = allele_encoding.compact()
combined_train_df = pandas.concat(
[multi_train_df, pan_sub_train_df], ignore_index=True, sort=True)
print("Total size", combined_train_df)
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
batch_generator_validation_split=0.2,
batch_generator_batch_size=128,
batch_generator_affinity_fraction=0.5))
s = time.time()
profiler = cProfile.Profile()
profiler.enable()
planner.plan(
affinities_mask=combined_train_df.is_affinity.values,
experiment_names=combined_train_df.sample_id.values,
alleles_matrix=allele_encoding.alleles,
is_binder=numpy.where(
combined_train_df.is_affinity.values,
combined_train_df.label.values,
to_ic50(combined_train_df.label.values)) < 1000.0)
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats("cumtime").reverse_order().print_stats()
print(planner.summary())
print("Planning took [sec]: ", time.time() - s)
(train_iter, test_iter) = planner.get_train_and_test_generators(
x_dict={
"idx": numpy.arange(len(combined_train_df)),
},
y_list=[])
train_batch_sizes = []
indices_total = numpy.zeros(len(combined_train_df))
for (kind, it) in [("train", train_iter), ("test", test_iter)]:
for (i, (x_item, y_item)) in enumerate(it):
idx = x_item["idx"]
indices_total[idx] += 1
batch_df = combined_train_df.iloc[idx]
if not batch_df.is_affinity.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[~batch_df.is_affinity].sample_id.nunique(), 1)
if kind == "train":
train_batch_sizes.append(len(batch_df))
# At most one short batch.
assert_less(sum(b != 128 for b in train_batch_sizes), 2)
assert_greater(
sum(b == 128 for b in train_batch_sizes), len(train_batch_sizes) - 2)
# Each point used exactly once.
assert_equal(
indices_total, numpy.ones(len(combined_train_df)))
......@@ -131,10 +131,6 @@ def test_neural_network_input():
results['peptide_length'], df.peptide.str.len().values)
def test_big():
train_basic_network(num=100000)
def test_small():
train_basic_network(num=10000)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment