Newer
Older
import logging
logging.getLogger('matplotlib').disabled = True
logging.getLogger('tensorflow').disabled = True
import os
import collections
import time
import cProfile
import pstats
import pandas
import numpy
MultiallelicMassSpecBatchGenerator)
from mhcflurry.regression_target import to_ic50
from mhcflurry import Class1AffinityPredictor
from numpy.testing import assert_equal
from nose.tools import assert_greater, assert_less
def data_path(name):
'''
Return the absolute path to a file in the test/data directory.
The name specified should be relative to test/data.
'''
return os.path.join(os.path.dirname(__file__), "data", name)
def test_basic_repeat():
for _ in range(100):
test_basic()
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
batch_generator_validation_split=validation_split,
batch_generator_batch_size=batch_size,
batch_generator_affinity_fraction=0.5))
exp1_alleles = ["HLA-A*03:01", "HLA-B*07:02", "HLA-C*02:01"]
exp2_alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*02:01"]
df = pandas.DataFrame(dict(
affinities_mask=([True] * 14) + ([False] * 6),
experiment_names=([None] * 14) + (["exp1"] * 2) + (["exp2"] * 4),
alleles_matrix=[["HLA-C*07:01", None, None]] * 10 + [
["HLA-A*02:01", None, None],
["HLA-A*02:01", None, None],
["HLA-A*03:01", None, None],
["HLA-A*03:01", None, None],
exp1_alleles,
exp1_alleles,
exp2_alleles,
exp2_alleles,
exp2_alleles,
exp2_alleles,
],
True, True, False, False, True, False, True, False, True, False,
]))
df = pandas.concat([df, df], ignore_index=True)
df = pandas.concat([df, df], ignore_index=True)
planner.plan(**df.to_dict("list"))
assert_equal(
planner.num_train_batches,
numpy.ceil(len(df) * (1 - validation_split) / batch_size))
assert_equal(
planner.num_test_batches,
numpy.ceil(len(df) * validation_split / batch_size))
(train_iter, test_iter) = planner.get_train_and_test_generators(
x_dict={
"idx": numpy.arange(len(df)),
},
y_list=[])
for (kind, it) in [("train", train_iter), ("test", test_iter)]:
for (i, (x_item, y_item)) in enumerate(it):
idx = x_item["idx"]
df.loc[idx, "kind"] = kind
df.loc[idx, "idx"] = idx
df.loc[idx, "batch"] = i
df["idx"] = df.idx.astype(int)
df["batch"] = df.batch.astype(int)
assert_equal(df.kind.value_counts()["test"], len(df) * validation_split)
assert_equal(df.kind.value_counts()["train"], len(df) * (1 - validation_split))
experiment_allele_colocations = collections.defaultdict(int)
for ((kind, batch_num), batch_df) in df.groupby(["kind", "batch"]):
if not batch_df.affinities_mask.all():
# Test each batch has at most one multiallelic ms experiment.
names = batch_df.loc[
~batch_df.affinities_mask
].experiment_names.unique()
assert_equal(len(names), 1)
(experiment,) = names
if batch_df.affinities_mask.any():
# Test experiments are matched to the correct affinity alleles.
affinity_alleles = batch_df.loc[
batch_df.affinities_mask
].alleles_matrix.str.get(0).values
for allele in affinity_alleles:
experiment_allele_colocations[(experiment, allele)] += 1
assert_greater(
experiment_allele_colocations[('exp1', 'HLA-A*03:01')],
experiment_allele_colocations[('exp1', 'HLA-A*02:01')])
assert_less(
experiment_allele_colocations[('exp2', 'HLA-A*03:01')],
experiment_allele_colocations[('exp2', 'HLA-A*02:01')])