Skip to content
Snippets Groups Projects
Commit 1940e636 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix batch generator tests

parent e6f26c20
No related branches found
No related tags found
No related merge requests found
......@@ -117,82 +117,63 @@ class MultiallelicMassSpecBatchGenerator(object):
affinity_fraction = hyperparameters["batch_generator_affinity_fraction"]
batch_size = hyperparameters["batch_generator_batch_size"]
df["first_allele"] = df.alleles.str.get(0)
equivalence_columns = [
"is_affinity",
"is_binder",
"experiment_name",
"first_allele",
]
df["equivalence_key"] = numpy.where(
df.is_affinity,
df.first_allele,
df.experiment_name,
) + " " + df.is_binder.map({True: "binder", False: "nonbinder"})
(df["equivalence_class"], equivalence_class_labels) = (
df.equivalence_key.factorize())
df["unused"] = True
df["idx"] = df.index
df = df.sample(frac=1.0)
affinities_per_batch = int(affinity_fraction * batch_size)
remaining_affinities_df = df.loc[df.is_affinity].copy()
# First do mixed affinity / multiallelic ms batches_generator.
batch_compositions = []
for experiment in df.loc[~df.is_affinity].experiment_name.unique():
if experiment is None:
continue
while True:
experiment_df = df.loc[
df.unused & (df.experiment_name == experiment)]
if len(experiment_df) == 0:
break
(experiment_alleles,) = experiment_df.alleles.unique()
affinities_df = df.loc[df.unused & df.is_affinity].copy()
affinities_df["matches_allele"] = (
affinities_df.first_allele.isin(experiment_alleles))
# Whenever possible we try to use affinities with the same
# alleles as the mass spec experiment
affinities_df = affinities_df.sort_values(
"matches_allele", ascending=False)
for (experiment, experiment_df) in df.loc[~df.is_affinity].groupby(
"experiment_name"):
(experiment_alleles,) = experiment_df.alleles.unique()
remaining_affinities_df["matches_allele"] = (
remaining_affinities_df.first_allele.isin(experiment_alleles))
# Whenever possible we try to use affinities with the same
# alleles as the mass spec experiment
remaining_affinities_df = remaining_affinities_df.sort_values(
"matches_allele", ascending=False)
while len(experiment_df) > 0:
affinities_for_this_batch = min(
affinities_per_batch, len(affinities_df))
affinities_per_batch, len(remaining_affinities_df))
mass_spec_for_this_batch = (
batch_size - affinities_for_this_batch)
if len(experiment_df) < mass_spec_for_this_batch:
mass_spec_for_this_batch = len(experiment_df)
affinities_for_this_batch = (
batch_size - mass_spec_for_this_batch)
if affinities_for_this_batch < len(affinities_df):
# For mass spec, we only do whole batches_generator, since it's
# unclear how our pairwise loss would interact with
# a smaller batch.
break
to_use_list = []
batch_composition = []
# sample mass spec
to_use = experiment_df.head(mass_spec_for_this_batch)
to_use_list.append(to_use.index.values)
# take mass spec
to_use = experiment_df.iloc[:mass_spec_for_this_batch]
experiment_df = experiment_df.iloc[mass_spec_for_this_batch:]
batch_composition.extend(to_use.equivalence_class.values)
# sample affinities
to_use = affinities_df.head(affinities_for_this_batch)
to_use_list.append(to_use.index.values)
to_use_indices = numpy.concatenate(to_use_list)
df.loc[to_use_indices, "unused"] = False
batch_compositions.append(
df.loc[to_use_indices].equivalence_class.values)
# take affinities
to_use = remaining_affinities_df.iloc[
:affinities_for_this_batch
]
remaining_affinities_df = remaining_affinities_df.iloc[
affinities_for_this_batch:
]
batch_composition.extend(to_use.equivalence_class.values)
batch_compositions.append(batch_composition)
# Affinities-only batches
affinities_df = df.loc[df.unused & df.is_affinity]
while len(affinities_df) > 0:
to_use = affinities_df.head(batch_size)
df.loc[to_use.index, "unused"] = False
while len(remaining_affinities_df) > 0:
to_use = remaining_affinities_df.iloc[:batch_size]
remaining_affinities_df = remaining_affinities_df.iloc[batch_size:]
batch_compositions.append(to_use.equivalence_class.values)
affinities_df = df.loc[df.unused & df.is_affinity]
class_to_indices = df.groupby("equivalence_class").idx.unique()
equivalence_classes = [
......@@ -228,7 +209,8 @@ class MultiallelicMassSpecBatchGenerator(object):
validation_items = numpy.random.choice(
n if potential_validation_mask is None
else numpy.where(potential_validation_mask)[0],
int(self.hyperparameters['batch_generator_validation_split'] * n))
int(self.hyperparameters['batch_generator_validation_split'] * n),
replace=False)
validation_mask = numpy.zeros(n, dtype=bool)
validation_mask[validation_items] = True
......
......@@ -19,6 +19,7 @@ from mhcflurry.regression_target import to_ic50
from mhcflurry import Class1AffinityPredictor
from numpy.testing import assert_equal
from nose.tools import assert_greater, assert_less
def data_path(name):
......@@ -29,20 +30,27 @@ def data_path(name):
return os.path.join(os.path.dirname(__file__), "data", name)
def test_basic_repeat():
for _ in range(100):
test_basic()
def test_basic():
batch_size = 7
validation_split = 0.2
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
batch_generator_validation_split=0.2,
batch_generator_batch_size=10,
batch_generator_validation_split=validation_split,
batch_generator_batch_size=batch_size,
batch_generator_affinity_fraction=0.5))
exp1_alleles = ["HLA-A*03:01", "HLA-B*07:02", "HLA-C*02:01"]
exp2_alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*02:01"]
df = pandas.DataFrame(dict(
affinities_mask=([True] * 4) + ([False] * 6),
experiment_names=([None] * 4) + (["exp1"] * 2) + (["exp2"] * 4),
alleles_matrix=[
affinities_mask=([True] * 14) + ([False] * 6),
experiment_names=([None] * 14) + (["exp1"] * 2) + (["exp2"] * 4),
alleles_matrix=[["HLA-C*07:01", None, None]] * 10 + [
["HLA-A*02:01", None, None],
["HLA-A*02:01", None, None],
["HLA-A*03:01", None, None],
......@@ -54,11 +62,20 @@ def test_basic():
exp2_alleles,
exp2_alleles,
],
is_binder=[
is_binder=[False, True] * 5 + [
True, True, False, False, True, False, True, False, True, False,
]))
df = pandas.concat([df, df], ignore_index=True)
df = pandas.concat([df, df], ignore_index=True)
planner.plan(**df.to_dict("list"))
print(planner.summary())
assert_equal(
planner.num_train_batches,
numpy.ceil(len(df) * (1 - validation_split) / batch_size))
assert_equal(
planner.num_test_batches,
numpy.ceil(len(df) * validation_split / batch_size))
(train_iter, test_iter) = planner.get_train_and_test_generators(
x_dict={
......@@ -74,20 +91,36 @@ def test_basic():
df.loc[idx, "batch"] = i
df["idx"] = df.idx.astype(int)
df["batch"] = df.batch.astype(int)
print(df)
assert_equal(df.kind.value_counts()["test"], len(df) * validation_split)
assert_equal(df.kind.value_counts()["train"], len(df) * (1 - validation_split))
experiment_allele_colocations = collections.defaultdict(int)
for ((kind, batch_num), batch_df) in df.groupby(["kind", "batch"]):
if not batch_df.affinities_mask.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[
~batch_df.affinities_mask
].experiment_names.nunique(), 1)
#import ipdb;ipdb.set_trace()
def test_large(sample_rate=0.01):
names = batch_df.loc[
~batch_df.affinities_mask
].experiment_names.unique()
assert_equal(len(names), 1)
(experiment,) = names
if batch_df.affinities_mask.any():
# Test experiments are matched to the correct affinity alleles.
affinity_alleles = batch_df.loc[
batch_df.affinities_mask
].alleles_matrix.str.get(0).values
for allele in affinity_alleles:
experiment_allele_colocations[(experiment, allele)] += 1
assert_greater(
experiment_allele_colocations[('exp1', 'HLA-A*03:01')],
experiment_allele_colocations[('exp1', 'HLA-A*02:01')])
assert_less(
experiment_allele_colocations[('exp2', 'HLA-A*03:01')],
experiment_allele_colocations[('exp2', 'HLA-A*02:01')])
def test_large(sample_rate=1.0):
multi_train_df = pandas.read_csv(
data_path("multiallelic_ms.benchmark1.csv.bz2"))
multi_train_df["label"] = multi_train_df.hit
......@@ -151,6 +184,7 @@ def test_large(sample_rate=0.01):
combined_train_df.is_affinity.values,
combined_train_df.label.values,
to_ic50(combined_train_df.label.values)) < 1000.0)
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats("cumtime").reverse_order().print_stats()
print(planner.summary())
......@@ -162,20 +196,25 @@ def test_large(sample_rate=0.01):
},
y_list=[])
train_batch_sizes = []
indices_total = numpy.zeros(len(combined_train_df))
for (kind, it) in [("train", train_iter), ("test", test_iter)]:
for (i, (x_item, y_item)) in enumerate(it):
idx = x_item["idx"]
combined_train_df.loc[idx, "kind"] = kind
combined_train_df.loc[idx, "idx"] = idx
combined_train_df.loc[idx, "batch"] = i
import ipdb ; ipdb.set_trace()
combined_train_df["idx"] = combined_train_df.idx.astype(int)
combined_train_df["batch"] = combined_train_df.batch.astype(int)
for ((kind, batch_num), batch_df) in combined_train_df.groupby(["kind", "batch"]):
if not batch_df.is_affinity.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[
~batch_df.is_affinity
].sample_id.nunique(), 1)
\ No newline at end of file
indices_total[idx] += 1
batch_df = combined_train_df.iloc[idx]
if not batch_df.is_affinity.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[~batch_df.is_affinity].sample_id.nunique(), 1)
if kind == "train":
train_batch_sizes.append(len(batch_df))
# At most one short batch.
assert_less(sum(b != 128 for b in train_batch_sizes), 2)
assert_greater(
sum(b == 128 for b in train_batch_sizes), len(train_batch_sizes) - 2)
# Each point used exactly once.
assert_equal(
indices_total, numpy.ones(len(combined_train_df)))
......@@ -753,7 +753,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
return (predictor, predictions, metrics, motifs)
def test_refinemeent_large(sample_rate=0.1):
def test_batch_generator(sample_rate=0.1):
multi_train_df = pandas.read_csv(
data_path("multiallelic_ms.benchmark1.csv.bz2"))
multi_train_df["label"] = multi_train_df.hit
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment