Newer
Older
import logging
logging.getLogger('matplotlib').disabled = True
logging.getLogger('tensorflow').disabled = True
import os
import collections
import time
import cProfile
import pstats
import pandas
import numpy
from mhcflurry.allele_encoding import MultipleAlleleEncoding
from mhcflurry.downloads import get_path
MultiallelicMassSpecBatchGenerator)
from mhcflurry.regression_target import to_ic50
from mhcflurry import Class1AffinityPredictor
from numpy.testing import assert_equal
def data_path(name):
'''
Return the absolute path to a file in the test/data directory.
The name specified should be relative to test/data.
'''
return os.path.join(os.path.dirname(__file__), "data", name)
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def test_basic():
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
batch_generator_validation_split=0.2,
batch_generator_batch_size=10,
batch_generator_affinity_fraction=0.5))
exp1_alleles = ["HLA-A*03:01", "HLA-B*07:02", "HLA-C*02:01"]
exp2_alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*02:01"]
df = pandas.DataFrame(dict(
affinities_mask=([True] * 4) + ([False] * 6),
experiment_names=([None] * 4) + (["exp1"] * 2) + (["exp2"] * 4),
alleles_matrix=[
["HLA-A*02:01", None, None],
["HLA-A*02:01", None, None],
["HLA-A*03:01", None, None],
["HLA-A*03:01", None, None],
exp1_alleles,
exp1_alleles,
exp2_alleles,
exp2_alleles,
exp2_alleles,
exp2_alleles,
],
is_binder=[
True, True, False, False, True, False, True, False, True, False,
]))
planner.plan(**df.to_dict("list"))
print(planner.summary())
(train_iter, test_iter) = planner.get_train_and_test_generators(
x_dict={
"idx": numpy.arange(len(df)),
},
y_list=[])
for (kind, it) in [("train", train_iter), ("test", test_iter)]:
for (i, (x_item, y_item)) in enumerate(it):
idx = x_item["idx"]
df.loc[idx, "kind"] = kind
df.loc[idx, "idx"] = idx
df.loc[idx, "batch"] = i
df["idx"] = df.idx.astype(int)
df["batch"] = df.batch.astype(int)
print(df)
for ((kind, batch_num), batch_df) in df.groupby(["kind", "batch"]):
if not batch_df.affinities_mask.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[
~batch_df.affinities_mask
].experiment_names.nunique(), 1)
#import ipdb;ipdb.set_trace()
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def test_large(sample_rate=0.01):
multi_train_df = pandas.read_csv(
data_path("multiallelic_ms.benchmark1.csv.bz2"))
multi_train_df["label"] = multi_train_df.hit
multi_train_df["is_affinity"] = False
sample_table = multi_train_df.loc[
multi_train_df.label == True
].drop_duplicates("sample_id").set_index("sample_id").loc[
multi_train_df.sample_id.unique()
]
grouped = multi_train_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
pan_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
pan_sub_train_df = pan_train_df
pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
del pan_sub_train_df["measurement_value"]
pan_sub_train_df["is_affinity"] = True
pan_sub_train_df = pan_sub_train_df.sample(frac=sample_rate)
multi_train_df = multi_train_df.sample(frac=sample_rate)
pan_predictor = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.with_mass_spec"),
optimization_level=0,
max_models=1)
allele_encoding = MultipleAlleleEncoding(
experiment_names=multi_train_df.sample_id.values,
experiment_to_allele_list=sample_table.alleles.to_dict(),
max_alleles_per_experiment=sample_table.alleles.str.len().max(),
allele_to_sequence=pan_predictor.allele_to_sequence,
)
allele_encoding.append_alleles(pan_sub_train_df.allele.values)
allele_encoding = allele_encoding.compact()
combined_train_df = pandas.concat(
[multi_train_df, pan_sub_train_df], ignore_index=True, sort=True)
print("Total size", combined_train_df)
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
batch_generator_validation_split=0.2,
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
batch_generator_affinity_fraction=0.5))
s = time.time()
profiler = cProfile.Profile()
profiler.enable()
planner.plan(
affinities_mask=combined_train_df.is_affinity.values,
experiment_names=combined_train_df.sample_id.values,
alleles_matrix=allele_encoding.alleles,
is_binder=numpy.where(
combined_train_df.is_affinity.values,
combined_train_df.label.values,
to_ic50(combined_train_df.label.values)) < 1000.0)
stats = pstats.Stats(profiler)
stats.sort_stats("cumtime").reverse_order().print_stats()
print(planner.summary())
print("Planning took [sec]: ", time.time() - s)
(train_iter, test_iter) = planner.get_train_and_test_generators(
x_dict={
"idx": numpy.arange(len(combined_train_df)),
},
y_list=[])
for (kind, it) in [("train", train_iter), ("test", test_iter)]:
for (i, (x_item, y_item)) in enumerate(it):
idx = x_item["idx"]
combined_train_df.loc[idx, "kind"] = kind
combined_train_df.loc[idx, "idx"] = idx
combined_train_df.loc[idx, "batch"] = i
combined_train_df["idx"] = combined_train_df.idx.astype(int)
combined_train_df["batch"] = combined_train_df.batch.astype(int)
for ((kind, batch_num), batch_df) in combined_train_df.groupby(["kind", "batch"]):
if not batch_df.is_affinity.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[
~batch_df.is_affinity
].sample_id.nunique(), 1)