Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
Join benchmark with precomputed predictions.
"""
import sys
import argparse
import os
import numpy
import collections
import pandas
import tqdm
import mhcflurry
from mhcflurry.downloads import get_path
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"benchmark")
parser.add_argument(
"predictors",
nargs="+",
choices=("netmhcpan4.ba", "netmhcpan4.el", "mixmhcpred"))
parser.add_argument(
"--out",
metavar="CSV",
required=True,
help="File to write")
def load_results(dirname, result_df=None, columns=None):
peptides = pandas.read_csv(os.path.join(dirname, "peptides.csv")).peptide
manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))
print("Loading results. Existing data has", len(peptides), "peptides and",
len(manifest_df), "columns")
if columns is None:
columns = manifest_df.col.values
if result_df is None:
result_df = pandas.DataFrame(index=peptides, columns=columns,
dtype="float32")
result_df[:] = numpy.nan
peptides_to_assign = peptides
mask = None
else:
mask = (peptides.isin(result_df.index)).values
peptides_to_assign = peptides[mask]
manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
with open(os.path.join(dirname, row.path), "rb") as fd:
value = numpy.load(fd)['arr_0']
if mask is not None:
value = value[mask]
result_df.loc[peptides_to_assign, row.col] = value
return result_df
def run():
args = parser.parse_args(sys.argv[1:])
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
df["alleles"] = df.hla.str.split()
peptides = df.peptide.unique()
alleles = set()
for some in df.hla.unique():
alleles.update(some.split())
predictions_dfs = {}
if 'netmhcpan4.ba' in args.predictor:
predictions_dfs['netmhcpan4.ba'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s affinity" % a for a in alleles])).rename(
columns=lambda s: s.replace("affinity", "").strip())
predictions_dfs['netmhcpan4.ba'] *= -1
if 'netmhcpan4.el' in args.predictor:
predictions_dfs['netmhcpan4.el'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s score" % a for a in alleles])).rename(
columns=lambda s: s.replace("score", "").strip())
if 'mixmhcpred' in args.predictor:
predictions_dfs['mixmhcpred'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s score" % a for a in alleles])).rename(
columns=lambda s: s.replace("score", "").strip())
skip_experiments = set()
for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()):
hla = hla_text.split()
for (name, precomputed_df) in predictions_dfs.items():
df.loc[sub_df.index, name] = numpy.nan
prediction_df = pandas.DataFrame(index=sub_df.peptide)
for allele in hla:
if allele not in precomputed_df.columns or precomputed_df[allele].isnull().all():
print(sub_df.sample_id.unique(), hla)
skip_experiments.update(sub_df.sample_id.unique())
prediction_df[allele] = precomputed_df.loc[df.index, allele]
df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values
df.loc[sub_df.index, name + " allele"] = prediction_df.idxmax(1, skipna=False).values
print("Skip experiments", skip_experiments)
print("results")
print(df)
df.to_csv(args.out)
print("Wrote", args.out)
if __name__ == '__main__':
run()