Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Split training data into CV folds.
"""
import argparse
import sys
from os.path import abspath
import pandas
from sklearn.model_selection import StratifiedKFold
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV")
parser.add_argument(
"--folds", metavar="N", type=int, default=5)
parser.add_argument(
"--allele",
nargs="+",
help="Include only the specified allele(s)")
parser.add_argument(
"--min-measurements-per-allele",
type=int,
metavar="N",
help="Use only alleles with >=N measurements.")
parser.add_argument(
"--subsample",
type=int,
metavar="N",
help="Subsample to first N rows")
parser.add_argument(
"--random-state",
metavar="N",
type=int,
help="Specify an int for deterministic splitting")
parser.add_argument(
"--output-pattern-train",
default="./train.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")
parser.add_argument(
"--output-pattern-test",
default="./test.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")
def run(argv):
args = parser.parse_args(argv)
df = pandas.read_csv(args.input)
print("Loaded data with shape: %s" % str(df.shape))
df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))
allele_counts = df.allele.value_counts()
if args.allele:
alleles = args.allele
else:
alleles = list(
allele_counts.ix[
allele_counts > args.min_measurements_per_allele
].index)
df = df.ix[df.allele.isin(alleles)]
print("Potentially subselected by allele to: %s" % str(df.shape))
print("Data has %d alleles: %s" % (
df.allele.nunique(), " ".join(df.allele.unique())))
df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index()
print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape))
if args.subsample:
df = df.head(args.subsample)
print("Subsampled to: %s" % str(df.shape))
kf = StratifiedKFold(
n_splits=args.folds,
shuffle=True,
random_state=args.random_state)
# Stratify by both allele and binder vs. nonbinder.
df["key"] = [
"%s_%s" % (
row.allele,
"binder" if row.measurement_value < 500 else "nonbinder")
for (_, row) in df.iterrows()
]
for i, (train, test) in enumerate(kf.split(df, df.key)):
train_filename = args.output_pattern_train.format(i)
test_filename = args.output_pattern_test.format(i)
df.iloc[train].to_csv(train_filename, index=False)
print("Wrote: %s" % abspath(train_filename))
df.iloc[test].to_csv(test_filename, index=False)
print("Wrote: %s" % abspath(test_filename))
if __name__ == '__main__':
run(sys.argv[1:])