Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import absolute_import
from nose.tools import eq_
import mhcflurry
import fancyimpute
from mhcflurry.downloads import get_path
from mhcflurry.class1_allele_specific import (
cross_validation_folds,
train_across_models_and_folds)
from mhcflurry.class1_allele_specific.train import (
HYPERPARAMETER_DEFAULTS)
def test_imputation():
imputer = fancyimpute.MICE(
n_imputations=2, n_burn_in=1, n_nearest_columns=25)
train_data = (
mhcflurry.dataset.Dataset.from_csv(
get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))
.get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"]))
folds = cross_validation_folds(
train_data,
n_folds=3,
imputer=imputer,
drop_similar_peptides=True,
alleles=["HLA-A0201", "HLA-A0202"],
n_jobs=2,
verbose=5,
)
eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
eq_(len(folds), 6)
for fold in folds:
eq_(fold.train.unique_alleles(), set([fold.allele]))
eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
eq_(fold.test.unique_alleles(), set([fold.allele]))
def test_cross_validation_no_imputation():
train_data = (
mhcflurry.dataset.Dataset.from_csv(
get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))
.get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"]))
folds = cross_validation_folds(
train_data,
n_folds=3,
imputer=None,
drop_similar_peptides=True,
alleles=["HLA-A0201", "HLA-A0202"]
)
eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
eq_(len(folds), 6)
for fold in folds:
eq_(fold.train.unique_alleles(), set([fold.allele]))
eq_(fold.test.unique_alleles(), set([fold.allele]))
models = HYPERPARAMETER_DEFAULTS.models_grid(
activation=["tanh", "relu"],
layer_sizes=[[4]],
embedding_output_dim=[8],
n_training_epochs=[3])
print(models)
df = train_across_models_and_folds(
folds,
models,
n_jobs=2,
verbose=50)
print(df)
assert df.test_auc.mean() > 0.6
def test_cross_validation_with_imputation():
imputer = fancyimpute.MICE(
n_imputations=2, n_burn_in=1, n_nearest_columns=25)
train_data = (
mhcflurry.dataset.Dataset.from_csv(
get_path("data_kim2014" , "bdata.2009.mhci.public.1.txt"))
.get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"]))
folds = cross_validation_folds(
train_data,
n_folds=3,
imputer=imputer,
drop_similar_peptides=True,
alleles=["HLA-A0201", "HLA-A0202"],
n_jobs=3,
verbose=5,
)
eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
eq_(len(folds), 6)
for fold in folds:
eq_(fold.train.unique_alleles(), set([fold.allele]))
eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
eq_(fold.test.unique_alleles(), set([fold.allele]))
models = HYPERPARAMETER_DEFAULTS.models_grid(
activation=["tanh", "relu"],
layer_sizes=[[4]],
embedding_output_dim=[8],
n_training_epochs=[3])
print(models)
df = train_across_models_and_folds(
folds,
models,
n_jobs=3,
verbose=5)
print(df)
assert df.test_auc.mean() > 0.6