Skip to content
Snippets Groups Projects
Commit 956efdba authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

adding test to make sure imputation improves predictive accuracy on a small...

adding test to make sure imputation improves predictive accuracy on a small allele, added allow_unknown_amino_acids to from_hyperparameters
parent e6bef3d1
No related merge requests found
......@@ -39,14 +39,14 @@ script:
# download training data
- script/download-kim-2013-dataset.sh
- script/download-iedb.sh
# only install data for A*01:01 and A*02:01 for testing
- script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01
# only install data for A*01:01, A*02:01, HLA-A*02:05 for testing
- script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05
- script/create-combined-class1-dataset.py
# only installing A0101 and A0201 for testing purposes and with very limited
# only installing A0101, A0201, A0205 for testing purposes and with very limited
# training
- >
mhcflurry-train-class1-allele-specific-models.py
--alleles HLA-A0101 HLA-A0201
--alleles HLA-A0101 HLA-A0201 HLA-A0205
--embedding-size 10
--hidden-layer-size 10
--training-epochs 100
......
......@@ -91,7 +91,8 @@ class Class1BindingPredictor(PredictorBase):
name=None,
max_ic50=MAX_IC50,
peptide_length=9,
embedding_input_dim=20,
n_amino_acids=20,
allow_unknown_amino_acids=True,
embedding_output_dim=20,
layer_sizes=[50],
activation="tanh",
......@@ -104,6 +105,7 @@ class Class1BindingPredictor(PredictorBase):
"""
Create untrained predictor with the given hyperparameters.
"""
embedding_input_dim = n_amino_acids + int(allow_unknown_amino_acids)
model = make_embedding_network(
peptide_length=peptide_length,
embedding_input_dim=embedding_input_dim,
......@@ -119,6 +121,7 @@ class Class1BindingPredictor(PredictorBase):
name=name,
max_ic50=max_ic50,
model=model,
allow_unknown_amino_acids=allow_unknown_amino_acids,
**kwargs)
def _combine_training_data(
......
......@@ -159,18 +159,12 @@ if __name__ == "__main__":
name=allele_name,
peptide_length=9,
max_ic50=args.max_ic50,
# 21 instead of 20 amino acids since we're also allowing
# an explicit unknown "X"
embedding_input_dim=21,
embedding_output_dim=args.embedding_size,
layer_sizes=(args.hidden_layer_size,),
activation=args.activation,
init=args.initialization,
dropout_probability=args.dropout,
learning_rate=args.learning_rate,
# this argument isn't a model hyperparameter but gets passed on to
# the initializer method
allow_unknown_amino_acids=True)
learning_rate=args.learning_rate)
json_filename = allele_name + ".json"
json_path = join(args.output_dir, json_filename)
......
from mhcflurry.imputation import (
create_imputed_datasets,
)
from mhcflurry.data import create_allele_data_from_peptide_to_ic50_dict
from mhcflurry.data import (
create_allele_data_from_peptide_to_ic50_dict,
load_allele_datasets
)
from mhcflurry.paths import CLASS1_DATA_CSV_PATH
from mhcflurry import Class1BindingPredictor
from fancyimpute import MICE
from nose.tools import eq_
......@@ -28,3 +33,20 @@ def test_create_imputed_datasets_two_alleles():
print(allele_name)
print(allele_data)
eq_(set(allele_data.peptides), expected_peptides)
def test_performance_improves_for_A0205_with_pretraining():
# test to make sure that imputation improves predictive accuracy after a
# small number of training iterations (5 epochs)
allele_data_dict = load_allele_datasets(CLASS1_DATA_CSV_PATH)
a0205_data_without_imputation = allele_data_dict["A0205"]
predictor_without_imputation = \
Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")
predictor_without_imputation.fit(
X=a0205_data_without_imputation.X_index,
Y=a0205_data_without_imputation.Y)
predictor_with_imputation = \
Class1BindingPredictor.from_hyperparameters(name="A0205-impute")
predictor_with_imputation.fit(
X=a0205_data_without_imputation.X_index,
Y=a0205_data_without_imputation.Y)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment