Skip to content
Snippets Groups Projects
Commit f73e73b2 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix peptide encoding

parent efb02b5d
No related branches found
No related tags found
No related merge requests found
......@@ -35,6 +35,7 @@ time python curate.py \
bzip2 curated_training_data.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
......@@ -26,6 +26,7 @@ rm mhc_ligand_full.zip
bzip2 mhc_ligand_full.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
......@@ -28,6 +28,7 @@ wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.pub
wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
......@@ -34,6 +34,7 @@ time mhcflurry-class1-train-allele-specific-models \
--min-measurements-per-allele 100
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
......@@ -11,10 +11,47 @@ import pandas
import mhcnames
from ..encodable_sequences import EncodableSequences
from ..downloads import get_path
from .class1_neural_network import Class1NeuralNetwork
class LazyLoadingClass1NeuralNetwork(object):
@classmethod
def wrap(cls, instance):
if isinstance(instance, cls):
return instance
elif isinstance(instance, Class1NeuralNetwork):
return cls(model=instance)
raise TypeError("Unsupported type: %s" % instance)
@classmethod
def wrap_list(cls, lst):
return [
cls.wrap(instance)
for instance in lst
]
def __init__(self, model=None, config=None, weights_filename=None):
if model is None:
assert config is not None
assert weights_filename is not None
else:
assert config is None
assert weights_filename is None
self.model = model
self.config = config
self.weights_filename = weights_filename
@property
def instance(self):
if self.model is None:
self.model = Class1NeuralNetwork.from_config(self.config)
self.model.restore_weights(self.weights_filename)
return self.model
class Class1AffinityPredictor(object):
def __init__(
self,
......@@ -31,9 +68,11 @@ class Class1AffinityPredictor(object):
if class1_pan_allele_models:
assert allele_to_pseudosequence, "Pseudosequences required"
self.allele_to_allele_specific_models = (
allele_to_allele_specific_models)
self.class1_pan_allele_models = class1_pan_allele_models
self.allele_to_allele_specific_models = dict(
(k, LazyLoadingClass1NeuralNetwork.wrap_list(v))
for (k, v) in allele_to_allele_specific_models.items())
self.class1_pan_allele_models = (
LazyLoadingClass1NeuralNetwork.wrap_list(class1_pan_allele_models))
self.allele_to_pseudosequence = allele_to_pseudosequence
if manifest_df is None:
......@@ -61,7 +100,7 @@ class Class1AffinityPredictor(object):
for (_, row) in sub_manifest_df.iterrows():
weights_path = self.weights_path(models_dir, row.model_name)
row.model.save_weights(weights_path)
row.model.instance.save_weights(weights_path)
print("Wrote: %s" % weights_path)
write_manifest_df = self.manifest_df[[
......@@ -75,7 +114,7 @@ class Class1AffinityPredictor(object):
def model_name(allele, num):
random_string = hashlib.sha1(
str(time.time()).encode()).hexdigest()[:16]
return "%s-%d-%s" % (allele, num, random_string)
return "%s-%d-%s" % (allele.upper(), num, random_string)
@staticmethod
def weights_path(models_dir, model_name):
......@@ -84,9 +123,11 @@ class Class1AffinityPredictor(object):
"weights_%s.%s" % (
model_name, Class1NeuralNetwork.weights_filename_extension))
@staticmethod
def load(models_dir, max_models=None):
def load(models_dir=None, max_models=None):
if models_dir is None:
models_dir = get_path("models_class1", "models")
manifest_path = join(models_dir, "manifest.csv")
manifest_df = pandas.read_csv(manifest_path, nrows=max_models)
......@@ -94,13 +135,11 @@ class Class1AffinityPredictor(object):
class1_pan_allele_models = []
all_models = []
for (_, row) in manifest_df.iterrows():
model = Class1NeuralNetwork.from_config(
json.loads(row.config_json))
weights_path = Class1AffinityPredictor.weights_path(
models_dir, row.model_name)
print("Loading model weights: %s" % weights_path)
model.restore_weights(weights_path)
model = LazyLoadingClass1NeuralNetwork(
config=json.loads(row.config_json),
weights_filename=Class1AffinityPredictor.weights_path(
models_dir, row.model_name)
)
if row.allele == "pan-class1":
class1_pan_allele_models.append(model)
else:
......@@ -157,17 +196,18 @@ class Class1AffinityPredictor(object):
models_list = []
for (i, model) in enumerate(models):
lazy_model = LazyLoadingClass1NeuralNetwork.wrap(model)
model_name = self.model_name(allele, i)
models_list.append(model) # models is a generator
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", allele),
("config_json", json.dumps(model.get_config())),
("model", model),
("model", lazy_model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
self.allele_to_allele_specific_models[allele].append(model)
self.allele_to_allele_specific_models[allele].append(lazy_model)
if models_dir_for_save:
self.save(
models_dir_for_save, model_names_to_write=[model_name])
......@@ -191,16 +231,18 @@ class Class1AffinityPredictor(object):
architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides,
affinities=affinities,
allele_pseudosequences=allele_pseudosequences)
allele_pseudosequences=allele_pseudosequences,
verbose=verbose)
for (i, model) in enumerate(models):
lazy_model = LazyLoadingClass1NeuralNetwork.wrap(model)
model_name = self.model_name("pan-class1", i)
self.class1_pan_allele_models.append(model)
self.class1_pan_allele_models.append(lazy_model)
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", "pan-class1"),
("config_json", json.dumps(model.get_config())),
("model", model),
("model", lazy_model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
......@@ -265,7 +307,7 @@ class Class1AffinityPredictor(object):
encodable_peptides = EncodableSequences.create(
df.peptide.values)
for (i, model) in enumerate(self.class1_pan_allele_models):
df["model_pan_%d" % i] = model.predict(
df["model_pan_%d" % i] = model.instance.predict(
encodable_peptides,
allele_pseudosequences=allele_pseudosequences)
......@@ -276,8 +318,9 @@ class Class1AffinityPredictor(object):
df.ix[mask].peptide.values)
models = self.allele_to_allele_specific_models.get(allele, [])
for (i, model) in enumerate(models):
df.loc[mask, "model_single_%d" % i] = model.predict(
allele_peptides)
df.loc[
mask, "model_single_%d" % i
] = model.instance.predict(allele_peptides)
# Geometric mean
df_predictions = df[
......
......@@ -175,7 +175,7 @@ class Class1NeuralNetwork(object):
self.hyperparameters['random_negative_constant'])
num_random_negative = pandas.Series(num_random_negative)
logging.info("Random negative counts per length:\n%s" % (
str(num_random_negative)))
str(num_random_negative.to_dict())))
aa_distribution = None
if self.hyperparameters['random_negative_match_distribution']:
......@@ -185,7 +185,7 @@ class Class1NeuralNetwork(object):
'random_negative_distribution_smoothing'])
logging.info(
"Using amino acid distribution for random negative:\n%s" % (
str(aa_distribution)))
str(aa_distribution.to_dict())))
y_values = from_ic50(affinities)
assert numpy.isnan(y_values).sum() == 0, numpy.isnan(y_values).sum()
......@@ -224,6 +224,8 @@ class Class1NeuralNetwork(object):
sample_weights_with_random_negatives = numpy.concatenate([
numpy.ones(int(num_random_negative.sum())),
sample_weights])
else:
sample_weights_with_random_negatives = None
val_losses = []
min_val_loss_iteration = None
......@@ -239,12 +241,10 @@ class Class1NeuralNetwork(object):
count,
length=length,
distribution=aa_distribution))
random_negative_peptides_encodable = (
EncodableSequences.create(
random_negative_peptides_list))
random_negative_peptides_encoding = (
self.peptides_to_network_input(
random_negative_peptides_encodable))
random_negative_peptides_list))
x_dict_with_random_negatives = {
"peptide": numpy.concatenate([
random_negative_peptides_encoding,
......@@ -263,9 +263,8 @@ class Class1NeuralNetwork(object):
shuffle=True,
verbose=verbose,
epochs=1,
validation_split=self.hyperparameters[
'validation_split'],
sample_weight=sample_weights)
validation_split=self.hyperparameters['validation_split'],
sample_weight=sample_weights_with_random_negatives)
for (key, value) in fit_history.history.items():
self.loss_history[key].extend(value)
......
......@@ -65,8 +65,17 @@ def one_hot_encoding(index_encoded, alphabet_size):
result = numpy.zeros(
(num_sequences, sequence_length, alphabet_size),
dtype='int32')
for position in range(sequence_length):
result[:, position, index_encoded[:, position]] = 1
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices = (
index_encoded +
(
sequence_length * alphabet_size * numpy.arange(num_sequences)
).reshape((-1, 1)) +
numpy.tile(numpy.arange(sequence_length),
(num_sequences, 1)) * alphabet_size)
result.put(flattened_indices, 1)
return result
......
from mhcflurry import encodable_sequences
from nose.tools import eq_
from numpy.testing import assert_equal
letter_to_index_dict = {
'A': 0,
'B': 1,
'C': 2,
}
def test_index_and_one_hot_encoding():
index_encoding = encodable_sequences.index_encoding(
["AAAA", "ABCA"], letter_to_index_dict)
assert_equal(
index_encoding,
[
[0, 0, 0, 0],
[0, 1, 2, 0],
])
one_hot = encodable_sequences.one_hot_encoding(index_encoding, 3)
eq_(one_hot.shape, (2, 4, 3))
assert_equal(
one_hot[0],
[
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
])
assert_equal(
one_hot[1],
[
[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[1, 0, 0],
])
......@@ -12,27 +12,35 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import cProfile
import mhcflurry
import pandas
import mhcflurry.class1_affinity_prediction
import mhcflurry.class1_allele_specific_ensemble
from mhcflurry.downloads import get_path
from mhcflurry import Class1AffinityPredictor
predictors = [
mhcflurry.class1_affinity_prediction.get_downloaded_predictor(),
mhcflurry.class1_allele_specific_ensemble.get_downloaded_predictor(),
mhcflurry.class1_affinity_prediction.Class1AffinityPredictor.load(),
]
def predict_and_check(allele, peptide, expected_range=(0, 500)):
def predict_and_check(
allele,
peptide,
predictors=predictors,
expected_range=(0, 500)):
for predictor in predictors:
(prediction,) = predictor.predict_for_allele(allele, [peptide])
assert prediction >= expected_range[0], (predictor, prediction)
assert prediction <= expected_range[1], (predictor, prediction)
def debug():
print("\n%s" % (
predictor.predict_to_dataframe(
peptides=[peptide],
allele=allele,
include_individual_model_predictions=True)))
(prediction,) = predictor.predict(allele=allele, peptides=[peptide])
assert prediction >= expected_range[0], (predictor, prediction, debug())
assert prediction <= expected_range[1], (predictor, prediction, debug())
def test_A1_Titin_epitope():
def test_A1_Titin_epitope_downloaded_models():
# Test the A1 Titin epitope ESDPIVAQY from
# Identification of a Titin-Derived HLA-A1-Presented Peptide
# as a Cross-Reactive Target for Engineered MAGE A3-Directed
......@@ -40,7 +48,7 @@ def test_A1_Titin_epitope():
predict_and_check("HLA-A*01:01", "ESDPIVAQY")
def test_A1_MAGE_epitope():
def test_A1_MAGE_epitope_downloaded_models():
# Test the A1 MAGE epitope EVDPIGHLY from
# Identification of a Titin-Derived HLA-A1-Presented Peptide
# as a Cross-Reactive Target for Engineered MAGE A3-Directed
......@@ -48,13 +56,66 @@ def test_A1_MAGE_epitope():
predict_and_check("HLA-A*01:01", "EVDPIGHLY")
def test_A2_HIV_epitope():
def test_A1_trained_models():
allele = "HLA-A*01:01"
df = pandas.read_csv(
get_path(
"data_curated", "curated_training_data.csv.bz2"))
df = df.ix[
(df.allele == allele) &
(df.peptide.str.len() >= 8) &
(df.peptide.str.len() <= 15)
]
hyperparameters = {
"max_epochs": 500,
"patience": 10,
"early_stopping": True,
"validation_split": 0.2,
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"use_embedding": False,
"kmer_size": 15,
"batch_normalization": False,
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
},
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
32
],
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
"dense_layer_l1_regularization": 0.001,
"dropout_probability": 0.0
}
predictor = Class1AffinityPredictor()
predictor.fit_allele_specific_predictors(
n_models=2,
architecture_hyperparameters=hyperparameters,
allele=allele,
peptides=df.peptide.values,
affinities=df.measurement_value.values,
)
predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictors=[predictor])
def test_A2_HIV_epitope_downloaded_models():
# Test the A2 HIV epitope SLYNTVATL from
# The HIV-1 HLA-A2-SLYNTVATL Is a Help-Independent CTL Epitope
predict_and_check("HLA-A*02:01", "SLYNTVATL")
if __name__ == "__main__":
test_A1_Titin_epitope()
test_A1_MAGE_epitope()
test_A2_HIV_epitope()
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mhcflurry.peptide_encoding import (
all_kmers,
extend_peptide,
shorten_peptide,
fixed_length_from_many_peptides,
)
from nose.tools import eq_
def test_all_kmers():
kmers = all_kmers(2, alphabet=["A", "B"])
assert len(kmers) == 4, kmers
eq_(set(kmers), {"AA", "AB", "BA", "BB"})
def test_all_kmers_string_alphabet():
kmers = all_kmers(2, alphabet="AB")
assert len(kmers) == 4, kmers
eq_(set(kmers), {"AA", "AB", "BA", "BB"})
def test_extend_peptide_all_positions():
# insert 0 or 1 at every position
results = extend_peptide(
"111",
desired_length=4,
start_offset=0,
end_offset=0,
insert_amino_acid_letters="01")
expected = [
"0111",
"1111",
"1011",
"1111",
"1101",
"1111",
"1110",
"1111",
]
eq_(results, expected)
def test_shorten_peptide_all_positions():
# insert 0 or 1 at every position
results = shorten_peptide(
"012",
desired_length=2,
start_offset=0,
end_offset=0,
insert_amino_acid_letters="012")
expected = [
"12",
"02",
"01"
]
eq_(results, expected)
def test_shorten_peptide_all_positions_except_first():
# insert 0 or 1 at every position
results = shorten_peptide(
"012",
desired_length=2,
start_offset=1,
end_offset=0,
insert_amino_acid_letters="012")
expected = [
"02",
"01",
]
eq_(results, expected)
def test_shorten_peptide_all_positions_except_last():
# insert 0 or 1 at every position
results = shorten_peptide(
"012",
desired_length=2,
start_offset=0,
end_offset=1,
insert_amino_acid_letters="012")
expected = [
"12",
"02",
]
eq_(results, expected)
def test_fixed_length_from_many_peptides():
kmers, original_indices, counts = fixed_length_from_many_peptides(
peptides=["ABC", "A"],
desired_length=2,
start_offset_extend=0,
end_offset_extend=0,
start_offset_shorten=0,
end_offset_shorten=0,
insert_amino_acid_letters="ABC")
print(kmers)
print(original_indices)
print(counts)
eq_(len(kmers), len(original_indices))
eq_(len(kmers), len(counts))
eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"])
eq_(original_indices, [0] * 3 + [1] * 6)
eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment