Skip to content
Snippets Groups Projects
Commit 7d17bedc authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

Merge pull request #19 from hammerlab/move-imputation-from-experiments

moving helpers for performing dataset imputations from experiments 
parents 00b17305 ac95b1f1
No related branches found
No related tags found
No related merge requests found
[![Build Status](https://travis-ci.org/hammerlab/mhcflurry.svg?branch=master)](https://travis-ci.org/hammerlab/mhcflurry) [![Coverage Status](https://coveralls.io/repos/github/hammerlab/mhcflurry/badge.svg?branch=fix-training-script)](https://coveralls.io/github/hammerlab/mhcflurry?branch=fix-training-script)
[![Build Status](https://travis-ci.org/hammerlab/mhcflurry.svg?branch=master)](https://travis-ci.org/hammerlab/mhcflurry) [![Coverage Status](https://coveralls.io/repos/github/hammerlab/mhcflurry/badge.svg?branch=master)](https://coveralls.io/github/hammerlab/mhcflurry?branch=master)
# mhcflurry
Peptide-MHC binding affinity prediction
......
# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -17,7 +17,7 @@ from __future__ import (
division,
absolute_import,
)
import numpy as np
def parse_int_list(s):
return [int(part.strip() for part in s.split(","))]
......@@ -65,3 +65,37 @@ def split_allele_names(s):
for part
in s.split(",")
]
def ic50_to_regression_target(ic50, max_ic50):
"""
Transform IC50 inhibitory binding concentrations to affinity values between
[0,1] where 0 means a value greater or equal to max_ic50 and 1 means very
strong binder.
Parameters
----------
ic50 : numpy.ndarray
max_ic50 : float
"""
log_ic50 = np.log(ic50) / np.log(max_ic50)
regression_target = 1.0 - log_ic50
# clamp to values between 0, 1
regression_target = np.maximum(regression_target, 0.0)
regression_target = np.minimum(regression_target, 1.0)
return regression_target
def regression_target_to_ic50(y, max_ic50):
"""
Transform values between [0,1] to IC50 inhibitory binding concentrations
between [1.0, infinity]
Parameters
----------
y : numpy.ndarray of float
max_ic50 : float
Returns numpy.ndarray
"""
return max_ic50 ** (1.0 - y)
......@@ -22,7 +22,7 @@ from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
from .common import normalize_allele_name
from .common import normalize_allele_name, ic50_to_regression_target
from .amino_acid import common_amino_acids
from .peptide_encoding import (
indices_to_hotshot_encoding,
......@@ -104,8 +104,7 @@ def load_dataframe(
only_human : bool
Only load entries from human MHC alleles
Returns DataFrame augmented with extra columns:
- "log_ic50" : log(ic50) / log(max_ic50)
Returns DataFrame augmented with extra column:
- "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
"""
if sep is None:
......@@ -138,13 +137,7 @@ def load_dataframe(
df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
ic50 = np.array(df[ic50_column_name])
log_ic50 = np.log(ic50) / np.log(max_ic50)
df["log_ic50"] = log_ic50
regression_output = 1.0 - log_ic50
# clamp to values between 0, 1
regression_output = np.maximum(regression_output, 0.0)
regression_output = np.minimum(regression_output, 1.0)
df["regression_output"] = regression_output
df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50)
return df, peptide_column_name
......@@ -278,6 +271,57 @@ def encode_peptide_to_affinity_dict(
n_samples, len(Y))
return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
def create_allele_data_from_peptide_to_ic50_dict(
peptide_to_ic50_dict,
max_ic50=MAX_IC50,
kmer_length=9,
flatten_binary_encoding=True):
"""
Parameters
----------
peptide_to_ic50_dict : dict
Dictionary mapping peptides of different lengths to IC50 binding
affinity values.
max_ic50 : float
Maximum IC50 value used as the cutoff for affinity of 0.0 when
transforming from IC50 to regression targets.
kmer_length : int
What length substrings will be fed to a fixed-length predictor?
flatten_binary_encoding : bool
Should hotshot encodings of amino acid inputs be flattened into a 1D
vector or have two dimensions (where the first represents position)?
Return an AlleleData object.
"""
Y_dict = {
peptide: ic50_to_regression_target(ic50, max_ic50)
for (peptide, ic50)
in peptide_to_ic50_dict.items()
}
(kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \
encode_peptide_to_affinity_dict(
Y_dict,
peptide_length=kmer_length,
flatten_binary_encoding=flatten_binary_encoding)
ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides])
assert len(kmer_peptides) == len(ic50_array), \
"Mismatch between # of peptides %d and # IC50 outputs %d" % (
len(kmer_peptides), len(ic50_array))
return AlleleData(
X_index=X_index,
X_binary=X_binary,
Y=Y_kmer,
ic50=ic50_array,
peptides=kmer_peptides,
original_peptides=original_peptides,
original_lengths=[len(peptide) for peptide in original_peptides],
substring_counts=counts,
weights=1.0 / counts)
def load_allele_datasets(
filename,
......@@ -371,35 +415,9 @@ def load_allele_datasets(
for (peptide, ic50)
in zip(raw_peptides, group[ic50_column_name])
}
Y_dict = {
peptide: y
for (peptide, y)
in zip(raw_peptides, group["regression_output"])
}
(kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
encode_peptide_to_affinity_dict(
Y_dict,
peptide_length=peptide_length,
flatten_binary_encoding=flatten_binary_encoding)
ic50 = np.array([ic50_dict[p] for p in original_peptides])
assert len(kmer_peptides) == len(ic50), \
"Mismatch between # of peptides %d and # IC50 outputs %d" % (
len(kmer_peptides), len(ic50))
allele_groups[allele] = AlleleData(
X_index=X_index,
X_binary=X_binary,
Y=Y,
ic50=ic50,
peptides=kmer_peptides,
original_peptides=original_peptides,
original_lengths=[len(peptide) for peptide in original_peptides],
substring_counts=counts,
weights=1.0 / counts)
allele_groups[allele] = create_allele_data_from_peptide_to_ic50_dict(
ic50_dict,
max_ic50=max_ic50)
return allele_groups
......
......@@ -27,28 +27,6 @@ import theano
theano.config.exception_verbosity = 'high'
def compile_forward_predictor(model, theano_mode=None):
"""
In cases where we want to get predictions from a model that hasn't
been compiled (to avoid overhead of compiling training code),
use this helper to only compile the subset of Theano needed for
forward-propagation/predictions.
"""
model.X_test = model.get_input(train=False)
model.y_test = model.get_output(train=False)
if type(model.X_test) == list:
predict_ins = model.X_test
else:
predict_ins = [model.X_test]
model._predict = theano.function(
predict_ins,
model.y_test,
allow_input_downcast=True,
mode=theano_mode)
def make_network(
input_size,
embedding_input_dim=None,
......@@ -61,8 +39,7 @@ def make_network(
dropout_probability=0.0,
model=None,
optimizer=None,
learning_rate=0.001,
compile_for_training=True):
learning_rate=0.001):
if model is None:
model = Sequential()
......@@ -112,16 +89,13 @@ def make_network(
output_dim=1,
init=init))
model.add(Activation(output_activation))
if compile_for_training:
model.compile(loss=loss, optimizer=optimizer)
else:
compile_forward_predictor(model)
model.compile(loss=loss, optimizer=optimizer)
return model
def make_hotshot_network(
peptide_length=9,
layer_sizes=[500],
layer_sizes=[100],
activation="relu",
init="lecun_uniform",
loss="mse",
......@@ -146,7 +120,7 @@ def make_embedding_network(
peptide_length=9,
embedding_input_dim=20,
embedding_output_dim=20,
layer_sizes=[500],
layer_sizes=[100],
activation="relu",
init="lecun_uniform",
loss="mse",
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import (
print_function,
division,
absolute_import,
)
from collections import defaultdict
import numpy as np
from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
from .data import (
create_allele_data_from_peptide_to_ic50_dict,
)
def prune_dense_matrix_and_labels(
X,
peptide_list,
allele_list,
min_observations_per_peptide=1,
min_observations_per_allele=1):
"""
Filter the dense matrix of pMHC binding affinities according to
the given minimum number of row/column observations.
Parameters
----------
X : numpy.ndarray
Incomplete dense matrix of pMHC affinity with n_peptides rows and
n_alleles columns.
peptide_list : list of str
Expected to have n_peptides entries
allele_list : list of str
Expected to have n_alleles entries
min_observations_per_peptide : int
Drop peptide rows with fewer than this number of observed values.
min_observations_per_allele : int
Drop allele columns with fewer than this number of observed values.
"""
observed_mask = np.isfinite(X)
n_observed_per_peptide = observed_mask.sum(axis=1)
too_few_peptide_observations = (
n_observed_per_peptide < min_observations_per_peptide)
if too_few_peptide_observations.any():
drop_peptide_indices = np.where(too_few_peptide_observations)[0]
keep_peptide_indices = np.where(~too_few_peptide_observations)[0]
print("Dropping %d peptides with <%d observations" % (
len(drop_peptide_indices),
min_observations_per_peptide))
X = X[keep_peptide_indices]
observed_mask = observed_mask[keep_peptide_indices]
peptide_list = [peptide_list[i] for i in keep_peptide_indices]
n_observed_per_allele = observed_mask.sum(axis=0)
too_few_allele_observations = (
n_observed_per_allele < min_observations_per_peptide)
if too_few_peptide_observations.any():
drop_allele_indices = np.where(too_few_allele_observations)[0]
keep_allele_indices = np.where(~too_few_allele_observations)[0]
print("Dropping %d alleles with <%d observations: %s" % (
len(drop_allele_indices),
min_observations_per_allele,
[allele_list[i] for i in drop_allele_indices]))
X = X[:, keep_allele_indices]
observed_mask = observed_mask[:, keep_allele_indices]
allele_list = [allele_list[i] for i in keep_allele_indices]
return X, peptide_list, allele_list
def create_incomplete_dense_pMHC_matrix(
allele_data_dict,
min_observations_per_peptide=1,
min_observations_per_allele=1):
"""
Given a dictionary mapping each allele name onto an AlleleData object,
returns a tuple with a dense matrix of affinities, a list of peptide labels
for each row and a list of allele labels for each column.
Parameters
----------
allele_data_dict : dict
Dictionary mapping allele names to AlleleData objects
imputer : object
Expected to have a method imputer.complete(X) which takes an array
with missing entries marked by NaN and returns a completed array.
min_observations_per_peptide : int
Drop peptide rows with fewer than this number of observed values.
min_observations_per_allele : int
Drop allele columns with fewer than this number of observed values.
"""
peptide_to_allele_to_affinity_dict = defaultdict(dict)
for (allele_name, allele_data) in allele_data_dict.items():
for peptide, affinity in zip(
allele_data.original_peptides,
allele_data.Y):
if allele_name not in peptide_to_allele_to_affinity_dict[peptide]:
peptide_to_allele_to_affinity_dict[peptide][allele_name] = affinity
n_binding_values = sum(
len(allele_dict)
for allele_dict in
allele_data_dict.values()
)
print("Collected %d binding values for %d alleles" % (
n_binding_values,
len(peptide_to_allele_to_affinity_dict)))
X, peptide_list, allele_list = \
dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity_dict)
return prune_dense_matrix_and_labels(
X,
peptide_list,
allele_list,
min_observations_per_peptide=min_observations_per_peptide,
min_observations_per_allele=min_observations_per_allele)
def dense_pMHC_matrix_to_nested_dict(X, peptide_list, allele_list):
"""
Converts a dense matrix of (n_peptides, n_alleles) floats to a nested
dictionary from allele -> peptide -> affinity.
"""
allele_to_peptide_to_ic50_dict = defaultdict(dict)
for row_index, peptide in enumerate(peptide_list):
for column_index, allele_name in enumerate(allele_list):
affinity = X[row_index, column_index]
if np.isfinite(affinity):
allele_to_peptide_to_ic50_dict[allele_name][peptide] = affinity
return allele_to_peptide_to_ic50_dict
def create_imputed_datasets(
allele_data_dict,
imputer,
min_observations_per_peptide=1,
min_observations_per_allele=1):
"""
Parameters
----------
allele_data_dict : dict
Dictionary mapping allele names to AlleleData objects
imputer : object
Expected to have a method imputer.complete(X) which takes an array
with missing entries marked by NaN and returns a completed array.
min_observations_per_peptide : int
Drop peptide rows with fewer than this number of observed values.
min_observations_per_allele : int
Drop allele columns with fewer than this number of observed values.
Returns dictionary mapping allele names to AlleleData objects containing
imputed affinity values.
"""
X_incomplete, peptide_list, allele_list = create_incomplete_dense_pMHC_matrix(
allele_data_dict=allele_data_dict,
min_observations_per_peptide=min_observations_per_peptide,
min_observations_per_allele=min_observations_per_allele)
if np.isnan(X_incomplete).sum() == 0:
# if all entries in the matrix are already filled in then don't
# try using an imputation algorithm since it might raise an
# exception.
X_complete = X_incomplete
else:
X_complete = imputer.complete(X_incomplete)
allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict(
X=X_complete,
peptide_list=peptide_list,
allele_list=allele_list)
return {
allele_name: create_allele_data_from_peptide_to_ic50_dict(allele_dict)
for (allele_name, allele_dict)
in allele_to_peptide_to_affinity_dict.items()
}
__version__ = "0.0.2"
__version__ = "0.0.3"
......@@ -17,11 +17,19 @@ from collections import OrderedDict
import pandas as pd
from .class1_binding_predictor import Class1BindingPredictor
from .common import normalize_allele_name
def predict(alleles, peptides):
allele_results = OrderedDict([])
result_dict = OrderedDict([
("allele", []),
("peptide", []),
("ic50", []),
])
for allele in alleles:
allele = normalize_allele_name(allele)
model = Class1BindingPredictor.from_allele_name(allele)
result_dictionary = model.predict_peptides(peptides)
allele_results.append(result_dictionary)
return pd.concat(allele_results)
for i, ic50 in enumerate(model.predict_peptides_ic50(peptides)):
result_dict["allele"].append(allele)
result_dict["peptide"].append(peptides[i])
result_dict["ic50"].append(ic50)
return pd.DataFrame(result_dict)
......@@ -22,6 +22,7 @@ from .amino_acid import (
amino_acids_with_unknown,
common_amino_acids
)
from .common import regression_target_to_ic50
class PredictorBase(object):
......@@ -40,13 +41,6 @@ class PredictorBase(object):
self.allow_unknown_amino_acids = allow_unknown_amino_acids
self.verbose = verbose
def log_to_ic50(self, log_value):
"""
Convert neural network output to IC50 values between 0.0 and
self.max_ic50 (typically 5000, 20000 or 50000)
"""
return self.max_ic50 ** (1.0 - log_value)
def encode_9mer_peptides(self, peptides):
if self.allow_unknown_amino_acids:
return amino_acids_with_unknown.index_encoding(peptides, 9)
......@@ -90,14 +84,15 @@ class PredictorBase(object):
return self.predict(X)
def predict_9mer_peptides_ic50(self, peptides):
return self.log_to_ic50(self.predict_9mer_peptides(peptides))
scores = self.predict_9mer_peptides(peptides)
return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
def predict_peptides_ic50(self, peptides):
"""
Predict IC50 affinities for peptides of any length
"""
return self.log_to_ic50(
self.predict_peptides(peptides))
scores = self.predict_peptides(peptides)
return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
def predict(self, X):
raise ValueError("Method 'predict' not yet implemented for %s!" % (
......
......@@ -7,4 +7,4 @@ h5py
cherrypy
bottle
fancyimpute
scikit-learn
scikit-learn
\ No newline at end of file
from nose.tools import eq_
from mhcflurry.data import (
create_allele_data_from_peptide_to_ic50_dict,
AlleleData
)
def test_create_allele_data_from_peptide_to_ic50_dict():
peptide_to_ic50_dict = {
("A" * 10): 1.2,
("C" * 9): 1000,
}
allele_data = create_allele_data_from_peptide_to_ic50_dict(
peptide_to_ic50_dict,
max_ic50=50000.0)
assert isinstance(allele_data, AlleleData)
expected_peptides = set([
"A" * 9,
"C" * 9,
])
peptides = set(allele_data.peptides)
eq_(expected_peptides, peptides)
from mhcflurry.common import (
ic50_to_regression_target,
regression_target_to_ic50,
)
from nose.tools import eq_
def test_regression_target_to_ic50():
eq_(regression_target_to_ic50(0, max_ic50=500.0), 500)
eq_(regression_target_to_ic50(1, max_ic50=500.0), 1.0)
def test_ic50_to_regression_target():
eq_(ic50_to_regression_target(5000, max_ic50=5000.0), 0)
eq_(ic50_to_regression_target(0, max_ic50=5000.0), 1.0)
from mhcflurry.imputation import (
create_imputed_datasets,
)
from mhcflurry.data import create_allele_data_from_peptide_to_ic50_dict
from fancyimpute import MICE
from nose.tools import eq_
def test_create_imputed_datasets_empty():
result = create_imputed_datasets({}, imputer=MICE(n_imputations=25))
eq_(result, {})
def test_create_imputed_datasets_two_alleles():
allele_data_dict = {
"HLA-A*02:01": create_allele_data_from_peptide_to_ic50_dict({
"A" * 9: 20.0,
"C" * 9: 40000.0,
}),
"HLA-A*02:05": create_allele_data_from_peptide_to_ic50_dict({
"S" * 9: 500.0,
"A" * 9: 25.0,
}),
}
result = create_imputed_datasets(allele_data_dict, imputer=MICE(n_imputations=25))
eq_(set(result.keys()), {"HLA-A*02:01", "HLA-A*02:05"})
expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
for allele_name, allele_data in result.items():
print(allele_name)
print(allele_data)
eq_(set(allele_data.peptides), expected_peptides)
from mhcflurry.feedforward import (
make_embedding_network,
make_hotshot_network,
)
import numpy as np
def test_make_embedding_network():
nn = make_embedding_network(
peptide_length=3,
layer_sizes=[3],
activation="tanh",
embedding_input_dim=3,
embedding_output_dim=20,
learning_rate=0.1)
X_negative = np.array([
[0] * 3,
[1] * 3,
[1, 0, 0],
[1, 1, 0],
[0, 1, 0],
[0, 0, 1],
[1, 0, 1],
])
X_positive = np.array([
[0, 2, 0],
[1, 2, 0],
[1, 2, 1],
[0, 2, 1],
[2, 2, 0],
[2, 2, 1],
[2, 2, 2],
])
X_index = np.vstack([X_negative, X_positive])
Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive))
nn.fit(X_index, Y, nb_epoch=10)
Y_pred = nn.predict(X_index)
print(Y)
print(Y_pred)
for (Y_i, Y_pred_i) in zip(Y, Y_pred):
assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i)
def test_make_hotshot_network():
nn = make_hotshot_network(
peptide_length=3,
activation="relu",
layer_sizes=[4],
n_amino_acids=2,
learning_rate=0.1)
X_binary = np.array([
[True, False, True, False, True, False],
[True, False, True, False, False, True],
[True, False, False, True, True, False],
[True, False, False, True, False, True],
[False, True, True, False, True, False],
[False, True, True, False, False, True],
], dtype=bool)
Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
nn.fit(X_binary, Y, nb_epoch=10)
Y_pred = nn.predict(X_binary)
print(Y)
print(Y_pred)
for (Y_i, Y_pred_i) in zip(Y, Y_pred):
assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i)
from mhcflurry import predict
def test_predict_A1_Titin_epitope():
result_df = predict(alleles=["HLA-A*01:01"], peptides=["ESDPIVAQY"])
assert len(result_df) == 1
row = result_df.ix[0]
ic50 = row["ic50"]
assert ic50 <= 500
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment