Skip to content
Snippets Groups Projects
Commit f110ea6e authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

moved ScoreSet out of experiments

parent deb4e546
No related branches found
No related tags found
No related merge requests found
...@@ -59,7 +59,7 @@ parser.add_argument( ...@@ -59,7 +59,7 @@ parser.add_argument(
parser.add_argument( parser.add_argument(
"--output-file", "--output-file",
default="imputation-accuracy-comparison.csv") default="matrix-completion-accuracy-results.csv")
parser.add_argument( parser.add_argument(
"--normalize-columns", "--normalize-columns",
......
...@@ -36,7 +36,7 @@ from mhcflurry import Class1BindingPredictor ...@@ -36,7 +36,7 @@ from mhcflurry import Class1BindingPredictor
from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import StratifiedKFold
from dataset_paths import PETERS2009_CSV_PATH from dataset_paths import PETERS2009_CSV_PATH
from score_set import ScoreSet
from matrix_completion_helpers import load_data, evaluate_predictions from matrix_completion_helpers import load_data, evaluate_predictions
from arg_parsing import parse_int_list, parse_float_list, parse_string_list from arg_parsing import parse_int_list, parse_float_list, parse_string_list
...@@ -206,9 +206,11 @@ if __name__ == "__main__": ...@@ -206,9 +206,11 @@ if __name__ == "__main__":
df = pd.DataFrame(pMHC_affinity_matrix, columns=allele_list, index=peptide_list) df = pd.DataFrame(pMHC_affinity_matrix, columns=allele_list, index=peptide_list)
df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
scores = ScoreSet( if args.output_file:
index=[ output_file = open(args.output_file, "w")
fields = [
"allele", "allele",
"cv_fold",
"peptide_count", "peptide_count",
"sample_count", "sample_count",
"dropout_probability", "dropout_probability",
...@@ -216,8 +218,15 @@ if __name__ == "__main__": ...@@ -216,8 +218,15 @@ if __name__ == "__main__":
"hidden_layer_size1", "hidden_layer_size1",
"hidden_layer_size2", "hidden_layer_size2",
"activation" "activation"
]) "mae",
"tau",
"auc",
"f1"
]
header_line = ",".join(fields)
output_file.write(header_line + "\n")
else:
output_file = None
if args.unknown_amino_acids: if args.unknown_amino_acids:
index_encoding = amino_acids_with_unknown.index_encoding index_encoding = amino_acids_with_unknown.index_encoding
else: else:
...@@ -256,7 +265,7 @@ if __name__ == "__main__": ...@@ -256,7 +265,7 @@ if __name__ == "__main__":
for hidden_layer_size1 in args.first_hidden_layer_sizes: for hidden_layer_size1 in args.first_hidden_layer_sizes:
for hidden_layer_size2 in args.second_hidden_layer_sizes: for hidden_layer_size2 in args.second_hidden_layer_sizes:
for activation in args.activation_functions: for activation in args.activation_functions:
key = "%f,%d,%d,%d,%s" % ( key = "%0.2f,%d,%d,%d,%s" % (
dropout, dropout,
embedding_dim_size, embedding_dim_size,
hidden_layer_size1, hidden_layer_size1,
...@@ -353,25 +362,13 @@ if __name__ == "__main__": ...@@ -353,25 +362,13 @@ if __name__ == "__main__":
train_values_fold = [observed_values[i] for i in train_indices] train_values_fold = [observed_values[i] for i in train_indices]
train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)} train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)}
"""
if args.verbose:
print("Training peptides for CV fold %d/%d:" % (
fold_idx + 1,
args.n_folds))
for p in train_peptides_fold:
aff = train_dict_fold[p]
print("-- %s: %f (%f nM)" % (
p,
aff,
args.max_ic50 ** (1 - aff)))
"""
test_peptides = [observed_peptides[i] for i in test_indices] test_peptides = [observed_peptides[i] for i in test_indices]
test_values = [observed_values[i] for i in test_indices] test_values = [observed_values[i] for i in test_indices]
test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} test_dict = {k: v for (k, v) in zip(test_peptides, test_values)}
if imputer is None: if imputer is None:
X_pretrain = None X_pretrain = np.array([], dtype=int).reshape((0, 9))
Y_pretrain = None Y_pretrain = np.array([], dtype=float)
pretrain_sample_weights = None pretrain_sample_weights = np.array([], dtype=float)
else: else:
# drop the test peptides from the full matrix and then # drop the test peptides from the full matrix and then
# run completion on it to get synthesized affinities # run completion on it to get synthesized affinities
...@@ -485,10 +482,26 @@ if __name__ == "__main__": ...@@ -485,10 +482,26 @@ if __name__ == "__main__":
y_true=test_values, y_true=test_values,
y_pred=y_pred, y_pred=y_pred,
max_ic50=args.max_ic50) max_ic50=args.max_ic50)
scores.add_many(
("%s,%d,%d," % (allele, n_train_unique, n_train)) + key, cv_fold_field_values = [
mae=mae, allele,
tau=tau, str(fold_idx),
f1_score=f1_score, str(n_train_unique),
auc=auc) str(n_train),
scores.to_csv(args.output_file) ]
accuracy_field_values = [
"%0.4f" % mae,
"%0.4f" % tau,
"%0.4f" % auc,
"%0.4f" % f1_score
]
output_line = (
",".join(cv_fold_field_values) +
"," + key +
"," + ",".join(accuracy_field_values) +
"\n"
)
print("CV fold result: %s" % output_line)
if output_file:
output_file.write(output_line + "\n")
output_file.flush()
...@@ -35,27 +35,25 @@ from .paths import CLASS1_MODEL_DIRECTORY ...@@ -35,27 +35,25 @@ from .paths import CLASS1_MODEL_DIRECTORY
from .feedforward import make_embedding_network from .feedforward import make_embedding_network
from .predictor_base import PredictorBase from .predictor_base import PredictorBase
_allele_predictor_cache = {}
from .class1_allele_specific_hyperparameters import MAX_IC50 from .class1_allele_specific_hyperparameters import MAX_IC50
_allele_predictor_cache = {}
class Class1BindingPredictor(PredictorBase): class Class1BindingPredictor(PredictorBase):
def __init__( def __init__(
self, self,
model, model,
allele=None, name=None,
max_ic50=MAX_IC50, max_ic50=MAX_IC50,
allow_unknown_amino_acids=False, allow_unknown_amino_acids=False,
verbose=False): verbose=False):
PredictorBase.__init__( PredictorBase.__init__(
self, self,
name=allele, name=name,
max_ic50=max_ic50, max_ic50=max_ic50,
allow_unknown_amino_acids=allow_unknown_amino_acids, allow_unknown_amino_acids=allow_unknown_amino_acids,
verbose=verbose) verbose=verbose)
self.allele = allele self.name = name
self.model = model self.model = model
@classmethod @classmethod
...@@ -411,21 +409,15 @@ class Class1BindingPredictor(PredictorBase): ...@@ -411,21 +409,15 @@ class Class1BindingPredictor(PredictorBase):
return list(sorted(alleles)) return list(sorted(alleles))
def __repr__(self): def __repr__(self):
return "Class1BindingPredictor(allele=%s, model=%s, max_ic50=%f)" % ( return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % (
self.allele, self.name,
self.model, self.model,
self.max_ic50) self.max_ic50)
def __str__(self): def __str__(self):
return repr(self) return repr(self)
def predict_9mer_peptides(self, peptides): def predict_encoded(self, X):
"""
Predict binding affinity for 9mer peptides
"""
if any(len(peptide) != 9 for peptide in peptides):
raise ValueError("Can only predict 9mer peptides")
X = self.encode_peptides(peptides)
max_expected_index = 20 if self.allow_unknown_amino_acids else 19 max_expected_index = 20 if self.allow_unknown_amino_acids else 19
assert X.max() <= max_expected_index, \ assert X.max() <= max_expected_index, \
"Got index %d in peptide encoding" % (X.max(),) "Got index %d in peptide encoding" % (X.max(),)
......
...@@ -12,5 +12,16 @@ ...@@ -12,5 +12,16 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
class Ensemble(object): class Ensemble(object):
def __init__(self, models, name=None):
\ No newline at end of file self.name = name
self.models = models
@classmethod
def from_directory(cls, directory_path):
files = os.listdir(directory_path)
...@@ -240,7 +240,6 @@ def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): ...@@ -240,7 +240,6 @@ def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
(n_samples, peptide_length) = X.shape (n_samples, peptide_length) = X.shape
if not n_indices: if not n_indices:
n_indices = X.max() - first_index_value + 1 n_indices = X.max() - first_index_value + 1
X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool) X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
for i, row in enumerate(X): for i, row in enumerate(X):
for j, xij in enumerate(row): for j, xij in enumerate(row):
...@@ -285,7 +284,7 @@ def fixed_length_index_encoding( ...@@ -285,7 +284,7 @@ def fixed_length_index_encoding(
insert_letters = ["X"] insert_letters = ["X"]
index_encoding = amino_acids_with_unknown.index_encoding index_encoding = amino_acids_with_unknown.index_encoding
else: else:
insert_letters = common_amino_acids.letters() insert_letters = common_amino_acid_letters
index_encoding = common_amino_acids.index_encoding index_encoding = common_amino_acids.index_encoding
fixed_length, original, counts = fixed_length_from_many_peptides( fixed_length, original, counts = fixed_length_from_many_peptides(
......
...@@ -12,13 +12,12 @@ ...@@ -12,13 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np from collections import defaultdict
from itertools import groupby import numpy as np
from .peptide_encoding import fixed_length_from_many_peptides from .peptide_encoding import fixed_length_index_encoding
from .amino_acid import ( from .amino_acid import (
common_amino_acid_letters,
amino_acids_with_unknown, amino_acids_with_unknown,
common_amino_acids common_amino_acids
) )
...@@ -53,6 +52,42 @@ class PredictorBase(object): ...@@ -53,6 +52,42 @@ class PredictorBase(object):
else: else:
return common_amino_acids.index_encoding(peptides, 9) return common_amino_acids.index_encoding(peptides, 9)
def encode_peptides(self, peptides):
"""
Parameters
----------
peptides : str list
Peptide strings of any length
Encode peptides of any length into fixed length vectors.
Returns 2d array of encoded peptides and 1d array indicating the
original peptide index for each row.
"""
indices = []
encoded_matrices = []
for i, peptide in enumerate(peptides):
matrix, _, _ = fixed_length_index_encoding(
peptides=[peptide],
desired_length=9,
allow_unknown_amino_acids=self.allow_unknown_amino_acids)
encoded_matrices.append(matrix)
indices.extend([i] * len(matrix))
combined_matrix = np.concatenate(encoded_matrices)
index_array = np.array(indices)
expected_shape = (len(index_array), 9)
assert combined_matrix.shape == expected_shape, \
"Expected shape %s but got %s" % (expected_shape, combined_matrix.shape)
return combined_matrix, index_array
def predict_9mer_peptides(self, peptides):
"""
Predict binding affinity for 9mer peptides
"""
if any(len(peptide) != 9 for peptide in peptides):
raise ValueError("Can only predict 9mer peptides")
X, _ = self.encode_peptides(peptides)
return self.predict_encoded(X)
def predict_9mer_peptides_ic50(self, peptides): def predict_9mer_peptides_ic50(self, peptides):
return self.log_to_ic50(self.predict_9mer_peptides(peptides)) return self.log_to_ic50(self.predict_9mer_peptides(peptides))
...@@ -63,6 +98,10 @@ class PredictorBase(object): ...@@ -63,6 +98,10 @@ class PredictorBase(object):
return self.log_to_ic50( return self.log_to_ic50(
self.predict_peptides(peptides)) self.predict_peptides(peptides))
def predict_encoded(self, X):
raise ValueError("Not yet implemented for %s!" % (
self.__class__.__name__,))
def predict_peptides( def predict_peptides(
self, self,
peptides, peptides,
...@@ -75,33 +114,18 @@ class PredictorBase(object): ...@@ -75,33 +114,18 @@ class PredictorBase(object):
amino acid characters. The prediction for a single peptide will be amino acid characters. The prediction for a single peptide will be
the average of expanded 9mers. the average of expanded 9mers.
""" """
results_dict = {} input_matrix, original_peptide_indices = self.encode_peptides(peptides)
for length, group_peptides in groupby(peptides, lambda x: len(x)): # non-9mer peptides get multiple predictions, which are then combined
group_peptides = list(group_peptides) # with the combine_fn argument
expanded_peptides, _, _ = fixed_length_from_many_peptides( multiple_predictions_dict = defaultdict(list)
peptides=group_peptides, fixed_length_predictions = self.predict_encoded(input_matrix)
desired_length=9, for i, yi in enumerate(fixed_length_predictions):
insert_amino_acid_letters=( original_peptide_index = original_peptide_indices[i]
["X"] if self.allow_unknown_amino_acids original_peptide = peptides[original_peptide_index]
else common_amino_acid_letters)) multiple_predictions_dict[original_peptide].append(yi)
n_group = len(group_peptides) combined_predictions_dict = {
n_expanded = len(expanded_peptides) p: combine_fn(ys)
expansion_factor = int(n_expanded / n_group) for (p, ys) in multiple_predictions_dict.items()
raw_y = self.predict_9mer_peptides(expanded_peptides) }
if expansion_factor == 1: return np.array([combined_predictions_dict[p] for p in peptides])
log_ic50s = raw_y
else:
# if peptides were a different length than the predictor's
# expected input length, then let's take the median prediction
# of each expanded peptide set
log_ic50s = np.zeros(n_group)
# take the median of each group of log(IC50) values
for i in range(n_group):
start = i * expansion_factor
end = (i + 1) * expansion_factor
log_ic50s[i] = combine_fn(raw_y[start:end])
assert len(group_peptides) == len(log_ic50s)
for peptide, log_ic50 in zip(group_peptides, log_ic50s):
results_dict[peptide] = log_ic50
return np.array([results_dict[p] for p in peptides])
File moved
...@@ -29,33 +29,39 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) ...@@ -29,33 +29,39 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--ic50-fraction-tolerance", parser.add_argument(
"--ic50-fraction-tolerance",
default=0.01, default=0.01,
type=float, type=float,
help=( help=(
"How much can the IEDB and NetMHCpan IC50 differ and still be" "How much can the IEDB and NetMHCpan IC50 differ and still be"
" considered compatible (as a fraction of the NetMHCpan value)")) " considered compatible (as a fraction of the NetMHCpan value)"))
parser.add_argument("--min-assay-overlap-size", parser.add_argument(
"--min-assay-overlap-size",
type=int, type=int,
default=1, default=1,
help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data") help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data")
parser.add_argument("--min-assay-fraction-same", parser.add_argument(
"--min-assay-fraction-same",
type=float, type=float,
help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data", help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data",
default=0.9) default=0.9)
parser.add_argument("--iedb-pickle-path", parser.add_argument(
"--iedb-pickle-path",
default=IEDB_PICKLE_PATH, default=IEDB_PICKLE_PATH,
help="Path to .pickle file containing dictionary of IEDB assay datasets") help="Path to .pickle file containing dictionary of IEDB assay datasets")
parser.add_argument("--netmhcpan-csv-path", parser.add_argument(
"--netmhcpan-csv-path",
default=PETERS_CSV_PATH, default=PETERS_CSV_PATH,
help="Path to CSV with NetMHCpan dataset from 2013 Peters paper") help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
parser.add_argument("--output-csv-path", parser.add_argument(
"--output-csv-path",
default=CLASS1_DATA_CSV_PATH, default=CLASS1_DATA_CSV_PATH,
help="Path to CSV of combined assay results") help="Path to CSV of combined assay results")
......
...@@ -82,7 +82,13 @@ def test_encode_peptides_9mer(): ...@@ -82,7 +82,13 @@ def test_encode_peptides_9mer():
predictor = Class1BindingPredictor( predictor = Class1BindingPredictor(
model=Dummy9merIndexEncodingModel(), model=Dummy9merIndexEncodingModel(),
allow_unknown_amino_acids=True) allow_unknown_amino_acids=True)
X = predictor.encode_peptides(["AAASSSYYY"]) X = predictor.encode_9mer_peptides(["AAASSSYYY"])
assert X.shape[0] == 1, X.shape
assert X.shape[1] == 9, X.shape
X, indices = predictor.encode_peptides(["AAASSSYYY"])
assert len(indices) == 1
assert indices[0] == 0
assert X.shape[0] == 1, X.shape assert X.shape[0] == 1, X.shape
assert X.shape[1] == 9, X.shape assert X.shape[1] == 9, X.shape
...@@ -91,7 +97,9 @@ def test_encode_peptides_8mer(): ...@@ -91,7 +97,9 @@ def test_encode_peptides_8mer():
predictor = Class1BindingPredictor( predictor = Class1BindingPredictor(
model=Dummy9merIndexEncodingModel(), model=Dummy9merIndexEncodingModel(),
allow_unknown_amino_acids=True) allow_unknown_amino_acids=True)
X = predictor.encode_peptides(["AAASSSYY"]) X, indices = predictor.encode_peptides(["AAASSSYY"])
assert len(indices) == 9
assert (indices == 0).all()
assert X.shape[0] == 9, (X.shape, X) assert X.shape[0] == 9, (X.shape, X)
assert X.shape[1] == 9, (X.shape, X) assert X.shape[1] == 9, (X.shape, X)
...@@ -100,6 +108,8 @@ def test_encode_peptides_10mer(): ...@@ -100,6 +108,8 @@ def test_encode_peptides_10mer():
predictor = Class1BindingPredictor( predictor = Class1BindingPredictor(
model=Dummy9merIndexEncodingModel(), model=Dummy9merIndexEncodingModel(),
allow_unknown_amino_acids=True) allow_unknown_amino_acids=True)
X = predictor.encode_peptides(["AAASSSYYFF"]) X, indices = predictor.encode_peptides(["AAASSSYYFF"])
assert len(indices) == 10
assert (indices == 0).all()
assert X.shape[0] == 10, (X.shape, X) assert X.shape[0] == 10, (X.shape, X)
assert X.shape[1] == 9, (X.shape, X) assert X.shape[1] == 9, (X.shape, X)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment