From f110ea6ee1b9c934b90d0df293d54ef52f1e6973 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Wed, 17 Feb 2016 13:06:59 -0500 Subject: [PATCH] moved ScoreSet out of experiments --- experiments/matrix-completion-accuracy.py | 2 +- ...matrix-completion-hyperparameter-search.py | 69 ++++++++------ mhcflurry/class1_binding_predictor.py | 22 ++--- mhcflurry/ensemble.py | 13 ++- mhcflurry/peptide_encoding.py | 3 +- mhcflurry/predictor_base.py | 92 ++++++++++++------- {experiments => mhcflurry}/score_set.py | 0 scripts/create-combined-class1-dataset.py | 18 ++-- test/test_class1_binding_predictor.py | 16 +++- 9 files changed, 145 insertions(+), 90 deletions(-) rename {experiments => mhcflurry}/score_set.py (100%) diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py index b3310ae9..0fef8576 100644 --- a/experiments/matrix-completion-accuracy.py +++ b/experiments/matrix-completion-accuracy.py @@ -59,7 +59,7 @@ parser.add_argument( parser.add_argument( "--output-file", - default="imputation-accuracy-comparison.csv") + default="matrix-completion-accuracy-results.csv") parser.add_argument( "--normalize-columns", diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py index b324465b..8c5a7611 100755 --- a/experiments/matrix-completion-hyperparameter-search.py +++ b/experiments/matrix-completion-hyperparameter-search.py @@ -36,7 +36,7 @@ from mhcflurry import Class1BindingPredictor from sklearn.cross_validation import StratifiedKFold from dataset_paths import PETERS2009_CSV_PATH -from score_set import ScoreSet + from matrix_completion_helpers import load_data, evaluate_predictions from arg_parsing import parse_int_list, parse_float_list, parse_string_list @@ -206,9 +206,11 @@ if __name__ == "__main__": df = pd.DataFrame(pMHC_affinity_matrix, columns=allele_list, index=peptide_list) df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") - scores = ScoreSet( - index=[ + if args.output_file: + output_file = open(args.output_file, "w") + fields = [ "allele", + "cv_fold", "peptide_count", "sample_count", "dropout_probability", @@ -216,8 +218,15 @@ if __name__ == "__main__": "hidden_layer_size1", "hidden_layer_size2", "activation" - ]) - + "mae", + "tau", + "auc", + "f1" + ] + header_line = ",".join(fields) + output_file.write(header_line + "\n") + else: + output_file = None if args.unknown_amino_acids: index_encoding = amino_acids_with_unknown.index_encoding else: @@ -256,7 +265,7 @@ if __name__ == "__main__": for hidden_layer_size1 in args.first_hidden_layer_sizes: for hidden_layer_size2 in args.second_hidden_layer_sizes: for activation in args.activation_functions: - key = "%f,%d,%d,%d,%s" % ( + key = "%0.2f,%d,%d,%d,%s" % ( dropout, embedding_dim_size, hidden_layer_size1, @@ -353,25 +362,13 @@ if __name__ == "__main__": train_values_fold = [observed_values[i] for i in train_indices] train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)} - """ - if args.verbose: - print("Training peptides for CV fold %d/%d:" % ( - fold_idx + 1, - args.n_folds)) - for p in train_peptides_fold: - aff = train_dict_fold[p] - print("-- %s: %f (%f nM)" % ( - p, - aff, - args.max_ic50 ** (1 - aff))) - """ test_peptides = [observed_peptides[i] for i in test_indices] test_values = [observed_values[i] for i in test_indices] test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} if imputer is None: - X_pretrain = None - Y_pretrain = None - pretrain_sample_weights = None + X_pretrain = np.array([], dtype=int).reshape((0, 9)) + Y_pretrain = np.array([], dtype=float) + pretrain_sample_weights = np.array([], dtype=float) else: # drop the test peptides from the full matrix and then # run completion on it to get synthesized affinities @@ -485,10 +482,26 @@ if __name__ == "__main__": y_true=test_values, y_pred=y_pred, max_ic50=args.max_ic50) - scores.add_many( - ("%s,%d,%d," % (allele, n_train_unique, n_train)) + key, - mae=mae, - tau=tau, - f1_score=f1_score, - auc=auc) - scores.to_csv(args.output_file) + + cv_fold_field_values = [ + allele, + str(fold_idx), + str(n_train_unique), + str(n_train), + ] + accuracy_field_values = [ + "%0.4f" % mae, + "%0.4f" % tau, + "%0.4f" % auc, + "%0.4f" % f1_score + ] + output_line = ( + ",".join(cv_fold_field_values) + + "," + key + + "," + ",".join(accuracy_field_values) + + "\n" + ) + print("CV fold result: %s" % output_line) + if output_file: + output_file.write(output_line + "\n") + output_file.flush() diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py index 122e8198..7ad212f9 100644 --- a/mhcflurry/class1_binding_predictor.py +++ b/mhcflurry/class1_binding_predictor.py @@ -35,27 +35,25 @@ from .paths import CLASS1_MODEL_DIRECTORY from .feedforward import make_embedding_network from .predictor_base import PredictorBase -_allele_predictor_cache = {} - - from .class1_allele_specific_hyperparameters import MAX_IC50 +_allele_predictor_cache = {} class Class1BindingPredictor(PredictorBase): def __init__( self, model, - allele=None, + name=None, max_ic50=MAX_IC50, allow_unknown_amino_acids=False, verbose=False): PredictorBase.__init__( self, - name=allele, + name=name, max_ic50=max_ic50, allow_unknown_amino_acids=allow_unknown_amino_acids, verbose=verbose) - self.allele = allele + self.name = name self.model = model @classmethod @@ -411,21 +409,15 @@ class Class1BindingPredictor(PredictorBase): return list(sorted(alleles)) def __repr__(self): - return "Class1BindingPredictor(allele=%s, model=%s, max_ic50=%f)" % ( - self.allele, + return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % ( + self.name, self.model, self.max_ic50) def __str__(self): return repr(self) - def predict_9mer_peptides(self, peptides): - """ - Predict binding affinity for 9mer peptides - """ - if any(len(peptide) != 9 for peptide in peptides): - raise ValueError("Can only predict 9mer peptides") - X = self.encode_peptides(peptides) + def predict_encoded(self, X): max_expected_index = 20 if self.allow_unknown_amino_acids else 19 assert X.max() <= max_expected_index, \ "Got index %d in peptide encoding" % (X.max(),) diff --git a/mhcflurry/ensemble.py b/mhcflurry/ensemble.py index 30ec2558..a0373607 100644 --- a/mhcflurry/ensemble.py +++ b/mhcflurry/ensemble.py @@ -12,5 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + class Ensemble(object): - \ No newline at end of file + def __init__(self, models, name=None): + self.name = name + self.models = models + + @classmethod + def from_directory(cls, directory_path): + files = os.listdir(directory_path) + + + diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py index c4698fc9..d07a41b8 100644 --- a/mhcflurry/peptide_encoding.py +++ b/mhcflurry/peptide_encoding.py @@ -240,7 +240,6 @@ def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): (n_samples, peptide_length) = X.shape if not n_indices: n_indices = X.max() - first_index_value + 1 - X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool) for i, row in enumerate(X): for j, xij in enumerate(row): @@ -285,7 +284,7 @@ def fixed_length_index_encoding( insert_letters = ["X"] index_encoding = amino_acids_with_unknown.index_encoding else: - insert_letters = common_amino_acids.letters() + insert_letters = common_amino_acid_letters index_encoding = common_amino_acids.index_encoding fixed_length, original, counts = fixed_length_from_many_peptides( diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py index f8972bb9..9939f553 100644 --- a/mhcflurry/predictor_base.py +++ b/mhcflurry/predictor_base.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np +from collections import defaultdict -from itertools import groupby +import numpy as np -from .peptide_encoding import fixed_length_from_many_peptides +from .peptide_encoding import fixed_length_index_encoding from .amino_acid import ( - common_amino_acid_letters, amino_acids_with_unknown, common_amino_acids ) @@ -53,6 +52,42 @@ class PredictorBase(object): else: return common_amino_acids.index_encoding(peptides, 9) + def encode_peptides(self, peptides): + """ + Parameters + ---------- + peptides : str list + Peptide strings of any length + + Encode peptides of any length into fixed length vectors. + Returns 2d array of encoded peptides and 1d array indicating the + original peptide index for each row. + """ + indices = [] + encoded_matrices = [] + for i, peptide in enumerate(peptides): + matrix, _, _ = fixed_length_index_encoding( + peptides=[peptide], + desired_length=9, + allow_unknown_amino_acids=self.allow_unknown_amino_acids) + encoded_matrices.append(matrix) + indices.extend([i] * len(matrix)) + combined_matrix = np.concatenate(encoded_matrices) + index_array = np.array(indices) + expected_shape = (len(index_array), 9) + assert combined_matrix.shape == expected_shape, \ + "Expected shape %s but got %s" % (expected_shape, combined_matrix.shape) + return combined_matrix, index_array + + def predict_9mer_peptides(self, peptides): + """ + Predict binding affinity for 9mer peptides + """ + if any(len(peptide) != 9 for peptide in peptides): + raise ValueError("Can only predict 9mer peptides") + X, _ = self.encode_peptides(peptides) + return self.predict_encoded(X) + def predict_9mer_peptides_ic50(self, peptides): return self.log_to_ic50(self.predict_9mer_peptides(peptides)) @@ -63,6 +98,10 @@ class PredictorBase(object): return self.log_to_ic50( self.predict_peptides(peptides)) + def predict_encoded(self, X): + raise ValueError("Not yet implemented for %s!" % ( + self.__class__.__name__,)) + def predict_peptides( self, peptides, @@ -75,33 +114,18 @@ class PredictorBase(object): amino acid characters. The prediction for a single peptide will be the average of expanded 9mers. """ - results_dict = {} - for length, group_peptides in groupby(peptides, lambda x: len(x)): - group_peptides = list(group_peptides) - expanded_peptides, _, _ = fixed_length_from_many_peptides( - peptides=group_peptides, - desired_length=9, - insert_amino_acid_letters=( - ["X"] if self.allow_unknown_amino_acids - else common_amino_acid_letters)) - - n_group = len(group_peptides) - n_expanded = len(expanded_peptides) - expansion_factor = int(n_expanded / n_group) - raw_y = self.predict_9mer_peptides(expanded_peptides) - if expansion_factor == 1: - log_ic50s = raw_y - else: - # if peptides were a different length than the predictor's - # expected input length, then let's take the median prediction - # of each expanded peptide set - log_ic50s = np.zeros(n_group) - # take the median of each group of log(IC50) values - for i in range(n_group): - start = i * expansion_factor - end = (i + 1) * expansion_factor - log_ic50s[i] = combine_fn(raw_y[start:end]) - assert len(group_peptides) == len(log_ic50s) - for peptide, log_ic50 in zip(group_peptides, log_ic50s): - results_dict[peptide] = log_ic50 - return np.array([results_dict[p] for p in peptides]) + input_matrix, original_peptide_indices = self.encode_peptides(peptides) + # non-9mer peptides get multiple predictions, which are then combined + # with the combine_fn argument + multiple_predictions_dict = defaultdict(list) + fixed_length_predictions = self.predict_encoded(input_matrix) + for i, yi in enumerate(fixed_length_predictions): + original_peptide_index = original_peptide_indices[i] + original_peptide = peptides[original_peptide_index] + multiple_predictions_dict[original_peptide].append(yi) + + combined_predictions_dict = { + p: combine_fn(ys) + for (p, ys) in multiple_predictions_dict.items() + } + return np.array([combined_predictions_dict[p] for p in peptides]) diff --git a/experiments/score_set.py b/mhcflurry/score_set.py similarity index 100% rename from experiments/score_set.py rename to mhcflurry/score_set.py diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py index 094b6f5b..7d7e5fc2 100755 --- a/scripts/create-combined-class1-dataset.py +++ b/scripts/create-combined-class1-dataset.py @@ -29,33 +29,39 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) parser = argparse.ArgumentParser() -parser.add_argument("--ic50-fraction-tolerance", +parser.add_argument( + "--ic50-fraction-tolerance", default=0.01, type=float, help=( "How much can the IEDB and NetMHCpan IC50 differ and still be" " considered compatible (as a fraction of the NetMHCpan value)")) -parser.add_argument("--min-assay-overlap-size", +parser.add_argument( + "--min-assay-overlap-size", type=int, default=1, help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data") -parser.add_argument("--min-assay-fraction-same", +parser.add_argument( + "--min-assay-fraction-same", type=float, help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data", default=0.9) -parser.add_argument("--iedb-pickle-path", +parser.add_argument( + "--iedb-pickle-path", default=IEDB_PICKLE_PATH, help="Path to .pickle file containing dictionary of IEDB assay datasets") -parser.add_argument("--netmhcpan-csv-path", +parser.add_argument( + "--netmhcpan-csv-path", default=PETERS_CSV_PATH, help="Path to CSV with NetMHCpan dataset from 2013 Peters paper") -parser.add_argument("--output-csv-path", +parser.add_argument( + "--output-csv-path", default=CLASS1_DATA_CSV_PATH, help="Path to CSV of combined assay results") diff --git a/test/test_class1_binding_predictor.py b/test/test_class1_binding_predictor.py index 89f9f9d5..c824ce1a 100644 --- a/test/test_class1_binding_predictor.py +++ b/test/test_class1_binding_predictor.py @@ -82,7 +82,13 @@ def test_encode_peptides_9mer(): predictor = Class1BindingPredictor( model=Dummy9merIndexEncodingModel(), allow_unknown_amino_acids=True) - X = predictor.encode_peptides(["AAASSSYYY"]) + X = predictor.encode_9mer_peptides(["AAASSSYYY"]) + assert X.shape[0] == 1, X.shape + assert X.shape[1] == 9, X.shape + + X, indices = predictor.encode_peptides(["AAASSSYYY"]) + assert len(indices) == 1 + assert indices[0] == 0 assert X.shape[0] == 1, X.shape assert X.shape[1] == 9, X.shape @@ -91,7 +97,9 @@ def test_encode_peptides_8mer(): predictor = Class1BindingPredictor( model=Dummy9merIndexEncodingModel(), allow_unknown_amino_acids=True) - X = predictor.encode_peptides(["AAASSSYY"]) + X, indices = predictor.encode_peptides(["AAASSSYY"]) + assert len(indices) == 9 + assert (indices == 0).all() assert X.shape[0] == 9, (X.shape, X) assert X.shape[1] == 9, (X.shape, X) @@ -100,6 +108,8 @@ def test_encode_peptides_10mer(): predictor = Class1BindingPredictor( model=Dummy9merIndexEncodingModel(), allow_unknown_amino_acids=True) - X = predictor.encode_peptides(["AAASSSYYFF"]) + X, indices = predictor.encode_peptides(["AAASSSYYFF"]) + assert len(indices) == 10 + assert (indices == 0).all() assert X.shape[0] == 10, (X.shape, X) assert X.shape[1] == 9, (X.shape, X) -- GitLab