From f110ea6ee1b9c934b90d0df293d54ef52f1e6973 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Wed, 17 Feb 2016 13:06:59 -0500
Subject: [PATCH] moved ScoreSet out of experiments

---
 experiments/matrix-completion-accuracy.py     |  2 +-
 ...matrix-completion-hyperparameter-search.py | 69 ++++++++------
 mhcflurry/class1_binding_predictor.py         | 22 ++---
 mhcflurry/ensemble.py                         | 13 ++-
 mhcflurry/peptide_encoding.py                 |  3 +-
 mhcflurry/predictor_base.py                   | 92 ++++++++++++-------
 {experiments => mhcflurry}/score_set.py       |  0
 scripts/create-combined-class1-dataset.py     | 18 ++--
 test/test_class1_binding_predictor.py         | 16 +++-
 9 files changed, 145 insertions(+), 90 deletions(-)
 rename {experiments => mhcflurry}/score_set.py (100%)

diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py
index b3310ae9..0fef8576 100644
--- a/experiments/matrix-completion-accuracy.py
+++ b/experiments/matrix-completion-accuracy.py
@@ -59,7 +59,7 @@ parser.add_argument(
 
 parser.add_argument(
     "--output-file",
-    default="imputation-accuracy-comparison.csv")
+    default="matrix-completion-accuracy-results.csv")
 
 parser.add_argument(
     "--normalize-columns",
diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py
index b324465b..8c5a7611 100755
--- a/experiments/matrix-completion-hyperparameter-search.py
+++ b/experiments/matrix-completion-hyperparameter-search.py
@@ -36,7 +36,7 @@ from mhcflurry import Class1BindingPredictor
 from sklearn.cross_validation import StratifiedKFold
 
 from dataset_paths import PETERS2009_CSV_PATH
-from score_set import ScoreSet
+
 from matrix_completion_helpers import load_data, evaluate_predictions
 
 from arg_parsing import parse_int_list, parse_float_list, parse_string_list
@@ -206,9 +206,11 @@ if __name__ == "__main__":
         df = pd.DataFrame(pMHC_affinity_matrix, columns=allele_list, index=peptide_list)
         df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
 
-    scores = ScoreSet(
-        index=[
+    if args.output_file:
+        output_file = open(args.output_file, "w")
+        fields = [
             "allele",
+            "cv_fold",
             "peptide_count",
             "sample_count",
             "dropout_probability",
@@ -216,8 +218,15 @@ if __name__ == "__main__":
             "hidden_layer_size1",
             "hidden_layer_size2",
             "activation"
-        ])
-
+            "mae",
+            "tau",
+            "auc",
+            "f1"
+        ]
+        header_line = ",".join(fields)
+        output_file.write(header_line + "\n")
+    else:
+        output_file = None
     if args.unknown_amino_acids:
         index_encoding = amino_acids_with_unknown.index_encoding
     else:
@@ -256,7 +265,7 @@ if __name__ == "__main__":
                 for hidden_layer_size1 in args.first_hidden_layer_sizes:
                     for hidden_layer_size2 in args.second_hidden_layer_sizes:
                         for activation in args.activation_functions:
-                            key = "%f,%d,%d,%d,%s" % (
+                            key = "%0.2f,%d,%d,%d,%s" % (
                                 dropout,
                                 embedding_dim_size,
                                 hidden_layer_size1,
@@ -353,25 +362,13 @@ if __name__ == "__main__":
             train_values_fold = [observed_values[i] for i in train_indices]
             train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)}
 
-            """
-            if args.verbose:
-                print("Training peptides for CV fold %d/%d:" % (
-                    fold_idx + 1,
-                    args.n_folds))
-                for p in train_peptides_fold:
-                    aff = train_dict_fold[p]
-                    print("-- %s: %f (%f nM)" % (
-                        p,
-                        aff,
-                        args.max_ic50 ** (1 - aff)))
-            """
             test_peptides = [observed_peptides[i] for i in test_indices]
             test_values = [observed_values[i] for i in test_indices]
             test_dict = {k: v for (k, v) in zip(test_peptides, test_values)}
             if imputer is None:
-                X_pretrain = None
-                Y_pretrain = None
-                pretrain_sample_weights = None
+                X_pretrain = np.array([], dtype=int).reshape((0, 9))
+                Y_pretrain = np.array([], dtype=float)
+                pretrain_sample_weights = np.array([], dtype=float)
             else:
                 # drop the test peptides from the full matrix and then
                 # run completion on it to get synthesized affinities
@@ -485,10 +482,26 @@ if __name__ == "__main__":
                     y_true=test_values,
                     y_pred=y_pred,
                     max_ic50=args.max_ic50)
-                scores.add_many(
-                    ("%s,%d,%d," % (allele, n_train_unique, n_train)) + key,
-                    mae=mae,
-                    tau=tau,
-                    f1_score=f1_score,
-                    auc=auc)
-        scores.to_csv(args.output_file)
+
+                cv_fold_field_values = [
+                    allele,
+                    str(fold_idx),
+                    str(n_train_unique),
+                    str(n_train),
+                ]
+                accuracy_field_values = [
+                    "%0.4f" % mae,
+                    "%0.4f" % tau,
+                    "%0.4f" % auc,
+                    "%0.4f" % f1_score
+                ]
+                output_line = (
+                    ",".join(cv_fold_field_values) +
+                    "," + key +
+                    "," + ",".join(accuracy_field_values) +
+                    "\n"
+                )
+                print("CV fold result: %s" % output_line)
+                if output_file:
+                    output_file.write(output_line + "\n")
+                    output_file.flush()
diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py
index 122e8198..7ad212f9 100644
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -35,27 +35,25 @@ from .paths import CLASS1_MODEL_DIRECTORY
 from .feedforward import make_embedding_network
 from .predictor_base import PredictorBase
 
-_allele_predictor_cache = {}
-
-
 from .class1_allele_specific_hyperparameters import MAX_IC50
 
+_allele_predictor_cache = {}
 
 class Class1BindingPredictor(PredictorBase):
     def __init__(
             self,
             model,
-            allele=None,
+            name=None,
             max_ic50=MAX_IC50,
             allow_unknown_amino_acids=False,
             verbose=False):
         PredictorBase.__init__(
             self,
-            name=allele,
+            name=name,
             max_ic50=max_ic50,
             allow_unknown_amino_acids=allow_unknown_amino_acids,
             verbose=verbose)
-        self.allele = allele
+        self.name = name
         self.model = model
 
     @classmethod
@@ -411,21 +409,15 @@ class Class1BindingPredictor(PredictorBase):
         return list(sorted(alleles))
 
     def __repr__(self):
-        return "Class1BindingPredictor(allele=%s, model=%s, max_ic50=%f)" % (
-            self.allele,
+        return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % (
+            self.name,
             self.model,
             self.max_ic50)
 
     def __str__(self):
         return repr(self)
 
-    def predict_9mer_peptides(self, peptides):
-        """
-        Predict binding affinity for 9mer peptides
-        """
-        if any(len(peptide) != 9 for peptide in peptides):
-            raise ValueError("Can only predict 9mer peptides")
-        X = self.encode_peptides(peptides)
+    def predict_encoded(self, X):
         max_expected_index = 20 if self.allow_unknown_amino_acids else 19
         assert X.max() <= max_expected_index, \
             "Got index %d in peptide encoding" % (X.max(),)
diff --git a/mhcflurry/ensemble.py b/mhcflurry/ensemble.py
index 30ec2558..a0373607 100644
--- a/mhcflurry/ensemble.py
+++ b/mhcflurry/ensemble.py
@@ -12,5 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 class Ensemble(object):
-    
\ No newline at end of file
+    def __init__(self, models, name=None):
+        self.name = name
+        self.models = models
+
+    @classmethod
+    def from_directory(cls, directory_path):
+        files = os.listdir(directory_path)
+
+
+
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
index c4698fc9..d07a41b8 100644
--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -240,7 +240,6 @@ def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
     (n_samples, peptide_length) = X.shape
     if not n_indices:
         n_indices = X.max() - first_index_value + 1
-
     X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
     for i, row in enumerate(X):
         for j, xij in enumerate(row):
@@ -285,7 +284,7 @@ def fixed_length_index_encoding(
         insert_letters = ["X"]
         index_encoding = amino_acids_with_unknown.index_encoding
     else:
-        insert_letters = common_amino_acids.letters()
+        insert_letters = common_amino_acid_letters
         index_encoding = common_amino_acids.index_encoding
 
     fixed_length, original, counts = fixed_length_from_many_peptides(
diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py
index f8972bb9..9939f553 100644
--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
+from collections import defaultdict
 
-from itertools import groupby
+import numpy as np
 
-from .peptide_encoding import fixed_length_from_many_peptides
+from .peptide_encoding import fixed_length_index_encoding
 from .amino_acid import (
-    common_amino_acid_letters,
     amino_acids_with_unknown,
     common_amino_acids
 )
@@ -53,6 +52,42 @@ class PredictorBase(object):
         else:
             return common_amino_acids.index_encoding(peptides, 9)
 
+    def encode_peptides(self, peptides):
+        """
+        Parameters
+        ----------
+        peptides : str list
+            Peptide strings of any length
+
+        Encode peptides of any length into fixed length vectors.
+        Returns 2d array of encoded peptides and 1d array indicating the
+        original peptide index for each row.
+        """
+        indices = []
+        encoded_matrices = []
+        for i, peptide in enumerate(peptides):
+            matrix, _, _ = fixed_length_index_encoding(
+                peptides=[peptide],
+                desired_length=9,
+                allow_unknown_amino_acids=self.allow_unknown_amino_acids)
+            encoded_matrices.append(matrix)
+            indices.extend([i] * len(matrix))
+        combined_matrix = np.concatenate(encoded_matrices)
+        index_array = np.array(indices)
+        expected_shape = (len(index_array), 9)
+        assert combined_matrix.shape == expected_shape, \
+            "Expected shape %s but got %s" % (expected_shape, combined_matrix.shape)
+        return combined_matrix, index_array
+
+    def predict_9mer_peptides(self, peptides):
+        """
+        Predict binding affinity for 9mer peptides
+        """
+        if any(len(peptide) != 9 for peptide in peptides):
+            raise ValueError("Can only predict 9mer peptides")
+        X, _ = self.encode_peptides(peptides)
+        return self.predict_encoded(X)
+
     def predict_9mer_peptides_ic50(self, peptides):
         return self.log_to_ic50(self.predict_9mer_peptides(peptides))
 
@@ -63,6 +98,10 @@ class PredictorBase(object):
         return self.log_to_ic50(
             self.predict_peptides(peptides))
 
+    def predict_encoded(self, X):
+        raise ValueError("Not yet implemented for %s!" % (
+            self.__class__.__name__,))
+
     def predict_peptides(
             self,
             peptides,
@@ -75,33 +114,18 @@ class PredictorBase(object):
         amino acid characters. The prediction for a single peptide will be
         the average of expanded 9mers.
         """
-        results_dict = {}
-        for length, group_peptides in groupby(peptides, lambda x: len(x)):
-            group_peptides = list(group_peptides)
-            expanded_peptides, _, _ = fixed_length_from_many_peptides(
-                peptides=group_peptides,
-                desired_length=9,
-                insert_amino_acid_letters=(
-                    ["X"] if self.allow_unknown_amino_acids
-                    else common_amino_acid_letters))
-
-            n_group = len(group_peptides)
-            n_expanded = len(expanded_peptides)
-            expansion_factor = int(n_expanded / n_group)
-            raw_y = self.predict_9mer_peptides(expanded_peptides)
-            if expansion_factor == 1:
-                log_ic50s = raw_y
-            else:
-                # if peptides were a different length than the predictor's
-                # expected input length, then let's take the median prediction
-                # of each expanded peptide set
-                log_ic50s = np.zeros(n_group)
-                # take the median of each group of log(IC50) values
-                for i in range(n_group):
-                    start = i * expansion_factor
-                    end = (i + 1) * expansion_factor
-                    log_ic50s[i] = combine_fn(raw_y[start:end])
-            assert len(group_peptides) == len(log_ic50s)
-            for peptide, log_ic50 in zip(group_peptides, log_ic50s):
-                results_dict[peptide] = log_ic50
-        return np.array([results_dict[p] for p in peptides])
+        input_matrix, original_peptide_indices = self.encode_peptides(peptides)
+        # non-9mer peptides get multiple predictions, which are then combined
+        # with the combine_fn argument
+        multiple_predictions_dict = defaultdict(list)
+        fixed_length_predictions = self.predict_encoded(input_matrix)
+        for i, yi in enumerate(fixed_length_predictions):
+            original_peptide_index = original_peptide_indices[i]
+            original_peptide = peptides[original_peptide_index]
+            multiple_predictions_dict[original_peptide].append(yi)
+
+        combined_predictions_dict = {
+            p: combine_fn(ys)
+            for (p, ys) in multiple_predictions_dict.items()
+        }
+        return np.array([combined_predictions_dict[p] for p in peptides])
diff --git a/experiments/score_set.py b/mhcflurry/score_set.py
similarity index 100%
rename from experiments/score_set.py
rename to mhcflurry/score_set.py
diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py
index 094b6f5b..7d7e5fc2 100755
--- a/scripts/create-combined-class1-dataset.py
+++ b/scripts/create-combined-class1-dataset.py
@@ -29,33 +29,39 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--ic50-fraction-tolerance",
+parser.add_argument(
+    "--ic50-fraction-tolerance",
     default=0.01,
     type=float,
     help=(
         "How much can the IEDB and NetMHCpan IC50 differ and still be"
         " considered compatible (as a fraction of the NetMHCpan value)"))
 
-parser.add_argument("--min-assay-overlap-size",
+parser.add_argument(
+    "--min-assay-overlap-size",
     type=int,
     default=1,
     help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data")
 
 
-parser.add_argument("--min-assay-fraction-same",
+parser.add_argument(
+    "--min-assay-fraction-same",
     type=float,
     help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data",
     default=0.9)
 
-parser.add_argument("--iedb-pickle-path",
+parser.add_argument(
+    "--iedb-pickle-path",
     default=IEDB_PICKLE_PATH,
     help="Path to .pickle file containing dictionary of IEDB assay datasets")
 
-parser.add_argument("--netmhcpan-csv-path",
+parser.add_argument(
+    "--netmhcpan-csv-path",
     default=PETERS_CSV_PATH,
     help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
 
-parser.add_argument("--output-csv-path",
+parser.add_argument(
+    "--output-csv-path",
     default=CLASS1_DATA_CSV_PATH,
     help="Path to CSV of combined assay results")
 
diff --git a/test/test_class1_binding_predictor.py b/test/test_class1_binding_predictor.py
index 89f9f9d5..c824ce1a 100644
--- a/test/test_class1_binding_predictor.py
+++ b/test/test_class1_binding_predictor.py
@@ -82,7 +82,13 @@ def test_encode_peptides_9mer():
     predictor = Class1BindingPredictor(
         model=Dummy9merIndexEncodingModel(),
         allow_unknown_amino_acids=True)
-    X = predictor.encode_peptides(["AAASSSYYY"])
+    X = predictor.encode_9mer_peptides(["AAASSSYYY"])
+    assert X.shape[0] == 1, X.shape
+    assert X.shape[1] == 9, X.shape
+
+    X, indices = predictor.encode_peptides(["AAASSSYYY"])
+    assert len(indices) == 1
+    assert indices[0] == 0
     assert X.shape[0] == 1, X.shape
     assert X.shape[1] == 9, X.shape
 
@@ -91,7 +97,9 @@ def test_encode_peptides_8mer():
     predictor = Class1BindingPredictor(
         model=Dummy9merIndexEncodingModel(),
         allow_unknown_amino_acids=True)
-    X = predictor.encode_peptides(["AAASSSYY"])
+    X, indices = predictor.encode_peptides(["AAASSSYY"])
+    assert len(indices) == 9
+    assert (indices == 0).all()
     assert X.shape[0] == 9, (X.shape, X)
     assert X.shape[1] == 9, (X.shape, X)
 
@@ -100,6 +108,8 @@ def test_encode_peptides_10mer():
     predictor = Class1BindingPredictor(
         model=Dummy9merIndexEncodingModel(),
         allow_unknown_amino_acids=True)
-    X = predictor.encode_peptides(["AAASSSYYFF"])
+    X, indices = predictor.encode_peptides(["AAASSSYYFF"])
+    assert len(indices) == 10
+    assert (indices == 0).all()
     assert X.shape[0] == 10, (X.shape, X)
     assert X.shape[1] == 9, (X.shape, X)
-- 
GitLab