From ab85ec989c1974436bd07f3079eff2d90e071623 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 4 Jul 2019 12:15:42 -0400
Subject: [PATCH] fixes

---
 mhcflurry/class1_neural_network.py            |  29 +++-
 mhcflurry/select_pan_allele_models_command.py | 148 +-----------------
 mhcflurry/train_pan_allele_models_command.py  |  29 +++-
 test/test_train_pan_allele_models_command.py  |  10 +-
 4 files changed, 58 insertions(+), 158 deletions(-)

diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index d2d36761..9d9f00bd 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -458,6 +458,9 @@ class Class1NeuralNetwork(object):
 
         """
         import keras
+        from keras import backend as K
+
+        fit_info = collections.defaultdict(list)
 
         loss = get_loss(self.hyperparameters['loss'])
 
@@ -478,6 +481,13 @@ class Class1NeuralNetwork(object):
         network._make_predict_function()
         self.set_allele_representations(allele_representations)
 
+        if self.hyperparameters['learning_rate'] is not None:
+            K.set_value(
+                self.network().optimizer.lr,
+                self.hyperparameters['learning_rate'])
+        fit_info["learning_rate"] = float(
+            K.get_value(self.network().optimizer.lr))
+
         validation_x_dict = {
             'peptide': self.peptides_to_network_input(
                 validation_peptide_encoding),
@@ -513,7 +523,8 @@ class Class1NeuralNetwork(object):
                 yielded_values_box[0] += len(affinities)
 
         start = time.time()
-        result = network.fit_generator(
+
+        fit_history = network.fit_generator(
             wrapped_generator(),
             steps_per_epoch=steps_per_epoch,
             epochs=epochs,
@@ -526,10 +537,12 @@ class Class1NeuralNetwork(object):
                 patience=patience,
                 verbose=verbose)]
         )
-        if verbose > 0:
-            print("fit_generator completed in %0.2f sec (%d total points)" % (
-                time.time() - start, yielded_values_box[0]))
-        return result
+        for (key, value) in fit_history.history.items():
+            fit_info[key].extend(value)
+
+        fit_info["time"] = time.time() - start
+        fit_info["num_points"] = yielded_values_box[0]
+        self.fit_info.append(dict(fit_info))
 
 
     def fit(
@@ -585,8 +598,10 @@ class Class1NeuralNetwork(object):
             How often (in seconds) to print progress update. Set to None to
             disable.
         """
+        from keras import backend as K
         encodable_peptides = EncodableSequences.create(peptides)
         peptide_encoding = self.peptides_to_network_input(encodable_peptides)
+        fit_info = collections.defaultdict(list)
 
         length_counts = (
             pandas.Series(encodable_peptides.sequences)
@@ -687,10 +702,11 @@ class Class1NeuralNetwork(object):
             loss=loss.loss, optimizer=self.hyperparameters['optimizer'])
 
         if self.hyperparameters['learning_rate'] is not None:
-            from keras import backend as K
             K.set_value(
                 self.network().optimizer.lr,
                 self.hyperparameters['learning_rate'])
+        fit_info["learning_rate"] = float(
+            K.get_value(self.network().optimizer.lr))
 
         if loss.supports_inequalities:
             # Do not sample negative affinities: just use an inequality.
@@ -762,7 +778,6 @@ class Class1NeuralNetwork(object):
         min_val_loss_iteration = None
         min_val_loss = None
 
-        fit_info = collections.defaultdict(list)
         start = time.time()
         last_progress_print = None
         x_dict_with_random_negatives = {}
diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py
index e44cdcdf..510016bf 100644
--- a/mhcflurry/select_pan_allele_models_command.py
+++ b/mhcflurry/select_pan_allele_models_command.py
@@ -43,13 +43,6 @@ parser.add_argument(
     help=(
         "Model selection data CSV. Expected columns: "
         "allele, peptide, measurement_value"))
-parser.add_argument(
-    "--exclude-data",
-    metavar="FILE.csv",
-    required=False,
-    help=(
-        "Data to EXCLUDE from model selection. Useful to specify the original "
-        "training data used"))
 parser.add_argument(
     "--models-dir",
     metavar="DIR",
@@ -60,24 +53,6 @@ parser.add_argument(
     metavar="DIR",
     required=True,
     help="Directory to write selected models")
-parser.add_argument(
-    "--out-unselected-predictions",
-    metavar="FILE.csv",
-    help="Write predictions for validation data using unselected predictor to "
-    "FILE.csv")
-parser.add_argument(
-    "--unselected-accuracy-scorer",
-    metavar="SCORER",
-    default="combined:mass-spec,mse")
-parser.add_argument(
-    "--unselected-accuracy-scorer-num-samples",
-    type=int,
-    default=1000)
-parser.add_argument(
-    "--unselected-accuracy-percentile-threshold",
-    type=float,
-    metavar="X",
-    default=95)
 parser.add_argument(
     "--min-models",
     type=int,
@@ -122,15 +97,14 @@ def run(argv=sys.argv[1:]):
     print("Loaded: %s" % input_predictor)
 
     alleles = input_predictor.supported_alleles
+    (min_peptide_length, max_peptide_length) = (
+        input_predictor.supported_peptide_lengths)
 
     metadata_dfs = {}
     df = pandas.read_csv(args.data)
     print("Loaded data: %s" % (str(df.shape)))
 
-    (min_peptide_length, max_peptide_length) = (
-        input_predictor.supported_peptide_lengths)
-
-    df = df.ix[
+    df = df.loc[
         (df.peptide.str.len() >= min_peptide_length) &
         (df.peptide.str.len() <= max_peptide_length)
     ]
@@ -141,26 +115,10 @@ def run(argv=sys.argv[1:]):
 
     # Allele names in data are assumed to be already normalized.
     df = df.loc[df.allele.isin(alleles)].dropna()
-    print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
-
-    if args.exclude_data:
-        exclude_df = pandas.read_csv(args.exclude_data)
-        metadata_dfs["model_selection_exclude"] = exclude_df
-        print("Loaded exclude data: %s" % (str(df.shape)))
-
-        df["_key"] = df.allele + "__" + df.peptide
-        exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
-        df["_excluded"] = df._key.isin(exclude_df._key.unique())
-        print("Excluding measurements per allele (counts): ")
-        print(df.groupby("allele")._excluded.sum())
+    print("Subselected to supported alleles: %s" % str(df.shape))
 
-        print("Excluding measurements per allele (fractions): ")
-        print(df.groupby("allele")._excluded.mean())
 
-        df = df.loc[~df._excluded]
-        del df["_excluded"]
-        del df["_key"]
-        print("Reduced data to: %s" % (str(df.shape)))
+    print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
 
     metadata_dfs["model_selection_data"] = df
 
@@ -168,101 +126,9 @@ def run(argv=sys.argv[1:]):
         args.mass_spec_regex)
 
 
-    if args.out_unselected_predictions:
-        df["unselected_prediction"] = input_predictor.predict(
-            alleles=df.allele.values,
-            peptides=df.peptide.values)
-        df.to_csv(args.out_unselected_predictions)
-        print("Wrote: %s" % args.out_unselected_predictions)
-
-    selectors = {}
-    selector_to_model_selection_kwargs = {}
-
-    def make_selector(
-            scoring,
-            combined_min_contribution_percent=args.combined_min_contribution_percent):
-        if scoring in selectors:
-            return (
-                selectors[scoring], selector_to_model_selection_kwargs[scoring])
-
-        start = time.time()
-        if scoring.startswith("combined:"):
-            model_selection_kwargs = {
-                'min_models': args.combined_min_models,
-                'max_models': args.combined_max_models,
-            }
-            component_selectors = []
-            for component_selector in scoring.split(":", 1)[1].split(","):
-                component_selectors.append(
-                    make_selector(
-                        component_selector)[0])
-            selector = CombinedModelSelector(
-                component_selectors,
-                min_contribution_percent=combined_min_contribution_percent)
-        elif scoring == "mse":
-            model_selection_kwargs = {
-                'min_models': args.mse_min_models,
-                'max_models': args.mse_max_models,
-            }
-            min_measurements = args.mse_min_measurements
-            selector = MSEModelSelector(
-                df=df.loc[~df.mass_spec],
-                predictor=input_predictor,
-                min_measurements=min_measurements)
-        elif scoring == "mass-spec":
-            mass_spec_df = df.loc[df.mass_spec]
-            model_selection_kwargs = {
-                'min_models': args.mass_spec_min_models,
-                'max_models': args.mass_spec_max_models,
-            }
-            min_measurements = args.mass_spec_min_measurements
-            selector = MassSpecModelSelector(
-                df=mass_spec_df,
-                predictor=input_predictor,
-                min_measurements=min_measurements)
-        elif scoring == "consensus":
-            model_selection_kwargs = {
-                'min_models': args.consensus_min_models,
-                'max_models': args.consensus_max_models,
-            }
-            selector = ConsensusModelSelector(
-                predictor=input_predictor,
-                num_peptides_per_length=args.consensus_num_peptides_per_length)
-        else:
-            raise ValueError("Unsupported scoring method: %s" % scoring)
-        print("Instantiated model selector %s in %0.2f sec." % (
-            scoring, time.time() - start))
-        return (selector, model_selection_kwargs)
-
-    for scoring in args.scoring:
-        (selector, model_selection_kwargs) = make_selector(scoring)
-        selectors[scoring] = selector
-        selector_to_model_selection_kwargs[scoring] = model_selection_kwargs
-
-    unselected_accuracy_scorer = None
-    if args.unselected_accuracy_scorer:
-        # Force running all selectors by setting combined_min_contribution_percent=0.
-        unselected_accuracy_scorer = make_selector(
-            args.unselected_accuracy_scorer,
-            combined_min_contribution_percent=0.0)[0]
-        print("Using unselected accuracy scorer: %s" % unselected_accuracy_scorer)
-    GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer
 
-    print("Selectors for alleles:")
-    allele_to_selector = {}
-    allele_to_model_selection_kwargs = {}
-    for allele in alleles:
-        selector = None
-        for possible_selector in args.scoring:
-            if selectors[possible_selector].usable_for_allele(allele=allele):
-                selector = selectors[possible_selector]
-                print("%20s %s" % (allele, selector.plan_summary(allele)))
-                break
-        if selector is None:
-            raise ValueError("No selectors usable for allele: %s" % allele)
-        allele_to_selector[allele] = selector
-        allele_to_model_selection_kwargs[allele] = (
-            selector_to_model_selection_kwargs[possible_selector])
+
+
 
     GLOBAL_DATA["args"] = args
     GLOBAL_DATA["input_predictor"] = input_predictor
diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py
index bafb6f72..1abf6712 100644
--- a/mhcflurry/train_pan_allele_models_command.py
+++ b/mhcflurry/train_pan_allele_models_command.py
@@ -9,6 +9,7 @@ import time
 import traceback
 import random
 import pprint
+import hashlib
 from functools import partial
 
 import numpy
@@ -130,6 +131,7 @@ add_worker_pool_args(parser)
 
 def assign_folds(df, num_folds, held_out_fraction, held_out_max):
     result_df = pandas.DataFrame(index=df.index)
+
     for fold in range(num_folds):
         result_df["fold_%d" % fold] = True
         for (allele, sub_df) in df.groupby("allele"):
@@ -172,6 +174,9 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max):
     print("Test points per fold")
     print((~result_df).sum())
 
+    result_df["allele"] = df["allele"]
+    result_df["peptide"] = df["peptide"]
+
     return result_df
 
 
@@ -422,8 +427,6 @@ def train_model(
         progress_print_interval,
         predictor,
         save_to):
-    import keras.backend as K
-    import keras
 
     df = GLOBAL_DATA["train_data"]
     folds_df = GLOBAL_DATA["folds_df"]
@@ -484,10 +487,10 @@ def train_model(
             epochs=pretrain_max_epochs,
             verbose=verbose,
         )
-        if model.hyperparameters['learning_rate']:
-            model.hyperparameters['learning_rate'] /= 10
-        else:
-            model.hyperparameters['learning_rate'] = 0.0001
+
+        # Use a smaller learning rate for training on real data
+        learning_rate = model.fit_info[-1]["learning_rate"]
+        model.hyperparameters['learning_rate'] = learning_rate / 10
 
     model.fit(
         peptides=train_peptides,
@@ -500,6 +503,20 @@ def train_model(
         progress_print_interval=progress_print_interval,
         verbose=verbose)
 
+    # Save model-specific training info
+    train_peptide_hash = hashlib.sha1()
+    for peptide in train_data.peptide.values:
+        train_peptide_hash.update(peptide.encode())
+    model.fit_info[-1]["training_info"] = {
+        "fold_num": fold_num,
+        "num_folds": num_folds,
+        "replicate_num": replicate_num,
+        "num_replicates": num_replicates,
+        "architecture_num": architecture_num,
+        "num_architectures": num_architectures,
+        "train_peptide_hash": train_peptide_hash.hexdigest(),
+    }
+
     numpy.testing.assert_equal(
         predictor.manifest_df.shape[0], len(predictor.class1_pan_allele_models))
     predictor.add_pan_allele_model(model, models_dir_for_save=save_to)
diff --git a/test/test_train_pan_allele_models_command.py b/test/test_train_pan_allele_models_command.py
index f4142a1d..f98599ff 100644
--- a/test/test_train_pan_allele_models_command.py
+++ b/test/test_train_pan_allele_models_command.py
@@ -101,7 +101,7 @@ HYPERPARAMETERS_LIST = [
 ][1:]
 
 
-def run_and_check(n_jobs=0):
+def run_and_check(n_jobs=0, delete=True):
     models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
     hyperparameters_filename = os.path.join(
         models_dir, "hyperparameters.yaml")
@@ -140,8 +140,9 @@ def run_and_check(n_jobs=0):
             alleles=["HLA-A*02:01"])
     print(df)
 
-    print("Deleting: %s" % models_dir)
-    shutil.rmtree(models_dir)
+    if delete:
+        print("Deleting: %s" % models_dir)
+        shutil.rmtree(models_dir)
 
 
 if os.environ.get("KERAS_BACKEND") != "theano":
@@ -153,5 +154,6 @@ if os.environ.get("KERAS_BACKEND") != "theano":
 def test_run_serial():
     run_and_check(n_jobs=0)
 
+
 if __name__ == "__main__":
-    test_run_serial()
\ No newline at end of file
+    run_and_check(n_jobs=0, delete=False)
-- 
GitLab