From 2143c288acbe2e278dbf08d8b92e5e6e54a598da Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 2 Sep 2019 14:48:56 -0400
Subject: [PATCH] fix

---
 mhcflurry/allele_encoding.py           |   9 +++
 mhcflurry/class1_affinity_predictor.py |  13 +++
 mhcflurry/class1_neural_network.py     |  11 +++
 test/test_speed.py                     | 106 ++++++++++++++++++++++---
 4 files changed, 126 insertions(+), 13 deletions(-)

diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py
index bfd61ec5..c799297a 100644
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -53,11 +53,20 @@ class AlleleEncoding(object):
                     a for a in alleles if a not in self.allele_to_index))
             self.indices = alleles.map(self.allele_to_index)
             assert not self.indices.isnull().any()
+            self.alleles = alleles
         else:
             self.indices = None
+            self.alleles = None
 
         self.encoding_cache = {}
 
+    def compact(self):
+        return AlleleEncoding(
+            alleles=self.alleles,
+            allele_to_sequence=dict(
+                (allele, self.allele_to_sequence[allele])
+                for allele in self.alleles.unique()))
+
     def allele_representations(self, encoding_name):
         if self.borrow_from is not None:
             return self.borrow_from.allele_representations(encoding_name)
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index b3a54bfe..f85f4cfe 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -1020,6 +1020,15 @@ class Class1AffinityPredictor(object):
                 allele_encoding = AlleleEncoding(
                     df.normalized_allele,
                     borrow_from=master_allele_encoding)
+
+                # The following line is a performance optimization that may be
+                # revisited. It causes the neural network to set to include
+                # only the alleles actually being predicted for. This makes
+                # the network much smaller. However, subsequent calls to
+                # predict will need to reset these weights, so there is a
+                # tradeoff.
+                allele_encoding = allele_encoding.compact()
+
                 for (i, model) in enumerate(self.class1_pan_allele_models):
                     predictions_array[:, i] = (
                         model.predict(
@@ -1030,6 +1039,10 @@ class Class1AffinityPredictor(object):
                 masked_allele_encoding = AlleleEncoding(
                     df.loc[mask].normalized_allele,
                     borrow_from=master_allele_encoding)
+
+                # See above performance note.
+                masked_allele_encoding = masked_allele_encoding.compact()
+
                 masked_peptides = peptides.sequences[mask]
                 for (i, model) in enumerate(self.class1_pan_allele_models):
                     predictions_array[mask, i] = model.predict(
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index 44f34d4f..30e3cc62 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -1218,6 +1218,17 @@ class Class1NeuralNetwork(object):
         # the allele sequences) are allowed.
         assert existing_weights.shape[1:] == reshaped.shape[1:]
 
+        if existing_weights.shape[0] > reshaped.shape[0]:
+            # Extend with NaNs so we can avoid having to reshape the weights
+            # matrix, which is expensive.
+            reshaped = numpy.append(
+                reshaped,
+                numpy.ones([
+                    existing_weights.shape[0] - reshaped.shape[0],
+                    reshaped.shape[1]
+                ]) * numpy.nan,
+                axis=0)
+
         if existing_weights.shape != reshaped.shape:
             # Network surgery required. Make a new network with this layer's
             # dimensions changed. Kind of a hack.
diff --git a/test/test_speed.py b/test/test_speed.py
index 79d7ab64..e9c342a6 100644
--- a/test/test_speed.py
+++ b/test/test_speed.py
@@ -1,21 +1,37 @@
+"""
+Profile prediction speed
+
+"""
 import numpy
 numpy.random.seed(0)
 import time
 import cProfile
 import pstats
 import collections
+import argparse
+import sys
 
 import pandas
 
 from mhcflurry import Class1AffinityPredictor
 from mhcflurry.encodable_sequences import EncodableSequences
 from mhcflurry.common import random_peptides
+from mhcflurry.downloads import get_path
+
+ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load(
+    get_path("models_class1", "models"))
+
+PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
+    get_path("models_class1_pan", "models.with_mass_spec"))
 
-DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()
+DEFAULT_NUM_PREDICTIONS = 10000
 
-NUM = 10000
 
-def test_speed(profile=False):
+def test_speed_allele_specific(
+        profile=False,
+        predictor=ALLELE_SPECIFIC_PREDICTOR,
+        num=DEFAULT_NUM_PREDICTIONS):
+
     starts = collections.OrderedDict()
     timings = collections.OrderedDict()
     profilers = collections.OrderedDict()
@@ -31,15 +47,14 @@ def test_speed(profile=False):
         if profile:
             profilers[name].disable()
 
-
     start("first")
-    DOWNLOADED_PREDICTOR.predict(["SIINFEKL"], allele="HLA-A*02:01")
+    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
     end("first")
 
-    peptides = random_peptides(NUM)
-    start("pred_%d" % NUM)
-    DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01")
-    end("pred_%d" % NUM)
+    peptides = random_peptides(num)
+    start("pred_%d" % num)
+    predictor.predict(peptides, allele="HLA-A*02:01")
+    end("pred_%d" % num)
 
     NUM2 = 10000
     peptides = EncodableSequences.create(random_peptides(NUM2, length=13))
@@ -48,13 +63,13 @@ def test_speed(profile=False):
     end("encode_blosum_%d" % NUM2)
 
     start("pred_already_encoded_%d" % NUM2)
-    DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01")
+    predictor.predict(peptides, allele="HLA-A*02:01")
     end("pred_already_encoded_%d" % NUM2)
 
     NUM_REPEATS = 100
     start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
     for _ in range(NUM_REPEATS):
-        DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01")
+        predictor.predict(peptides, allele="HLA-A*02:01")
     end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
 
     print("SPEED BENCHMARK")
@@ -64,12 +79,77 @@ def test_speed(profile=False):
         (key, pstats.Stats(value)) for (key, value) in profilers.items())
 
 
+def test_speed_pan_allele(
+        profile=False,
+        predictor=PAN_ALLELE_PREDICTOR,
+        num=DEFAULT_NUM_PREDICTIONS):
+
+    starts = collections.OrderedDict()
+    timings = collections.OrderedDict()
+    profilers = collections.OrderedDict()
+
+    def start(name):
+        starts[name] = time.time()
+        if profile:
+            profilers[name] = cProfile.Profile()
+            profilers[name].enable()
+
+    def end(name):
+        timings[name] = time.time() - starts[name]
+        if profile:
+            profilers[name].disable()
+
+    start("first")
+    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
+    end("first")
+
+    peptides = random_peptides(num)
+    start("pred_%d" % num)
+    predictor.predict(peptides, allele="HLA-A*02:01")
+    end("pred_%d" % num)
+
+    print("SPEED BENCHMARK")
+    print("Results:\n%s" % str(pandas.Series(timings)))
+
+    return dict(
+        (key, pstats.Stats(value)) for (key, value) in profilers.items())
+
+
+parser = argparse.ArgumentParser(usage=__doc__)
+parser.add_argument(
+    "--predictor",
+    nargs="+",
+    choices=["allele-specific", "pan-allele"],
+    default=["allele-specific", "pan-allele"],
+    help="Which predictors to run")
+
+parser.add_argument(
+    "--num-predictions",
+    type=int,
+    default=DEFAULT_NUM_PREDICTIONS,
+    help="Number of predictions to run")
+
 if __name__ == '__main__':
     # If run directly from python, do profiling and leave the user in a shell
     # to explore results.
 
-    result = test_speed(profile=True)
-    result["pred_%d" % NUM].sort_stats("cumtime").reverse_order().print_stats()
+    args = parser.parse_args(sys.argv[1:])
+
+    if "allele-specific" in args.predictor:
+        print("Running allele-specific test")
+        result = test_speed_allele_specific(
+            profile=True, num=args.num_predictions)
+        result[
+            "pred_%d" % args.num_predictions
+        ].sort_stats("cumtime").reverse_order().print_stats()
+
+    if "pan-allele" in args.predictor:
+        print("Running pan-allele test")
+        result = test_speed_pan_allele(
+            profile=True, num=args.num_predictions)
+        result[
+            "pred_%d" % args.num_predictions
+        ].sort_stats("cumtime").reverse_order().print_stats()
 
     # Leave in ipython
     locals().update(result)
-- 
GitLab