fix

2143c288 · Tim O'Donnell · a7158f6e · 2143c288 · 2143c288 · 2143c288
Commit 2143c288 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -53,11 +53,20 @@ class AlleleEncoding(object):
                    a for a in alleles if a not in self.allele_to_index))
            self.indices = alleles.map(self.allele_to_index)
            assert not self.indices.isnull().any()
+            self.alleles = alleles
        else:
            self.indices = None
+            self.alleles = None
        self.encoding_cache = {}
+    def compact(self):
+        return AlleleEncoding(
+            alleles=self.alleles,
+            allele_to_sequence=dict(
+                (allele, self.allele_to_sequence[allele])
+                for allele in self.alleles.unique()))
    def allele_representations(self, encoding_name):
        if self.borrow_from is not None:
            return self.borrow_from.allele_representations(encoding_name)

--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -1020,6 +1020,15 @@ class Class1AffinityPredictor(object):
                allele_encoding = AlleleEncoding(
                    df.normalized_allele,
                    borrow_from=master_allele_encoding)
+                # The following line is a performance optimization that may be
+                # revisited. It causes the neural network to set to include
+                # only the alleles actually being predicted for. This makes
+                # the network much smaller. However, subsequent calls to
+                # predict will need to reset these weights, so there is a
+                # tradeoff.
+                allele_encoding = allele_encoding.compact()
                for (i, model) in enumerate(self.class1_pan_allele_models):
                    predictions_array[:, i] = (
                        model.predict(
@@ -1030,6 +1039,10 @@ class Class1AffinityPredictor(object):
                masked_allele_encoding = AlleleEncoding(
                    df.loc[mask].normalized_allele,
                    borrow_from=master_allele_encoding)
+                # See above performance note.
+                masked_allele_encoding = masked_allele_encoding.compact()
                masked_peptides = peptides.sequences[mask]
                for (i, model) in enumerate(self.class1_pan_allele_models):
                    predictions_array[mask, i] = model.predict(

--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -1218,6 +1218,17 @@ class Class1NeuralNetwork(object):
        # the allele sequences) are allowed.
        assert existing_weights.shape[1:] == reshaped.shape[1:]
+        if existing_weights.shape[0] > reshaped.shape[0]:
+            # Extend with NaNs so we can avoid having to reshape the weights
+            # matrix, which is expensive.
+            reshaped = numpy.append(
+                reshaped,
+                numpy.ones([
+                    existing_weights.shape[0] - reshaped.shape[0],
+                    reshaped.shape[1]
+                ]) * numpy.nan,
+                axis=0)
        if existing_weights.shape != reshaped.shape:
            # Network surgery required. Make a new network with this layer's
            # dimensions changed. Kind of a hack.

--- a/test/test_speed.py
+++ b/test/test_speed.py
+"""
+Profile prediction speed
+"""
 import numpy
 numpy.random.seed(0)
 import time
 import cProfile
 import pstats
 import collections
+import argparse
+import sys
 import pandas
 from mhcflurry import Class1AffinityPredictor
 from mhcflurry.encodable_sequences import EncodableSequences
 from mhcflurry.common import random_peptides
+from mhcflurry.downloads import get_path
+ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load(
+    get_path("models_class1", "models"))
+PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
+    get_path("models_class1_pan", "models.with_mass_spec"))
-DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()
+DEFAULT_NUM_PREDICTIONS = 10000
-NUM = 10000
-def test_speed(profile=False):
+def test_speed_allele_specific(
+        profile=False,
+        predictor=ALLELE_SPECIFIC_PREDICTOR,
+        num=DEFAULT_NUM_PREDICTIONS):
    starts = collections.OrderedDict()
    timings = collections.OrderedDict()
    profilers = collections.OrderedDict()
@@ -31,15 +47,14 @@ def test_speed(profile=False):
        if profile:
            profilers[name].disable()
    start("first")
-    DOWNLOADED_PREDICTOR.predict(["SIINFEKL"], allele="HLA-A*02:01")
+    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
    end("first")
-    peptides = random_peptides(NUM)
+    peptides = random_peptides(num)
-    start("pred_%d" % NUM)
+    start("pred_%d" % num)
-    DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01")
+    predictor.predict(peptides, allele="HLA-A*02:01")
-    end("pred_%d" % NUM)
+    end("pred_%d" % num)
    NUM2 = 10000
    peptides = EncodableSequences.create(random_peptides(NUM2, length=13))
@@ -48,13 +63,13 @@ def test_speed(profile=False):
    end("encode_blosum_%d" % NUM2)
    start("pred_already_encoded_%d" % NUM2)
-    DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01")
+    predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_already_encoded_%d" % NUM2)
    NUM_REPEATS = 100
    start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
    for _ in range(NUM_REPEATS):
-        DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01")
+        predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
    print("SPEED BENCHMARK")
@@ -64,12 +79,77 @@ def test_speed(profile=False):
        (key, pstats.Stats(value)) for (key, value) in profilers.items())
+def test_speed_pan_allele(
+        profile=False,
+        predictor=PAN_ALLELE_PREDICTOR,
+        num=DEFAULT_NUM_PREDICTIONS):
+    starts = collections.OrderedDict()
+    timings = collections.OrderedDict()
+    profilers = collections.OrderedDict()
+    def start(name):
+        starts[name] = time.time()
+        if profile:
+            profilers[name] = cProfile.Profile()
+            profilers[name].enable()
+    def end(name):
+        timings[name] = time.time() - starts[name]
+        if profile:
+            profilers[name].disable()
+    start("first")
+    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
+    end("first")
+    peptides = random_peptides(num)
+    start("pred_%d" % num)
+    predictor.predict(peptides, allele="HLA-A*02:01")
+    end("pred_%d" % num)
+    print("SPEED BENCHMARK")
+    print("Results:\n%s" % str(pandas.Series(timings)))
+    return dict(
+        (key, pstats.Stats(value)) for (key, value) in profilers.items())
+parser = argparse.ArgumentParser(usage=__doc__)
+parser.add_argument(
+    "--predictor",
+    nargs="+",
+    choices=["allele-specific", "pan-allele"],
+    default=["allele-specific", "pan-allele"],
+    help="Which predictors to run")
+parser.add_argument(
+    "--num-predictions",
+    type=int,
+    default=DEFAULT_NUM_PREDICTIONS,
+    help="Number of predictions to run")
 if __name__ == '__main__':
    # If run directly from python, do profiling and leave the user in a shell
    # to explore results.
-    result = test_speed(profile=True)
+    args = parser.parse_args(sys.argv[1:])
-    result["pred_%d" % NUM].sort_stats("cumtime").reverse_order().print_stats()
+    if "allele-specific" in args.predictor:
+        print("Running allele-specific test")
+        result = test_speed_allele_specific(
+            profile=True, num=args.num_predictions)
+        result[
+            "pred_%d" % args.num_predictions
+        ].sort_stats("cumtime").reverse_order().print_stats()
+    if "pan-allele" in args.predictor:
+        print("Running pan-allele test")
+        result = test_speed_pan_allele(
+            profile=True, num=args.num_predictions)
+        result[
+            "pred_%d" % args.num_predictions
+        ].sort_stats("cumtime").reverse_order().print_stats()
    # Leave in ipython
    locals().update(result)