From 2143c288acbe2e278dbf08d8b92e5e6e54a598da Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 2 Sep 2019 14:48:56 -0400 Subject: [PATCH] fix --- mhcflurry/allele_encoding.py | 9 +++ mhcflurry/class1_affinity_predictor.py | 13 +++ mhcflurry/class1_neural_network.py | 11 +++ test/test_speed.py | 106 ++++++++++++++++++++++--- 4 files changed, 126 insertions(+), 13 deletions(-) diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py index bfd61ec5..c799297a 100644 --- a/mhcflurry/allele_encoding.py +++ b/mhcflurry/allele_encoding.py @@ -53,11 +53,20 @@ class AlleleEncoding(object): a for a in alleles if a not in self.allele_to_index)) self.indices = alleles.map(self.allele_to_index) assert not self.indices.isnull().any() + self.alleles = alleles else: self.indices = None + self.alleles = None self.encoding_cache = {} + def compact(self): + return AlleleEncoding( + alleles=self.alleles, + allele_to_sequence=dict( + (allele, self.allele_to_sequence[allele]) + for allele in self.alleles.unique())) + def allele_representations(self, encoding_name): if self.borrow_from is not None: return self.borrow_from.allele_representations(encoding_name) diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index b3a54bfe..f85f4cfe 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -1020,6 +1020,15 @@ class Class1AffinityPredictor(object): allele_encoding = AlleleEncoding( df.normalized_allele, borrow_from=master_allele_encoding) + + # The following line is a performance optimization that may be + # revisited. It causes the neural network to set to include + # only the alleles actually being predicted for. This makes + # the network much smaller. However, subsequent calls to + # predict will need to reset these weights, so there is a + # tradeoff. + allele_encoding = allele_encoding.compact() + for (i, model) in enumerate(self.class1_pan_allele_models): predictions_array[:, i] = ( model.predict( @@ -1030,6 +1039,10 @@ class Class1AffinityPredictor(object): masked_allele_encoding = AlleleEncoding( df.loc[mask].normalized_allele, borrow_from=master_allele_encoding) + + # See above performance note. + masked_allele_encoding = masked_allele_encoding.compact() + masked_peptides = peptides.sequences[mask] for (i, model) in enumerate(self.class1_pan_allele_models): predictions_array[mask, i] = model.predict( diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 44f34d4f..30e3cc62 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -1218,6 +1218,17 @@ class Class1NeuralNetwork(object): # the allele sequences) are allowed. assert existing_weights.shape[1:] == reshaped.shape[1:] + if existing_weights.shape[0] > reshaped.shape[0]: + # Extend with NaNs so we can avoid having to reshape the weights + # matrix, which is expensive. + reshaped = numpy.append( + reshaped, + numpy.ones([ + existing_weights.shape[0] - reshaped.shape[0], + reshaped.shape[1] + ]) * numpy.nan, + axis=0) + if existing_weights.shape != reshaped.shape: # Network surgery required. Make a new network with this layer's # dimensions changed. Kind of a hack. diff --git a/test/test_speed.py b/test/test_speed.py index 79d7ab64..e9c342a6 100644 --- a/test/test_speed.py +++ b/test/test_speed.py @@ -1,21 +1,37 @@ +""" +Profile prediction speed + +""" import numpy numpy.random.seed(0) import time import cProfile import pstats import collections +import argparse +import sys import pandas from mhcflurry import Class1AffinityPredictor from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.common import random_peptides +from mhcflurry.downloads import get_path + +ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load( + get_path("models_class1", "models")) + +PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load( + get_path("models_class1_pan", "models.with_mass_spec")) -DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load() +DEFAULT_NUM_PREDICTIONS = 10000 -NUM = 10000 -def test_speed(profile=False): +def test_speed_allele_specific( + profile=False, + predictor=ALLELE_SPECIFIC_PREDICTOR, + num=DEFAULT_NUM_PREDICTIONS): + starts = collections.OrderedDict() timings = collections.OrderedDict() profilers = collections.OrderedDict() @@ -31,15 +47,14 @@ def test_speed(profile=False): if profile: profilers[name].disable() - start("first") - DOWNLOADED_PREDICTOR.predict(["SIINFEKL"], allele="HLA-A*02:01") + predictor.predict(["SIINFEKL"], allele="HLA-A*02:01") end("first") - peptides = random_peptides(NUM) - start("pred_%d" % NUM) - DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") - end("pred_%d" % NUM) + peptides = random_peptides(num) + start("pred_%d" % num) + predictor.predict(peptides, allele="HLA-A*02:01") + end("pred_%d" % num) NUM2 = 10000 peptides = EncodableSequences.create(random_peptides(NUM2, length=13)) @@ -48,13 +63,13 @@ def test_speed(profile=False): end("encode_blosum_%d" % NUM2) start("pred_already_encoded_%d" % NUM2) - DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") + predictor.predict(peptides, allele="HLA-A*02:01") end("pred_already_encoded_%d" % NUM2) NUM_REPEATS = 100 start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS)) for _ in range(NUM_REPEATS): - DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") + predictor.predict(peptides, allele="HLA-A*02:01") end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS)) print("SPEED BENCHMARK") @@ -64,12 +79,77 @@ def test_speed(profile=False): (key, pstats.Stats(value)) for (key, value) in profilers.items()) +def test_speed_pan_allele( + profile=False, + predictor=PAN_ALLELE_PREDICTOR, + num=DEFAULT_NUM_PREDICTIONS): + + starts = collections.OrderedDict() + timings = collections.OrderedDict() + profilers = collections.OrderedDict() + + def start(name): + starts[name] = time.time() + if profile: + profilers[name] = cProfile.Profile() + profilers[name].enable() + + def end(name): + timings[name] = time.time() - starts[name] + if profile: + profilers[name].disable() + + start("first") + predictor.predict(["SIINFEKL"], allele="HLA-A*02:01") + end("first") + + peptides = random_peptides(num) + start("pred_%d" % num) + predictor.predict(peptides, allele="HLA-A*02:01") + end("pred_%d" % num) + + print("SPEED BENCHMARK") + print("Results:\n%s" % str(pandas.Series(timings))) + + return dict( + (key, pstats.Stats(value)) for (key, value) in profilers.items()) + + +parser = argparse.ArgumentParser(usage=__doc__) +parser.add_argument( + "--predictor", + nargs="+", + choices=["allele-specific", "pan-allele"], + default=["allele-specific", "pan-allele"], + help="Which predictors to run") + +parser.add_argument( + "--num-predictions", + type=int, + default=DEFAULT_NUM_PREDICTIONS, + help="Number of predictions to run") + if __name__ == '__main__': # If run directly from python, do profiling and leave the user in a shell # to explore results. - result = test_speed(profile=True) - result["pred_%d" % NUM].sort_stats("cumtime").reverse_order().print_stats() + args = parser.parse_args(sys.argv[1:]) + + if "allele-specific" in args.predictor: + print("Running allele-specific test") + result = test_speed_allele_specific( + profile=True, num=args.num_predictions) + result[ + "pred_%d" % args.num_predictions + ].sort_stats("cumtime").reverse_order().print_stats() + + if "pan-allele" in args.predictor: + print("Running pan-allele test") + result = test_speed_pan_allele( + profile=True, num=args.num_predictions) + result[ + "pred_%d" % args.num_predictions + ].sort_stats("cumtime").reverse_order().print_stats() # Leave in ipython locals().update(result) -- GitLab