Skip to content
Snippets Groups Projects
Commit 2143c288 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent a7158f6e
No related branches found
No related tags found
No related merge requests found
...@@ -53,11 +53,20 @@ class AlleleEncoding(object): ...@@ -53,11 +53,20 @@ class AlleleEncoding(object):
a for a in alleles if a not in self.allele_to_index)) a for a in alleles if a not in self.allele_to_index))
self.indices = alleles.map(self.allele_to_index) self.indices = alleles.map(self.allele_to_index)
assert not self.indices.isnull().any() assert not self.indices.isnull().any()
self.alleles = alleles
else: else:
self.indices = None self.indices = None
self.alleles = None
self.encoding_cache = {} self.encoding_cache = {}
def compact(self):
return AlleleEncoding(
alleles=self.alleles,
allele_to_sequence=dict(
(allele, self.allele_to_sequence[allele])
for allele in self.alleles.unique()))
def allele_representations(self, encoding_name): def allele_representations(self, encoding_name):
if self.borrow_from is not None: if self.borrow_from is not None:
return self.borrow_from.allele_representations(encoding_name) return self.borrow_from.allele_representations(encoding_name)
......
...@@ -1020,6 +1020,15 @@ class Class1AffinityPredictor(object): ...@@ -1020,6 +1020,15 @@ class Class1AffinityPredictor(object):
allele_encoding = AlleleEncoding( allele_encoding = AlleleEncoding(
df.normalized_allele, df.normalized_allele,
borrow_from=master_allele_encoding) borrow_from=master_allele_encoding)
# The following line is a performance optimization that may be
# revisited. It causes the neural network to set to include
# only the alleles actually being predicted for. This makes
# the network much smaller. However, subsequent calls to
# predict will need to reset these weights, so there is a
# tradeoff.
allele_encoding = allele_encoding.compact()
for (i, model) in enumerate(self.class1_pan_allele_models): for (i, model) in enumerate(self.class1_pan_allele_models):
predictions_array[:, i] = ( predictions_array[:, i] = (
model.predict( model.predict(
...@@ -1030,6 +1039,10 @@ class Class1AffinityPredictor(object): ...@@ -1030,6 +1039,10 @@ class Class1AffinityPredictor(object):
masked_allele_encoding = AlleleEncoding( masked_allele_encoding = AlleleEncoding(
df.loc[mask].normalized_allele, df.loc[mask].normalized_allele,
borrow_from=master_allele_encoding) borrow_from=master_allele_encoding)
# See above performance note.
masked_allele_encoding = masked_allele_encoding.compact()
masked_peptides = peptides.sequences[mask] masked_peptides = peptides.sequences[mask]
for (i, model) in enumerate(self.class1_pan_allele_models): for (i, model) in enumerate(self.class1_pan_allele_models):
predictions_array[mask, i] = model.predict( predictions_array[mask, i] = model.predict(
......
...@@ -1218,6 +1218,17 @@ class Class1NeuralNetwork(object): ...@@ -1218,6 +1218,17 @@ class Class1NeuralNetwork(object):
# the allele sequences) are allowed. # the allele sequences) are allowed.
assert existing_weights.shape[1:] == reshaped.shape[1:] assert existing_weights.shape[1:] == reshaped.shape[1:]
if existing_weights.shape[0] > reshaped.shape[0]:
# Extend with NaNs so we can avoid having to reshape the weights
# matrix, which is expensive.
reshaped = numpy.append(
reshaped,
numpy.ones([
existing_weights.shape[0] - reshaped.shape[0],
reshaped.shape[1]
]) * numpy.nan,
axis=0)
if existing_weights.shape != reshaped.shape: if existing_weights.shape != reshaped.shape:
# Network surgery required. Make a new network with this layer's # Network surgery required. Make a new network with this layer's
# dimensions changed. Kind of a hack. # dimensions changed. Kind of a hack.
......
"""
Profile prediction speed
"""
import numpy import numpy
numpy.random.seed(0) numpy.random.seed(0)
import time import time
import cProfile import cProfile
import pstats import pstats
import collections import collections
import argparse
import sys
import pandas import pandas
from mhcflurry import Class1AffinityPredictor from mhcflurry import Class1AffinityPredictor
from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.common import random_peptides from mhcflurry.common import random_peptides
from mhcflurry.downloads import get_path
ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load(
get_path("models_class1", "models"))
PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.with_mass_spec"))
DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load() DEFAULT_NUM_PREDICTIONS = 10000
NUM = 10000
def test_speed(profile=False): def test_speed_allele_specific(
profile=False,
predictor=ALLELE_SPECIFIC_PREDICTOR,
num=DEFAULT_NUM_PREDICTIONS):
starts = collections.OrderedDict() starts = collections.OrderedDict()
timings = collections.OrderedDict() timings = collections.OrderedDict()
profilers = collections.OrderedDict() profilers = collections.OrderedDict()
...@@ -31,15 +47,14 @@ def test_speed(profile=False): ...@@ -31,15 +47,14 @@ def test_speed(profile=False):
if profile: if profile:
profilers[name].disable() profilers[name].disable()
start("first") start("first")
DOWNLOADED_PREDICTOR.predict(["SIINFEKL"], allele="HLA-A*02:01") predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
end("first") end("first")
peptides = random_peptides(NUM) peptides = random_peptides(num)
start("pred_%d" % NUM) start("pred_%d" % num)
DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") predictor.predict(peptides, allele="HLA-A*02:01")
end("pred_%d" % NUM) end("pred_%d" % num)
NUM2 = 10000 NUM2 = 10000
peptides = EncodableSequences.create(random_peptides(NUM2, length=13)) peptides = EncodableSequences.create(random_peptides(NUM2, length=13))
...@@ -48,13 +63,13 @@ def test_speed(profile=False): ...@@ -48,13 +63,13 @@ def test_speed(profile=False):
end("encode_blosum_%d" % NUM2) end("encode_blosum_%d" % NUM2)
start("pred_already_encoded_%d" % NUM2) start("pred_already_encoded_%d" % NUM2)
DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") predictor.predict(peptides, allele="HLA-A*02:01")
end("pred_already_encoded_%d" % NUM2) end("pred_already_encoded_%d" % NUM2)
NUM_REPEATS = 100 NUM_REPEATS = 100
start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS)) start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
for _ in range(NUM_REPEATS): for _ in range(NUM_REPEATS):
DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") predictor.predict(peptides, allele="HLA-A*02:01")
end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS)) end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
print("SPEED BENCHMARK") print("SPEED BENCHMARK")
...@@ -64,12 +79,77 @@ def test_speed(profile=False): ...@@ -64,12 +79,77 @@ def test_speed(profile=False):
(key, pstats.Stats(value)) for (key, value) in profilers.items()) (key, pstats.Stats(value)) for (key, value) in profilers.items())
def test_speed_pan_allele(
profile=False,
predictor=PAN_ALLELE_PREDICTOR,
num=DEFAULT_NUM_PREDICTIONS):
starts = collections.OrderedDict()
timings = collections.OrderedDict()
profilers = collections.OrderedDict()
def start(name):
starts[name] = time.time()
if profile:
profilers[name] = cProfile.Profile()
profilers[name].enable()
def end(name):
timings[name] = time.time() - starts[name]
if profile:
profilers[name].disable()
start("first")
predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
end("first")
peptides = random_peptides(num)
start("pred_%d" % num)
predictor.predict(peptides, allele="HLA-A*02:01")
end("pred_%d" % num)
print("SPEED BENCHMARK")
print("Results:\n%s" % str(pandas.Series(timings)))
return dict(
(key, pstats.Stats(value)) for (key, value) in profilers.items())
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--predictor",
nargs="+",
choices=["allele-specific", "pan-allele"],
default=["allele-specific", "pan-allele"],
help="Which predictors to run")
parser.add_argument(
"--num-predictions",
type=int,
default=DEFAULT_NUM_PREDICTIONS,
help="Number of predictions to run")
if __name__ == '__main__': if __name__ == '__main__':
# If run directly from python, do profiling and leave the user in a shell # If run directly from python, do profiling and leave the user in a shell
# to explore results. # to explore results.
result = test_speed(profile=True) args = parser.parse_args(sys.argv[1:])
result["pred_%d" % NUM].sort_stats("cumtime").reverse_order().print_stats()
if "allele-specific" in args.predictor:
print("Running allele-specific test")
result = test_speed_allele_specific(
profile=True, num=args.num_predictions)
result[
"pred_%d" % args.num_predictions
].sort_stats("cumtime").reverse_order().print_stats()
if "pan-allele" in args.predictor:
print("Running pan-allele test")
result = test_speed_pan_allele(
profile=True, num=args.num_predictions)
result[
"pred_%d" % args.num_predictions
].sort_stats("cumtime").reverse_order().print_stats()
# Leave in ipython # Leave in ipython
locals().update(result) locals().update(result)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment