Skip to content
Snippets Groups Projects
Unverified Commit 804ff270 authored by Tim O'Donnell's avatar Tim O'Donnell Committed by GitHub
Browse files

Merge pull request #163 from openvax/1.6.1

1.6.1
parents 9d9861af c81bcbcd
No related branches found
Tags 1.6.1
No related merge requests found
...@@ -22,7 +22,7 @@ MHC ligands. ...@@ -22,7 +22,7 @@ MHC ligands.
If you find MHCflurry useful in your research please cite: If you find MHCflurry useful in your research please cite:
> T. O'Donnell, A. Rubinsteyn, U. Laserson. "Improved predictive models of peptide presentation on MHC I". *biorxiv*, 2020. https://www.biorxiv.org/content/10.1101/2020.03.28.013714v1 > T. O'Donnell, A. Rubinsteyn, U. Laserson. "A model of antigen processing improves prediction of MHC I-presented peptides". *biorxiv*, 2020. https://www.biorxiv.org/content/10.1101/2020.03.28.013714v2
> T. O’Donnell, A. Rubinsteyn, M. Bonsack, A. B. Riemer, U. Laserson, and J. Hammerbacher, "MHCflurry: Open-Source Class I MHC Binding Affinity Prediction," *Cell Systems*, 2018. https://www.cell.com/cell-systems/fulltext/S2405-4712(18)30232-1. > T. O’Donnell, A. Rubinsteyn, M. Bonsack, A. B. Riemer, U. Laserson, and J. Hammerbacher, "MHCflurry: Open-Source Class I MHC Binding Affinity Prediction," *Cell Systems*, 2018. https://www.cell.com/cell-systems/fulltext/S2405-4712(18)30232-1.
......
...@@ -43,6 +43,9 @@ COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown" ...@@ -43,6 +43,9 @@ COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown"
AMINO_ACID_INDEX = dict( AMINO_ACID_INDEX = dict(
(letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN)) (letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN))
for (letter, i) in list(AMINO_ACID_INDEX.items()):
AMINO_ACID_INDEX[letter.lower()] = i # Support lower-case as well.
AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
BLOSUM62_MATRIX = pandas.read_csv(StringIO(""" BLOSUM62_MATRIX = pandas.read_csv(StringIO("""
......
...@@ -453,7 +453,19 @@ class Class1AffinityPredictor(object): ...@@ -453,7 +453,19 @@ class Class1AffinityPredictor(object):
`Class1AffinityPredictor` instance `Class1AffinityPredictor` instance
""" """
if models_dir is None: if models_dir is None:
models_dir = get_default_class1_models_dir() try:
models_dir = get_default_class1_models_dir()
except RuntimeError as e:
# Fall back to the affinity predictor included in presentation
# predictor if possible.
from mhcflurry.class1_presentation_predictor import (
Class1PresentationPredictor)
try:
presentation_predictor = Class1PresentationPredictor.load()
return presentation_predictor.affinity_predictor
except RuntimeError:
raise e
if optimization_level is None: if optimization_level is None:
optimization_level = OPTIMIZATION_LEVEL optimization_level = OPTIMIZATION_LEVEL
......
...@@ -234,7 +234,7 @@ class Class1PresentationPredictor(object): ...@@ -234,7 +234,7 @@ class Class1PresentationPredictor(object):
return result_df return result_df
def predict_processing( def predict_processing(
self, peptides, n_flanks=None, c_flanks=None, verbose=1): self, peptides, n_flanks=None, c_flanks=None, throw=True, verbose=1):
""" """
Predict antigen processing scores for individual peptides, optionally Predict antigen processing scores for individual peptides, optionally
including flanking sequences for better cleavage prediction. including flanking sequences for better cleavage prediction.
...@@ -244,6 +244,8 @@ class Class1PresentationPredictor(object): ...@@ -244,6 +244,8 @@ class Class1PresentationPredictor(object):
peptides : list of string peptides : list of string
n_flanks : list of string [same length as peptides] n_flanks : list of string [same length as peptides]
c_flanks : list of string [same length as peptides] c_flanks : list of string [same length as peptides]
throw : boolean
Whether to raise exception on unsupported peptides
verbose : int verbose : int
Returns Returns
...@@ -285,6 +287,7 @@ class Class1PresentationPredictor(object): ...@@ -285,6 +287,7 @@ class Class1PresentationPredictor(object):
peptides=peptide_chunk, peptides=peptide_chunk,
n_flanks=n_flank_chunk, n_flanks=n_flank_chunk,
c_flanks=c_flank_chunk, c_flanks=c_flank_chunk,
throw=throw,
batch_size=PREDICT_BATCH_SIZE) batch_size=PREDICT_BATCH_SIZE)
result_chunks.append(result_chunk) result_chunks.append(result_chunk)
return numpy.concatenate(result_chunks) return numpy.concatenate(result_chunks)
...@@ -441,7 +444,8 @@ class Class1PresentationPredictor(object): ...@@ -441,7 +444,8 @@ class Class1PresentationPredictor(object):
(up to 6) indicating the genotype. If you are predicting across (up to 6) indicating the genotype. If you are predicting across
multiple samples, pass a dict where the keys are (arbitrary) multiple samples, pass a dict where the keys are (arbitrary)
sample names and the values are the alleles to predict for that sample names and the values are the alleles to predict for that
sample. sample. Set to an empty list or dict to perform processing
prediction only.
sample_names : list of string [same length as peptides] sample_names : list of string [same length as peptides]
If you are passing a dict for 'alleles', you can use this If you are passing a dict for 'alleles', you can use this
argument to specify which peptides go with which samples. If it is argument to specify which peptides go with which samples. If it is
...@@ -503,17 +507,26 @@ class Class1PresentationPredictor(object): ...@@ -503,17 +507,26 @@ class Class1PresentationPredictor(object):
peptides=peptides, peptides=peptides,
n_flanks=n_flanks, n_flanks=n_flanks,
c_flanks=c_flanks, c_flanks=c_flanks,
throw=throw,
verbose=verbose) verbose=verbose)
df = self.predict_affinity( if alleles:
peptides=peptides, df = self.predict_affinity(
alleles=alleles, peptides=peptides,
sample_names=sample_names, # might be None alleles=alleles,
include_affinity_percentile=include_affinity_percentile, sample_names=sample_names, # might be None
verbose=verbose, include_affinity_percentile=include_affinity_percentile,
throw=throw) verbose=verbose,
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
else:
# Processing predicion only.
df = pandas.DataFrame({
"peptide_num": numpy.arange(len(peptides)),
"peptide": peptides,
})
df["affinity_score"] = from_ic50(df.affinity)
df["processing_score"] = df.peptide_num.map( df["processing_score"] = df.peptide_num.map(
pandas.Series(processing_scores)) pandas.Series(processing_scores))
if c_flanks is not None: if c_flanks is not None:
...@@ -523,12 +536,21 @@ class Class1PresentationPredictor(object): ...@@ -523,12 +536,21 @@ class Class1PresentationPredictor(object):
model_name = 'with_flanks' if n_flanks is not None else "without_flanks" model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name) model = self.get_model(model_name)
if len(df) > 0: if "affinity_score" in df.columns:
df["presentation_score"] = model.predict_proba( if len(df) > 0:
df[self.model_inputs].values)[:,1] input_matrix = df[self.model_inputs]
else: null_mask = None
df["presentation_score"] = [] if not throw:
del df["affinity_score"] # Invalid peptides will be null.
null_mask = input_matrix.isnull().any(1)
input_matrix = input_matrix.fillna(0.0)
df["presentation_score"] = model.predict_proba(
input_matrix.values)[:,1]
if null_mask is not None:
df.loc[null_mask, "presentation_score"] = numpy.nan
else:
df["presentation_score"] = []
del df["affinity_score"]
return df return df
def predict_sequences( def predict_sequences(
...@@ -536,7 +558,7 @@ class Class1PresentationPredictor(object): ...@@ -536,7 +558,7 @@ class Class1PresentationPredictor(object):
sequences, sequences,
alleles, alleles,
result="best", result="best",
comparison_quantity="presentation_score", comparison_quantity=None,
filter_value=None, filter_value=None,
peptide_lengths=(8, 9, 10, 11), peptide_lengths=(8, 9, 10, 11),
use_flanks=True, use_flanks=True,
...@@ -593,7 +615,8 @@ class Class1PresentationPredictor(object): ...@@ -593,7 +615,8 @@ class Class1PresentationPredictor(object):
comparison_quantity : string comparison_quantity : string
One of "presentation_score", "processing_score", "affinity", or One of "presentation_score", "processing_score", "affinity", or
"affinity_percentile". Prediction to use to rank (if result is "affinity_percentile". Prediction to use to rank (if result is
"best") or filter (if result is "filtered") results. "best") or filter (if result is "filtered") results. Default is
"presentation_score".
filter_value : float filter_value : float
Threshold value to use, only relevant when result is "filtered". Threshold value to use, only relevant when result is "filtered".
If comparison_quantity is "affinity", then all results less than If comparison_quantity is "affinity", then all results less than
...@@ -618,8 +641,13 @@ class Class1PresentationPredictor(object): ...@@ -618,8 +641,13 @@ class Class1PresentationPredictor(object):
peptide, n_flank, c_flank, sequence_name, affinity, best_allele, peptide, n_flank, c_flank, sequence_name, affinity, best_allele,
processing_score, presentation_score processing_score, presentation_score
""" """
if len(alleles) == 0:
alleles = {}
if comparison_quantity is None: if comparison_quantity is None:
comparison_quantity = "presentation_score" comparison_quantity = (
"presentation_score"
if len(alleles) > 0 else "processing_score")
processing_predictor = self.processing_predictor_with_flanks processing_predictor = self.processing_predictor_with_flanks
if not use_flanks or processing_predictor is None: if not use_flanks or processing_predictor is None:
......
...@@ -294,6 +294,7 @@ class Class1ProcessingNeuralNetwork(object): ...@@ -294,6 +294,7 @@ class Class1ProcessingNeuralNetwork(object):
def predict_encoded( def predict_encoded(
self, self,
sequences, sequences,
throw=True,
batch_size=DEFAULT_PREDICT_BATCH_SIZE): batch_size=DEFAULT_PREDICT_BATCH_SIZE):
""" """
Predict antigen processing. Predict antigen processing.
...@@ -302,6 +303,8 @@ class Class1ProcessingNeuralNetwork(object): ...@@ -302,6 +303,8 @@ class Class1ProcessingNeuralNetwork(object):
---------- ----------
sequences : FlankingEncoding sequences : FlankingEncoding
Peptides and flanking sequences Peptides and flanking sequences
throw : boolean
Whether to throw exception on unsupported peptides
batch_size : int batch_size : int
Prediction keras batch size. Prediction keras batch size.
...@@ -309,13 +312,13 @@ class Class1ProcessingNeuralNetwork(object): ...@@ -309,13 +312,13 @@ class Class1ProcessingNeuralNetwork(object):
------- -------
numpy.array numpy.array
""" """
x_dict = self.network_input(sequences) x_dict = self.network_input(sequences, throw=throw)
raw_predictions = self.network().predict( raw_predictions = self.network().predict(
x_dict, batch_size=batch_size) x_dict, batch_size=batch_size)
predictions = numpy.squeeze(raw_predictions).astype("float64") predictions = numpy.squeeze(raw_predictions).astype("float64")
return predictions return predictions
def network_input(self, sequences): def network_input(self, sequences, throw=True):
""" """
Encode peptides to the fixed-length encoding expected by the neural Encode peptides to the fixed-length encoding expected by the neural
network (which depends on the architecture). network (which depends on the architecture).
...@@ -324,6 +327,8 @@ class Class1ProcessingNeuralNetwork(object): ...@@ -324,6 +327,8 @@ class Class1ProcessingNeuralNetwork(object):
---------- ----------
sequences : FlankingEncoding sequences : FlankingEncoding
Peptides and flanking sequences Peptides and flanking sequences
throw : boolean
Whether to throw exception on unsupported peptides
Returns Returns
------- -------
...@@ -334,7 +339,8 @@ class Class1ProcessingNeuralNetwork(object): ...@@ -334,7 +339,8 @@ class Class1ProcessingNeuralNetwork(object):
self.hyperparameters['peptide_max_length'], self.hyperparameters['peptide_max_length'],
n_flank_length=self.hyperparameters['n_flank_length'], n_flank_length=self.hyperparameters['n_flank_length'],
c_flank_length=self.hyperparameters['c_flank_length'], c_flank_length=self.hyperparameters['c_flank_length'],
allow_unsupported_amino_acids=True) allow_unsupported_amino_acids=True,
throw=throw)
result = { result = {
"sequence": encoded.array, "sequence": encoded.array,
......
...@@ -176,6 +176,7 @@ class Class1ProcessingPredictor(object): ...@@ -176,6 +176,7 @@ class Class1ProcessingPredictor(object):
peptides, peptides,
n_flanks=None, n_flanks=None,
c_flanks=None, c_flanks=None,
throw=True,
batch_size=DEFAULT_PREDICT_BATCH_SIZE): batch_size=DEFAULT_PREDICT_BATCH_SIZE):
""" """
Predict antigen processing. Predict antigen processing.
...@@ -188,6 +189,10 @@ class Class1ProcessingPredictor(object): ...@@ -188,6 +189,10 @@ class Class1ProcessingPredictor(object):
Upstream sequence before each peptide Upstream sequence before each peptide
c_flanks : list of string c_flanks : list of string
Downstream sequence after each peptide Downstream sequence after each peptide
throw : boolean
If True, a ValueError will be raised in the case of unsupported
peptides. If False, a warning will be logged and the predictions
for those peptides will be NaN.
batch_size : int batch_size : int
Prediction keras batch size. Prediction keras batch size.
...@@ -202,6 +207,7 @@ class Class1ProcessingPredictor(object): ...@@ -202,6 +207,7 @@ class Class1ProcessingPredictor(object):
peptides=peptides, peptides=peptides,
n_flanks=n_flanks, n_flanks=n_flanks,
c_flanks=c_flanks, c_flanks=c_flanks,
throw=throw,
batch_size=batch_size).score.values batch_size=batch_size).score.values
def predict_to_dataframe( def predict_to_dataframe(
...@@ -209,6 +215,7 @@ class Class1ProcessingPredictor(object): ...@@ -209,6 +215,7 @@ class Class1ProcessingPredictor(object):
peptides, peptides,
n_flanks=None, n_flanks=None,
c_flanks=None, c_flanks=None,
throw=True,
batch_size=DEFAULT_PREDICT_BATCH_SIZE): batch_size=DEFAULT_PREDICT_BATCH_SIZE):
""" """
Predict antigen processing. Predict antigen processing.
...@@ -231,10 +238,10 @@ class Class1ProcessingPredictor(object): ...@@ -231,10 +238,10 @@ class Class1ProcessingPredictor(object):
sequences = FlankingEncoding( sequences = FlankingEncoding(
peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks) peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
return self.predict_to_dataframe_encoded( return self.predict_to_dataframe_encoded(
sequences=sequences, batch_size=batch_size) sequences=sequences, throw=throw, batch_size=batch_size)
def predict_to_dataframe_encoded( def predict_to_dataframe_encoded(
self, sequences, batch_size=DEFAULT_PREDICT_BATCH_SIZE): self, sequences, throw=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
""" """
Predict antigen processing. Predict antigen processing.
...@@ -244,6 +251,7 @@ class Class1ProcessingPredictor(object): ...@@ -244,6 +251,7 @@ class Class1ProcessingPredictor(object):
---------- ----------
sequences : FlankingEncoding sequences : FlankingEncoding
batch_size : int batch_size : int
throw : boolean
Returns Returns
------- -------
...@@ -254,7 +262,7 @@ class Class1ProcessingPredictor(object): ...@@ -254,7 +262,7 @@ class Class1ProcessingPredictor(object):
for (i, network) in enumerate(self.models): for (i, network) in enumerate(self.models):
predictions = network.predict_encoded( predictions = network.predict_encoded(
sequences, batch_size=batch_size) sequences, throw=throw, batch_size=batch_size)
score_array.append(predictions) score_array.append(predictions)
score_array = numpy.array(score_array) score_array = numpy.array(score_array)
......
...@@ -7,6 +7,7 @@ from __future__ import ( ...@@ -7,6 +7,7 @@ from __future__ import (
from six import string_types from six import string_types
from collections import namedtuple from collections import namedtuple
import logging
from .encodable_sequences import EncodingError, EncodableSequences from .encodable_sequences import EncodingError, EncodableSequences
...@@ -63,7 +64,8 @@ class FlankingEncoding(object): ...@@ -63,7 +64,8 @@ class FlankingEncoding(object):
peptide_max_length, peptide_max_length,
n_flank_length, n_flank_length,
c_flank_length, c_flank_length,
allow_unsupported_amino_acids=True): allow_unsupported_amino_acids=True,
throw=True):
""" """
Encode variable-length sequences to a fixed-size matrix. Encode variable-length sequences to a fixed-size matrix.
...@@ -81,6 +83,8 @@ class FlankingEncoding(object): ...@@ -81,6 +83,8 @@ class FlankingEncoding(object):
allow_unsupported_amino_acids : bool allow_unsupported_amino_acids : bool
If True, non-canonical amino acids will be replaced with the X If True, non-canonical amino acids will be replaced with the X
character before encoding. character before encoding.
throw : bool
Whether to raise exception on unsupported peptides
Returns Returns
------- -------
...@@ -97,7 +101,8 @@ class FlankingEncoding(object): ...@@ -97,7 +101,8 @@ class FlankingEncoding(object):
peptide_max_length, peptide_max_length,
n_flank_length, n_flank_length,
c_flank_length, c_flank_length,
allow_unsupported_amino_acids) allow_unsupported_amino_acids,
throw)
if cache_key not in self.encoding_cache: if cache_key not in self.encoding_cache:
result = self.encode( result = self.encode(
vector_encoding_name=vector_encoding_name, vector_encoding_name=vector_encoding_name,
...@@ -105,7 +110,8 @@ class FlankingEncoding(object): ...@@ -105,7 +110,8 @@ class FlankingEncoding(object):
peptide_max_length=peptide_max_length, peptide_max_length=peptide_max_length,
n_flank_length=n_flank_length, n_flank_length=n_flank_length,
c_flank_length=c_flank_length, c_flank_length=c_flank_length,
allow_unsupported_amino_acids=allow_unsupported_amino_acids) allow_unsupported_amino_acids=allow_unsupported_amino_acids,
throw=throw)
self.encoding_cache[cache_key] = result self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key] return self.encoding_cache[cache_key]
...@@ -116,7 +122,8 @@ class FlankingEncoding(object): ...@@ -116,7 +122,8 @@ class FlankingEncoding(object):
peptide_max_length, peptide_max_length,
n_flank_length, n_flank_length,
c_flank_length, c_flank_length,
allow_unsupported_amino_acids=False): allow_unsupported_amino_acids=False,
throw=True):
""" """
Encode variable-length sequences to a fixed-size matrix. Encode variable-length sequences to a fixed-size matrix.
...@@ -130,6 +137,7 @@ class FlankingEncoding(object): ...@@ -130,6 +137,7 @@ class FlankingEncoding(object):
n_flank_length : int n_flank_length : int
c_flank_length : int c_flank_length : int
allow_unsupported_amino_acids : bool allow_unsupported_amino_acids : bool
throw : bool
Returns Returns
------- -------
...@@ -140,13 +148,21 @@ class FlankingEncoding(object): ...@@ -140,13 +148,21 @@ class FlankingEncoding(object):
(df.peptide.str.len() < 1) (df.peptide.str.len() < 1)
] ]
if len(error_df) > 0: if len(error_df) > 0:
raise EncodingError( message = (
"Sequence '%s' (length %d) unsupported. There are %d " "Sequence '%s' (length %d) unsupported. There are %d "
"total peptides with this length." % ( "total peptides with this length." % (
error_df.iloc[0].peptide, error_df.iloc[0].peptide,
len(error_df.iloc[0].peptide), len(error_df.iloc[0].peptide),
len(error_df)), len(error_df)))
supported_peptide_lengths=(1, peptide_max_length + 1)) if throw:
raise EncodingError(
message,
supported_peptide_lengths=(1, peptide_max_length + 1))
logging.warning(message)
# Replace invalid peptides with X's. The encoding will be set to
# NaNs for these peptides farther below.
df.loc[error_df.index, "peptide"] = "X" * peptide_max_length
if n_flank_length > 0: if n_flank_length > 0:
n_flanks = df.n_flank.str.pad( n_flanks = df.n_flank.str.pad(
...@@ -171,6 +187,11 @@ class FlankingEncoding(object): ...@@ -171,6 +187,11 @@ class FlankingEncoding(object):
max_length=n_flank_length + peptide_max_length + c_flank_length, max_length=n_flank_length + peptide_max_length + c_flank_length,
allow_unsupported_amino_acids=allow_unsupported_amino_acids) allow_unsupported_amino_acids=allow_unsupported_amino_acids)
array = array.astype("float32") # So NaNs can be used.
if len(error_df) > 0:
array[error_df.index] = numpy.nan
result = EncodingResult( result = EncodingResult(
array, peptide_lengths=peptides.str.len().values) array, peptide_lengths=peptides.str.len().values)
......
...@@ -43,9 +43,7 @@ from __future__ import ( ...@@ -43,9 +43,7 @@ from __future__ import (
import sys import sys
import argparse import argparse
import itertools
import logging import logging
import os
import pandas import pandas
...@@ -126,10 +124,11 @@ input_args.add_argument( ...@@ -126,10 +124,11 @@ input_args.add_argument(
results_args = parser.add_argument_group(title="Result options") results_args = parser.add_argument_group(title="Result options")
results_args.add_argument( results_args.add_argument(
"--peptide-lengths", "--peptide-lengths",
type=int, default="8-11",
nargs="+", metavar="L",
default=[8, 9, 10, 11], help="Peptide lengths to consider. Pass as START-END (e.g. 8-11) or a "
help="Peptide lengths to consider. Default: %(default)s.") "comma-separated list (8,9,10,11). When using START-END, the range is "
"INCLUSIVE on both ends. Default: %(default)s.")
comparison_quantities = [ comparison_quantities = [
"presentation_score", "presentation_score",
"processing_score", "processing_score",
...@@ -203,6 +202,23 @@ model_args.add_argument( ...@@ -203,6 +202,23 @@ model_args.add_argument(
help="Do not use flanking sequence information in predictions") help="Do not use flanking sequence information in predictions")
def parse_peptide_lengths(value):
try:
if "-" in value:
(start, end) = value.split("-", 2)
start = int(start.strip())
end = int(end.strip())
peptide_lengths = list(range(start, end + 1))
else:
peptide_lengths = [
int(length.strip())
for length in value.split(",")
]
except ValueError:
raise ValueError("Couldn't parse peptide lengths: ", value)
return peptide_lengths
def run(argv=sys.argv[1:]): def run(argv=sys.argv[1:]):
logging.getLogger('tensorflow').disabled = True logging.getLogger('tensorflow').disabled = True
...@@ -216,6 +232,8 @@ def run(argv=sys.argv[1:]): ...@@ -216,6 +232,8 @@ def run(argv=sys.argv[1:]):
if args.output_delimiter == "\\t": if args.output_delimiter == "\\t":
args.output_delimiter = "\t" args.output_delimiter = "\t"
peptide_lengths = parse_peptide_lengths(args.peptide_lengths)
result_args = { result_args = {
"all": args.results_all, "all": args.results_all,
"best": args.results_best, "best": args.results_best,
...@@ -309,16 +327,21 @@ def run(argv=sys.argv[1:]): ...@@ -309,16 +327,21 @@ def run(argv=sys.argv[1:]):
df = df.set_index(args.sequence_id_column) df = df.set_index(args.sequence_id_column)
genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+") if args.alleles:
genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i) genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+")
genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i)
alleles = genotypes.to_dict()
else:
print("No alleles specified. Will perform processing prediction only.")
alleles = {}
result_df = predictor.predict_sequences( result_df = predictor.predict_sequences(
sequences=df[args.sequence_column].to_dict(), sequences=df[args.sequence_column].to_dict(),
alleles=genotypes.to_dict(), alleles=alleles,
result=result, result=result,
comparison_quantity=result_comparison_quantity, comparison_quantity=result_comparison_quantity,
filter_value=result_filter_value, filter_value=result_filter_value,
peptide_lengths=args.peptide_lengths, peptide_lengths=peptide_lengths,
use_flanks=not args.no_flanking, use_flanks=not args.no_flanking,
include_affinity_percentile=not args.no_affinity_percentile, include_affinity_percentile=not args.no_affinity_percentile,
throw=not args.no_throw) throw=not args.no_throw)
......
__version__ = "1.6.0" __version__ = "1.6.1"
>QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2] prefix >QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2] prefix
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVfrssVLHSTQDLFLPFFSNVTWFHAIHV
SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF
LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI
>protein1 >protein1
......
...@@ -300,14 +300,19 @@ def test_downloaded_predictor(): ...@@ -300,14 +300,19 @@ def test_downloaded_predictor():
assert len(scan_results4) > 200, len(scan_results4) assert len(scan_results4) > 200, len(scan_results4)
assert_less(scan_results4.iloc[0].affinity, 100) assert_less(scan_results4.iloc[0].affinity, 100)
sequences = {
"seq1":
"MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2":
"QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3":
"AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
}
scan_results5 = PRESENTATION_PREDICTOR.predict_sequences( scan_results5 = PRESENTATION_PREDICTOR.predict_sequences(
result="all", result="all",
comparison_quantity="affinity", comparison_quantity="affinity",
sequences={ sequences=sequences,
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles={ alleles={
"sample1": [ "sample1": [
"HLA-A*02:01", "HLA-A*02:01",
...@@ -328,3 +333,92 @@ def test_downloaded_predictor(): ...@@ -328,3 +333,92 @@ def test_downloaded_predictor():
}) })
print(scan_results5) print(scan_results5)
assert_equal(len(scan_results5), len(scan_results4) * 2) assert_equal(len(scan_results5), len(scan_results4) * 2)
# Test case-insensitive.
scan_results6 = PRESENTATION_PREDICTOR.predict_sequences(
result="all",
comparison_quantity="affinity",
sequences=dict((k, v.lower()) for (k, v) in sequences.items()),
alleles={
"sample1": [
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
],
"sample2": [
"HLA-A*01:01",
"HLA-A*02:06",
"HLA-B*07:02",
"HLA-B*44:02",
"HLA-C*03:01",
"HLA-C*07:02",
],
})
numpy.testing.assert_equal(
scan_results6.peptide.values,
scan_results5.peptide.str.lower().values,
)
numpy.testing.assert_almost_equal(
scan_results6.affinity.values, scan_results5.affinity.values)
numpy.testing.assert_almost_equal(
scan_results6.processing_score.values,
scan_results5.processing_score.values)
numpy.testing.assert_almost_equal(
scan_results6.presentation_score.values,
scan_results5.presentation_score.values)
scan_results7 = PRESENTATION_PREDICTOR.predict_sequences(
result="all",
comparison_quantity="affinity",
sequences={
"seq1": "LVEVEKgVLPQLE",
"seq2": "MRELNGGAYTRYVDNNFCGPdg",
},
alleles={
"sample1": [
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
]
})
print(scan_results7)
# Check that c-terminus peptide is included and with the same case as input.
assert "DNNFCGPdg" in scan_results7.peptide.values, scan_results7.peptide
def test_downloaded_predictor_invalid_peptides():
global PRESENTATION_PREDICTOR
peptides = [
"SIINFEKL",
"REALLYLNGPEPTIDESSSSS",
"SIINFEKLQ",
]
alleles = [
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
]
numpy.testing.assert_raises(
ValueError,
PRESENTATION_PREDICTOR.predict,
peptides=peptides,
alleles=alleles)
results1 = PRESENTATION_PREDICTOR.predict(
peptides=peptides,
alleles=alleles,
throw=False).presentation_score.values
numpy.testing.assert_equal(numpy.isnan(results1), [False, True, False])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment