Skip to content
Snippets Groups Projects
Commit 5e100c1f authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Sequence scanning in presentation predictor

parent d05e772e
No related merge requests found
......@@ -113,6 +113,7 @@ class Class1PresentationPredictor(object):
df = pandas.DataFrame({
"peptide": numpy.array(peptides, copy=False),
})
df["peptide_num"] = df.index
if experiment_names is None:
peptides = EncodableSequences.create(peptides)
all_alleles = set()
......@@ -141,9 +142,13 @@ class Class1PresentationPredictor(object):
new_df["affinity"] = predictions_df[
experiment_alleles
].min(1).values
new_df["best_allele"] = predictions_df[
experiment_alleles
].idxmin(1).values
if len(df) == 0:
new_df["best_allele"] = []
else:
new_df["best_allele"] = predictions_df[
experiment_alleles
].idxmin(1).values
dfs.append(new_df)
result_df = pandas.concat(dfs, ignore_index=True)
else:
......@@ -177,8 +182,8 @@ class Class1PresentationPredictor(object):
if include_affinity_percentile:
result_df["affinity_percentile"] = (
self.affinity_predictor.percentile_ranks(
df.affinity.values,
alleles=df.best_alleles.values,
result_df.affinity.values,
alleles=result_df.best_allele.values,
throw=False))
return result_df
......@@ -215,6 +220,9 @@ class Class1PresentationPredictor(object):
raise ValueError("No processing predictor with flanks")
predictor = self.processing_predictor_with_flanks
if len(peptides) == 0:
return numpy.array([], dtype=float)
num_chunks = int(numpy.ceil(float(len(peptides)) / PREDICT_CHUNK_SIZE))
peptide_chunks = numpy.array_split(peptides, num_chunks)
n_flank_chunks = numpy.array_split(n_flanks, num_chunks)
......@@ -384,6 +392,13 @@ class Class1PresentationPredictor(object):
Presentation scores for each peptide. Scores range from 0 to 1, with
higher values indicating more favorable presentation likelihood.
"""
if isinstance(alleles, dict):
if experiment_names is None:
raise ValueError(
"experiment_names must be supplied when alleles is a dict. "
"Alternatively, call predict_to_dataframe to predict over "
"all experiments")
return self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
......@@ -416,15 +431,17 @@ class Class1PresentationPredictor(object):
----------
peptides : list of string
Peptide sequences
alleles : list of string or string -> string dict
alleles : list of string or string -> list of string dict
If you are predicting for a single sample, pass a list of strings
(up to 6) indicating the genotype. If you are predicting across
multiple samples, pass a dict where the keys are (arbitrary)
experiment names and the values are the alleles to predict for that
sample.
experiment_names : list of string [same length as peptides]
If you are passing a dict for 'alleles', use this argument to
specify which peptides go with which experiments.
If you are passing a dict for 'alleles', you can use this argument to
specify which peptides go with which experiments. If it is None,
then predictions will be performed for each peptide across all
experiments.
n_flanks : list of string [same length as peptides]
Upstream sequences before the peptide. Sequences of any length can
be given and a suffix of the size supported by the model will be
......@@ -453,14 +470,12 @@ class Class1PresentationPredictor(object):
if isinstance(alleles, string_types):
raise TypeError("alleles must be a list or dict")
if isinstance(alleles, dict):
if experiment_names is None:
raise ValueError(
"experiment_names must be supplied when alleles is a dict")
else:
if not isinstance(alleles, dict):
# Make alleles into a dict.
if experiment_names is not None:
raise ValueError(
"alleles must be a dict when experiment_names is specified")
alleles = numpy.array(alleles, copy=False)
if len(alleles) > MAX_ALLELES_PER_SAMPLE:
raise ValueError(
......@@ -472,7 +487,6 @@ class Class1PresentationPredictor(object):
"is expected to be the same length as peptides."
% MAX_ALLELES_PER_SAMPLE)
experiment_names = ["experiment1"] * len(peptides)
alleles = {
"experiment1": alleles,
}
......@@ -488,22 +502,27 @@ class Class1PresentationPredictor(object):
df = self.predict_affinity(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
experiment_names=experiment_names, # might be None
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
df["processing_score"] = processing_scores
df["processing_score"] = df.peptide_num.map(
pandas.Series(processing_scores))
if c_flanks is not None:
df.insert(1, "c_flank", c_flanks)
df.insert(1, "c_flank", df.peptide_num.map(pandas.Series(c_flanks)))
if n_flanks is not None:
df.insert(1, "n_flank", n_flanks)
df.insert(1, "n_flank", df.peptide_num.map(pandas.Series(n_flanks)))
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
df["presentation_score"] = model.predict_proba(
df[self.model_inputs].values)[:,1]
if len(df) > 0:
df["presentation_score"] = model.predict_proba(
df[self.model_inputs].values)[:,1]
else:
df["presentation_score"] = []
del df["affinity_score"]
return df
......@@ -514,9 +533,9 @@ class Class1PresentationPredictor(object):
result="best",
comparison_quantity="presentation_score",
filter_value=None,
peptide_lengths=[8, 9, 10, 11],
peptide_lengths=(8, 9, 10, 11),
use_flanks=True,
include_affinity_percentile=False,
include_affinity_percentile=True,
verbose=1,
throw=True):
"""
......@@ -527,22 +546,21 @@ class Class1PresentationPredictor(object):
sequences : str, list of string, or string -> string dict
Protein sequences. If a dict is given, the keys are arbitrary (
e.g. protein names), and the values are the amino acid sequences.
alleles : str, list of string, list of list of string, or string -> string dict
alleles : list of string, list of list of string, or dict of string -> list of string
MHC I alleles. Can be: (1) a string (a single allele), (2) a list of
strings (a single genotype), (3) a list of list of strings
(multiple genotypes, where the total number of genotypes must equal
the number of sequences), or (4) a dict (in which case the keys must
match the sequences dict keys).
the number of sequences), or (4) a dict giving multiple genotypes,
which will each be run over the sequences.
result : string
One of:
- "best": return the strongest peptide for each sequence
- "all": return predictions for all peptides
- "filtered": return predictions where comparison_quantity is
stronger (i.e (<) for affinity, (>) for scores) than filter_value.
Specify 'best' to return the strongest peptide for each sequence,
'all' to return predictions for all peptides, or 'filtered' to
return predictions where the comparison_quantity is stronger
(i.e (<) for affinity, (>) for scores) than filter_value.
comparison_quantity : string
One of "presentation_score", "processing_score", or "affinity".
Quantity to use to rank (if result is "best") or filter (if result
is "filtered") results.
One of "presentation_score", "processing_score", "affinity", or
"affinity_percentile". Prediction to use to rank (if result is
"best") or filter (if result is "filtered") results.
filter_value : float
Threshold value to use, only relevant when result is "filtered".
If comparison_quantity is "affinity", then all results less than
......@@ -589,42 +607,62 @@ class Class1PresentationPredictor(object):
("sequence_%04d" % (i + 1), sequence)
for (i, sequence) in enumerate(sequences))
cross_product = True
if isinstance(alleles, string_types):
# Case (1) - alleles is a string
alleles = [alleles]
if not isinstance(alleles, dict):
if all([isinstance(item, string_types) for item in alleles]):
alleles = dict((name, alleles) for name in sequences.keys())
elif len(alleles) != len(sequences):
if isinstance(alleles, dict):
if any([isinstance(v, string_types) for v in alleles.values()]):
raise ValueError(
"alleles must be (1) a string (a single allele), (2) a list of "
"strings (a single genotype), (3) a list of list of strings ("
"(multiple genotypes, where the total number of genotypes "
"must equal the number of sequences), or (4) a dict (in which "
"case the keys must match the sequences dict keys). Here "
"it seemed like option (3) was being used, but the length "
"of alleles (%d) did not match the length of sequences (%d)."
% (len(alleles), len(sequences)))
"The values in the alleles dict must be lists, not strings")
else:
if all(isinstance(a, string_types) for a in alleles):
# Case (2) - a simple list of alleles
alleles = {
'genotype': alleles
}
else:
alleles = dict(zip(sequences.keys(), alleles))
# Case (3) - a list of lists
alleles = collections.OrderedDict(
("genotype_%04d" % (i + 1), genotype)
for (i, genotype) in enumerate(alleles))
cross_product = False
if len(alleles) != len(sequences):
raise ValueError(
"When passing a list of lists for the alleles argument "
"the length of the list (%d) must match the length of "
"the sequences being predicted (%d)" % (
len(alleles), len(sequences)))
missing = [key for key in sequences if key not in alleles]
if missing:
raise ValueError(
"Sequence names missing from alleles dict: ", missing)
if not isinstance(alleles, dict):
raise ValueError("Invalid type for alleles: ", type(alleles))
experiment_names = None if cross_product else []
genotype_names = list(alleles)
position_in_sequence = []
for (i, (name, sequence)) in enumerate(sequences.items()):
genotype_name = None if cross_product else genotype_names[i]
for (name, sequence) in sequences.items():
if not isinstance(sequence, string_types):
raise ValueError("Expected string, not %s (%s)" % (
sequence, type(sequence)))
for peptide_start in range(len(sequence) - min(peptide_lengths)):
for peptide_start in range(len(sequence) - min(peptide_lengths) + 1):
n_flank_start = max(0, peptide_start - n_flank_length)
for peptide_length in peptide_lengths:
peptide = sequence[
peptide_start: peptide_start + peptide_length
]
if len(peptide) != peptide_length:
continue
c_flank_end = (
peptide_start + peptide_length + c_flank_length)
sequence_names.append(name)
peptides.append(
sequence[peptide_start: peptide_start + peptide_length])
position_in_sequence.append(peptide_start)
if not cross_product:
experiment_names.append(genotype_name)
peptides.append(peptide)
if use_flanks:
n_flanks.append(
sequence[n_flank_start : peptide_start])
......@@ -636,13 +674,20 @@ class Class1PresentationPredictor(object):
alleles=alleles,
n_flanks=n_flanks,
c_flanks=c_flanks,
experiment_names=sequence_names,
experiment_names=experiment_names,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
result_df = result_df.rename(
columns={"experiment_name": "sequence_name"})
result_df.insert(
0,
"sequence_name",
result_df.peptide_num.map(pandas.Series(sequence_names)))
result_df.insert(
1,
"position_in_sequence",
result_df.peptide_num.map(pandas.Series(position_in_sequence)))
del result_df["peptide_num"]
comparison_is_score = comparison_quantity.endswith("score")
......@@ -652,7 +697,8 @@ class Class1PresentationPredictor(object):
if result == "best":
result_df = result_df.drop_duplicates(
"sequence_name", keep="first").sort_values("sequence_name")
["sequence_name", "experiment_name"], keep="first"
).sort_values("sequence_name")
elif result == "filtered":
if comparison_is_score:
result_df = result_df.loc[
......
"""
Adapted from pyensembl, github.com/openvax/pyensembl
Original implementation by Alex Rubinsteyn.
The worse sin in bioinformatics is to write your own FASTA parser.
We're doing this to avoid adding another dependency to MHCflurry, however.
"""
from __future__ import print_function, division, absolute_import
from gzip import GzipFile
import logging
from six import binary_type, PY3
import pandas
def read_fasta_to_dataframe(filename):
reader = FastaParser()
rows = reader.iterate_over_file(filename)
return pandas.DataFrame(
rows,
columns=["sequence_id", "sequence"])
class FastaParser(object):
"""
FastaParser object consumes lines of a FASTA file incrementally.
"""
def __init__(self):
self.current_id = None
self.current_lines = []
def iterate_over_file(self, fasta_path):
"""
Generator that yields identifiers paired with sequences.
"""
with self.open_file(fasta_path) as f:
for line in f:
line = line.rstrip()
if len(line) == 0:
continue
# have to slice into a bytes object or else get a single integer
first_char = line[0:1]
if first_char == b">":
previous_entry = self._current_entry()
self.current_id = self._parse_header_id(line)
if len(self.current_id) == 0:
logging.warning(
"Unable to parse ID from header line: %s", line)
self.current_lines = []
if previous_entry is not None:
yield previous_entry
elif first_char == b";":
# semicolon are comment characters
continue
else:
self.current_lines.append(line)
# the last sequence is still in the lines buffer after we're done with
# the file so make sure to yield it
id_and_seq = self._current_entry()
if id_and_seq is not None:
yield id_and_seq
def _current_entry(self):
# when we hit a new entry, if this isn't the first
# entry of the file then put the last one in the dictionary
if self.current_id:
if len(self.current_lines) == 0:
logging.warning("No sequence data for '%s'", self.current_id)
else:
sequence = b"".join(self.current_lines)
if PY3:
# only decoding into an ASCII str for Python 3 since
# the binary sequence type for Python 2 is already 'str'
# and the unicode representation is inefficient
# (using either 16 or 32 bits per character depends on build)
sequence = sequence.decode("ascii")
return self.current_id, sequence
@staticmethod
def open_file(fasta_path):
"""
Open either a text file or compressed gzip file as a stream of bytes.
"""
if fasta_path.endswith("gz") or fasta_path.endswith("gzip"):
return GzipFile(fasta_path, 'rb')
else:
return open(fasta_path, 'rb')
@staticmethod
def _parse_header_id(line):
"""
Pull the transcript or protein identifier from the header line
which starts with '>'
"""
if type(line) is not binary_type:
raise TypeError("Expected header line to be of type %s but got %s" % (
binary_type, type(line)))
if len(line) <= 1:
raise ValueError("No identifier on FASTA line")
# split line at first space to get the unique identifier for
# this sequence
space_index = line.find(b" ")
if space_index >= 0:
identifier = line[1:space_index]
else:
identifier = line[1:]
return identifier.decode("ascii")
\ No newline at end of file
'''
Scan protein sequences using the MHCflurry presentation predictor.
By default, subsequences with affinity percentile ranks less than 2.0 are
returned. You can also specify --results-all to return predictions for all
subsequences, or --results-best to return the top subsequence for each sequence.
Examples:
Scan a set of sequences in a FASTA file for binders to any alleles in a MHC I
genotype:
mhcflurry-predict-scan \
test/data/example.fasta \
--alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02
Instead of a FASTA, you can also pass a CSV that has "sequence_id" and "sequence"
columns.
You can also specify multiple MHC I genotypes to scan:
mhcflurry-predict-scan \
test/data/example.fasta \
--alleles \
HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \
HLA-A*01:01,HLA-A*02:06,HLA-B*68:01,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01
If `--out` is not specified, results are written to standard out.
You can also run on sequences specified on the commandline:
mhcflurry-predict-scan \
--sequences MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT \
--alleles HLA-A0201 H-2Kb
'''
from __future__ import (
print_function,
division,
absolute_import,
)
import sys
import argparse
import itertools
import logging
import os
import pandas
from .downloads import get_default_class1_presentation_models_dir
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_presentation_predictor import Class1PresentationPredictor
from .fasta import read_fasta_to_dataframe
from .version import __version__
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False)
helper_args = parser.add_argument_group(title="Help")
helper_args.add_argument(
"-h", "--help",
action="help",
help="Show this help message and exit"
)
helper_args.add_argument(
"--list-supported-alleles",
action="store_true",
default=False,
help="Prints the list of supported alleles and exits"
)
helper_args.add_argument(
"--list-supported-peptide-lengths",
action="store_true",
default=False,
help="Prints the list of supported peptide lengths and exits"
)
helper_args.add_argument(
"--version",
action="version",
version="mhcflurry %s" % __version__,
)
input_args = parser.add_argument_group(title="Input options")
input_args.add_argument(
"input",
metavar="INPUT.csv",
nargs="?",
help="Input CSV or FASTA")
input_args.add_argument(
"--input-format",
choices=("guess", "csv", "fasta"),
default="guess",
help="Format of input file. By default, it is guessed from the file "
"extension.")
input_args.add_argument(
"--alleles",
metavar="ALLELE",
nargs="+",
help="Alleles to predict")
input_args.add_argument(
"--sequences",
metavar="SEQ",
nargs="+",
help="Sequences to predict (exclusive with passing an input file)")
input_args.add_argument(
"--sequence-id-column",
metavar="NAME",
default="sequence_id",
help="Input CSV column name for sequence IDs. Default: '%(default)s'")
input_args.add_argument(
"--sequence-column",
metavar="NAME",
default="sequence",
help="Input CSV column name for sequences. Default: '%(default)s'")
input_args.add_argument(
"--no-throw",
action="store_true",
default=False,
help="Return NaNs for unsupported alleles or peptides instead of raising")
results_args = parser.add_argument_group(title="Result options")
results_args.add_argument(
"--peptide-lengths",
type=int,
nargs="+",
default=[8, 9, 10, 11],
help="Peptide lengths to consider. Default: %(default)s.")
comparison_quantities = [
"presentation_score",
"processing_score",
"affinity",
"affinity_percentile",
]
results_args.add_argument(
"--results-all",
action="store_true",
default=False,
help="")
results_args.add_argument(
"--results-best",
choices=comparison_quantities,
help="Take the top result for each sequence according to the specified "
"predicted quantity")
results_args.add_argument(
"--results-filtered",
choices=comparison_quantities,
help="Filter results by the specified quantity.")
results_args.add_argument(
"--threshold-presentation-score",
type=float,
default=0.7,
help="Threshold if filtering by presentation score. Default: %(default)s")
results_args.add_argument(
"--threshold-processing-score",
type=float,
default=0.5,
help="Threshold if filtering by processing score. Default: %(default)s")
results_args.add_argument(
"--threshold-affinity",
type=float,
default=500,
help="Threshold if filtering by affinity. Default: %(default)s")
results_args.add_argument(
"--threshold-affinity-percentile",
type=float,
default=2.0,
help="Threshold if filtering by affinity percentile. Default: %(default)s")
output_args = parser.add_argument_group(title="Output options")
output_args.add_argument(
"--out",
metavar="OUTPUT.csv",
help="Output CSV")
output_args.add_argument(
"--output-delimiter",
metavar="CHAR",
default=",",
help="Delimiter character for results. Default: '%(default)s'")
output_args.add_argument(
"--no-affinity-percentile",
default=False,
action="store_true",
help="Do not include affinity percentile rank")
model_args = parser.add_argument_group(title="Model options")
model_args.add_argument(
"--models",
metavar="DIR",
default=None,
help="Directory containing presentation models."
"Default: %s" % get_default_class1_presentation_models_dir(
test_exists=False))
model_args.add_argument(
"--no-flanking",
action="store_true",
default=False,
help="Do not use flanking sequence information in predictions")
def run(argv=sys.argv[1:]):
logging.getLogger('tensorflow').disabled = True
if not argv:
parser.print_help()
parser.exit(1)
args = parser.parse_args(argv)
# It's hard to pass a tab in a shell, so we correct a common error:
if args.output_delimiter == "\\t":
args.output_delimiter = "\t"
result_args = {
"all": args.results_all,
"best": args.results_best,
"filtered": args.results_filtered,
}
if all([not bool(arg) for arg in result_args.values()]):
result_args["filtered"] = "affinity_percentile"
if sum([bool(arg) for arg in result_args.values()]) > 1:
parser.error(
"Specify at most one of --results-all, --results-best, "
"--results-filtered")
(result,) = [key for (key, value) in result_args.items() if value]
result_comparison_quantity = result_args[result]
result_filter_value = None if result != "filtered" else {
"presentation_score": args.threshold_presentation_score,
"processing_score": args.threshold_processing_score,
"affinity": args.threshold_affinity,
"affinity_percentile": args.threshold_affinity_percentile,
}[result_comparison_quantity]
models_dir = args.models
if models_dir is None:
# The reason we set the default here instead of in the argument parser
# is that we want to test_exists at this point, so the user gets a
# message instructing them to download the models if needed.
models_dir = get_default_class1_presentation_models_dir(test_exists=True)
predictor = Class1PresentationPredictor.load(models_dir)
if args.list_supported_alleles:
print("\n".join(predictor.supported_alleles))
return
if args.list_supported_peptide_lengths:
min_len, max_len = predictor.supported_peptide_lengths
print("\n".join([str(l) for l in range(min_len, max_len+1)]))
return
if args.input:
if args.sequences:
parser.error(
"If an input file is specified, do not specify --sequences")
input_format = args.input_format
if input_format == "guess":
extension = args.input.lower().split(".")[-1]
if extension in ["gz", "bzip2"]:
extension = args.input.lower().split(".")[-2]
if extension == "csv":
input_format = "csv"
elif extension in ["fasta", "fa"]:
input_format = "fasta"
else:
parser.error(
"Couldn't guess input format from file extension: %s\n"
"Pass the --input-format argument to specify if it is a "
"CSV or fasta file" % args.input)
print("Guessed input file format:", input_format)
if input_format == "csv":
df = pandas.read_csv(args.input)
print("Read input CSV with %d rows, columns are: %s" % (
len(df), ", ".join(df.columns)))
for col in [args.sequence_column,]:
if col not in df.columns:
raise ValueError(
"No such column '%s' in CSV. Columns are: %s" % (
col, ", ".join(["'%s'" % c for c in df.columns])))
elif input_format == "fasta":
df = read_fasta_to_dataframe(args.input)
print("Read input fasta with %d sequences" % len(df))
print(df)
else:
raise ValueError("Unsupported input format", input_format)
else:
if not args.sequences:
parser.error(
"Specify either an input file or the --sequences argument")
df = pandas.DataFrame({
args.sequence_column: args.sequences,
})
if args.sequence_id_column not in df:
df[args.sequence_id_column] = "sequence_" + df.index.astype(str)
df = df.set_index(args.sequence_id_column)
genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+")
genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i)
result_df = predictor.predict_sequences(
sequences=df[args.sequence_column].to_dict(),
alleles=genotypes.to_dict(),
result=result,
comparison_quantity=result_comparison_quantity,
filter_value=result_filter_value,
peptide_lengths=args.peptide_lengths,
use_flanks=not args.no_flanking,
include_affinity_percentile=not args.no_affinity_percentile,
throw=not args.no_throw)
if args.out:
result_df.to_csv(args.out, index=False, sep=args.output_delimiter)
print("Wrote: %s" % args.out)
else:
result_df.to_csv(sys.stdout, index=False, sep=args.output_delimiter)
......@@ -73,6 +73,7 @@ if __name__ == '__main__':
'console_scripts': [
'mhcflurry-downloads = mhcflurry.downloads_command:run',
'mhcflurry-predict = mhcflurry.predict_command:run',
'mhcflurry-predict-scan = mhcflurry.predict_scan_command:run',
'mhcflurry-class1-train-allele-specific-models = '
'mhcflurry.train_allele_specific_models_command:run',
'mhcflurry-class1-train-pan-allele-models = '
......
>QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2]
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV
SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF
LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI
NLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYN
ENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASV
YAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD
YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYF
PLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFL
PFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLT
PTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLG
AENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGI
AVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDC
LGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIG
VTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDI
LSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLM
SFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNT
FVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVA
KNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDD
SEPVLKGVKLHYT
>protein1
MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQV
EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHH
>protein2
VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGA
ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC
......@@ -132,6 +132,87 @@ def test_basic():
numpy.testing.assert_array_almost_equal(
test_df["prediction2"], other_test_df["prediction2"], decimal=6)
def test_downloaded_predictor_small():
global PRESENTATION_PREDICTOR
# Test sequence scanning
scan_results = PRESENTATION_PREDICTOR.predict_sequences(
sequences=[
"MESLVPGFN",
"QPYVFIKRS",
"AGGHSYGAD",
],
alleles={
"HLA-A*02:01": ["HLA-A*02:01"],
"HLA-C*02:01": ["HLA-C*02:01"],
},
peptide_lengths=[9],
result="best")
print(scan_results)
assert_equal(len(scan_results), 6)
scan_results = PRESENTATION_PREDICTOR.predict_sequences(
sequences=[
"MESLVPGFN",
"QPYVFIKRS",
"AGGHSYGAD",
],
alleles={
"HLA-A*02:01": ["HLA-A*02:01"],
"HLA-C*02:01": ["HLA-C*02:01"],
},
peptide_lengths=[8, 9],
result="best")
print(scan_results)
assert_equal(len(scan_results), 6)
scan_results = PRESENTATION_PREDICTOR.predict_sequences(
sequences=[
"MESLVPGFN",
"QPYVFIKRS",
"AGGHSYGAD",
],
alleles={
"HLA-A*02:01": ["HLA-A*02:01"],
"HLA-C*02:01": ["HLA-C*02:01"],
},
peptide_lengths=[9],
result="all")
print(scan_results)
assert_equal(len(scan_results), 6)
scan_results = PRESENTATION_PREDICTOR.predict_sequences(
sequences=[
"MESLVPGFN",
"QPYVFIKRS",
"AGGHSYGAD",
],
alleles={
"HLA-A*02:01": ["HLA-A*02:01"],
"HLA-C*02:01": ["HLA-C*02:01"],
},
peptide_lengths=[8, 9],
result="all")
print(scan_results)
assert_equal(len(scan_results), 18)
scan_results = PRESENTATION_PREDICTOR.predict_sequences(
sequences=[
"MESLVPGFN",
"QPYVFIKRS",
"AGGHSYGAD",
],
alleles={
"HLA-A*02:01": ["HLA-A*02:01"],
"HLA-C*02:01": ["HLA-C*02:01"],
},
peptide_lengths=[10],
result="all")
print(scan_results)
assert_equal(len(scan_results), 0)
def test_downloaded_predictor():
global PRESENTATION_PREDICTOR
......@@ -220,3 +301,32 @@ def test_downloaded_predictor():
assert len(scan_results4) > 200, len(scan_results4)
assert_less(scan_results4.iloc[0].affinity, 100)
scan_results5 = PRESENTATION_PREDICTOR.predict_sequences(
result="all",
comparison_quantity="affinity",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles={
"sample1": [
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
],
"sample2": [
"HLA-A*01:01",
"HLA-A*02:06",
"HLA-B*07:02",
"HLA-B*44:02",
"HLA-C*03:01",
"HLA-C*07:02",
],
})
print(scan_results5)
assert_equal(len(scan_results5), len(scan_results4) * 2)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment