Merge pull request #163 from openvax/1.6.1

1.6.1

Merge pull request #163 from openvax/1.6.1
1.6.1
804ff270 · Tim O'Donnell · GitHub · 9d9861af · c81bcbcd · 804ff270
Unverified Commit 804ff270 authored 4 years ago by Tim O'Donnell Committed by GitHub 4 years ago
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ MHC ligands.
 If you find MHCflurry useful in your research please cite:
-> T. O'Donnell, A. Rubinsteyn, U. Laserson. "Improved predictive models of peptide presentation on MHC I". *biorxiv*, 2020. https://www.biorxiv.org/content/10.1101/2020.03.28.013714v1
+> T. O'Donnell, A. Rubinsteyn, U. Laserson. "A model of antigen processing improves prediction of MHC I-presented peptides". *biorxiv*, 2020. https://www.biorxiv.org/content/10.1101/2020.03.28.013714v2
 > T. O’Donnell, A. Rubinsteyn, M. Bonsack, A. B. Riemer, U. Laserson, and J. Hammerbacher, "MHCflurry: Open-Source Class I MHC Binding Affinity Prediction," *Cell Systems*, 2018. https://www.cell.com/cell-systems/fulltext/S2405-4712(18)30232-1.

--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -43,6 +43,9 @@ COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown"
 AMINO_ACID_INDEX = dict(
    (letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN))
+for (letter, i) in list(AMINO_ACID_INDEX.items()):
+    AMINO_ACID_INDEX[letter.lower()] = i  # Support lower-case as well.
 AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
 BLOSUM62_MATRIX = pandas.read_csv(StringIO("""

--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -453,7 +453,19 @@ class Class1AffinityPredictor(object):
        `Class1AffinityPredictor` instance
        """
        if models_dir is None:
-            models_dir = get_default_class1_models_dir()
+            try:
+                models_dir = get_default_class1_models_dir()
+            except RuntimeError as e:
+                # Fall back to the affinity predictor included in presentation
+                # predictor if possible.
+                from mhcflurry.class1_presentation_predictor import (
+                    Class1PresentationPredictor)
+                try:
+                    presentation_predictor = Class1PresentationPredictor.load()
+                    return presentation_predictor.affinity_predictor
+                except RuntimeError:
+                    raise e
        if optimization_level is None:
            optimization_level = OPTIMIZATION_LEVEL

--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -234,7 +234,7 @@ class Class1PresentationPredictor(object):
        return result_df
    def predict_processing(
-            self, peptides, n_flanks=None, c_flanks=None, verbose=1):
+            self, peptides, n_flanks=None, c_flanks=None, throw=True, verbose=1):
        """
        Predict antigen processing scores for individual peptides, optionally
        including flanking sequences for better cleavage prediction.
@@ -244,6 +244,8 @@ class Class1PresentationPredictor(object):
        peptides : list of string
        n_flanks : list of string [same length as peptides]
        c_flanks : list of string [same length as peptides]
+        throw : boolean
+            Whether to raise exception on unsupported peptides
        verbose  : int
        Returns
@@ -285,6 +287,7 @@ class Class1PresentationPredictor(object):
                peptides=peptide_chunk,
                n_flanks=n_flank_chunk,
                c_flanks=c_flank_chunk,
+                throw=throw,
                batch_size=PREDICT_BATCH_SIZE)
            result_chunks.append(result_chunk)
        return numpy.concatenate(result_chunks)
@@ -441,7 +444,8 @@ class Class1PresentationPredictor(object):
            (up to 6) indicating the genotype. If you are predicting across
            multiple samples, pass a dict where the keys are (arbitrary)
            sample names and the values are the alleles to predict for that
-            sample.
+            sample. Set to an empty list or dict to perform processing
+            prediction only.
        sample_names : list of string [same length as peptides]
            If you are passing a dict for 'alleles', you can use this
            argument to specify which peptides go with which samples. If it is
@@ -503,17 +507,26 @@ class Class1PresentationPredictor(object):
            peptides=peptides,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
+            throw=throw,
            verbose=verbose)
-        df = self.predict_affinity(
+        if alleles:
-            peptides=peptides,
+            df = self.predict_affinity(
-            alleles=alleles,
+                peptides=peptides,
-            sample_names=sample_names,  # might be None
+                alleles=alleles,
-            include_affinity_percentile=include_affinity_percentile,
+                sample_names=sample_names,  # might be None
-            verbose=verbose,
+                include_affinity_percentile=include_affinity_percentile,
-            throw=throw)
+                verbose=verbose,
+                throw=throw)
+            df["affinity_score"] = from_ic50(df.affinity)
+        else:
+            # Processing predicion only.
+            df = pandas.DataFrame({
+                "peptide_num": numpy.arange(len(peptides)),
+                "peptide": peptides,
+            })
-        df["affinity_score"] = from_ic50(df.affinity)
        df["processing_score"] = df.peptide_num.map(
            pandas.Series(processing_scores))
        if c_flanks is not None:
@@ -523,12 +536,21 @@ class Class1PresentationPredictor(object):
        model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
        model = self.get_model(model_name)
-        if len(df) > 0:
+        if "affinity_score" in df.columns:
-            df["presentation_score"] = model.predict_proba(
+            if len(df) > 0:
-                df[self.model_inputs].values)[:,1]
+                input_matrix = df[self.model_inputs]
-        else:
+                null_mask = None
-            df["presentation_score"] = []
+                if not throw:
-        del df["affinity_score"]
+                    # Invalid peptides will be null.
+                    null_mask = input_matrix.isnull().any(1)
+                    input_matrix = input_matrix.fillna(0.0)
+                df["presentation_score"] = model.predict_proba(
+                    input_matrix.values)[:,1]
+                if null_mask is not None:
+                    df.loc[null_mask, "presentation_score"] = numpy.nan
+            else:
+                df["presentation_score"] = []
+            del df["affinity_score"]
        return df
    def predict_sequences(
@@ -536,7 +558,7 @@ class Class1PresentationPredictor(object):
            sequences,
            alleles,
            result="best",
-            comparison_quantity="presentation_score",
+            comparison_quantity=None,
            filter_value=None,
            peptide_lengths=(8, 9, 10, 11),
            use_flanks=True,
@@ -593,7 +615,8 @@ class Class1PresentationPredictor(object):
        comparison_quantity : string
            One of "presentation_score", "processing_score", "affinity", or
            "affinity_percentile". Prediction to use to rank (if result is
-            "best") or filter (if result is "filtered") results.
+            "best") or filter (if result is "filtered") results. Default is
+            "presentation_score".
        filter_value : float
            Threshold value to use, only relevant when result is "filtered".
            If comparison_quantity is "affinity", then all results less than
@@ -618,8 +641,13 @@ class Class1PresentationPredictor(object):
            peptide, n_flank, c_flank, sequence_name, affinity, best_allele,
            processing_score, presentation_score
        """
+        if len(alleles) == 0:
+            alleles = {}
        if comparison_quantity is None:
-            comparison_quantity = "presentation_score"
+            comparison_quantity = (
+                "presentation_score"
+                if len(alleles) > 0 else "processing_score")
        processing_predictor = self.processing_predictor_with_flanks
        if not use_flanks or processing_predictor is None:

--- a/mhcflurry/class1_processing_neural_network.py
+++ b/mhcflurry/class1_processing_neural_network.py
@@ -294,6 +294,7 @@ class Class1ProcessingNeuralNetwork(object):
    def predict_encoded(
            self,
            sequences,
+            throw=True,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        """
        Predict antigen processing.
@@ -302,6 +303,8 @@ class Class1ProcessingNeuralNetwork(object):
        ----------
        sequences : FlankingEncoding
            Peptides and flanking sequences
+        throw : boolean
+            Whether to throw exception on unsupported peptides
        batch_size : int
            Prediction keras batch size.
@@ -309,13 +312,13 @@ class Class1ProcessingNeuralNetwork(object):
        -------
        numpy.array
        """
-        x_dict = self.network_input(sequences)
+        x_dict = self.network_input(sequences, throw=throw)
        raw_predictions = self.network().predict(
            x_dict, batch_size=batch_size)
        predictions = numpy.squeeze(raw_predictions).astype("float64")
        return predictions
-    def network_input(self, sequences):
+    def network_input(self, sequences, throw=True):
        """
        Encode peptides to the fixed-length encoding expected by the neural
        network (which depends on the architecture).
@@ -324,6 +327,8 @@ class Class1ProcessingNeuralNetwork(object):
        ----------
        sequences : FlankingEncoding
            Peptides and flanking sequences
+        throw : boolean
+            Whether to throw exception on unsupported peptides
        Returns
        -------
@@ -334,7 +339,8 @@ class Class1ProcessingNeuralNetwork(object):
            self.hyperparameters['peptide_max_length'],
            n_flank_length=self.hyperparameters['n_flank_length'],
            c_flank_length=self.hyperparameters['c_flank_length'],
-            allow_unsupported_amino_acids=True)
+            allow_unsupported_amino_acids=True,
+            throw=throw)
        result = {
            "sequence": encoded.array,

--- a/mhcflurry/class1_processing_predictor.py
+++ b/mhcflurry/class1_processing_predictor.py
@@ -176,6 +176,7 @@ class Class1ProcessingPredictor(object):
            peptides,
            n_flanks=None,
            c_flanks=None,
+            throw=True,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        """
        Predict antigen processing.
@@ -188,6 +189,10 @@ class Class1ProcessingPredictor(object):
            Upstream sequence before each peptide
        c_flanks : list of string
            Downstream sequence after each peptide
+        throw : boolean
+            If True, a ValueError will be raised in the case of unsupported
+            peptides. If False, a warning will be logged and the predictions
+            for those peptides will be NaN.
        batch_size : int
            Prediction keras batch size.
@@ -202,6 +207,7 @@ class Class1ProcessingPredictor(object):
            peptides=peptides,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
+            throw=throw,
            batch_size=batch_size).score.values
    def predict_to_dataframe(
@@ -209,6 +215,7 @@ class Class1ProcessingPredictor(object):
            peptides,
            n_flanks=None,
            c_flanks=None,
+            throw=True,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        """
        Predict antigen processing.
@@ -231,10 +238,10 @@ class Class1ProcessingPredictor(object):
        sequences = FlankingEncoding(
            peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
        return self.predict_to_dataframe_encoded(
-            sequences=sequences, batch_size=batch_size)
+            sequences=sequences, throw=throw, batch_size=batch_size)
    def predict_to_dataframe_encoded(
-            self, sequences, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
+            self, sequences, throw=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        """
        Predict antigen processing.
@@ -244,6 +251,7 @@ class Class1ProcessingPredictor(object):
        ----------
        sequences : FlankingEncoding
        batch_size : int
+        throw : boolean
        Returns
        -------
@@ -254,7 +262,7 @@ class Class1ProcessingPredictor(object):
        for (i, network) in enumerate(self.models):
            predictions = network.predict_encoded(
-                sequences, batch_size=batch_size)
+                sequences, throw=throw, batch_size=batch_size)
            score_array.append(predictions)
        score_array = numpy.array(score_array)

--- a/mhcflurry/flanking_encoding.py
+++ b/mhcflurry/flanking_encoding.py
@@ -7,6 +7,7 @@ from __future__ import (
 from six import string_types
 from collections import namedtuple
+import logging
 from .encodable_sequences import EncodingError, EncodableSequences
@@ -63,7 +64,8 @@ class FlankingEncoding(object):
            peptide_max_length,
            n_flank_length,
            c_flank_length,
-            allow_unsupported_amino_acids=True):
+            allow_unsupported_amino_acids=True,
+            throw=True):
        """
        Encode variable-length sequences to a fixed-size matrix.
@@ -81,6 +83,8 @@ class FlankingEncoding(object):
        allow_unsupported_amino_acids : bool
            If True, non-canonical amino acids will be replaced with the X
            character before encoding.
+        throw : bool
+            Whether to raise exception on unsupported peptides
        Returns
        -------
@@ -97,7 +101,8 @@ class FlankingEncoding(object):
            peptide_max_length,
            n_flank_length,
            c_flank_length,
-            allow_unsupported_amino_acids)
+            allow_unsupported_amino_acids,
+            throw)
        if cache_key not in self.encoding_cache:
            result = self.encode(
                vector_encoding_name=vector_encoding_name,
@@ -105,7 +110,8 @@ class FlankingEncoding(object):
                peptide_max_length=peptide_max_length,
                n_flank_length=n_flank_length,
                c_flank_length=c_flank_length,
-                allow_unsupported_amino_acids=allow_unsupported_amino_acids)
+                allow_unsupported_amino_acids=allow_unsupported_amino_acids,
+                throw=throw)
            self.encoding_cache[cache_key] = result
        return self.encoding_cache[cache_key]
@@ -116,7 +122,8 @@ class FlankingEncoding(object):
            peptide_max_length,
            n_flank_length,
            c_flank_length,
-            allow_unsupported_amino_acids=False):
+            allow_unsupported_amino_acids=False,
+            throw=True):
        """
        Encode variable-length sequences to a fixed-size matrix.
@@ -130,6 +137,7 @@ class FlankingEncoding(object):
        n_flank_length : int
        c_flank_length : int
        allow_unsupported_amino_acids : bool
+        throw : bool
        Returns
        -------
@@ -140,13 +148,21 @@ class FlankingEncoding(object):
            (df.peptide.str.len() < 1)
        ]
        if len(error_df) > 0:
-            raise EncodingError(
+            message = (
                "Sequence '%s' (length %d) unsupported. There are %d "
                "total peptides with this length." % (
                    error_df.iloc[0].peptide,
                    len(error_df.iloc[0].peptide),
-                    len(error_df)),
+                    len(error_df)))
-                supported_peptide_lengths=(1, peptide_max_length + 1))
+            if throw:
+                raise EncodingError(
+                    message,
+                    supported_peptide_lengths=(1, peptide_max_length + 1))
+            logging.warning(message)
+            # Replace invalid peptides with X's. The encoding will be set to
+            # NaNs for these peptides farther below.
+            df.loc[error_df.index, "peptide"] = "X" * peptide_max_length
        if n_flank_length > 0:
            n_flanks = df.n_flank.str.pad(
@@ -171,6 +187,11 @@ class FlankingEncoding(object):
            max_length=n_flank_length + peptide_max_length + c_flank_length,
            allow_unsupported_amino_acids=allow_unsupported_amino_acids)
+        array = array.astype("float32")  # So NaNs can be used.
+        if len(error_df) > 0:
+            array[error_df.index] = numpy.nan
        result = EncodingResult(
            array, peptide_lengths=peptides.str.len().values)

--- a/mhcflurry/predict_scan_command.py
+++ b/mhcflurry/predict_scan_command.py
@@ -43,9 +43,7 @@ from __future__ import (
 import sys
 import argparse
-import itertools
 import logging
-import os
 import pandas
@@ -126,10 +124,11 @@ input_args.add_argument(
 results_args = parser.add_argument_group(title="Result options")
 results_args.add_argument(
    "--peptide-lengths",
-    type=int,
+    default="8-11",
-    nargs="+",
+    metavar="L",
-    default=[8, 9, 10, 11],
+    help="Peptide lengths to consider. Pass as START-END (e.g. 8-11) or a "
-    help="Peptide lengths to consider. Default: %(default)s.")
+    "comma-separated list (8,9,10,11). When using START-END, the range is "
+    "INCLUSIVE on both ends. Default: %(default)s.")
 comparison_quantities = [
    "presentation_score",
    "processing_score",
@@ -203,6 +202,23 @@ model_args.add_argument(
    help="Do not use flanking sequence information in predictions")
+def parse_peptide_lengths(value):
+    try:
+        if "-" in value:
+            (start, end) = value.split("-", 2)
+            start = int(start.strip())
+            end = int(end.strip())
+            peptide_lengths = list(range(start, end + 1))
+        else:
+            peptide_lengths = [
+                int(length.strip())
+                for length in value.split(",")
+            ]
+    except ValueError:
+        raise ValueError("Couldn't parse peptide lengths: ", value)
+    return peptide_lengths
 def run(argv=sys.argv[1:]):
    logging.getLogger('tensorflow').disabled = True
@@ -216,6 +232,8 @@ def run(argv=sys.argv[1:]):
    if args.output_delimiter == "\\t":
        args.output_delimiter = "\t"
+    peptide_lengths = parse_peptide_lengths(args.peptide_lengths)
    result_args = {
        "all": args.results_all,
        "best": args.results_best,
@@ -309,16 +327,21 @@ def run(argv=sys.argv[1:]):
    df = df.set_index(args.sequence_id_column)
-    genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+")
+    if args.alleles:
-    genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i)
+        genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+")
+        genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i)
+        alleles = genotypes.to_dict()
+    else:
+        print("No alleles specified. Will perform processing prediction only.")
+        alleles = {}
    result_df = predictor.predict_sequences(
        sequences=df[args.sequence_column].to_dict(),
-        alleles=genotypes.to_dict(),
+        alleles=alleles,
        result=result,
        comparison_quantity=result_comparison_quantity,
        filter_value=result_filter_value,
-        peptide_lengths=args.peptide_lengths,
+        peptide_lengths=peptide_lengths,
        use_flanks=not args.no_flanking,
        include_affinity_percentile=not args.no_affinity_percentile,
        throw=not args.no_throw)

--- a/mhcflurry/version.py
+++ b/mhcflurry/version.py
-__version__ = "1.6.0"
+__version__ = "1.6.1"
--- a/test/data/example.fasta
+++ b/test/data/example.fasta
 >QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2] prefix
-MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV
+MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVfrssVLHSTQDLFLPFFSNVTWFHAIHV
 SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF
 LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI
 >protein1

--- a/test/test_class1_presentation_predictor.py
+++ b/test/test_class1_presentation_predictor.py
@@ -300,14 +300,19 @@ def test_downloaded_predictor():
    assert len(scan_results4) > 200, len(scan_results4)
    assert_less(scan_results4.iloc[0].affinity, 100)
+    sequences = {
+        "seq1":
+            "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
+        "seq2":
+            "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
+        "seq3":
+            "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
+    }
    scan_results5 = PRESENTATION_PREDICTOR.predict_sequences(
        result="all",
        comparison_quantity="affinity",
-        sequences={
+        sequences=sequences,
-            "seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
-            "seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
-            "seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
-        },
        alleles={
            "sample1": [
                "HLA-A*02:01",
@@ -328,3 +333,92 @@ def test_downloaded_predictor():
        })
    print(scan_results5)
    assert_equal(len(scan_results5), len(scan_results4) * 2)
+    # Test case-insensitive.
+    scan_results6 = PRESENTATION_PREDICTOR.predict_sequences(
+        result="all",
+        comparison_quantity="affinity",
+        sequences=dict((k, v.lower()) for (k, v) in sequences.items()),
+        alleles={
+            "sample1": [
+                "HLA-A*02:01",
+                "HLA-A*03:01",
+                "HLA-B*57:01",
+                "HLA-B*44:02",
+                "HLA-C*02:01",
+                "HLA-C*07:01",
+            ],
+            "sample2": [
+                "HLA-A*01:01",
+                "HLA-A*02:06",
+                "HLA-B*07:02",
+                "HLA-B*44:02",
+                "HLA-C*03:01",
+                "HLA-C*07:02",
+            ],
+        })
+    numpy.testing.assert_equal(
+        scan_results6.peptide.values,
+        scan_results5.peptide.str.lower().values,
+    )
+    numpy.testing.assert_almost_equal(
+        scan_results6.affinity.values, scan_results5.affinity.values)
+    numpy.testing.assert_almost_equal(
+        scan_results6.processing_score.values,
+        scan_results5.processing_score.values)
+    numpy.testing.assert_almost_equal(
+        scan_results6.presentation_score.values,
+        scan_results5.presentation_score.values)
+    scan_results7 = PRESENTATION_PREDICTOR.predict_sequences(
+        result="all",
+        comparison_quantity="affinity",
+        sequences={
+            "seq1": "LVEVEKgVLPQLE",
+            "seq2": "MRELNGGAYTRYVDNNFCGPdg",
+        },
+        alleles={
+            "sample1": [
+                "HLA-A*02:01",
+                "HLA-A*03:01",
+                "HLA-B*57:01",
+                "HLA-B*44:02",
+                "HLA-C*02:01",
+                "HLA-C*07:01",
+            ]
+        })
+    print(scan_results7)
+    # Check that c-terminus peptide is included and with the same case as input.
+    assert "DNNFCGPdg" in scan_results7.peptide.values, scan_results7.peptide
+def test_downloaded_predictor_invalid_peptides():
+    global PRESENTATION_PREDICTOR
+    peptides = [
+        "SIINFEKL",
+        "REALLYLNGPEPTIDESSSSS",
+        "SIINFEKLQ",
+    ]
+    alleles = [
+        "HLA-A*02:01",
+        "HLA-A*03:01",
+        "HLA-B*57:01",
+        "HLA-B*44:02",
+        "HLA-C*02:01",
+        "HLA-C*07:01",
+    ]
+    numpy.testing.assert_raises(
+        ValueError,
+        PRESENTATION_PREDICTOR.predict,
+        peptides=peptides,
+        alleles=alleles)
+    results1 = PRESENTATION_PREDICTOR.predict(
+        peptides=peptides,
+        alleles=alleles,
+        throw=False).presentation_score.values
+    numpy.testing.assert_equal(numpy.isnan(results1), [False, True, False])