fixed propagation of allow_unknown_amino_acids through dataset loading

115bfdd5 · Alex Rubinsteyn · cc224e7a · 115bfdd5 · 115bfdd5 · 115bfdd5
Commit 115bfdd5 authored 8 years ago by Alex Rubinsteyn
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -34,7 +34,7 @@ from .serialization_helpers import (
    load_keras_model_from_disk,
    save_keras_model_to_disk
 )
-
+from .peptide_encoding import check_valid_index_encoding_array
 from .class1_allele_specific_hyperparameters import MAX_IC50

 _allele_predictor_cache = {}
@@ -386,14 +386,7 @@ class Class1BindingPredictor(PredictorBase):
        Given an encoded array of amino acid indices, returns a vector
        of predicted log IC50 values.
        """
-        X = np.asarray(X)
-        if len(X.shape) != 2:
-            raise ValueError("Expected 2d input, got array with shape %s" % (
-                X.shape,))
-        max_expected_index = 20 if self.allow_unknown_amino_acids else 19
-        if X.max() > max_expected_index:
-            raise ValueError(
-                "Got index %d in peptide encoding, max expected %d" % (
-                    X.max(),
-                    max_expected_index))
+        X = check_valid_index_encoding_array(
+            X,
+            allow_unknown_amino_acids=self.allow_unknown_amino_acids)
        return self.model.predict(X, verbose=False).flatten()
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -26,7 +26,8 @@ from .common import normalize_allele_name
 from .amino_acid import common_amino_acids
 from .peptide_encoding import (
    indices_to_hotshot_encoding,
-    fixed_length_from_many_peptides
+    fixed_length_index_encoding,
+    check_valid_index_encoding_array,
 )
 from .class1_allele_specific_hyperparameters import MAX_IC50

@@ -196,10 +197,32 @@ def load_allele_dicts(
 def encode_peptide_to_affinity_dict(
        peptide_to_affinity_dict,
        peptide_length=9,
-        flatten_binary_encoding=True):
+        flatten_binary_encoding=True,
+        allow_unknown_amino_acids=True):
    """
-    Given a dictionary mapping from peptide sequences to affinity values,
-    returns tuple with the following fields:
+    Given a dictionary mapping from peptide sequences to affinity values, return
+    both index and binary encodings of fixed length peptides, and
+    a vector of their affinities.
+
+    Parameters
+    ----------
+    peptide_to_affinity_dict : dict
+        Keys are peptide strings (of multiple lengths), each mapping to a
+        continuous affinity value.
+
+    peptide_length : int
+        Length of vector encoding
+
+    flatten_binary_encoding : bool
+        Should the binary encoding of a peptide be two-dimensional (9x20)
+        or a flattened 1d vector
+
+    allow_unknown_amino_acids : bool
+        When extending a short vector to the desired peptide length, should
+        we insert every possible amino acid or a designated character "X"
+        indicating an unknown amino acid.
+
+    Returns tuple with the following fields:
        - kmer_peptides: fixed length peptide strings
        - original_peptides: variable length peptide strings
        - counts: how many fixed length peptides were made from this original
@@ -208,24 +231,32 @@ def encode_peptide_to_affinity_dict(
        - Y: affinity values associated with original peptides
    """
    raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
-    kmer_peptides, original_peptides, counts = \
-        fixed_length_from_many_peptides(
+    X_index, kmer_peptides, original_peptides, counts = \
+        fixed_length_index_encoding(
            peptides=raw_peptides,
            desired_length=peptide_length,
            start_offset_shorten=0,
            end_offset_shorten=0,
            start_offset_extend=0,
-            end_offset_extend=0)
+            end_offset_extend=0,
+            allow_unknown_amino_acids=allow_unknown_amino_acids)
+
    n_samples = len(kmer_peptides)
+
    assert n_samples == len(original_peptides), \
        "Mismatch between # of samples (%d) and # of peptides (%d)" % (
            n_samples, len(original_peptides))
    assert n_samples == len(counts), \
        "Mismatch between # of samples (%d) and # of counts (%d)" % (
            n_samples, len(counts))
-
-    X_index = index_encoding(kmer_peptides, peptide_length)
-    X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
+    assert n_samples == len(X_index), \
+        "Mismatch between # of sample (%d) and index feature vectors (%d)" % (
+            n_samples, len(X_index))
+    X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
+    n_indices = 20 + allow_unknown_amino_acids
+    X_binary = indices_to_hotshot_encoding(
+        X_index,
+        n_indices=n_indices)

    assert X_binary.shape[0] == X_index.shape[0], \
        ("Mismatch between number of samples for index encoding (%d)"
@@ -235,7 +266,7 @@ def encode_peptide_to_affinity_dict(

    if flatten_binary_encoding:
        # collapse 3D input into 2D matrix
-        n_binary_features = peptide_length * 20
+        n_binary_features = peptide_length * n_indices
        X_binary = X_binary.reshape((n_samples, n_binary_features))

    # easier to work with counts when they're an array instead of list
@@ -260,7 +291,7 @@ def load_allele_datasets(
        peptide_column_name=None,
        peptide_length_column_name="peptide_length",
        ic50_column_name="meas",
-        only_human=True):
+        only_human=False):
    """
    Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
    and log-transforms the IC50 measurement. Returns dictionary mapping allele

--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -273,12 +273,16 @@ def fixed_length_index_encoding(
    refers to the position *before* the start of a peptide and, similarly,
    `end_offset_extend` = 0 refers to the position *after* the peptide.

-    Returns feature matrix X, a list of original peptides for each feature
-    vector, and a list of integer counts indicating how many rows share a
-    particular original peptide. When two rows are expanded out of a single
-    original peptide, they will both have a count of 2. These counts can
-    be useful for down-weighting the importance of multiple feature vectors
-    which originate from the same sample.
+    Returns tuple with the following fields:
+        - index encoded feature matrix X
+        - list of fixed length peptides
+        - list of "original" peptides of varying lengths
+        - list of integer counts indicating how many rows came from
+          that original peptide.
+
+    When two rows are expanded out of a single original peptide, they will both
+    have a count of 2. These counts can be useful for down-weighting the
+    importance of multiple feature vectors which originate from the same sample.
    """
    if allow_unknown_amino_acids:
        insert_letters = ["X"]
@@ -295,5 +299,18 @@ def fixed_length_index_encoding(
        start_offset_extend=start_offset_extend,
        end_offset_extend=end_offset_extend,
        insert_amino_acid_letters=insert_letters)
-    X = index_encoding(fixed_length, desired_length)
-    return X, original, counts
+    X_index = index_encoding(fixed_length, desired_length)
+    return X_index, fixed_length, original, counts
+
+def check_valid_index_encoding_array(X, allow_unknown_amino_acids=True):
+        X = np.asarray(X)
+        if len(X.shape) != 2:
+            raise ValueError("Expected 2d input, got array with shape %s" % (
+                X.shape,))
+        max_expected_index = 20 if allow_unknown_amino_acids else 19
+        if X.max() > max_expected_index:
+            raise ValueError(
+                "Got index %d in peptide encoding, max expected %d" % (
+                    X.max(),
+                    max_expected_index))
+        return X
--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -67,7 +67,7 @@ class PredictorBase(object):
        indices = []
        encoded_matrices = []
        for i, peptide in enumerate(peptides):
-            matrix, _, _ = fixed_length_index_encoding(
+            matrix, _, _, _ = fixed_length_index_encoding(
                peptides=[peptide],
                desired_length=9,
                allow_unknown_amino_acids=self.allow_unknown_amino_acids)