Merge pull request #19 from hammerlab/move-imputation-from-experiments

moving helpers for performing dataset imputations from experiments

Merge pull request #19 from hammerlab/move-imputation-from-experiments
moving helpers for performing dataset imputations from experiments
7d17bedc · Alex Rubinsteyn · 00b17305 · ac95b1f1 · 7d17bedc · 7d17bedc
Commit 7d17bedc authored 8 years ago by Alex Rubinsteyn
--- a/README.md
+++ b/README.md
-[![Build Status](https://travis-ci.org/hammerlab/mhcflurry.svg?branch=master)](https://travis-ci.org/hammerlab/mhcflurry) [![Coverage Status](https://coveralls.io/repos/github/hammerlab/mhcflurry/badge.svg?branch=fix-training-script)](https://coveralls.io/github/hammerlab/mhcflurry?branch=fix-training-script)
+[![Build Status](https://travis-ci.org/hammerlab/mhcflurry.svg?branch=master)](https://travis-ci.org/hammerlab/mhcflurry) [![Coverage Status](https://coveralls.io/repos/github/hammerlab/mhcflurry/badge.svg?branch=master)](https://coveralls.io/github/hammerlab/mhcflurry?branch=master)

 # mhcflurry
 Peptide-MHC binding affinity prediction

--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@ from __future__ import (
    division,
    absolute_import,
 )
-
+import numpy as np

 def parse_int_list(s):
    return [int(part.strip() for part in s.split(","))]
@@ -65,3 +65,37 @@ def split_allele_names(s):
        for part
        in s.split(",")
    ]
+
+def ic50_to_regression_target(ic50, max_ic50):
+    """
+    Transform IC50 inhibitory binding concentrations to affinity values between
+    [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very
+    strong binder.
+
+    Parameters
+    ----------
+    ic50 : numpy.ndarray
+
+    max_ic50 : float
+    """
+    log_ic50 = np.log(ic50) / np.log(max_ic50)
+    regression_target = 1.0 - log_ic50
+    # clamp to values between 0, 1
+    regression_target = np.maximum(regression_target, 0.0)
+    regression_target = np.minimum(regression_target, 1.0)
+    return regression_target
+
+def regression_target_to_ic50(y, max_ic50):
+    """
+    Transform values between [0,1] to IC50 inhibitory binding concentrations
+    between [1.0, infinity]
+
+    Parameters
+    ----------
+    y : numpy.ndarray of float
+
+    max_ic50 : float
+
+    Returns numpy.ndarray
+    """
+    return max_ic50 ** (1.0 - y)
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -22,7 +22,7 @@ from collections import namedtuple, defaultdict
 import pandas as pd
 import numpy as np

-from .common import normalize_allele_name
+from .common import normalize_allele_name, ic50_to_regression_target
 from .amino_acid import common_amino_acids
 from .peptide_encoding import (
    indices_to_hotshot_encoding,
@@ -104,8 +104,7 @@ def load_dataframe(
    only_human : bool
        Only load entries from human MHC alleles

-    Returns DataFrame augmented with extra columns:
-        - "log_ic50" : log(ic50) / log(max_ic50)
+    Returns DataFrame augmented with extra column:
        - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
    """
    if sep is None:
@@ -138,13 +137,7 @@ def load_dataframe(

    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
    ic50 = np.array(df[ic50_column_name])
-    log_ic50 = np.log(ic50) / np.log(max_ic50)
-    df["log_ic50"] = log_ic50
-    regression_output = 1.0 - log_ic50
-    # clamp to values between 0, 1
-    regression_output = np.maximum(regression_output, 0.0)
-    regression_output = np.minimum(regression_output, 1.0)
-    df["regression_output"] = regression_output
+    df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50)
    return df, peptide_column_name


@@ -278,6 +271,57 @@ def encode_peptide_to_affinity_dict(
            n_samples, len(Y))
    return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)

+def create_allele_data_from_peptide_to_ic50_dict(
+        peptide_to_ic50_dict,
+        max_ic50=MAX_IC50,
+        kmer_length=9,
+        flatten_binary_encoding=True):
+    """
+    Parameters
+    ----------
+    peptide_to_ic50_dict : dict
+        Dictionary mapping peptides of different lengths to IC50 binding
+        affinity values.
+
+    max_ic50 : float
+        Maximum IC50 value used as the cutoff for affinity of 0.0 when
+        transforming from IC50 to regression targets.
+
+    kmer_length : int
+        What length substrings will be fed to a fixed-length predictor?
+
+    flatten_binary_encoding : bool
+        Should hotshot encodings of amino acid inputs be flattened into a 1D
+        vector or have two dimensions (where the first represents position)?
+
+    Return an AlleleData object.
+    """
+    Y_dict = {
+        peptide: ic50_to_regression_target(ic50, max_ic50)
+        for (peptide, ic50)
+        in peptide_to_ic50_dict.items()
+    }
+    (kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \
+        encode_peptide_to_affinity_dict(
+            Y_dict,
+            peptide_length=kmer_length,
+            flatten_binary_encoding=flatten_binary_encoding)
+
+    ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides])
+    assert len(kmer_peptides) == len(ic50_array), \
+        "Mismatch between # of peptides %d and # IC50 outputs %d" % (
+            len(kmer_peptides), len(ic50_array))
+
+    return AlleleData(
+        X_index=X_index,
+        X_binary=X_binary,
+        Y=Y_kmer,
+        ic50=ic50_array,
+        peptides=kmer_peptides,
+        original_peptides=original_peptides,
+        original_lengths=[len(peptide) for peptide in original_peptides],
+        substring_counts=counts,
+        weights=1.0 / counts)

 def load_allele_datasets(
        filename,
@@ -371,35 +415,9 @@ def load_allele_datasets(
            for (peptide, ic50)
            in zip(raw_peptides, group[ic50_column_name])
        }
-
-        Y_dict = {
-            peptide: y
-            for (peptide, y)
-            in zip(raw_peptides, group["regression_output"])
-        }
-
-        (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
-            encode_peptide_to_affinity_dict(
-                Y_dict,
-                peptide_length=peptide_length,
-                flatten_binary_encoding=flatten_binary_encoding)
-
-        ic50 = np.array([ic50_dict[p] for p in original_peptides])
-
-        assert len(kmer_peptides) == len(ic50), \
-            "Mismatch between # of peptides %d and # IC50 outputs %d" % (
-                len(kmer_peptides), len(ic50))
-
-        allele_groups[allele] = AlleleData(
-            X_index=X_index,
-            X_binary=X_binary,
-            Y=Y,
-            ic50=ic50,
-            peptides=kmer_peptides,
-            original_peptides=original_peptides,
-            original_lengths=[len(peptide) for peptide in original_peptides],
-            substring_counts=counts,
-            weights=1.0 / counts)
+        allele_groups[allele] = create_allele_data_from_peptide_to_ic50_dict(
+            ic50_dict,
+            max_ic50=max_ic50)
    return allele_groups



--- a/mhcflurry/feedforward.py
+++ b/mhcflurry/feedforward.py
@@ -27,28 +27,6 @@ import theano
 theano.config.exception_verbosity = 'high'


-def compile_forward_predictor(model, theano_mode=None):
-    """
-    In cases where we want to get predictions from a model that hasn't
-    been compiled (to avoid overhead of compiling training code),
-    use this helper to only compile the subset of Theano needed for
-    forward-propagation/predictions.
-    """
-
-    model.X_test = model.get_input(train=False)
-    model.y_test = model.get_output(train=False)
-    if type(model.X_test) == list:
-        predict_ins = model.X_test
-    else:
-        predict_ins = [model.X_test]
-
-    model._predict = theano.function(
-        predict_ins,
-        model.y_test,
-        allow_input_downcast=True,
-        mode=theano_mode)
-
-
 def make_network(
        input_size,
        embedding_input_dim=None,
@@ -61,8 +39,7 @@ def make_network(
        dropout_probability=0.0,
        model=None,
        optimizer=None,
-        learning_rate=0.001,
-        compile_for_training=True):
+        learning_rate=0.001):

    if model is None:
        model = Sequential()
@@ -112,16 +89,13 @@ def make_network(
        output_dim=1,
        init=init))
    model.add(Activation(output_activation))
-    if compile_for_training:
-        model.compile(loss=loss, optimizer=optimizer)
-    else:
-        compile_forward_predictor(model)
+    model.compile(loss=loss, optimizer=optimizer)
    return model


 def make_hotshot_network(
        peptide_length=9,
-        layer_sizes=[500],
+        layer_sizes=[100],
        activation="relu",
        init="lecun_uniform",
        loss="mse",
@@ -146,7 +120,7 @@ def make_embedding_network(
        peptide_length=9,
        embedding_input_dim=20,
        embedding_output_dim=20,
-        layer_sizes=[500],
+        layer_sizes=[100],
        activation="relu",
        init="lecun_uniform",
        loss="mse",

--- a/mhcflurry/imputation.py
+++ b/mhcflurry/imputation.py
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
+from collections import defaultdict
+import numpy as np
+from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
+
+from .data import (
+    create_allele_data_from_peptide_to_ic50_dict,
+)
+
+def prune_dense_matrix_and_labels(
+        X,
+        peptide_list,
+        allele_list,
+        min_observations_per_peptide=1,
+        min_observations_per_allele=1):
+    """
+    Filter the dense matrix of pMHC binding affinities according to
+    the given minimum number of row/column observations.
+
+    Parameters
+    ----------
+    X : numpy.ndarray
+        Incomplete dense matrix of pMHC affinity with n_peptides rows and
+        n_alleles columns.
+
+    peptide_list : list of str
+        Expected to have n_peptides entries
+
+    allele_list : list of str
+        Expected to have n_alleles entries
+
+    min_observations_per_peptide : int
+        Drop peptide rows with fewer than this number of observed values.
+
+    min_observations_per_allele : int
+        Drop allele columns with fewer than this number of observed values.
+    """
+    observed_mask = np.isfinite(X)
+    n_observed_per_peptide = observed_mask.sum(axis=1)
+    too_few_peptide_observations = (
+        n_observed_per_peptide < min_observations_per_peptide)
+    if too_few_peptide_observations.any():
+        drop_peptide_indices = np.where(too_few_peptide_observations)[0]
+        keep_peptide_indices = np.where(~too_few_peptide_observations)[0]
+        print("Dropping %d peptides with <%d observations" % (
+            len(drop_peptide_indices),
+            min_observations_per_peptide))
+        X = X[keep_peptide_indices]
+        observed_mask = observed_mask[keep_peptide_indices]
+        peptide_list = [peptide_list[i] for i in keep_peptide_indices]
+
+    n_observed_per_allele = observed_mask.sum(axis=0)
+    too_few_allele_observations = (
+        n_observed_per_allele < min_observations_per_peptide)
+    if too_few_peptide_observations.any():
+        drop_allele_indices = np.where(too_few_allele_observations)[0]
+        keep_allele_indices = np.where(~too_few_allele_observations)[0]
+        print("Dropping %d alleles with <%d observations: %s" % (
+            len(drop_allele_indices),
+            min_observations_per_allele,
+            [allele_list[i] for i in drop_allele_indices]))
+        X = X[:, keep_allele_indices]
+        observed_mask = observed_mask[:, keep_allele_indices]
+        allele_list = [allele_list[i] for i in keep_allele_indices]
+    return X, peptide_list, allele_list
+
+
+def create_incomplete_dense_pMHC_matrix(
+        allele_data_dict,
+        min_observations_per_peptide=1,
+        min_observations_per_allele=1):
+    """
+    Given a dictionary mapping each allele name onto an AlleleData object,
+    returns a tuple with a dense matrix of affinities, a list of peptide labels
+    for each row and a list of allele labels for each column.
+
+    Parameters
+    ----------
+    allele_data_dict : dict
+        Dictionary mapping allele names to AlleleData objects
+
+    imputer : object
+        Expected to have a method imputer.complete(X) which takes an array
+        with missing entries marked by NaN and returns a completed array.
+
+    min_observations_per_peptide : int
+        Drop peptide rows with fewer than this number of observed values.
+
+    min_observations_per_allele : int
+        Drop allele columns with fewer than this number of observed values.
+    """
+    peptide_to_allele_to_affinity_dict = defaultdict(dict)
+    for (allele_name, allele_data) in allele_data_dict.items():
+        for peptide, affinity in zip(
+                allele_data.original_peptides,
+                allele_data.Y):
+            if allele_name not in peptide_to_allele_to_affinity_dict[peptide]:
+                peptide_to_allele_to_affinity_dict[peptide][allele_name] = affinity
+
+    n_binding_values = sum(
+        len(allele_dict)
+        for allele_dict in
+        allele_data_dict.values()
+    )
+    print("Collected %d binding values for %d alleles" % (
+        n_binding_values,
+        len(peptide_to_allele_to_affinity_dict)))
+
+    X, peptide_list, allele_list = \
+        dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity_dict)
+    return prune_dense_matrix_and_labels(
+        X,
+        peptide_list,
+        allele_list,
+        min_observations_per_peptide=min_observations_per_peptide,
+        min_observations_per_allele=min_observations_per_allele)
+
+def dense_pMHC_matrix_to_nested_dict(X, peptide_list, allele_list):
+    """
+    Converts a dense matrix of (n_peptides, n_alleles) floats to a nested
+    dictionary from allele -> peptide -> affinity.
+    """
+    allele_to_peptide_to_ic50_dict = defaultdict(dict)
+    for row_index, peptide in enumerate(peptide_list):
+        for column_index, allele_name in enumerate(allele_list):
+            affinity = X[row_index, column_index]
+            if np.isfinite(affinity):
+                allele_to_peptide_to_ic50_dict[allele_name][peptide] = affinity
+    return allele_to_peptide_to_ic50_dict
+
+def create_imputed_datasets(
+        allele_data_dict,
+        imputer,
+        min_observations_per_peptide=1,
+        min_observations_per_allele=1):
+    """
+    Parameters
+    ----------
+    allele_data_dict : dict
+        Dictionary mapping allele names to AlleleData objects
+
+    imputer : object
+        Expected to have a method imputer.complete(X) which takes an array
+        with missing entries marked by NaN and returns a completed array.
+
+    min_observations_per_peptide : int
+        Drop peptide rows with fewer than this number of observed values.
+
+    min_observations_per_allele : int
+        Drop allele columns with fewer than this number of observed values.
+
+    Returns dictionary mapping allele names to AlleleData objects containing
+    imputed affinity values.
+    """
+    X_incomplete, peptide_list, allele_list = create_incomplete_dense_pMHC_matrix(
+        allele_data_dict=allele_data_dict,
+        min_observations_per_peptide=min_observations_per_peptide,
+        min_observations_per_allele=min_observations_per_allele)
+    if np.isnan(X_incomplete).sum() == 0:
+        # if all entries in the matrix are already filled in then don't
+        # try using an imputation algorithm since it might raise an
+        # exception.
+        X_complete = X_incomplete
+    else:
+        X_complete = imputer.complete(X_incomplete)
+    allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict(
+        X=X_complete,
+        peptide_list=peptide_list,
+        allele_list=allele_list)
+    return {
+        allele_name: create_allele_data_from_peptide_to_ic50_dict(allele_dict)
+        for (allele_name, allele_dict)
+        in allele_to_peptide_to_affinity_dict.items()
+    }
--- a/mhcflurry/package_metadata.py
+++ b/mhcflurry/package_metadata.py

-__version__ = "0.0.2"
+__version__ = "0.0.3"
--- a/mhcflurry/predict.py
+++ b/mhcflurry/predict.py
@@ -17,11 +17,19 @@ from collections import OrderedDict
 import pandas as pd

 from .class1_binding_predictor import Class1BindingPredictor
+from .common import normalize_allele_name

 def predict(alleles, peptides):
-    allele_results = OrderedDict([])
+    result_dict = OrderedDict([
+        ("allele", []),
+        ("peptide", []),
+        ("ic50", []),
+    ])
    for allele in alleles:
+        allele = normalize_allele_name(allele)
        model = Class1BindingPredictor.from_allele_name(allele)
-        result_dictionary = model.predict_peptides(peptides)
-        allele_results.append(result_dictionary)
-    return pd.concat(allele_results)
+        for i, ic50 in enumerate(model.predict_peptides_ic50(peptides)):
+            result_dict["allele"].append(allele)
+            result_dict["peptide"].append(peptides[i])
+            result_dict["ic50"].append(ic50)
+    return pd.DataFrame(result_dict)
--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -22,6 +22,7 @@ from .amino_acid import (
    amino_acids_with_unknown,
    common_amino_acids
 )
+from .common import regression_target_to_ic50


 class PredictorBase(object):
@@ -40,13 +41,6 @@ class PredictorBase(object):
        self.allow_unknown_amino_acids = allow_unknown_amino_acids
        self.verbose = verbose

-    def log_to_ic50(self, log_value):
-        """
-        Convert neural network output to IC50 values between 0.0 and
-        self.max_ic50 (typically 5000, 20000 or 50000)
-        """
-        return self.max_ic50 ** (1.0 - log_value)
-
    def encode_9mer_peptides(self, peptides):
        if self.allow_unknown_amino_acids:
            return amino_acids_with_unknown.index_encoding(peptides, 9)
@@ -90,14 +84,15 @@ class PredictorBase(object):
        return self.predict(X)

    def predict_9mer_peptides_ic50(self, peptides):
-        return self.log_to_ic50(self.predict_9mer_peptides(peptides))
+        scores = self.predict_9mer_peptides(peptides)
+        return regression_target_to_ic50(scores, max_ic50=self.max_ic50)

    def predict_peptides_ic50(self, peptides):
        """
        Predict IC50 affinities for peptides of any length
        """
-        return self.log_to_ic50(
-            self.predict_peptides(peptides))
+        scores = self.predict_peptides(peptides)
+        return regression_target_to_ic50(scores, max_ic50=self.max_ic50)

    def predict(self, X):
        raise ValueError("Method 'predict' not yet implemented for %s!" % (

--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,4 @@ h5py
 cherrypy
 bottle
 fancyimpute
-scikit-learn
+scikit-learn
\ No newline at end of file
--- a/test/test_allele_data.py
+++ b/test/test_allele_data.py
+from nose.tools import eq_
+from mhcflurry.data import (
+    create_allele_data_from_peptide_to_ic50_dict,
+    AlleleData
+)
+
+def test_create_allele_data_from_peptide_to_ic50_dict():
+    peptide_to_ic50_dict = {
+        ("A" * 10): 1.2,
+        ("C" * 9): 1000,
+    }
+    allele_data = create_allele_data_from_peptide_to_ic50_dict(
+        peptide_to_ic50_dict,
+        max_ic50=50000.0)
+    assert isinstance(allele_data, AlleleData)
+    expected_peptides = set([
+        "A" * 9,
+        "C" * 9,
+    ])
+    peptides = set(allele_data.peptides)
+    eq_(expected_peptides, peptides)
--- a/test/test_common.py
+++ b/test/test_common.py
+from mhcflurry.common import (
+    ic50_to_regression_target,
+    regression_target_to_ic50,
+)
+from nose.tools import eq_
+
+def test_regression_target_to_ic50():
+    eq_(regression_target_to_ic50(0, max_ic50=500.0), 500)
+    eq_(regression_target_to_ic50(1, max_ic50=500.0), 1.0)
+
+def test_ic50_to_regression_target():
+    eq_(ic50_to_regression_target(5000, max_ic50=5000.0), 0)
+    eq_(ic50_to_regression_target(0, max_ic50=5000.0), 1.0)
--- a/test/test_imputation.py
+++ b/test/test_imputation.py
+from mhcflurry.imputation import (
+    create_imputed_datasets,
+)
+from mhcflurry.data import create_allele_data_from_peptide_to_ic50_dict
+
+from fancyimpute import MICE
+from nose.tools import eq_
+
+def test_create_imputed_datasets_empty():
+    result = create_imputed_datasets({}, imputer=MICE(n_imputations=25))
+    eq_(result, {})
+
+def test_create_imputed_datasets_two_alleles():
+    allele_data_dict = {
+        "HLA-A*02:01": create_allele_data_from_peptide_to_ic50_dict({
+            "A" * 9: 20.0,
+            "C" * 9: 40000.0,
+        }),
+        "HLA-A*02:05": create_allele_data_from_peptide_to_ic50_dict({
+            "S" * 9: 500.0,
+            "A" * 9: 25.0,
+        }),
+    }
+    result = create_imputed_datasets(allele_data_dict, imputer=MICE(n_imputations=25))
+    eq_(set(result.keys()), {"HLA-A*02:01", "HLA-A*02:05"})
+    expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
+    for allele_name, allele_data in result.items():
+        print(allele_name)
+        print(allele_data)
+        eq_(set(allele_data.peptides), expected_peptides)
--- a/test/test_neural_nets.py
+++ b/test/test_neural_nets.py
+from mhcflurry.feedforward import (
+    make_embedding_network,
+    make_hotshot_network,
+)
+import numpy as np
+
+def test_make_embedding_network():
+    nn = make_embedding_network(
+        peptide_length=3,
+        layer_sizes=[3],
+        activation="tanh",
+        embedding_input_dim=3,
+        embedding_output_dim=20,
+        learning_rate=0.1)
+
+    X_negative = np.array([
+        [0] * 3,
+        [1] * 3,
+        [1, 0, 0],
+        [1, 1, 0],
+        [0, 1, 0],
+        [0, 0, 1],
+        [1, 0, 1],
+    ])
+    X_positive = np.array([
+        [0, 2, 0],
+        [1, 2, 0],
+        [1, 2, 1],
+        [0, 2, 1],
+        [2, 2, 0],
+        [2, 2, 1],
+        [2, 2, 2],
+    ])
+    X_index = np.vstack([X_negative, X_positive])
+    Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive))
+    nn.fit(X_index, Y, nb_epoch=10)
+    Y_pred = nn.predict(X_index)
+    print(Y)
+    print(Y_pred)
+    for (Y_i, Y_pred_i) in zip(Y, Y_pred):
+        assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i)
+
+def test_make_hotshot_network():
+    nn = make_hotshot_network(
+        peptide_length=3,
+        activation="relu",
+        layer_sizes=[4],
+        n_amino_acids=2,
+        learning_rate=0.1)
+    X_binary = np.array([
+        [True, False, True, False, True, False],
+        [True, False, True, False, False, True],
+        [True, False, False, True, True, False],
+        [True, False, False, True, False, True],
+        [False, True, True, False, True, False],
+        [False, True, True, False, False, True],
+    ], dtype=bool)
+    Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
+    nn.fit(X_binary, Y, nb_epoch=10)
+    Y_pred = nn.predict(X_binary)
+    print(Y)
+    print(Y_pred)
+    for (Y_i, Y_pred_i) in zip(Y, Y_pred):
+        assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i)
--- a/test/test_predict.py
+++ b/test/test_predict.py
+from mhcflurry import predict
+
+def test_predict_A1_Titin_epitope():
+    result_df = predict(alleles=["HLA-A*01:01"], peptides=["ESDPIVAQY"])
+    assert len(result_df) == 1
+    row = result_df.ix[0]
+    ic50 = row["ic50"]
+    assert ic50 <= 500