moving helpers for performing dataset imputations from experiments and added a few tests

2ab2dbd9 · Alex Rubinsteyn · 00b17305 · 2ab2dbd9 · 2ab2dbd9 · 2ab2dbd9
Commit 2ab2dbd9 authored 8 years ago by Alex Rubinsteyn
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@ from __future__ import (
    division,
    absolute_import,
 )
-
+import numpy as np

 def parse_int_list(s):
    return [int(part.strip() for part in s.split(","))]
@@ -65,3 +65,37 @@ def split_allele_names(s):
        for part
        in s.split(",")
    ]
+
+def ic50_to_regression_target(ic50, max_ic50):
+    """
+    Transform IC50 inhibitory binding concentrations to affinity values between
+    [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very
+    strong binder.
+
+    Parameters
+    ----------
+    ic50 : numpy.ndarray
+
+    max_ic50 : float
+    """
+    log_ic50 = np.log(ic50) / np.log(max_ic50)
+    regression_target = 1.0 - log_ic50
+    # clamp to values between 0, 1
+    regression_target = np.maximum(regression_target, 0.0)
+    regression_target = np.minimum(regression_target, 1.0)
+    return regression_target
+
+def regression_target_to_ic50(y, max_ic50):
+    """
+    Transform values between [0,1] to IC50 inhibitory binding concentrations
+    between [1.0, infinity]
+
+    Parameters
+    ----------
+    y : numpy.ndarray of float
+
+    max_ic50 : float
+
+    Returns numpy.ndarray
+    """
+    return max_ic50 ** (1.0 - y)
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -22,7 +22,7 @@ from collections import namedtuple, defaultdict
 import pandas as pd
 import numpy as np

-from .common import normalize_allele_name
+from .common import normalize_allele_name, ic50_to_regression_target
 from .amino_acid import common_amino_acids
 from .peptide_encoding import (
    indices_to_hotshot_encoding,
@@ -104,8 +104,7 @@ def load_dataframe(
    only_human : bool
        Only load entries from human MHC alleles

-    Returns DataFrame augmented with extra columns:
-        - "log_ic50" : log(ic50) / log(max_ic50)
+    Returns DataFrame augmented with extra column:
        - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
    """
    if sep is None:
@@ -138,13 +137,7 @@ def load_dataframe(

    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
    ic50 = np.array(df[ic50_column_name])
-    log_ic50 = np.log(ic50) / np.log(max_ic50)
-    df["log_ic50"] = log_ic50
-    regression_output = 1.0 - log_ic50
-    # clamp to values between 0, 1
-    regression_output = np.maximum(regression_output, 0.0)
-    regression_output = np.minimum(regression_output, 1.0)
-    df["regression_output"] = regression_output
+    df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50)
    return df, peptide_column_name


@@ -278,6 +271,57 @@ def encode_peptide_to_affinity_dict(
            n_samples, len(Y))
    return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)

+def create_allele_data_from_peptide_to_ic50_dict(
+        peptide_to_ic50_dict,
+        max_ic50=MAX_IC50,
+        kmer_length=9,
+        flatten_binary_encoding=True):
+    """
+    Parameters
+    ----------
+    peptide_to_ic50_dict : dict
+        Dictionary mapping peptides of different lengths to IC50 binding
+        affinity values.
+
+    max_ic50 : float
+        Maximum IC50 value used as the cutoff for affinity of 0.0 when
+        transforming from IC50 to regression targets.
+
+    kmer_length : int
+        What length substrings will be fed to a fixed-length predictor?
+
+    flatten_binary_encoding : bool
+        Should hotshot encodings of amino acid inputs be flattened into a 1D
+        vector or have two dimensions (where the first represents position)?
+
+    Return an AlleleData object.
+    """
+    Y_dict = {
+        peptide: ic50_to_regression_target(ic50, max_ic50)
+        for (peptide, ic50)
+        in peptide_to_ic50_dict.items()
+    }
+    (kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \
+        encode_peptide_to_affinity_dict(
+            Y_dict,
+            peptide_length=kmer_length,
+            flatten_binary_encoding=flatten_binary_encoding)
+
+    ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides])
+    assert len(kmer_peptides) == len(ic50_array), \
+        "Mismatch between # of peptides %d and # IC50 outputs %d" % (
+            len(kmer_peptides), len(ic50_array))
+
+    return AlleleData(
+        X_index=X_index,
+        X_binary=X_binary,
+        Y=Y_kmer,
+        ic50=ic50_array,
+        peptides=kmer_peptides,
+        original_peptides=original_peptides,
+        original_lengths=[len(peptide) for peptide in original_peptides],
+        substring_counts=counts,
+        weights=1.0 / counts)

 def load_allele_datasets(
        filename,
@@ -371,35 +415,10 @@ def load_allele_datasets(
            for (peptide, ic50)
            in zip(raw_peptides, group[ic50_column_name])
        }
-
-        Y_dict = {
-            peptide: y
-            for (peptide, y)
-            in zip(raw_peptides, group["regression_output"])
-        }
-
-        (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
-            encode_peptide_to_affinity_dict(
-                Y_dict,
-                peptide_length=peptide_length,
-                flatten_binary_encoding=flatten_binary_encoding)
-
-        ic50 = np.array([ic50_dict[p] for p in original_peptides])
-
-        assert len(kmer_peptides) == len(ic50), \
-            "Mismatch between # of peptides %d and # IC50 outputs %d" % (
-                len(kmer_peptides), len(ic50))
-
-        allele_groups[allele] = AlleleData(
-            X_index=X_index,
-            X_binary=X_binary,
-            Y=Y,
-            ic50=ic50,
-            peptides=kmer_peptides,
-            original_peptides=original_peptides,
-            original_lengths=[len(peptide) for peptide in original_peptides],
-            substring_counts=counts,
-            weights=1.0 / counts)
+        allele_date = create_allele_data_from_peptide_to_ic50_dict(
+            ic50_dict,
+            max_ic50=max_ic50)
+        allele_groups[allele] = allele_data
    return allele_groups



--- a/mhcflurry/package_metadata.py
+++ b/mhcflurry/package_metadata.py

-__version__ = "0.0.2"
+__version__ = "0.0.3"
--- a/test/test_allele_data.py
+++ b/test/test_allele_data.py
+from nose.tools import eq_
+from mhcflurry.data import (
+    create_allele_data_from_peptide_to_ic50_dict,
+    AlleleData
+)
+
+def test_create_allele_data_from_peptide_to_ic50_dict():
+    peptide_to_ic50_dict = {
+        ("A" * 10): 1.2,
+        ("C" * 9): 1000,
+    }
+    allele_data = create_allele_data_from_peptide_to_ic50_dict(peptide_to_ic50_dict, max_ic50=50000.0)
+    assert isinstance(allele_data, AlleleData)
+    expected_peptides = set([
+        "A" * 9,
+        "C" * 9,
+    ])
+    peptides = set(allele_data.peptides)
+    eq_(expected_peptides, peptides)
\ No newline at end of file
--- a/test/test_common.py
+++ b/test/test_common.py
+from mhcflurry.common import (
+    ic50_to_regression_target,
+    regression_target_to_ic50,
+)
+from nose.tools import eq_
+
+def test_regression_target_to_ic50():
+    eq_(regression_target_to_ic50(0, max_ic50=500.0), 500)
+    eq_(regression_target_to_ic50(1, max_ic50=500.0), 1.0)
+
+def test_ic50_to_regression_target():
+    eq_(ic50_to_regression_target(5000, max_ic50=5000.0), 0)
+    eq_(ic50_to_regression_target(0, max_ic50=5000.0), 1.0)