From 2ab2dbd978df5974e173713fb74501c5112801a1 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Thu, 28 Apr 2016 18:55:18 -0400 Subject: [PATCH] moving helpers for performing dataset imputations from experiments and added a few tests --- mhcflurry/common.py | 38 +++++++++++++- mhcflurry/data.py | 97 +++++++++++++++++++++-------------- mhcflurry/package_metadata.py | 2 +- test/test_allele_data.py | 19 +++++++ test/test_common.py | 13 +++++ 5 files changed, 127 insertions(+), 42 deletions(-) create mode 100644 test/test_allele_data.py create mode 100644 test/test_common.py diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 6ab2f9fd..a3318c2f 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ from __future__ import ( division, absolute_import, ) - +import numpy as np def parse_int_list(s): return [int(part.strip() for part in s.split(","))] @@ -65,3 +65,37 @@ def split_allele_names(s): for part in s.split(",") ] + +def ic50_to_regression_target(ic50, max_ic50): + """ + Transform IC50 inhibitory binding concentrations to affinity values between + [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very + strong binder. + + Parameters + ---------- + ic50 : numpy.ndarray + + max_ic50 : float + """ + log_ic50 = np.log(ic50) / np.log(max_ic50) + regression_target = 1.0 - log_ic50 + # clamp to values between 0, 1 + regression_target = np.maximum(regression_target, 0.0) + regression_target = np.minimum(regression_target, 1.0) + return regression_target + +def regression_target_to_ic50(y, max_ic50): + """ + Transform values between [0,1] to IC50 inhibitory binding concentrations + between [1.0, infinity] + + Parameters + ---------- + y : numpy.ndarray of float + + max_ic50 : float + + Returns numpy.ndarray + """ + return max_ic50 ** (1.0 - y) diff --git a/mhcflurry/data.py b/mhcflurry/data.py index d4e9b83e..5defe489 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -22,7 +22,7 @@ from collections import namedtuple, defaultdict import pandas as pd import numpy as np -from .common import normalize_allele_name +from .common import normalize_allele_name, ic50_to_regression_target from .amino_acid import common_amino_acids from .peptide_encoding import ( indices_to_hotshot_encoding, @@ -104,8 +104,7 @@ def load_dataframe( only_human : bool Only load entries from human MHC alleles - Returns DataFrame augmented with extra columns: - - "log_ic50" : log(ic50) / log(max_ic50) + Returns DataFrame augmented with extra column: - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1] """ if sep is None: @@ -138,13 +137,7 @@ def load_dataframe( df[allele_column_name] = df[allele_column_name].map(normalize_allele_name) ic50 = np.array(df[ic50_column_name]) - log_ic50 = np.log(ic50) / np.log(max_ic50) - df["log_ic50"] = log_ic50 - regression_output = 1.0 - log_ic50 - # clamp to values between 0, 1 - regression_output = np.maximum(regression_output, 0.0) - regression_output = np.minimum(regression_output, 1.0) - df["regression_output"] = regression_output + df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50) return df, peptide_column_name @@ -278,6 +271,57 @@ def encode_peptide_to_affinity_dict( n_samples, len(Y)) return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) +def create_allele_data_from_peptide_to_ic50_dict( + peptide_to_ic50_dict, + max_ic50=MAX_IC50, + kmer_length=9, + flatten_binary_encoding=True): + """ + Parameters + ---------- + peptide_to_ic50_dict : dict + Dictionary mapping peptides of different lengths to IC50 binding + affinity values. + + max_ic50 : float + Maximum IC50 value used as the cutoff for affinity of 0.0 when + transforming from IC50 to regression targets. + + kmer_length : int + What length substrings will be fed to a fixed-length predictor? + + flatten_binary_encoding : bool + Should hotshot encodings of amino acid inputs be flattened into a 1D + vector or have two dimensions (where the first represents position)? + + Return an AlleleData object. + """ + Y_dict = { + peptide: ic50_to_regression_target(ic50, max_ic50) + for (peptide, ic50) + in peptide_to_ic50_dict.items() + } + (kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \ + encode_peptide_to_affinity_dict( + Y_dict, + peptide_length=kmer_length, + flatten_binary_encoding=flatten_binary_encoding) + + ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides]) + assert len(kmer_peptides) == len(ic50_array), \ + "Mismatch between # of peptides %d and # IC50 outputs %d" % ( + len(kmer_peptides), len(ic50_array)) + + return AlleleData( + X_index=X_index, + X_binary=X_binary, + Y=Y_kmer, + ic50=ic50_array, + peptides=kmer_peptides, + original_peptides=original_peptides, + original_lengths=[len(peptide) for peptide in original_peptides], + substring_counts=counts, + weights=1.0 / counts) def load_allele_datasets( filename, @@ -371,35 +415,10 @@ def load_allele_datasets( for (peptide, ic50) in zip(raw_peptides, group[ic50_column_name]) } - - Y_dict = { - peptide: y - for (peptide, y) - in zip(raw_peptides, group["regression_output"]) - } - - (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \ - encode_peptide_to_affinity_dict( - Y_dict, - peptide_length=peptide_length, - flatten_binary_encoding=flatten_binary_encoding) - - ic50 = np.array([ic50_dict[p] for p in original_peptides]) - - assert len(kmer_peptides) == len(ic50), \ - "Mismatch between # of peptides %d and # IC50 outputs %d" % ( - len(kmer_peptides), len(ic50)) - - allele_groups[allele] = AlleleData( - X_index=X_index, - X_binary=X_binary, - Y=Y, - ic50=ic50, - peptides=kmer_peptides, - original_peptides=original_peptides, - original_lengths=[len(peptide) for peptide in original_peptides], - substring_counts=counts, - weights=1.0 / counts) + allele_date = create_allele_data_from_peptide_to_ic50_dict( + ic50_dict, + max_ic50=max_ic50) + allele_groups[allele] = allele_data return allele_groups diff --git a/mhcflurry/package_metadata.py b/mhcflurry/package_metadata.py index e62b38a7..373d4d3d 100644 --- a/mhcflurry/package_metadata.py +++ b/mhcflurry/package_metadata.py @@ -1,2 +1,2 @@ -__version__ = "0.0.2" +__version__ = "0.0.3" diff --git a/test/test_allele_data.py b/test/test_allele_data.py new file mode 100644 index 00000000..8048cfe2 --- /dev/null +++ b/test/test_allele_data.py @@ -0,0 +1,19 @@ +from nose.tools import eq_ +from mhcflurry.data import ( + create_allele_data_from_peptide_to_ic50_dict, + AlleleData +) + +def test_create_allele_data_from_peptide_to_ic50_dict(): + peptide_to_ic50_dict = { + ("A" * 10): 1.2, + ("C" * 9): 1000, + } + allele_data = create_allele_data_from_peptide_to_ic50_dict(peptide_to_ic50_dict, max_ic50=50000.0) + assert isinstance(allele_data, AlleleData) + expected_peptides = set([ + "A" * 9, + "C" * 9, + ]) + peptides = set(allele_data.peptides) + eq_(expected_peptides, peptides) \ No newline at end of file diff --git a/test/test_common.py b/test/test_common.py new file mode 100644 index 00000000..fb5cd8d9 --- /dev/null +++ b/test/test_common.py @@ -0,0 +1,13 @@ +from mhcflurry.common import ( + ic50_to_regression_target, + regression_target_to_ic50, +) +from nose.tools import eq_ + +def test_regression_target_to_ic50(): + eq_(regression_target_to_ic50(0, max_ic50=500.0), 500) + eq_(regression_target_to_ic50(1, max_ic50=500.0), 1.0) + +def test_ic50_to_regression_target(): + eq_(ic50_to_regression_target(5000, max_ic50=5000.0), 0) + eq_(ic50_to_regression_target(0, max_ic50=5000.0), 1.0) -- GitLab