Skip to content
Snippets Groups Projects
Commit 2ab2dbd9 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

moving helpers for performing dataset imputations from experiments and added a few tests

parent 00b17305
No related branches found
No related tags found
No related merge requests found
# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -17,7 +17,7 @@ from __future__ import (
division,
absolute_import,
)
import numpy as np
def parse_int_list(s):
return [int(part.strip() for part in s.split(","))]
......@@ -65,3 +65,37 @@ def split_allele_names(s):
for part
in s.split(",")
]
def ic50_to_regression_target(ic50, max_ic50):
"""
Transform IC50 inhibitory binding concentrations to affinity values between
[0,1] where 0 means a value greater or equal to max_ic50 and 1 means very
strong binder.
Parameters
----------
ic50 : numpy.ndarray
max_ic50 : float
"""
log_ic50 = np.log(ic50) / np.log(max_ic50)
regression_target = 1.0 - log_ic50
# clamp to values between 0, 1
regression_target = np.maximum(regression_target, 0.0)
regression_target = np.minimum(regression_target, 1.0)
return regression_target
def regression_target_to_ic50(y, max_ic50):
"""
Transform values between [0,1] to IC50 inhibitory binding concentrations
between [1.0, infinity]
Parameters
----------
y : numpy.ndarray of float
max_ic50 : float
Returns numpy.ndarray
"""
return max_ic50 ** (1.0 - y)
......@@ -22,7 +22,7 @@ from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
from .common import normalize_allele_name
from .common import normalize_allele_name, ic50_to_regression_target
from .amino_acid import common_amino_acids
from .peptide_encoding import (
indices_to_hotshot_encoding,
......@@ -104,8 +104,7 @@ def load_dataframe(
only_human : bool
Only load entries from human MHC alleles
Returns DataFrame augmented with extra columns:
- "log_ic50" : log(ic50) / log(max_ic50)
Returns DataFrame augmented with extra column:
- "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
"""
if sep is None:
......@@ -138,13 +137,7 @@ def load_dataframe(
df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
ic50 = np.array(df[ic50_column_name])
log_ic50 = np.log(ic50) / np.log(max_ic50)
df["log_ic50"] = log_ic50
regression_output = 1.0 - log_ic50
# clamp to values between 0, 1
regression_output = np.maximum(regression_output, 0.0)
regression_output = np.minimum(regression_output, 1.0)
df["regression_output"] = regression_output
df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50)
return df, peptide_column_name
......@@ -278,6 +271,57 @@ def encode_peptide_to_affinity_dict(
n_samples, len(Y))
return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
def create_allele_data_from_peptide_to_ic50_dict(
peptide_to_ic50_dict,
max_ic50=MAX_IC50,
kmer_length=9,
flatten_binary_encoding=True):
"""
Parameters
----------
peptide_to_ic50_dict : dict
Dictionary mapping peptides of different lengths to IC50 binding
affinity values.
max_ic50 : float
Maximum IC50 value used as the cutoff for affinity of 0.0 when
transforming from IC50 to regression targets.
kmer_length : int
What length substrings will be fed to a fixed-length predictor?
flatten_binary_encoding : bool
Should hotshot encodings of amino acid inputs be flattened into a 1D
vector or have two dimensions (where the first represents position)?
Return an AlleleData object.
"""
Y_dict = {
peptide: ic50_to_regression_target(ic50, max_ic50)
for (peptide, ic50)
in peptide_to_ic50_dict.items()
}
(kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \
encode_peptide_to_affinity_dict(
Y_dict,
peptide_length=kmer_length,
flatten_binary_encoding=flatten_binary_encoding)
ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides])
assert len(kmer_peptides) == len(ic50_array), \
"Mismatch between # of peptides %d and # IC50 outputs %d" % (
len(kmer_peptides), len(ic50_array))
return AlleleData(
X_index=X_index,
X_binary=X_binary,
Y=Y_kmer,
ic50=ic50_array,
peptides=kmer_peptides,
original_peptides=original_peptides,
original_lengths=[len(peptide) for peptide in original_peptides],
substring_counts=counts,
weights=1.0 / counts)
def load_allele_datasets(
filename,
......@@ -371,35 +415,10 @@ def load_allele_datasets(
for (peptide, ic50)
in zip(raw_peptides, group[ic50_column_name])
}
Y_dict = {
peptide: y
for (peptide, y)
in zip(raw_peptides, group["regression_output"])
}
(kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
encode_peptide_to_affinity_dict(
Y_dict,
peptide_length=peptide_length,
flatten_binary_encoding=flatten_binary_encoding)
ic50 = np.array([ic50_dict[p] for p in original_peptides])
assert len(kmer_peptides) == len(ic50), \
"Mismatch between # of peptides %d and # IC50 outputs %d" % (
len(kmer_peptides), len(ic50))
allele_groups[allele] = AlleleData(
X_index=X_index,
X_binary=X_binary,
Y=Y,
ic50=ic50,
peptides=kmer_peptides,
original_peptides=original_peptides,
original_lengths=[len(peptide) for peptide in original_peptides],
substring_counts=counts,
weights=1.0 / counts)
allele_date = create_allele_data_from_peptide_to_ic50_dict(
ic50_dict,
max_ic50=max_ic50)
allele_groups[allele] = allele_data
return allele_groups
......
__version__ = "0.0.2"
__version__ = "0.0.3"
from nose.tools import eq_
from mhcflurry.data import (
create_allele_data_from_peptide_to_ic50_dict,
AlleleData
)
def test_create_allele_data_from_peptide_to_ic50_dict():
peptide_to_ic50_dict = {
("A" * 10): 1.2,
("C" * 9): 1000,
}
allele_data = create_allele_data_from_peptide_to_ic50_dict(peptide_to_ic50_dict, max_ic50=50000.0)
assert isinstance(allele_data, AlleleData)
expected_peptides = set([
"A" * 9,
"C" * 9,
])
peptides = set(allele_data.peptides)
eq_(expected_peptides, peptides)
\ No newline at end of file
from mhcflurry.common import (
ic50_to_regression_target,
regression_target_to_ic50,
)
from nose.tools import eq_
def test_regression_target_to_ic50():
eq_(regression_target_to_ic50(0, max_ic50=500.0), 500)
eq_(regression_target_to_ic50(1, max_ic50=500.0), 1.0)
def test_ic50_to_regression_target():
eq_(ic50_to_regression_target(5000, max_ic50=5000.0), 0)
eq_(ic50_to_regression_target(0, max_ic50=5000.0), 1.0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment