Skip to content
Snippets Groups Projects
Commit 9e3c4c8b authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

moving encoding to dataset

parent 59e3b559
No related branches found
No related tags found
No related merge requests found
......@@ -17,6 +17,7 @@ from __future__ import (
division,
absolute_import,
)
from math import exp, log
def parse_int_list(s):
return [int(part.strip() for part in s.split(","))]
......@@ -64,3 +65,17 @@ def split_allele_names(s):
for part
in s.split(",")
]
def geometric_mean(xs, weights=None):
"""
Geometric mean of a collection of affinity measurements, with optional
sample weights.
"""
if len(xs) == 1:
return xs[0]
elif weights is None:
return exp(sum(log(xi) for xi in xs))
sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights))
denom = sum(weights)
return exp(sum_weighted_log / denom)
......@@ -22,14 +22,13 @@ from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
from .common import normalize_allele_name
from .amino_acid import common_amino_acids
from .peptide_encoding import (
indices_to_hotshot_encoding,
fixed_length_index_encoding,
check_valid_index_encoding_array,
)
from .regression_target import MAX_IC50, ic50_to_regression_target
index_encoding = common_amino_acids.index_encoding
......@@ -50,225 +49,8 @@ AlleleData = namedtuple(
])
def _infer_csv_separator(filename):
"""
Determine if file is separated by comma, tab, or whitespace.
Default to whitespace if the others are not detected.
Returns (sep, delim_whitespace)
"""
for candidate in [",", "\t"]:
with open(filename, "r") as f:
for line in f:
if line.startswith("#"):
continue
if candidate in line:
return candidate, False
return None, True
def load_dataframe(
filename,
max_ic50=MAX_IC50,
sep=None,
species_column_name="species",
allele_column_name="mhc",
peptide_column_name=None,
filter_peptide_length=None,
ic50_column_name="meas",
only_human=False):
"""
Load a dataframe of peptide-MHC affinity measurements
filename : str
TSV filename with columns:
- 'species'
- 'mhc'
- 'peptide_length'
- 'sequence'
- 'meas'
max_ic50 : float
Treat IC50 scores above this value as all equally bad
(transform them to 0.0 in the regression output)
sep : str, optional
Separator in CSV file, default is to let Pandas infer
peptide_column_name : str, optional
Default behavior is to try {"sequence", "peptide", "peptide_sequence"}
filter_peptide_length : int, optional
Which length peptides to use (default=load all lengths)
only_human : bool
Only load entries from human MHC alleles
Returns DataFrame augmented with extra column:
- "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
"""
if sep is None:
sep, delim_whitespace = _infer_csv_separator(filename)
else:
delim_whitespace = False
df = pd.read_csv(
filename,
sep=sep,
delim_whitespace=delim_whitespace,
engine="c")
# hack: get around variability of column naming by checking if
# the peptide_column_name is actually present and if not try "peptide"
if peptide_column_name is None:
columns = set(df.keys())
for candidate in ["sequence", "peptide", "peptide_sequence"]:
if candidate in columns:
peptide_column_name = candidate
break
if peptide_column_name is None:
raise ValueError(
"Couldn't find peptide column name, candidates: %s" % (
columns))
if only_human:
human_mask = df[species_column_name] == "human"
df = df[human_mask]
if filter_peptide_length:
length_mask = df[peptide_column_name].str.len() == filter_peptide_length
df = df[length_mask]
df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
ic50 = np.array(df[ic50_column_name])
df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50)
return df, peptide_column_name
def load_allele_dicts(
filename,
max_ic50=MAX_IC50,
regression_output=False,
sep=None,
species_column_name="species",
allele_column_name="mhc",
peptide_column_name=None,
ic50_column_name="meas",
only_human=False,
min_allele_size=1):
"""
Parsing CSV of binding data into dictionary of dictionaries.
The outer key is an allele name, the inner key is a peptide sequence,
and the inner value is an IC50 or log-transformed value between [0,1]
"""
binding_df, peptide_column_name = load_dataframe(
filename=filename,
max_ic50=max_ic50,
sep=sep,
species_column_name=species_column_name,
allele_column_name=allele_column_name,
peptide_column_name=peptide_column_name,
ic50_column_name=ic50_column_name,
only_human=only_human)
# map peptides to either the raw IC50 or rescaled log IC50 depending
# on the `regression_output` parameter
output_column_name = (
"regression_output"
if regression_output
else ic50_column_name
)
return {
allele_name: {
peptide: output
for (peptide, output)
in zip(group[peptide_column_name], group[output_column_name])
}
for (allele_name, group)
in binding_df.groupby(allele_column_name)
if len(group) >= min_allele_size
}
def encode_peptide_to_affinity_dict(
peptide_to_affinity_dict,
peptide_length=9,
flatten_binary_encoding=True,
allow_unknown_amino_acids=True):
"""
Given a dictionary mapping from peptide sequences to affinity values, return
both index and binary encodings of fixed length peptides, and
a vector of their affinities.
Parameters
----------
peptide_to_affinity_dict : dict
Keys are peptide strings (of multiple lengths), each mapping to a
continuous affinity value.
peptide_length : int
Length of vector encoding
flatten_binary_encoding : bool
Should the binary encoding of a peptide be two-dimensional (9x20)
or a flattened 1d vector
allow_unknown_amino_acids : bool
When extending a short vector to the desired peptide length, should
we insert every possible amino acid or a designated character "X"
indicating an unknown amino acid.
Returns tuple with the following fields:
- kmer_peptides: fixed length peptide strings
- original_peptides: variable length peptide strings
- counts: how many fixed length peptides were made from this original
- X_index: index encoding of fixed length peptides
- X_binary: binary encoding of fixed length peptides
- Y: affinity values associated with original peptides
"""
raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
X_index, kmer_peptides, original_peptides, counts = \
fixed_length_index_encoding(
peptides=raw_peptides,
desired_length=peptide_length,
start_offset_shorten=0,
end_offset_shorten=0,
start_offset_extend=0,
end_offset_extend=0,
allow_unknown_amino_acids=allow_unknown_amino_acids)
n_samples = len(kmer_peptides)
assert n_samples == len(original_peptides), \
"Mismatch between # of samples (%d) and # of peptides (%d)" % (
n_samples, len(original_peptides))
assert n_samples == len(counts), \
"Mismatch between # of samples (%d) and # of counts (%d)" % (
n_samples, len(counts))
assert n_samples == len(X_index), \
"Mismatch between # of sample (%d) and index feature vectors (%d)" % (
n_samples, len(X_index))
X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
n_indices = 20 + allow_unknown_amino_acids
X_binary = indices_to_hotshot_encoding(
X_index,
n_indices=n_indices)
assert X_binary.shape[0] == X_index.shape[0], \
("Mismatch between number of samples for index encoding (%d)"
" vs. binary encoding (%d)") % (
X_binary.shape[0],
X_index.shape[0])
if flatten_binary_encoding:
# collapse 3D input into 2D matrix
n_binary_features = peptide_length * n_indices
X_binary = X_binary.reshape((n_samples, n_binary_features))
# easier to work with counts when they're an array instead of list
counts = np.array(counts)
Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
assert n_samples == len(Y), \
"Mismatch between # peptides %d and # regression outputs %d" % (
n_samples, len(Y))
return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
def create_allele_data_from_peptide_to_ic50_dict(
peptide_to_ic50_dict,
......@@ -337,8 +119,7 @@ def load_allele_datasets(
peptide_column_name=None,
peptide_length_column_name="peptide_length",
ic50_column_name="meas",
only_human=False,
shuffle=True):
only_human=False):
"""
Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
and log-transforms the IC50 measurement. Returns dictionary mapping allele
......
......@@ -23,7 +23,11 @@ from six import string_types
import pandas as pd
import numpy as np
from .dataset_helpers import prepare_pMHC_affinity_arrays, geometric_mean
from .common import geometric_mean
from .dataset_helpers import (
prepare_pMHC_affinity_arrays,
load_dataframe
)
class Dataset(object):
......@@ -250,6 +254,27 @@ class Dataset(object):
return cls.from_allele_to_peptide_to_affinity_dictionary(
{allele_name: peptide_to_affinity_dict})
@classmethod
def from_csv(
cls,
filename,
sep=None,
allele_column_name=None,
peptide_column_name=None,
affinity_column_name=None):
df, allele_column_name, peptide_column_name, affinity_column_name = \
load_dataframe(
filename=filename,
sep=sep,
allele_column_name=allele_column_name,
peptide_column_name=peptide_column_name,
affinity_column_name=affinity_column_name)
df = df.rename(columns={
allele_column_name: "allele",
peptide_column_name: "peptide",
affinity_column_name: "affinity"})
return cls(df)
def get_allele(self, allele_name):
"""
Get Dataset for a single allele
......
......@@ -20,20 +20,14 @@ from __future__ import (
from typechecks import require_instance
import numpy as np
from math import exp, log
import pandas as pd
def geometric_mean(xs, weights=None):
"""
Geometric mean of a collection of affinity measurements, with optional
sample weights.
"""
if len(xs) == 1:
return xs[0]
elif weights is None:
return exp(sum(log(xi) for xi in xs))
sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights))
denom = sum(weights)
return exp(sum_weighted_log / denom)
from .common import normalize_allele_name
from .peptide_encoding import (
indices_to_hotshot_encoding,
fixed_length_index_encoding,
check_valid_index_encoding_array,
)
def check_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights):
"""
......@@ -82,3 +76,196 @@ def prepare_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights=N
affinities=affinities,
sample_weights=sample_weights)
return alleles, peptides, affinities, sample_weights
def infer_csv_separator(filename):
"""
Determine if file is separated by comma, tab, or whitespace.
Default to whitespace if the others are not detected.
Returns (sep, delim_whitespace)
"""
for candidate in [",", "\t"]:
with open(filename, "r") as f:
for line in f:
if line.startswith("#"):
continue
if candidate in line:
return candidate, False
return None, True
def load_dataframe(
filename,
sep=None,
allele_column_name=None,
peptide_column_name=None,
affinity_column_name=None,
filter_peptide_length=None,
normalize_allele_names=True):
"""
Load a dataframe of peptide-MHC affinity measurements
filename : str
TSV filename with columns:
- 'species'
- 'mhc'
- 'peptide_length'
- 'sequence'
- 'meas'
sep : str, optional
Separator in CSV file, default is to let Pandas infer
allele_column_name : str, optional
Default behavior is to try {"mhc", "allele", "hla"}
peptide_column_name : str, optional
Default behavior is to try {"sequence", "peptide", "peptide_sequence"}
affinity_column_name : str, optional
Default behavior is to try {"meas", "ic50", "affinity", "aff"}
filter_peptide_length : int, optional
Which length peptides to use (default=load all lengths)
normalize_allele_names : bool
Normalize MHC names or leave them alone
Returns:
- DataFrame
- peptide column name
- allele column name
- affinity column name
"""
if sep is None:
sep, delim_whitespace = infer_csv_separator(filename)
else:
delim_whitespace = False
df = pd.read_csv(
filename,
sep=sep,
delim_whitespace=delim_whitespace,
engine="c")
columns = set(df.keys())
if allele_column_name is None:
for candidate in ["mhc", "allele", "hla"]:
if candidate in columns:
allele_column_name = candidate
break
if allele_column_name is None:
raise ValueError(
"Couldn't find alleles, available columns: %s" % (
columns,))
if peptide_column_name is None:
for candidate in ["sequence", "peptide", "peptide_sequence"]:
if candidate in columns:
peptide_column_name = candidate
break
if peptide_column_name is None:
raise ValueError(
"Couldn't find peptides, available columns: %s" % (
columns,))
if affinity_column_name is None:
for candidate in ["meas", "ic50", "affinity"]:
if candidate in columns:
affinity_column_name = candidate
break
if affinity_column_name is None:
raise ValueError(
"Couldn't find affinity values, available columns: %s" % (
columns,))
if filter_peptide_length:
length_mask = df[peptide_column_name].str.len() == filter_peptide_length
df = df[length_mask]
df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
return df, allele_column_name, peptide_column_name, affinity_column_name
def encode_peptide_to_affinity_dict(
peptide_to_affinity_dict,
peptide_length=9,
flatten_binary_encoding=True,
allow_unknown_amino_acids=True):
"""
Given a dictionary mapping from peptide sequences to affinity values, return
both index and binary encodings of fixed length peptides, and
a vector of their affinities.
Parameters
----------
peptide_to_affinity_dict : dict
Keys are peptide strings (of multiple lengths), each mapping to a
continuous affinity value.
peptide_length : int
Length of vector encoding
flatten_binary_encoding : bool
Should the binary encoding of a peptide be two-dimensional (9x20)
or a flattened 1d vector
allow_unknown_amino_acids : bool
When extending a short vector to the desired peptide length, should
we insert every possible amino acid or a designated character "X"
indicating an unknown amino acid.
Returns tuple with the following fields:
- kmer_peptides: fixed length peptide strings
- original_peptides: variable length peptide strings
- counts: how many fixed length peptides were made from this original
- X_index: index encoding of fixed length peptides
- X_binary: binary encoding of fixed length peptides
- Y: affinity values associated with original peptides
"""
raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
X_index, kmer_peptides, original_peptides, counts = \
fixed_length_index_encoding(
peptides=raw_peptides,
desired_length=peptide_length,
start_offset_shorten=0,
end_offset_shorten=0,
start_offset_extend=0,
end_offset_extend=0,
allow_unknown_amino_acids=allow_unknown_amino_acids)
n_samples = len(kmer_peptides)
assert n_samples == len(original_peptides), \
"Mismatch between # of samples (%d) and # of peptides (%d)" % (
n_samples, len(original_peptides))
assert n_samples == len(counts), \
"Mismatch between # of samples (%d) and # of counts (%d)" % (
n_samples, len(counts))
assert n_samples == len(X_index), \
"Mismatch between # of sample (%d) and index feature vectors (%d)" % (
n_samples, len(X_index))
X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
n_indices = 20 + allow_unknown_amino_acids
X_binary = indices_to_hotshot_encoding(
X_index,
n_indices=n_indices)
assert X_binary.shape[0] == X_index.shape[0], \
("Mismatch between number of samples for index encoding (%d)"
" vs. binary encoding (%d)") % (
X_binary.shape[0],
X_index.shape[0])
if flatten_binary_encoding:
# collapse 3D input into 2D matrix
n_binary_features = peptide_length * n_indices
X_binary = X_binary.reshape((n_samples, n_binary_features))
# easier to work with counts when they're an array instead of list
counts = np.array(counts)
Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
assert n_samples == len(Y), \
"Mismatch between # peptides %d and # regression outputs %d" % (
n_samples, len(Y))
return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment