From 9e3c4c8b3522a3f4a3143bac3d7f39b001622404 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 6 May 2016 17:00:01 -0400 Subject: [PATCH] moving encoding to dataset --- mhcflurry/common.py | 15 +++ mhcflurry/data.py | 223 +---------------------------------- mhcflurry/dataset.py | 27 ++++- mhcflurry/dataset_helpers.py | 213 +++++++++++++++++++++++++++++++-- 4 files changed, 243 insertions(+), 235 deletions(-) diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 5fd34782..0849f238 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -17,6 +17,7 @@ from __future__ import ( division, absolute_import, ) +from math import exp, log def parse_int_list(s): return [int(part.strip() for part in s.split(","))] @@ -64,3 +65,17 @@ def split_allele_names(s): for part in s.split(",") ] + + +def geometric_mean(xs, weights=None): + """ + Geometric mean of a collection of affinity measurements, with optional + sample weights. + """ + if len(xs) == 1: + return xs[0] + elif weights is None: + return exp(sum(log(xi) for xi in xs)) + sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights)) + denom = sum(weights) + return exp(sum_weighted_log / denom) diff --git a/mhcflurry/data.py b/mhcflurry/data.py index 136449af..4792ce04 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -22,14 +22,13 @@ from collections import namedtuple, defaultdict import pandas as pd import numpy as np -from .common import normalize_allele_name from .amino_acid import common_amino_acids from .peptide_encoding import ( indices_to_hotshot_encoding, fixed_length_index_encoding, check_valid_index_encoding_array, ) -from .regression_target import MAX_IC50, ic50_to_regression_target + index_encoding = common_amino_acids.index_encoding @@ -50,225 +49,8 @@ AlleleData = namedtuple( ]) -def _infer_csv_separator(filename): - """ - Determine if file is separated by comma, tab, or whitespace. - Default to whitespace if the others are not detected. - - Returns (sep, delim_whitespace) - """ - for candidate in [",", "\t"]: - with open(filename, "r") as f: - for line in f: - if line.startswith("#"): - continue - if candidate in line: - return candidate, False - return None, True - - -def load_dataframe( - filename, - max_ic50=MAX_IC50, - sep=None, - species_column_name="species", - allele_column_name="mhc", - peptide_column_name=None, - filter_peptide_length=None, - ic50_column_name="meas", - only_human=False): - """ - Load a dataframe of peptide-MHC affinity measurements - - filename : str - TSV filename with columns: - - 'species' - - 'mhc' - - 'peptide_length' - - 'sequence' - - 'meas' - - max_ic50 : float - Treat IC50 scores above this value as all equally bad - (transform them to 0.0 in the regression output) - - sep : str, optional - Separator in CSV file, default is to let Pandas infer - - peptide_column_name : str, optional - Default behavior is to try {"sequence", "peptide", "peptide_sequence"} - - filter_peptide_length : int, optional - Which length peptides to use (default=load all lengths) - - only_human : bool - Only load entries from human MHC alleles - - Returns DataFrame augmented with extra column: - - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1] - """ - if sep is None: - sep, delim_whitespace = _infer_csv_separator(filename) - else: - delim_whitespace = False - df = pd.read_csv( - filename, - sep=sep, - delim_whitespace=delim_whitespace, - engine="c") - # hack: get around variability of column naming by checking if - # the peptide_column_name is actually present and if not try "peptide" - if peptide_column_name is None: - columns = set(df.keys()) - for candidate in ["sequence", "peptide", "peptide_sequence"]: - if candidate in columns: - peptide_column_name = candidate - break - if peptide_column_name is None: - raise ValueError( - "Couldn't find peptide column name, candidates: %s" % ( - columns)) - if only_human: - human_mask = df[species_column_name] == "human" - df = df[human_mask] - - if filter_peptide_length: - length_mask = df[peptide_column_name].str.len() == filter_peptide_length - df = df[length_mask] - - df[allele_column_name] = df[allele_column_name].map(normalize_allele_name) - ic50 = np.array(df[ic50_column_name]) - df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50) - return df, peptide_column_name - - -def load_allele_dicts( - filename, - max_ic50=MAX_IC50, - regression_output=False, - sep=None, - species_column_name="species", - allele_column_name="mhc", - peptide_column_name=None, - ic50_column_name="meas", - only_human=False, - min_allele_size=1): - """ - Parsing CSV of binding data into dictionary of dictionaries. - The outer key is an allele name, the inner key is a peptide sequence, - and the inner value is an IC50 or log-transformed value between [0,1] - """ - binding_df, peptide_column_name = load_dataframe( - filename=filename, - max_ic50=max_ic50, - sep=sep, - species_column_name=species_column_name, - allele_column_name=allele_column_name, - peptide_column_name=peptide_column_name, - ic50_column_name=ic50_column_name, - only_human=only_human) - # map peptides to either the raw IC50 or rescaled log IC50 depending - # on the `regression_output` parameter - output_column_name = ( - "regression_output" - if regression_output - else ic50_column_name - ) - return { - allele_name: { - peptide: output - for (peptide, output) - in zip(group[peptide_column_name], group[output_column_name]) - } - for (allele_name, group) - in binding_df.groupby(allele_column_name) - if len(group) >= min_allele_size - } - -def encode_peptide_to_affinity_dict( - peptide_to_affinity_dict, - peptide_length=9, - flatten_binary_encoding=True, - allow_unknown_amino_acids=True): - """ - Given a dictionary mapping from peptide sequences to affinity values, return - both index and binary encodings of fixed length peptides, and - a vector of their affinities. - Parameters - ---------- - peptide_to_affinity_dict : dict - Keys are peptide strings (of multiple lengths), each mapping to a - continuous affinity value. - - peptide_length : int - Length of vector encoding - - flatten_binary_encoding : bool - Should the binary encoding of a peptide be two-dimensional (9x20) - or a flattened 1d vector - - allow_unknown_amino_acids : bool - When extending a short vector to the desired peptide length, should - we insert every possible amino acid or a designated character "X" - indicating an unknown amino acid. - - Returns tuple with the following fields: - - kmer_peptides: fixed length peptide strings - - original_peptides: variable length peptide strings - - counts: how many fixed length peptides were made from this original - - X_index: index encoding of fixed length peptides - - X_binary: binary encoding of fixed length peptides - - Y: affinity values associated with original peptides - """ - raw_peptides = list(sorted(peptide_to_affinity_dict.keys())) - X_index, kmer_peptides, original_peptides, counts = \ - fixed_length_index_encoding( - peptides=raw_peptides, - desired_length=peptide_length, - start_offset_shorten=0, - end_offset_shorten=0, - start_offset_extend=0, - end_offset_extend=0, - allow_unknown_amino_acids=allow_unknown_amino_acids) - - n_samples = len(kmer_peptides) - - assert n_samples == len(original_peptides), \ - "Mismatch between # of samples (%d) and # of peptides (%d)" % ( - n_samples, len(original_peptides)) - assert n_samples == len(counts), \ - "Mismatch between # of samples (%d) and # of counts (%d)" % ( - n_samples, len(counts)) - assert n_samples == len(X_index), \ - "Mismatch between # of sample (%d) and index feature vectors (%d)" % ( - n_samples, len(X_index)) - X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids) - n_indices = 20 + allow_unknown_amino_acids - X_binary = indices_to_hotshot_encoding( - X_index, - n_indices=n_indices) - - assert X_binary.shape[0] == X_index.shape[0], \ - ("Mismatch between number of samples for index encoding (%d)" - " vs. binary encoding (%d)") % ( - X_binary.shape[0], - X_index.shape[0]) - - if flatten_binary_encoding: - # collapse 3D input into 2D matrix - n_binary_features = peptide_length * n_indices - X_binary = X_binary.reshape((n_samples, n_binary_features)) - - # easier to work with counts when they're an array instead of list - counts = np.array(counts) - - Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides]) - assert n_samples == len(Y), \ - "Mismatch between # peptides %d and # regression outputs %d" % ( - n_samples, len(Y)) - return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) def create_allele_data_from_peptide_to_ic50_dict( peptide_to_ic50_dict, @@ -337,8 +119,7 @@ def load_allele_datasets( peptide_column_name=None, peptide_length_column_name="peptide_length", ic50_column_name="meas", - only_human=False, - shuffle=True): + only_human=False): """ Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides and log-transforms the IC50 measurement. Returns dictionary mapping allele diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py index b8f08bfb..9201053f 100644 --- a/mhcflurry/dataset.py +++ b/mhcflurry/dataset.py @@ -23,7 +23,11 @@ from six import string_types import pandas as pd import numpy as np -from .dataset_helpers import prepare_pMHC_affinity_arrays, geometric_mean +from .common import geometric_mean +from .dataset_helpers import ( + prepare_pMHC_affinity_arrays, + load_dataframe +) class Dataset(object): @@ -250,6 +254,27 @@ class Dataset(object): return cls.from_allele_to_peptide_to_affinity_dictionary( {allele_name: peptide_to_affinity_dict}) + @classmethod + def from_csv( + cls, + filename, + sep=None, + allele_column_name=None, + peptide_column_name=None, + affinity_column_name=None): + df, allele_column_name, peptide_column_name, affinity_column_name = \ + load_dataframe( + filename=filename, + sep=sep, + allele_column_name=allele_column_name, + peptide_column_name=peptide_column_name, + affinity_column_name=affinity_column_name) + df = df.rename(columns={ + allele_column_name: "allele", + peptide_column_name: "peptide", + affinity_column_name: "affinity"}) + return cls(df) + def get_allele(self, allele_name): """ Get Dataset for a single allele diff --git a/mhcflurry/dataset_helpers.py b/mhcflurry/dataset_helpers.py index a16a61c0..379b62d5 100644 --- a/mhcflurry/dataset_helpers.py +++ b/mhcflurry/dataset_helpers.py @@ -20,20 +20,14 @@ from __future__ import ( from typechecks import require_instance import numpy as np -from math import exp, log +import pandas as pd -def geometric_mean(xs, weights=None): - """ - Geometric mean of a collection of affinity measurements, with optional - sample weights. - """ - if len(xs) == 1: - return xs[0] - elif weights is None: - return exp(sum(log(xi) for xi in xs)) - sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights)) - denom = sum(weights) - return exp(sum_weighted_log / denom) +from .common import normalize_allele_name +from .peptide_encoding import ( + indices_to_hotshot_encoding, + fixed_length_index_encoding, + check_valid_index_encoding_array, +) def check_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights): """ @@ -82,3 +76,196 @@ def prepare_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights=N affinities=affinities, sample_weights=sample_weights) return alleles, peptides, affinities, sample_weights + + +def infer_csv_separator(filename): + """ + Determine if file is separated by comma, tab, or whitespace. + Default to whitespace if the others are not detected. + + Returns (sep, delim_whitespace) + """ + for candidate in [",", "\t"]: + with open(filename, "r") as f: + for line in f: + if line.startswith("#"): + continue + if candidate in line: + return candidate, False + return None, True + +def load_dataframe( + filename, + sep=None, + allele_column_name=None, + peptide_column_name=None, + affinity_column_name=None, + filter_peptide_length=None, + normalize_allele_names=True): + """ + Load a dataframe of peptide-MHC affinity measurements + + filename : str + TSV filename with columns: + - 'species' + - 'mhc' + - 'peptide_length' + - 'sequence' + - 'meas' + + sep : str, optional + Separator in CSV file, default is to let Pandas infer + + allele_column_name : str, optional + Default behavior is to try {"mhc", "allele", "hla"} + + peptide_column_name : str, optional + Default behavior is to try {"sequence", "peptide", "peptide_sequence"} + + affinity_column_name : str, optional + Default behavior is to try {"meas", "ic50", "affinity", "aff"} + + filter_peptide_length : int, optional + Which length peptides to use (default=load all lengths) + + normalize_allele_names : bool + Normalize MHC names or leave them alone + + Returns: + - DataFrame + - peptide column name + - allele column name + - affinity column name + """ + if sep is None: + sep, delim_whitespace = infer_csv_separator(filename) + else: + delim_whitespace = False + + df = pd.read_csv( + filename, + sep=sep, + delim_whitespace=delim_whitespace, + engine="c") + + columns = set(df.keys()) + + if allele_column_name is None: + for candidate in ["mhc", "allele", "hla"]: + if candidate in columns: + allele_column_name = candidate + break + if allele_column_name is None: + raise ValueError( + "Couldn't find alleles, available columns: %s" % ( + columns,)) + + if peptide_column_name is None: + for candidate in ["sequence", "peptide", "peptide_sequence"]: + if candidate in columns: + peptide_column_name = candidate + break + if peptide_column_name is None: + raise ValueError( + "Couldn't find peptides, available columns: %s" % ( + columns,)) + + if affinity_column_name is None: + for candidate in ["meas", "ic50", "affinity"]: + if candidate in columns: + affinity_column_name = candidate + break + if affinity_column_name is None: + raise ValueError( + "Couldn't find affinity values, available columns: %s" % ( + columns,)) + if filter_peptide_length: + length_mask = df[peptide_column_name].str.len() == filter_peptide_length + df = df[length_mask] + df[allele_column_name] = df[allele_column_name].map(normalize_allele_name) + return df, allele_column_name, peptide_column_name, affinity_column_name + + +def encode_peptide_to_affinity_dict( + peptide_to_affinity_dict, + peptide_length=9, + flatten_binary_encoding=True, + allow_unknown_amino_acids=True): + """ + Given a dictionary mapping from peptide sequences to affinity values, return + both index and binary encodings of fixed length peptides, and + a vector of their affinities. + + Parameters + ---------- + peptide_to_affinity_dict : dict + Keys are peptide strings (of multiple lengths), each mapping to a + continuous affinity value. + + peptide_length : int + Length of vector encoding + + flatten_binary_encoding : bool + Should the binary encoding of a peptide be two-dimensional (9x20) + or a flattened 1d vector + + allow_unknown_amino_acids : bool + When extending a short vector to the desired peptide length, should + we insert every possible amino acid or a designated character "X" + indicating an unknown amino acid. + + Returns tuple with the following fields: + - kmer_peptides: fixed length peptide strings + - original_peptides: variable length peptide strings + - counts: how many fixed length peptides were made from this original + - X_index: index encoding of fixed length peptides + - X_binary: binary encoding of fixed length peptides + - Y: affinity values associated with original peptides + """ + raw_peptides = list(sorted(peptide_to_affinity_dict.keys())) + X_index, kmer_peptides, original_peptides, counts = \ + fixed_length_index_encoding( + peptides=raw_peptides, + desired_length=peptide_length, + start_offset_shorten=0, + end_offset_shorten=0, + start_offset_extend=0, + end_offset_extend=0, + allow_unknown_amino_acids=allow_unknown_amino_acids) + + n_samples = len(kmer_peptides) + + assert n_samples == len(original_peptides), \ + "Mismatch between # of samples (%d) and # of peptides (%d)" % ( + n_samples, len(original_peptides)) + assert n_samples == len(counts), \ + "Mismatch between # of samples (%d) and # of counts (%d)" % ( + n_samples, len(counts)) + assert n_samples == len(X_index), \ + "Mismatch between # of sample (%d) and index feature vectors (%d)" % ( + n_samples, len(X_index)) + X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids) + n_indices = 20 + allow_unknown_amino_acids + X_binary = indices_to_hotshot_encoding( + X_index, + n_indices=n_indices) + + assert X_binary.shape[0] == X_index.shape[0], \ + ("Mismatch between number of samples for index encoding (%d)" + " vs. binary encoding (%d)") % ( + X_binary.shape[0], + X_index.shape[0]) + + if flatten_binary_encoding: + # collapse 3D input into 2D matrix + n_binary_features = peptide_length * n_indices + X_binary = X_binary.reshape((n_samples, n_binary_features)) + + # easier to work with counts when they're an array instead of list + counts = np.array(counts) + + Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides]) + assert n_samples == len(Y), \ + "Mismatch between # peptides %d and # regression outputs %d" % ( + n_samples, len(Y)) + return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) -- GitLab