From ec233b8caca3c053d6b8372861b1348c90d9cc0d Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 6 May 2016 19:10:56 -0400 Subject: [PATCH] changing imputation logic to use Dataset --- ...llele_specific_kmer_ic50_predictor_base.py | 13 +- mhcflurry/data.py | 231 ------------------ mhcflurry/dataset.py | 42 ++++ mhcflurry/imputation.py | 50 +--- mhcflurry/peptide_encoding.py | 42 ++-- test/test_class1_binding_predictor_A0205.py | 1 - test/test_load_dataframe.py | 73 ++---- test/test_peptide_encoding.py | 8 +- 8 files changed, 105 insertions(+), 355 deletions(-) delete mode 100644 mhcflurry/data.py diff --git a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py index ee244da1..e8c6ff00 100644 --- a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py +++ b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py @@ -158,8 +158,17 @@ class Class1AlleleSpecificKmerIC50PredictorBase(IC50PredictorBase): Extra arguments are passed on to the fit_encoded_kmer_arrays() method. """ - X, Y, sample_weights = dataset.encode() - X_pretrain, Y_pretrain, sample_weights_pretrain = pretraining_dataset.encode() + X, Y, sample_weights, _ = dataset.kmer_index_encoding( + kmer_size=self.kmer_size, + allow_unknown_amino_acids=self.allow_unknown_amino_acids) + + if pretraining_dataset is None: + X_pretrain = Y_pretrain = sample_weights_pretrain = None + else: + X_pretrain, Y_pretrain, sample_weights_pretrain = \ + pretraining_dataset.kmer_index_encoding( + kmer_size=self.kmer_size, + allow_unknown_amino_acids=self.allow_unknown_amino_acids) return self.fit_arrays( X=X, Y=Y, diff --git a/mhcflurry/data.py b/mhcflurry/data.py deleted file mode 100644 index 4792ce04..00000000 --- a/mhcflurry/data.py +++ /dev/null @@ -1,231 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import ( - print_function, - division, - absolute_import, -) -from collections import namedtuple, defaultdict - -import pandas as pd -import numpy as np - -from .amino_acid import common_amino_acids -from .peptide_encoding import ( - indices_to_hotshot_encoding, - fixed_length_index_encoding, - check_valid_index_encoding_array, -) - - -index_encoding = common_amino_acids.index_encoding - -AlleleData = namedtuple( - "AlleleData", - [ - "X_index", # index-based featue encoding of fixed length peptides - "X_binary", # binary encoding of fixed length peptides - "Y", # regression encoding of IC50 (log scaled between 0..1) - "peptides", # list of fixed length peptide string - "ic50", # IC50 value associated with each entry - "original_peptides", # original peptides may be of different lengths - "original_lengths", # len(original_peptide) - "substring_counts", # how many substrings were extracted from - # each original peptide string - "weights", # 1.0 / count - "max_ic50", # maximum IC50 value used for encoding Y - ]) - - - - - -def create_allele_data_from_peptide_to_ic50_dict( - peptide_to_ic50_dict, - max_ic50=MAX_IC50, - kmer_length=9, - flatten_binary_encoding=True): - """ - Parameters - ---------- - peptide_to_ic50_dict : dict - Dictionary mapping peptides of different lengths to IC50 binding - affinity values. - - max_ic50 : float - Maximum IC50 value used as the cutoff for affinity of 0.0 when - transforming from IC50 to regression targets. - - kmer_length : int - What length substrings will be fed to a fixed-length predictor? - - flatten_binary_encoding : bool - Should hotshot encodings of amino acid inputs be flattened into a 1D - vector or have two dimensions (where the first represents position)? - - Return an AlleleData object. - """ - Y_dict = { - peptide: ic50_to_regression_target(ic50, max_ic50) - for (peptide, ic50) - in peptide_to_ic50_dict.items() - } - (kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \ - encode_peptide_to_affinity_dict( - Y_dict, - peptide_length=kmer_length, - flatten_binary_encoding=flatten_binary_encoding) - - ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides]) - assert len(kmer_peptides) == len(ic50_array), \ - "Mismatch between # of peptides %d and # IC50 outputs %d" % ( - len(kmer_peptides), len(ic50_array)) - - return AlleleData( - X_index=X_index, - X_binary=X_binary, - Y=Y_kmer, - ic50=ic50_array, - peptides=np.array(kmer_peptides), - original_peptides=np.array(original_peptides), - original_lengths=np.array( - [len(peptide) for peptide in original_peptides]), - substring_counts=counts, - weights=1.0 / counts, - max_ic50=max_ic50) - - -def load_allele_datasets( - filename, - peptide_length=9, - use_multiple_peptide_lengths=True, - max_ic50=MAX_IC50, - flatten_binary_encoding=True, - sep=None, - species_column_name="species", - allele_column_name="mhc", - peptide_column_name=None, - peptide_length_column_name="peptide_length", - ic50_column_name="meas", - only_human=False): - """ - Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides - and log-transforms the IC50 measurement. Returns dictionary mapping allele - names to AlleleData objects (containing fields X, Y, ic50) - - Parameters - ---------- - filename : str - TSV filename with columns: - - 'species' - - 'mhc' - - 'peptide_length' - - 'sequence' - - 'meas' - - peptide_length : int - Which length peptides to use (default=9) - - use_multiple_peptide_lengths : bool - If a peptide is shorter than `peptide_length`, expand it into many - peptides of the appropriate length by inserting all combinations of - amino acids. Similarly, if a peptide is longer than `peptide_length`, - shorten it by deleting stretches of contiguous amino acids at all - peptide positions. - - max_ic50 : float - Treat IC50 scores above this value as all equally bad - (transform them to 0.0 in the rescaled output) - - flatten_binary_encoding : bool - If False, returns a (n_samples, peptide_length, 20) matrix, otherwise - returns the 2D flattened version of the same data. - - sep : str, optional - Separator in CSV file, default is to let Pandas infer - - peptide_column_name : str, optional - Default behavior is to try {"sequence", "peptide", "peptide_sequence"} - - only_human : bool - Only load entries from human MHC alleles - """ - df, peptide_column_name = load_dataframe( - filename=filename, - max_ic50=max_ic50, - sep=sep, - species_column_name=species_column_name, - allele_column_name=allele_column_name, - peptide_column_name=peptide_column_name, - filter_peptide_length=None if use_multiple_peptide_lengths else peptide_length, - ic50_column_name=ic50_column_name, - only_human=only_human) - - allele_groups = {} - for allele, group in df.groupby(allele_column_name): - assert allele not in allele_groups, \ - "Duplicate datasets for %s" % allele - - raw_peptides = group[peptide_column_name] - - # filter lengths in case user wants to drop peptides that are longer - # or shorter than the desired fixed length - if not use_multiple_peptide_lengths: - drop_mask = raw_peptides.str.len() != peptide_length - group = group[~drop_mask] - raw_peptides = raw_peptides[~drop_mask] - - # convert from a Pandas column to a list, since that's easier to - # interact with later - raw_peptides = list(raw_peptides) - - # create dictionaries of outputs from which we can look up values - # after peptides have been expanded - ic50_dict = { - peptide: ic50 - for (peptide, ic50) - in zip(raw_peptides, group[ic50_column_name]) - } - allele_groups[allele] = create_allele_data_from_peptide_to_ic50_dict( - ic50_dict, - max_ic50=max_ic50) - return allele_groups - - -def collapse_multiple_peptide_entries(allele_datasets): - """ - If an (allele, peptide) pair occurs multiple times then reduce it - to a single entry by taking the weighted average of affinity values. - - Returns a dictionary of alleles, each of which maps to a dictionary of - peptides that map to affinity values. - """ - allele_to_peptide_to_affinity = {} - for (allele, dataset) in allele_datasets.items(): - multiple_affinities = defaultdict(list) - for (peptide, normalized_affinity, weight) in zip( - dataset.peptides, dataset.Y, dataset.weights): - multiple_affinities[peptide].append((normalized_affinity, weight)) - weighted_averages = {} - for peptide, affinity_weight_tuples in multiple_affinities.items(): - denom = 0.0 - sum_weighted_affinities = 0.0 - for affinity, weight in affinity_weight_tuples: - sum_weighted_affinities += affinity * weight - denom += weight - if denom > 0: - weighted_averages[peptide] = sum_weighted_affinities / denom - allele_to_peptide_to_affinity[allele] = weighted_averages - return allele_to_peptide_to_affinity diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py index 9201053f..cfc8b611 100644 --- a/mhcflurry/dataset.py +++ b/mhcflurry/dataset.py @@ -28,6 +28,7 @@ from .dataset_helpers import ( prepare_pMHC_affinity_arrays, load_dataframe ) +from .peptide_encoding import fixed_length_index_encoding class Dataset(object): @@ -342,6 +343,47 @@ class Dataset(object): df = pd.DataFrame(new_data_dict) return self.__class__(df) + def kmer_index_encoding( + self, + kmer_size=9, + allow_unknown_amino_acids=True): + """ + Encode peptides in this dataset using a fixed-length vector + representation. + + Parameters + ---------- + kmer_size : int + Length of encoding for each peptide + + allow_unknown_amino_acids : bool + If True, then extend shorter amino acids using "X" character, + otherwise fill in all possible combinations of real amino acids. + + Returns: + - 2d array of encoded kmers + - 1d array of affinity value corresponding to the source + peptide for each kmer + - sample_weights (1 / kmer count per peptide) + - indices of original peptides from which kmers were extracted + """ + X_index, _, original_peptide_indices, counts = \ + fixed_length_index_encoding( + peptides=self.peptides, + desired_length=kmer_size, + start_offset_shorten=0, + end_offset_shorten=0, + start_offset_extend=0, + end_offset_extend=0, + allow_unknown_amino_acids=allow_unknown_amino_acids) + original_peptide_indices = np.asarray(original_peptide_indices) + counts = np.asarray(counts) + assert len(original_peptide_indices) == len(self.affinities) + assert len(counts) == len(self.affinities) + kmer_affinities = self.affinities[original_peptide_indices] + sample_weights = 1.0 / counts + return X_index, kmer_affinities, sample_weights, original_peptide_indices + def imputed_missing_values( self, imputation_method, diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py index 018356f3..2f809e2f 100644 --- a/mhcflurry/imputation.py +++ b/mhcflurry/imputation.py @@ -28,10 +28,7 @@ from fancyimpute.soft_impute import SoftImpute from fancyimpute.mice import MICE from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary -from .data import ( - create_allele_data_from_peptide_to_ic50_dict, -) -from .regression_target import regression_target_to_ic50 +from .dataset import Dataset def _check_dense_pMHC_array(X, peptide_list, allele_list): if len(peptide_list) != len(set(peptide_list)): @@ -164,15 +161,14 @@ def dense_pMHC_matrix_to_nested_dict(X, peptide_list, allele_list): return allele_to_peptide_to_ic50_dict def create_imputed_datasets( - allele_data_dict, + dataset, imputer, - min_observations_per_peptide=1, - min_observations_per_allele=1): + min_observations_per_peptide=2, + min_observations_per_allele=2): """ Parameters ---------- - allele_data_dict : dict - Dictionary mapping allele names to AlleleData objects + allele_data_dict : Dataset imputer : object Expected to have a method imputer.complete(X) which takes an array @@ -187,51 +183,27 @@ def create_imputed_datasets( Returns dictionary mapping allele names to AlleleData objects containing imputed affinity values. """ - if len(allele_data_dict) == 0: - return {} - - # pick the max IC50 associated with a random AlleleData object and then - # make sure that they all agree - allele_data_objects = list(allele_data_dict.values()) - max_ic50 = allele_data_objects[0].max_ic50 - if not all(allele_data.max_ic50 == max_ic50 for allele_data in allele_data_objects): - raise ValueError("AlleleData objects must all use the same max IC50 (%f)" % ( - max_ic50,)) X_incomplete, peptide_list, allele_list = create_incomplete_dense_pMHC_matrix( allele_data_dict=allele_data_dict, min_observations_per_peptide=min_observations_per_peptide, min_observations_per_allele=min_observations_per_allele) + X_incomplete_log = np.log(X_incomplete) + if np.isnan(X_incomplete).sum() == 0: # if all entries in the matrix are already filled in then don't # try using an imputation algorithm since it might raise an # exception. logging.warn("No missing values, using original data instead of imputation") - X_complete = X_incomplete + X_complete_log = X_incomplete_log else: - X_complete = imputer.complete(X_incomplete) - + X_complete_log = imputer.complete(X_incomplete_log) + X_complete = np.exp(X_complete_log) allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict( X=X_complete, peptide_list=peptide_list, allele_list=allele_list) - - # map all the normalized regression targets to IC50 values - allele_to_peptide_to_ic50_dict = { - allele: { - peptide: regression_target_to_ic50(y, max_ic50=max_ic50) - for (peptide, y) - in allele_dict.items() - } - for (allele, allele_dict) - in allele_to_peptide_to_affinity_dict.items() - } - - return { - allele_name: create_allele_data_from_peptide_to_ic50_dict(allele_dict) - for (allele_name, allele_dict) - in allele_to_peptide_to_ic50_dict.items() - } + return Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict) def imputer_from_name(imputation_method_name, **kwargs): """ diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py index cad5f53c..49698814 100644 --- a/mhcflurry/peptide_encoding.py +++ b/mhcflurry/peptide_encoding.py @@ -128,7 +128,6 @@ def shorten_peptide( for i in range(start_offset, end_range) ] - def fixed_length_from_many_peptides( peptides, desired_length, @@ -141,7 +140,6 @@ def fixed_length_from_many_peptides( Create a set of fixed-length k-mer peptides from a collection of varying length peptides. - Shorter peptides are filled in using all possible amino acids at any insertion site between start_offset_shorten and -end_offset_shorten where start_offset_extend=0 represents insertions before the string @@ -167,10 +165,10 @@ def fixed_length_from_many_peptides( Returns three lists: - a list of fixed length peptides (all of length `desired_length`) - - a list of the original peptides from which subsequences were - contracted or lengthened - - a list of integers indicating the number of fixed length peptides - generated from each variable length peptide. + - a list of indices of the original peptides from which subsequences + were contracted or lengthened + - a list of counts for each fixed length peptide indicating the + number extracted from its corresponding shorter/longer peptide Example: kmers, original, counts = fixed_length_from_many_peptides( @@ -187,7 +185,7 @@ def fixed_length_from_many_peptides( Parameters ---------- - peptide : list of str + peptides : list of str desired_length : int @@ -202,9 +200,9 @@ def fixed_length_from_many_peptides( insert_amino_acid_letters : str | list of characters """ all_fixed_length_peptides = [] - original_peptide_sequences = [] + indices = [] counts = [] - for peptide in peptides: + for i, peptide in enumerate(peptides): n = len(peptide) if n == desired_length: fixed_length_peptides = [peptide] @@ -228,12 +226,11 @@ def fixed_length_from_many_peptides( start_offset=start_offset_shorten, end_offset=end_offset_shorten, insert_amino_acid_letters=insert_amino_acid_letters) - n_fixed_length = len(fixed_length_peptides) all_fixed_length_peptides.extend(fixed_length_peptides) - original_peptide_sequences.extend([peptide] * n_fixed_length) + indices.extend([i] * n_fixed_length) counts.extend([n_fixed_length] * n_fixed_length) - return all_fixed_length_peptides, original_peptide_sequences, counts + return all_fixed_length_peptides, indices, counts def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): @@ -296,16 +293,17 @@ def fixed_length_index_encoding( insert_letters = common_amino_acid_letters index_encoding = common_amino_acids.index_encoding - fixed_length, original, counts = fixed_length_from_many_peptides( - peptides=peptides, - desired_length=desired_length, - start_offset_shorten=start_offset_shorten, - end_offset_shorten=end_offset_shorten, - start_offset_extend=start_offset_extend, - end_offset_extend=end_offset_extend, - insert_amino_acid_letters=insert_letters) - X_index = index_encoding(fixed_length, desired_length) - return (X_index, fixed_length, original, counts) + fixed_length, original_peptide_indices, counts = \ + fixed_length_from_many_peptides( + peptides=peptides, + desired_length=desired_length, + start_offset_shorten=start_offset_shorten, + end_offset_shorten=end_offset_shorten, + start_offset_extend=start_offset_extend, + end_offset_extend=end_offset_extend, + insert_amino_acid_letters=insert_letters) + X = index_encoding(fixed_length, desired_length) + return (X, fixed_length, original_peptide_indices, counts) def check_valid_index_encoding_array(X, allow_unknown_amino_acids=True): X = np.asarray(X) diff --git a/test/test_class1_binding_predictor_A0205.py b/test/test_class1_binding_predictor_A0205.py index 0383c40c..819bbd1c 100644 --- a/test/test_class1_binding_predictor_A0205.py +++ b/test/test_class1_binding_predictor_A0205.py @@ -7,7 +7,6 @@ import numpy as np def class1_binding_predictor_A0205_training_accuracy(): - dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH) dataset_a0205 = dataset.get_allele("HLA-A0205") diff --git a/test/test_load_dataframe.py b/test/test_load_dataframe.py index 64972e9f..ab19a94c 100644 --- a/test/test_load_dataframe.py +++ b/test/test_load_dataframe.py @@ -18,24 +18,20 @@ Testing dataframe loading from CSV: load_dataframe Input: filename, - peptide_length=None, - max_ic50=MAX_IC50, - sep=None, - species_column_name="species", - allele_column_name="mhc", - peptide_column_name=None, - peptide_length_column_name="peptide_length", - ic50_column_name="meas", - only_human=True Outputs: - Tuple with dataframe and name of peptide column + Tuple with following elements: + - dataframe + - name of allele column + - name of peptide column + - name of affinity column """ -from mhcflurry.data import load_dataframe +from mhcflurry.dataset_helpers import load_dataframe import numpy as np from tempfile import NamedTemporaryFile from pandas import DataFrame +from nose.tools import eq_ def make_dummy_dataframe(): dummy_ic50_values = np.array([ @@ -62,48 +58,13 @@ def test_load_dataframe(): binding_data_path = f.name df_expected.to_csv(binding_data_path, index=False) - for max_ic50 in [500.0, 50000.0]: - df, peptide_column_name = load_dataframe( - filename=binding_data_path, - max_ic50=max_ic50, - only_human=False) - assert peptide_column_name == "sequence" - assert len(df) == n_expected, \ - "Expected %d entries in DataFrame but got %d in %s" % ( - n_expected, - len(df), - df) - assert "regression_output" in df, df.columns - expected_values = np.minimum( - 0, - np.log(df["meas"]) / np.log(max_ic50) - ) - np.allclose(df["regression_output"], expected_values) - -def test_load_dataframe_human(): - df_expected = make_dummy_dataframe() - human_mask = df_expected["species"] == "human" - df_expected = df_expected[human_mask] - n_expected = len(df_expected) - - with NamedTemporaryFile(mode="w") as f: - binding_data_path = f.name - df_expected.to_csv(binding_data_path, index=False) - - for max_ic50 in [500.0, 50000.0]: - df, peptide_column_name = load_dataframe( - filename=binding_data_path, - max_ic50=max_ic50, - only_human=True) - assert peptide_column_name == "sequence" - assert len(df) == n_expected, \ - "Expected %d entries in DataFrame but got %d in %s" % ( - n_expected, - len(df), - df) - assert "regression_output" in df, df.columns - expected_values = np.minimum( - 0, - np.log(df["meas"]) / np.log(max_ic50) - ) - np.allclose(df["regression_output"], expected_values) + df, allele_column_name, peptide_column_name, affinity_column_name = \ + load_dataframe(filename=binding_data_path) + eq_(allele_column_name, "mhc") + eq_(peptide_column_name, "sequence") + eq_(affinity_column_name, "meas") + assert len(df) == n_expected, \ + "Expected %d entries in DataFrame but got %d in %s" % ( + n_expected, + len(df), + df) diff --git a/test/test_peptide_encoding.py b/test/test_peptide_encoding.py index aa6ff804..e1e31e51 100644 --- a/test/test_peptide_encoding.py +++ b/test/test_peptide_encoding.py @@ -105,7 +105,7 @@ def test_shorten_peptide_all_positions_except_last(): def test_fixed_length_from_many_peptides(): - kmers, original, counts = fixed_length_from_many_peptides( + kmers, original_indices, counts = fixed_length_from_many_peptides( peptides=["ABC", "A"], desired_length=2, start_offset_extend=0, @@ -114,10 +114,10 @@ def test_fixed_length_from_many_peptides(): end_offset_shorten=0, insert_amino_acid_letters="ABC") print(kmers) - print(original) + print(original_indices) print(counts) - eq_(len(kmers), len(original)) + eq_(len(kmers), len(original_indices)) eq_(len(kmers), len(counts)) eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]) - eq_(original, ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]) + eq_(original_indices, [0] * 3 + [1] * 6) eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6]) -- GitLab