From 294f70eee670e8df56792b584585f398ef8ef1fd Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 9 May 2016 18:19:59 -0400 Subject: [PATCH] moved main impute method onto Dataset --- mhcflurry/dataset.py | 51 +++++++++-- mhcflurry/imputation.py | 195 ---------------------------------------- test/test_imputation.py | 36 ++++---- 3 files changed, 63 insertions(+), 219 deletions(-) delete mode 100644 mhcflurry/imputation.py diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py index b6a42d28..4aa0a823 100644 --- a/mhcflurry/dataset.py +++ b/mhcflurry/dataset.py @@ -18,8 +18,10 @@ from __future__ import ( absolute_import, ) from collections import defaultdict, OrderedDict -from six import string_types +import logging + +from six import string_types import pandas as pd import numpy as np @@ -29,6 +31,12 @@ from .dataset_helpers import ( load_dataframe ) from .peptide_encoding import fixed_length_index_encoding +from .imputation_helpers import ( + check_dense_pMHC_array, + prune_dense_matrix_and_labels, + dense_pMHC_matrix_to_nested_dict, +) + class Dataset(object): @@ -248,7 +256,6 @@ class Dataset(object): "Wrong length for column '%s', expected %d but got %d" % ( column_name, column)) df[column_name] = np.asarray(column) - print(df) return cls(df) @classmethod @@ -468,19 +475,18 @@ class Dataset(object): shape = (n_peptides, n_alleles) X = np.ones(shape, dtype=float) for (allele, allele_dict) in allele_to_peptide_to_affinity_dict.items(): - print(allele, allele_dict) column_index = allele_order[allele] for (peptide, affinity) in allele_dict.items(): - print(peptide, affinity) row_index = peptide_order[peptide] X[row_index, column_index] = affinity return X, peptides_list, alleles_list - def imputed_missing_values( + def impute_missing_values( self, imputation_method, log_transform=True, - synthetic_sample_weight=1.0): + min_observations_per_peptide=2, + min_observations_per_allele=3): """ Synthesize new measurements for missing pMHC pairs using the given imputation_method. @@ -496,10 +502,37 @@ class Dataset(object): Transform affinities with to log10 values before imputation (and then transform back afterward). - synthetic_sample_weight : float - Default weight to give newly synthesized samples. + min_observations_per_peptide : int + Drop peptide rows with fewer than this number of observed values. + + min_observations_per_allele : int + Drop allele columns with fewer than this number of observed values. Returns Dataset with original pMHC affinities and additional synthetic samples. """ - pass + X_incomplete, peptide_list, allele_list = self.to_dense_pMHC_affinity_matrix() + + check_dense_pMHC_array(X_incomplete, peptide_list, allele_list) + + # drop alleles and peptides with small amounts of data + X_incomplete, peptide_list, allele_list = prune_dense_matrix_and_labels( + X_incomplete, peptide_list, allele_list, + min_observations_per_peptide=min_observations_per_peptide, + min_observations_per_allele=min_observations_per_allele) + X_incomplete_log = np.log(X_incomplete) + + if np.isnan(X_incomplete).sum() == 0: + # if all entries in the matrix are already filled in then don't + # try using an imputation algorithm since it might raise an + # exception. + logging.warn("No missing values, using original data instead of imputation") + X_complete_log = X_incomplete_log + else: + X_complete_log = imputation_method.complete(X_incomplete_log) + X_complete = np.exp(X_complete_log) + allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict( + X=X_complete, + peptide_list=peptide_list, + allele_list=allele_list) + return self.from_nested_dictionary(allele_to_peptide_to_affinity_dict) diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py deleted file mode 100644 index c9bf9c6e..00000000 --- a/mhcflurry/imputation.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import ( - print_function, - division, - absolute_import, -) -from collections import defaultdict -import logging - -import numpy as np -from fancyimpute.knn import KNN -from fancyimpute.iterative_svd import IterativeSVD -from fancyimpute.simple_fill import SimpleFill -from fancyimpute.soft_impute import SoftImpute -from fancyimpute.mice import MICE - -from .dataset import Dataset - -def _check_dense_pMHC_array(X, peptide_list, allele_list): - if len(peptide_list) != len(set(peptide_list)): - raise ValueError("Duplicate peptides detected in peptide list") - if len(allele_list) != len(set(allele_list)): - raise ValueError("Duplicate alleles detected in allele list") - n_rows, n_cols = X.shape - if n_rows != len(peptide_list): - raise ValueError( - "Expected dense array with shape %s to have %d rows" % ( - X.shape, len(peptide_list))) - if n_cols != len(allele_list): - raise ValueError( - "Expected dense array with shape %s to have %d columns" % ( - X.shape, len(allele_list))) - -def prune_dense_matrix_and_labels( - X, - peptide_list, - allele_list, - min_observations_per_peptide=1, - min_observations_per_allele=1): - """ - Filter the dense matrix of pMHC binding affinities according to - the given minimum number of row/column observations. - - Parameters - ---------- - X : numpy.ndarray - Incomplete dense matrix of pMHC affinity with n_peptides rows and - n_alleles columns. - - peptide_list : list of str - Expected to have n_peptides entries - - allele_list : list of str - Expected to have n_alleles entries - - min_observations_per_peptide : int - Drop peptide rows with fewer than this number of observed values. - - min_observations_per_allele : int - Drop allele columns with fewer than this number of observed values. - """ - observed_mask = np.isfinite(X) - n_observed_per_peptide = observed_mask.sum(axis=1) - too_few_peptide_observations = ( - n_observed_per_peptide < min_observations_per_peptide) - if too_few_peptide_observations.any(): - drop_peptide_indices = np.where(too_few_peptide_observations)[0] - keep_peptide_indices = np.where(~too_few_peptide_observations)[0] - print("Dropping %d peptides with <%d observations" % ( - len(drop_peptide_indices), - min_observations_per_peptide)) - X = X[keep_peptide_indices] - observed_mask = observed_mask[keep_peptide_indices] - peptide_list = [peptide_list[i] for i in keep_peptide_indices] - - n_observed_per_allele = observed_mask.sum(axis=0) - too_few_allele_observations = ( - n_observed_per_allele < min_observations_per_peptide) - if too_few_peptide_observations.any(): - drop_allele_indices = np.where(too_few_allele_observations)[0] - keep_allele_indices = np.where(~too_few_allele_observations)[0] - print("Dropping %d alleles with <%d observations: %s" % ( - len(drop_allele_indices), - min_observations_per_allele, - [allele_list[i] for i in drop_allele_indices])) - X = X[:, keep_allele_indices] - observed_mask = observed_mask[:, keep_allele_indices] - allele_list = [allele_list[i] for i in keep_allele_indices] - _check_dense_pMHC_array(X, peptide_list, allele_list) - return X, peptide_list, allele_list - -def dense_pMHC_matrix_to_nested_dict(X, peptide_list, allele_list): - """ - Converts a dense matrix of (n_peptides, n_alleles) floats to a nested - dictionary from allele -> peptide -> affinity. - """ - allele_to_peptide_to_ic50_dict = defaultdict(dict) - for row_index, peptide in enumerate(peptide_list): - for column_index, allele_name in enumerate(allele_list): - affinity = X[row_index, column_index] - if np.isfinite(affinity): - allele_to_peptide_to_ic50_dict[allele_name][peptide] = affinity - return allele_to_peptide_to_ic50_dict - -def create_imputed_datasets( - dataset, - imputer, - min_observations_per_peptide=2, - min_observations_per_allele=2): - """ - Parameters - ---------- - allele_data_dict : Dataset - - imputer : object - Expected to have a method imputer.complete(X) which takes an array - with missing entries marked by NaN and returns a completed array. - - min_observations_per_peptide : int - Drop peptide rows with fewer than this number of observed values. - - min_observations_per_allele : int - Drop allele columns with fewer than this number of observed values. - - Returns dictionary mapping allele names to AlleleData objects containing - imputed affinity values. - """ - assert isinstance(dataset, Dataset), "Expected Dataset, got %s" % (type(dataset),) - X_incomplete, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix() - - _check_dense_pMHC_array(X_incomplete, peptide_list, allele_list) - - # drop alleles and peptides with small amounts of data - X_incomplete, peptide_list, allele_list = prune_dense_matrix_and_labels( - X_incomplete, peptide_list, allele_list, - min_observations_per_peptide=min_observations_per_peptide, - min_observations_per_allele=min_observations_per_allele) - X_incomplete_log = np.log(X_incomplete) - - if np.isnan(X_incomplete).sum() == 0: - # if all entries in the matrix are already filled in then don't - # try using an imputation algorithm since it might raise an - # exception. - logging.warn("No missing values, using original data instead of imputation") - X_complete_log = X_incomplete_log - else: - X_complete_log = imputer.complete(X_incomplete_log) - X_complete = np.exp(X_complete_log) - allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict( - X=X_complete, - peptide_list=peptide_list, - allele_list=allele_list) - return Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict) - -def imputer_from_name(imputation_method_name, **kwargs): - """ - Helper function for constructing an imputation object from a name given - typically from a commandline argument. - """ - imputation_method_name = imputation_method_name.strip().lower() - if imputation_method_name == "mice": - kwargs["n_burn_in"] = kwargs.get("n_burn_in", 5) - kwargs["n_imputations"] = kwargs.get("n_imputations", 25) - kwargs["min_value"] = kwargs.get("min_value", 0) - kwargs["max_value"] = kwargs.get("max_value", 1) - return MICE(**kwargs) - elif imputation_method_name == "knn": - kwargs["k"] = kwargs.get("k", 3) - kwargs["orientation"] = kwargs.get("orientation", "columns") - kwargs["print_interval"] = kwargs.get("print_interval", 10) - return KNN(**kwargs) - elif imputation_method_name == "svd": - kwargs["rank"] = kwargs.get("rank", 10) - return IterativeSVD(**kwargs) - elif imputation_method_name == "svt" or imputation_method_name == "softimpute": - return SoftImpute(**kwargs) - elif imputation_method_name == "mean": - return SimpleFill("mean", **kwargs) - elif imputation_method_name == "none": - return None - else: - raise ValueError("Invalid imputation method: %s" % imputation_method_name) diff --git a/test/test_imputation.py b/test/test_imputation.py index 308fd5f8..60d82405 100644 --- a/test/test_imputation.py +++ b/test/test_imputation.py @@ -1,8 +1,5 @@ -from mhcflurry.imputation import ( - create_imputed_datasets, - imputer_from_name, -) +from mhcflurry.imputation_helpers import imputer_from_name from mhcflurry.dataset import Dataset from mhcflurry.paths import CLASS1_DATA_CSV_PATH from mhcflurry import Class1BindingPredictor @@ -13,9 +10,7 @@ from nose.tools import eq_ def test_create_imputed_datasets_empty(): empty_dataset = Dataset.create_empty() - result = create_imputed_datasets( - empty_dataset, - imputer=MICE(n_imputations=25)) + result = empty_dataset.impute_missing_values(MICE(n_imputations=25)) eq_(result, empty_dataset) def test_create_imputed_datasets_two_alleles(): @@ -29,7 +24,7 @@ def test_create_imputed_datasets_two_alleles(): "A" * 9: 25.0, }, }) - imputed_dataset = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25)) + imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25)) eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"}) expected_peptides = {"A" * 9, "C" * 9, "S" * 9} for allele_name, allele_data in imputed_dataset.groupby_allele(): @@ -39,16 +34,24 @@ def test_performance_improves_for_A0205_with_pretraining(): # test to make sure that imputation improves predictive accuracy after a # small number of training iterations (5 epochs) dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH) + print("Full dataset: %d pMHC entries" % len(dataset)) + + limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"] - print("Available alleles: %s" % (set(dataset.unique_alleles()),)) # restrict to just five alleles - dataset = dataset.get_alleles([ - "HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]) + dataset = dataset.get_alleles(limited_alleles) + print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset))) + a0205_data_without_imputation = dataset.get_allele("HLA-A0205") + + print("Dataset with only A0205, # entries: %d" % len(a0205_data_without_imputation)) + predictor_without_imputation = \ Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute") + X_index, Y_true, sample_weights, _ = \ a0205_data_without_imputation.kmer_index_encoding() + predictor_without_imputation.fit_kmer_encoded_arrays( X=X_index, Y=Y_true, @@ -56,13 +59,16 @@ def test_performance_improves_for_A0205_with_pretraining(): n_training_epochs=10) Y_pred_without_imputation = \ - predictor_without_imputation.predict_scores_for_kmer_array(X_index) + predictor_without_imputation.predict_scores_for_kmer_encoded_array(X_index) diff_squared = (Y_true - Y_pred_without_imputation) ** 2 + mse_without_imputation = (diff_squared * sample_weights) / sample_weights.sum() - imputed_datset = dataset.impute(MICE(n_imputations=25)) + imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25)) + print("After imputation, dataset for %s has %d entries" % ( + limited_alleles, len(imputed_datset))) a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205") - + print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation))) X_index_imputed, Y_imputed, sample_weights_imputed, _ = \ a0205_data_with_imputation.kmer_index_encoding() @@ -79,7 +85,7 @@ def test_performance_improves_for_A0205_with_pretraining(): n_training_epochs=10) Y_pred_with_imputation = \ - predictor_with_imputation.predict_scores_for_kmer_array(X_index) + predictor_with_imputation.predict_scores_for_kmer_encoded_array(X_index) diff_squared = (Y_true - Y_pred_with_imputation) ** 2 mse_with_imputation = (diff_squared * sample_weights_imputed) / sample_weights_imputed.sum() -- GitLab