From e6bef3d102066a0acf4a9d7974ac28dd8352e249 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 29 Apr 2016 12:29:35 -0400 Subject: [PATCH] added sanity checking to imputation helpers --- mhcflurry/amino_acid.py | 7 +++++- .../class1_allele_specific_hyperparameters.py | 2 +- mhcflurry/data.py | 2 +- mhcflurry/feedforward.py | 2 +- mhcflurry/imputation.py | 23 ++++++++++++++++++- mhcflurry/paths.py | 7 +++++- mhcflurry/peptide_encoding.py | 5 ++++ mhcflurry/predictor_base.py | 5 ++++ 8 files changed, 47 insertions(+), 6 deletions(-) diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index f4022561..9c06fada 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import ( + print_function, + division, + absolute_import, +) import numpy as np diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/class1_allele_specific_hyperparameters.py index 33e15cdb..cfdf846b 100644 --- a/mhcflurry/class1_allele_specific_hyperparameters.py +++ b/mhcflurry/class1_allele_specific_hyperparameters.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/mhcflurry/data.py b/mhcflurry/data.py index 1ea71212..59c2be51 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py index e401b4b8..e526d6df 100644 --- a/mhcflurry/feedforward.py +++ b/mhcflurry/feedforward.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py index 1bcb08dc..cddeb5e3 100644 --- a/mhcflurry/imputation.py +++ b/mhcflurry/imputation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ from __future__ import ( absolute_import, ) from collections import defaultdict +import logging + import numpy as np from fancyimpute.dictionary_helpers import ( dense_matrix_from_nested_dictionary @@ -34,6 +36,21 @@ from .data import ( create_allele_data_from_peptide_to_ic50_dict, ) +def _check_dense_pMHC_array(X, peptide_list, allele_list): + if len(peptide_list) != len(set(peptide_list)): + raise ValueError("Duplicate peptides detected in peptide list") + if len(allele_list) != len(set(allele_list)): + raise ValueError("Duplicate alleles detected in allele list") + n_rows, n_cols = X.shape + if n_rows != len(peptide_list): + raise ValueError( + "Expected dense array with shape %s to have %d rows" % ( + X.shape, len(peptide_list))) + if n_cols != len(allele_list): + raise ValueError( + "Expected dense array with shape %s to have %d columns" % ( + X.shape, len(allele_list))) + def prune_dense_matrix_and_labels( X, peptide_list, @@ -89,6 +106,7 @@ def prune_dense_matrix_and_labels( X = X[:, keep_allele_indices] observed_mask = observed_mask[:, keep_allele_indices] allele_list = [allele_list[i] for i in keep_allele_indices] + _check_dense_pMHC_array(X, peptide_list, allele_list) return X, peptide_list, allele_list @@ -135,6 +153,8 @@ def create_incomplete_dense_pMHC_matrix( X, peptide_list, allele_list = \ dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity_dict) + _check_dense_pMHC_array(X, peptide_list, allele_list) + return prune_dense_matrix_and_labels( X, peptide_list, @@ -187,6 +207,7 @@ def create_imputed_datasets( # if all entries in the matrix are already filled in then don't # try using an imputation algorithm since it might raise an # exception. + logging.warn("No missing values, using original data instead of imputation") X_complete = X_incomplete else: X_complete = imputer.complete(X_incomplete) diff --git a/mhcflurry/paths.py b/mhcflurry/paths.py index ec9e8377..1ef41eb9 100644 --- a/mhcflurry/paths.py +++ b/mhcflurry/paths.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine +# Copyright (c) 2016. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import ( + print_function, + division, + absolute_import, +) from os.path import join from appdirs import user_data_dir diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py index 45045680..cad5f53c 100644 --- a/mhcflurry/peptide_encoding.py +++ b/mhcflurry/peptide_encoding.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import ( + print_function, + division, + absolute_import, +) import itertools import logging diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py index dba570da..a335158c 100644 --- a/mhcflurry/predictor_base.py +++ b/mhcflurry/predictor_base.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import ( + print_function, + division, + absolute_import, +) from collections import defaultdict import numpy as np -- GitLab