From e6bef3d102066a0acf4a9d7974ac28dd8352e249 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 29 Apr 2016 12:29:35 -0400
Subject: [PATCH] added sanity checking to imputation helpers

---
 mhcflurry/amino_acid.py                       |  7 +++++-
 .../class1_allele_specific_hyperparameters.py |  2 +-
 mhcflurry/data.py                             |  2 +-
 mhcflurry/feedforward.py                      |  2 +-
 mhcflurry/imputation.py                       | 23 ++++++++++++++++++-
 mhcflurry/paths.py                            |  7 +++++-
 mhcflurry/peptide_encoding.py                 |  5 ++++
 mhcflurry/predictor_base.py                   |  5 ++++
 8 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index f4022561..9c06fada 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
 import numpy as np
 
 
diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/class1_allele_specific_hyperparameters.py
index 33e15cdb..cfdf846b 100644
--- a/mhcflurry/class1_allele_specific_hyperparameters.py
+++ b/mhcflurry/class1_allele_specific_hyperparameters.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
index 1ea71212..59c2be51 100644
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py
index e401b4b8..e526d6df 100644
--- a/mhcflurry/feedforward.py
+++ b/mhcflurry/feedforward.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py
index 1bcb08dc..cddeb5e3 100644
--- a/mhcflurry/imputation.py
+++ b/mhcflurry/imputation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@ from __future__ import (
     absolute_import,
 )
 from collections import defaultdict
+import logging
+
 import numpy as np
 from fancyimpute.dictionary_helpers import (
     dense_matrix_from_nested_dictionary
@@ -34,6 +36,21 @@ from .data import (
     create_allele_data_from_peptide_to_ic50_dict,
 )
 
+def _check_dense_pMHC_array(X, peptide_list, allele_list):
+    if len(peptide_list) != len(set(peptide_list)):
+        raise ValueError("Duplicate peptides detected in peptide list")
+    if len(allele_list) != len(set(allele_list)):
+        raise ValueError("Duplicate alleles detected in allele list")
+    n_rows, n_cols = X.shape
+    if n_rows != len(peptide_list):
+        raise ValueError(
+            "Expected dense array with shape %s to have %d rows" % (
+                X.shape, len(peptide_list)))
+    if n_cols != len(allele_list):
+        raise ValueError(
+            "Expected dense array with shape %s to have %d columns" % (
+                X.shape, len(allele_list)))
+
 def prune_dense_matrix_and_labels(
         X,
         peptide_list,
@@ -89,6 +106,7 @@ def prune_dense_matrix_and_labels(
         X = X[:, keep_allele_indices]
         observed_mask = observed_mask[:, keep_allele_indices]
         allele_list = [allele_list[i] for i in keep_allele_indices]
+    _check_dense_pMHC_array(X, peptide_list, allele_list)
     return X, peptide_list, allele_list
 
 
@@ -135,6 +153,8 @@ def create_incomplete_dense_pMHC_matrix(
 
     X, peptide_list, allele_list = \
         dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity_dict)
+    _check_dense_pMHC_array(X, peptide_list, allele_list)
+
     return prune_dense_matrix_and_labels(
         X,
         peptide_list,
@@ -187,6 +207,7 @@ def create_imputed_datasets(
         # if all entries in the matrix are already filled in then don't
         # try using an imputation algorithm since it might raise an
         # exception.
+        logging.warn("No missing values, using original data instead of imputation")
         X_complete = X_incomplete
     else:
         X_complete = imputer.complete(X_incomplete)
diff --git a/mhcflurry/paths.py b/mhcflurry/paths.py
index ec9e8377..1ef41eb9 100644
--- a/mhcflurry/paths.py
+++ b/mhcflurry/paths.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
 from os.path import join
 from appdirs import user_data_dir
 
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
index 45045680..cad5f53c 100644
--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
 import itertools
 import logging
 
diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py
index dba570da..a335158c 100644
--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
 from collections import defaultdict
 
 import numpy as np
-- 
GitLab