From 294f70eee670e8df56792b584585f398ef8ef1fd Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 9 May 2016 18:19:59 -0400
Subject: [PATCH] moved main impute method onto Dataset

---
 mhcflurry/dataset.py    |  51 +++++++++--
 mhcflurry/imputation.py | 195 ----------------------------------------
 test/test_imputation.py |  36 ++++----
 3 files changed, 63 insertions(+), 219 deletions(-)
 delete mode 100644 mhcflurry/imputation.py

diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py
index b6a42d28..4aa0a823 100644
--- a/mhcflurry/dataset.py
+++ b/mhcflurry/dataset.py
@@ -18,8 +18,10 @@ from __future__ import (
     absolute_import,
 )
 from collections import defaultdict, OrderedDict
-from six import string_types
+import logging
+
 
+from six import string_types
 import pandas as pd
 import numpy as np
 
@@ -29,6 +31,12 @@ from .dataset_helpers import (
     load_dataframe
 )
 from .peptide_encoding import fixed_length_index_encoding
+from .imputation_helpers import (
+    check_dense_pMHC_array,
+    prune_dense_matrix_and_labels,
+    dense_pMHC_matrix_to_nested_dict,
+)
+
 
 class Dataset(object):
 
@@ -248,7 +256,6 @@ class Dataset(object):
                     "Wrong length for column '%s', expected %d but got %d" % (
                         column_name, column))
             df[column_name] = np.asarray(column)
-        print(df)
         return cls(df)
 
     @classmethod
@@ -468,19 +475,18 @@ class Dataset(object):
         shape = (n_peptides, n_alleles)
         X = np.ones(shape, dtype=float)
         for (allele, allele_dict) in allele_to_peptide_to_affinity_dict.items():
-            print(allele, allele_dict)
             column_index = allele_order[allele]
             for (peptide, affinity) in allele_dict.items():
-                print(peptide, affinity)
                 row_index = peptide_order[peptide]
                 X[row_index, column_index] = affinity
         return X, peptides_list, alleles_list
 
-    def imputed_missing_values(
+    def impute_missing_values(
             self,
             imputation_method,
             log_transform=True,
-            synthetic_sample_weight=1.0):
+            min_observations_per_peptide=2,
+            min_observations_per_allele=3):
         """
         Synthesize new measurements for missing pMHC pairs using the given
         imputation_method.
@@ -496,10 +502,37 @@ class Dataset(object):
             Transform affinities with to log10 values before imputation
             (and then transform back afterward).
 
-        synthetic_sample_weight : float
-            Default weight to give newly synthesized samples.
+        min_observations_per_peptide : int
+            Drop peptide rows with fewer than this number of observed values.
+
+        min_observations_per_allele : int
+            Drop allele columns with fewer than this number of observed values.
 
         Returns Dataset with original pMHC affinities and additional
         synthetic samples.
         """
-        pass
+        X_incomplete, peptide_list, allele_list = self.to_dense_pMHC_affinity_matrix()
+
+        check_dense_pMHC_array(X_incomplete, peptide_list, allele_list)
+
+        # drop alleles and peptides with small amounts of data
+        X_incomplete, peptide_list, allele_list = prune_dense_matrix_and_labels(
+            X_incomplete, peptide_list, allele_list,
+            min_observations_per_peptide=min_observations_per_peptide,
+            min_observations_per_allele=min_observations_per_allele)
+        X_incomplete_log = np.log(X_incomplete)
+
+        if np.isnan(X_incomplete).sum() == 0:
+            # if all entries in the matrix are already filled in then don't
+            # try using an imputation algorithm since it might raise an
+            # exception.
+            logging.warn("No missing values, using original data instead of imputation")
+            X_complete_log = X_incomplete_log
+        else:
+            X_complete_log = imputation_method.complete(X_incomplete_log)
+        X_complete = np.exp(X_complete_log)
+        allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict(
+            X=X_complete,
+            peptide_list=peptide_list,
+            allele_list=allele_list)
+        return self.from_nested_dictionary(allele_to_peptide_to_affinity_dict)
diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py
deleted file mode 100644
index c9bf9c6e..00000000
--- a/mhcflurry/imputation.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-from collections import defaultdict
-import logging
-
-import numpy as np
-from fancyimpute.knn import KNN
-from fancyimpute.iterative_svd import IterativeSVD
-from fancyimpute.simple_fill import SimpleFill
-from fancyimpute.soft_impute import SoftImpute
-from fancyimpute.mice import MICE
-
-from .dataset import Dataset
-
-def _check_dense_pMHC_array(X, peptide_list, allele_list):
-    if len(peptide_list) != len(set(peptide_list)):
-        raise ValueError("Duplicate peptides detected in peptide list")
-    if len(allele_list) != len(set(allele_list)):
-        raise ValueError("Duplicate alleles detected in allele list")
-    n_rows, n_cols = X.shape
-    if n_rows != len(peptide_list):
-        raise ValueError(
-            "Expected dense array with shape %s to have %d rows" % (
-                X.shape, len(peptide_list)))
-    if n_cols != len(allele_list):
-        raise ValueError(
-            "Expected dense array with shape %s to have %d columns" % (
-                X.shape, len(allele_list)))
-
-def prune_dense_matrix_and_labels(
-        X,
-        peptide_list,
-        allele_list,
-        min_observations_per_peptide=1,
-        min_observations_per_allele=1):
-    """
-    Filter the dense matrix of pMHC binding affinities according to
-    the given minimum number of row/column observations.
-
-    Parameters
-    ----------
-    X : numpy.ndarray
-        Incomplete dense matrix of pMHC affinity with n_peptides rows and
-        n_alleles columns.
-
-    peptide_list : list of str
-        Expected to have n_peptides entries
-
-    allele_list : list of str
-        Expected to have n_alleles entries
-
-    min_observations_per_peptide : int
-        Drop peptide rows with fewer than this number of observed values.
-
-    min_observations_per_allele : int
-        Drop allele columns with fewer than this number of observed values.
-    """
-    observed_mask = np.isfinite(X)
-    n_observed_per_peptide = observed_mask.sum(axis=1)
-    too_few_peptide_observations = (
-        n_observed_per_peptide < min_observations_per_peptide)
-    if too_few_peptide_observations.any():
-        drop_peptide_indices = np.where(too_few_peptide_observations)[0]
-        keep_peptide_indices = np.where(~too_few_peptide_observations)[0]
-        print("Dropping %d peptides with <%d observations" % (
-            len(drop_peptide_indices),
-            min_observations_per_peptide))
-        X = X[keep_peptide_indices]
-        observed_mask = observed_mask[keep_peptide_indices]
-        peptide_list = [peptide_list[i] for i in keep_peptide_indices]
-
-    n_observed_per_allele = observed_mask.sum(axis=0)
-    too_few_allele_observations = (
-        n_observed_per_allele < min_observations_per_peptide)
-    if too_few_peptide_observations.any():
-        drop_allele_indices = np.where(too_few_allele_observations)[0]
-        keep_allele_indices = np.where(~too_few_allele_observations)[0]
-        print("Dropping %d alleles with <%d observations: %s" % (
-            len(drop_allele_indices),
-            min_observations_per_allele,
-            [allele_list[i] for i in drop_allele_indices]))
-        X = X[:, keep_allele_indices]
-        observed_mask = observed_mask[:, keep_allele_indices]
-        allele_list = [allele_list[i] for i in keep_allele_indices]
-    _check_dense_pMHC_array(X, peptide_list, allele_list)
-    return X, peptide_list, allele_list
-
-def dense_pMHC_matrix_to_nested_dict(X, peptide_list, allele_list):
-    """
-    Converts a dense matrix of (n_peptides, n_alleles) floats to a nested
-    dictionary from allele -> peptide -> affinity.
-    """
-    allele_to_peptide_to_ic50_dict = defaultdict(dict)
-    for row_index, peptide in enumerate(peptide_list):
-        for column_index, allele_name in enumerate(allele_list):
-            affinity = X[row_index, column_index]
-            if np.isfinite(affinity):
-                allele_to_peptide_to_ic50_dict[allele_name][peptide] = affinity
-    return allele_to_peptide_to_ic50_dict
-
-def create_imputed_datasets(
-        dataset,
-        imputer,
-        min_observations_per_peptide=2,
-        min_observations_per_allele=2):
-    """
-    Parameters
-    ----------
-    allele_data_dict : Dataset
-
-    imputer : object
-        Expected to have a method imputer.complete(X) which takes an array
-        with missing entries marked by NaN and returns a completed array.
-
-    min_observations_per_peptide : int
-        Drop peptide rows with fewer than this number of observed values.
-
-    min_observations_per_allele : int
-        Drop allele columns with fewer than this number of observed values.
-
-    Returns dictionary mapping allele names to AlleleData objects containing
-    imputed affinity values.
-    """
-    assert isinstance(dataset, Dataset), "Expected Dataset, got %s" % (type(dataset),)
-    X_incomplete, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix()
-
-    _check_dense_pMHC_array(X_incomplete, peptide_list, allele_list)
-
-    # drop alleles and peptides with small amounts of data
-    X_incomplete, peptide_list, allele_list = prune_dense_matrix_and_labels(
-        X_incomplete, peptide_list, allele_list,
-        min_observations_per_peptide=min_observations_per_peptide,
-        min_observations_per_allele=min_observations_per_allele)
-    X_incomplete_log = np.log(X_incomplete)
-
-    if np.isnan(X_incomplete).sum() == 0:
-        # if all entries in the matrix are already filled in then don't
-        # try using an imputation algorithm since it might raise an
-        # exception.
-        logging.warn("No missing values, using original data instead of imputation")
-        X_complete_log = X_incomplete_log
-    else:
-        X_complete_log = imputer.complete(X_incomplete_log)
-    X_complete = np.exp(X_complete_log)
-    allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict(
-        X=X_complete,
-        peptide_list=peptide_list,
-        allele_list=allele_list)
-    return Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict)
-
-def imputer_from_name(imputation_method_name, **kwargs):
-    """
-    Helper function for constructing an imputation object from a name given
-    typically from a commandline argument.
-    """
-    imputation_method_name = imputation_method_name.strip().lower()
-    if imputation_method_name == "mice":
-        kwargs["n_burn_in"] = kwargs.get("n_burn_in", 5)
-        kwargs["n_imputations"] = kwargs.get("n_imputations", 25)
-        kwargs["min_value"] = kwargs.get("min_value", 0)
-        kwargs["max_value"] = kwargs.get("max_value", 1)
-        return MICE(**kwargs)
-    elif imputation_method_name == "knn":
-        kwargs["k"] = kwargs.get("k", 3)
-        kwargs["orientation"] = kwargs.get("orientation", "columns")
-        kwargs["print_interval"] = kwargs.get("print_interval", 10)
-        return KNN(**kwargs)
-    elif imputation_method_name == "svd":
-        kwargs["rank"] = kwargs.get("rank", 10)
-        return IterativeSVD(**kwargs)
-    elif imputation_method_name == "svt" or imputation_method_name == "softimpute":
-        return SoftImpute(**kwargs)
-    elif imputation_method_name == "mean":
-        return SimpleFill("mean", **kwargs)
-    elif imputation_method_name == "none":
-        return None
-    else:
-        raise ValueError("Invalid imputation method: %s" % imputation_method_name)
diff --git a/test/test_imputation.py b/test/test_imputation.py
index 308fd5f8..60d82405 100644
--- a/test/test_imputation.py
+++ b/test/test_imputation.py
@@ -1,8 +1,5 @@
 
-from mhcflurry.imputation import (
-    create_imputed_datasets,
-    imputer_from_name,
-)
+from mhcflurry.imputation_helpers import imputer_from_name
 from mhcflurry.dataset import Dataset
 from mhcflurry.paths import CLASS1_DATA_CSV_PATH
 from mhcflurry import Class1BindingPredictor
@@ -13,9 +10,7 @@ from nose.tools import eq_
 
 def test_create_imputed_datasets_empty():
     empty_dataset = Dataset.create_empty()
-    result = create_imputed_datasets(
-        empty_dataset,
-        imputer=MICE(n_imputations=25))
+    result = empty_dataset.impute_missing_values(MICE(n_imputations=25))
     eq_(result, empty_dataset)
 
 def test_create_imputed_datasets_two_alleles():
@@ -29,7 +24,7 @@ def test_create_imputed_datasets_two_alleles():
             "A" * 9: 25.0,
         },
     })
-    imputed_dataset = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
+    imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25))
     eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
     expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
     for allele_name, allele_data in imputed_dataset.groupby_allele():
@@ -39,16 +34,24 @@ def test_performance_improves_for_A0205_with_pretraining():
     # test to make sure that imputation improves predictive accuracy after a
     # small number of training iterations (5 epochs)
     dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
+    print("Full dataset: %d pMHC entries" % len(dataset))
+
+    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]
 
-    print("Available alleles: %s" % (set(dataset.unique_alleles()),))
     # restrict to just five alleles
-    dataset = dataset.get_alleles([
-        "HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"])
+    dataset = dataset.get_alleles(limited_alleles)
+    print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset)))
+
     a0205_data_without_imputation = dataset.get_allele("HLA-A0205")
+
+    print("Dataset with only A0205, # entries: %d" % len(a0205_data_without_imputation))
+
     predictor_without_imputation = \
         Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")
+
     X_index, Y_true, sample_weights, _ = \
         a0205_data_without_imputation.kmer_index_encoding()
+
     predictor_without_imputation.fit_kmer_encoded_arrays(
         X=X_index,
         Y=Y_true,
@@ -56,13 +59,16 @@ def test_performance_improves_for_A0205_with_pretraining():
         n_training_epochs=10)
 
     Y_pred_without_imputation = \
-        predictor_without_imputation.predict_scores_for_kmer_array(X_index)
+        predictor_without_imputation.predict_scores_for_kmer_encoded_array(X_index)
     diff_squared = (Y_true - Y_pred_without_imputation) ** 2
+
     mse_without_imputation = (diff_squared * sample_weights) / sample_weights.sum()
 
-    imputed_datset = dataset.impute(MICE(n_imputations=25))
+    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
+    print("After imputation, dataset for %s has %d entries" % (
+        limited_alleles, len(imputed_datset)))
     a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
-
+    print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation)))
     X_index_imputed, Y_imputed, sample_weights_imputed, _ = \
         a0205_data_with_imputation.kmer_index_encoding()
 
@@ -79,7 +85,7 @@ def test_performance_improves_for_A0205_with_pretraining():
         n_training_epochs=10)
 
     Y_pred_with_imputation = \
-        predictor_with_imputation.predict_scores_for_kmer_array(X_index)
+        predictor_with_imputation.predict_scores_for_kmer_encoded_array(X_index)
 
     diff_squared = (Y_true - Y_pred_with_imputation) ** 2
     mse_with_imputation = (diff_squared * sample_weights_imputed) / sample_weights_imputed.sum()
-- 
GitLab