debugging imputation with Dataset

bb35d247 · Alex Rubinsteyn · 14705f3f · bb35d247 · bb35d247 · bb35d247
Commit bb35d247 authored 8 years ago by Alex Rubinsteyn
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,14 +39,17 @@ script:
  # download training data
  - script/download-kim-2013-dataset.sh
  - script/download-iedb.sh
-  # only install data for A*01:01, A*02:01, HLA-A*02:05 for testing
-  - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05
+  # only install data for A*01:01, A*02:01, HLA-A*02:05, HLA-B*07:02, H-2-KD
+  # for testing
+  - >
+    script/create-iedb-class1-dataset.py
+    --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 HLA-B*07:02 H-2-KD
  - script/create-combined-class1-dataset.py
-  # only installing A0101, A0201, A0205 for testing purposes and with very limited
-  # training
+  # only installing A0101, A0201, A0205, B0702, H-2-Kd for testing purposes and
+  # with very limited training
  - >
      mhcflurry-train-class1-allele-specific-models.py
-      --alleles HLA-A0101 HLA-A0201 HLA-A0205
+      --alleles HLA-A0101 HLA-A0201 HLA-A0205 HLA-B0702 H-2-KD
      --embedding-size 10
      --hidden-layer-size 10
      --training-epochs 100

--- a/mhcflurry/dataset.py
+++ b/mhcflurry/dataset.py
@@ -124,10 +124,11 @@ class Dataset(object):
        Two datasets are equal if they contain the same number of samples
        with the same properties and values.
        """
-        if len(self) != len(other):
+        if type(other) is not Dataset:
            return False
-
-        if len(self.columns) != len(other.columns):
+        elif len(self) != len(other):
+            return False
+        elif len(self.columns) != len(other.columns):
            return False

        for ci, cj in zip(self.columns, other.columns):
@@ -161,14 +162,18 @@ class Dataset(object):

    def groupby_allele(self):
        """
-        Returns a dictionary mapping each allele name to a Dataset containing
-        just that allele's data.
+        Yields a sequence of tuples of allele names with Datasets containing
+        entries just for that allele.
+        """
+        for (allele_name, group_df) in self.to_dataframe().groupby("allele"):
+            yield (allele_name, Dataset(group_df))
+
+    def groupby_allele_dictionary(self):
+        """
+        Returns dictionary mapping each allele name to a Dataset containing
+        only entries from that allele.
        """
-        return {
-            allele_name: Dataset(group_df)
-            for (allele_name, group_df)
-            in self.to_dataframe().groupby("allele")
-        }
+        return dict(self.groupby_allele())

    def to_nested_dictionary(self, combine_fn=geometric_mean):
        """
@@ -187,7 +192,9 @@ class Dataset(object):
            # dictionary mapping each peptide to a list of affinities
            peptide_to_affinity_dict = defaultdict(list)
            peptide_to_weight_dict = defaultdict(list)
-            for _, peptide, affinity, sample_weight in allele_dataset.itertuples():
+            for (allele, peptide), row in allele_dataset.iterrows():
+                affinity = row["affinity"]
+                sample_weight = row["sample_weight"]
                peptide_to_affinity_dict[peptide].append(affinity)
                peptide_to_weight_dict[peptide].append(sample_weight)
            allele_to_peptide_to_affinity_dict[allele] = {
@@ -196,6 +203,7 @@ class Dataset(object):
                    peptide_to_weight_dict[peptide])
                for peptide in peptide_to_affinity_dict.keys()
            }
+        return allele_to_peptide_to_affinity_dict

    @classmethod
    def from_sequences(
@@ -254,7 +262,8 @@ class Dataset(object):

    @classmethod
    def from_nested_dictionary(
-            cls, allele_to_peptide_to_affinity_dict):
+            cls,
+            allele_to_peptide_to_affinity_dict):
        """
        Given nested dictionaries mapping allele -> peptide -> affinity,
        construct a Dataset with uniform sample weights.
@@ -272,6 +281,13 @@ class Dataset(object):
            peptides=peptides,
            affinities=affinities)

+    @classmethod
+    def create_empty(cls):
+        """
+        Returns an empty Dataset containing no pMHC entries.
+        """
+        return cls.from_nested_dictionary({})
+
    @classmethod
    def from_single_allele_dictionary(
            cls,
@@ -281,8 +297,7 @@ class Dataset(object):
        Given a peptide->affinity dictionary for a single allele,
        create a Dataset.
        """
-        return cls.from_allele_to_peptide_to_affinity_dictionary(
-            {allele_name: peptide_to_affinity_dict})
+        return cls.from_nested_dictionary({allele_name: peptide_to_affinity_dict})

    @classmethod
    def from_csv(
@@ -424,11 +439,18 @@ class Dataset(object):
                allow_unknown_amino_acids=allow_unknown_amino_acids)
        original_peptide_indices = np.asarray(original_peptide_indices)
        counts = np.asarray(counts)
-        assert len(original_peptide_indices) == len(self.affinities)
-        assert len(counts) == len(self.affinities)
        kmer_affinities = self.affinities[original_peptide_indices]
-        sample_weights = 1.0 / counts
-        return X_index, kmer_affinities, sample_weights, original_peptide_indices
+        kmer_sample_weights = self.sample_weights[original_peptide_indices]
+
+        assert len(original_peptide_indices) == len(kmer_affinities)
+        assert len(counts) == len(kmer_affinities)
+        assert len(kmer_sample_weights) == len(kmer_affinities)
+
+        # combine the original sample weights of varying length peptides
+        # with a 1/n_kmers factor for the number of kmers pulled out of each
+        # original peptide
+        combined_sample_weights = kmer_sample_weights * (1.0 / counts)
+        return X_index, kmer_affinities, combined_sample_weights, original_peptide_indices

    def to_dense_pMHC_affinity_matrix(self):
        """
@@ -442,11 +464,14 @@ class Dataset(object):
        n_peptides = len(peptides_list)
        alleles_list = list(sorted(self.unique_alleles()))
        allele_order = {a: i for (i, a) in enumerate(alleles_list)}
-        n_alleles = alleles_list
-        X = np.ones((n_peptides, n_alleles))
+        n_alleles = len(alleles_list)
+        shape = (n_peptides, n_alleles)
+        X = np.ones(shape, dtype=float)
        for (allele, allele_dict) in allele_to_peptide_to_affinity_dict.items():
+            print(allele, allele_dict)
            column_index = allele_order[allele]
            for (peptide, affinity) in allele_dict.items():
+                print(peptide, affinity)
                row_index = peptide_order[peptide]
                X[row_index, column_index] = affinity
        return X, peptides_list, alleles_list

--- a/mhcflurry/imputation.py
+++ b/mhcflurry/imputation.py
@@ -138,6 +138,7 @@ def create_imputed_datasets(
    Returns dictionary mapping allele names to AlleleData objects containing
    imputed affinity values.
    """
+    assert isinstance(dataset, Dataset), "Expected Dataset, got %s" % (type(dataset),)
    X_incomplete, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix()

    _check_dense_pMHC_array(X_incomplete, peptide_list, allele_list)

--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -6,7 +6,7 @@ def test_create_allele_data_from_single_allele_dict():
        ("A" * 10): 1.2,
        ("C" * 9): 1000,
    }
-    dataset = Dataset.from_peptide_to_affinity_dictionary(
+    dataset = Dataset.from_single_allele_dictionary(
        allele_name="A0201",
        peptide_to_affinity_dict=peptide_to_ic50_dict)
    assert isinstance(dataset, Dataset)

--- a/test/test_imputation.py
+++ b/test/test_imputation.py
+
 from mhcflurry.imputation import (
    create_imputed_datasets,
    imputer_from_name,
@@ -8,14 +9,17 @@ from mhcflurry import Class1BindingPredictor

 from fancyimpute import MICE, KNN, SoftImpute, IterativeSVD
 from nose.tools import eq_
-import numpy as np
+

 def test_create_imputed_datasets_empty():
-    result = create_imputed_datasets({}, imputer=MICE(n_imputations=25))
-    eq_(result, {})
+    empty_dataset = Dataset.create_empty()
+    result = create_imputed_datasets(
+        empty_dataset,
+        imputer=MICE(n_imputations=25))
+    eq_(result, empty_dataset)

 def test_create_imputed_datasets_two_alleles():
-    allele_to_peptide_to_affinity_dict = {
+    dataset = Dataset.from_nested_dictionary({
        "HLA-A*02:01": {
            "A" * 9: 20.0,
            "C" * 9: 40000.0,
@@ -24,13 +28,11 @@ def test_create_imputed_datasets_two_alleles():
            "S" * 9: 500.0,
            "A" * 9: 25.0,
        },
-    }
-    dataset = Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict)
-    result = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
-    eq_(set(result.keys()), {"HLA-A*02:01", "HLA-A*02:05"})
+    })
+    imputed_dataset = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
+    eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
    expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
-    for allele_name, allele_data in result.items():
-        print(allele_name, allele_data.peptides)
+    for allele_name, allele_data in imputed_dataset.groupby_allele():
        eq_(set(allele_data.peptides), expected_peptides)

 def test_performance_improves_for_A0205_with_pretraining():
@@ -39,23 +41,14 @@ def test_performance_improves_for_A0205_with_pretraining():
    dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)

    print("Available alleles: %s" % (set(dataset.unique_alleles()),))
-
-    # restrict to just three alleles
-    dataset = dataset.get_alleles(["HLA-A0205", "HLA-A0201", "HLA-A0101"])
-
+    # restrict to just five alleles
+    dataset = dataset.get_alleles([
+        "HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"])
    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")
    predictor_without_imputation = \
        Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")
-
-    print("Without imputation, # samples = %d, # original peptides = %d" % (
-        len(a0205_data_without_imputation.peptides),
-        len(set(a0205_data_without_imputation.original_peptides))))
-
-    print(set(a0205_data_without_imputation.original_peptides))
-    X_index = a0205_data_without_imputation.X_index
-    Y_true = a0205_data_without_imputation.Y
-    sample_weights = a0205_data_without_imputation.weights
-
+    X_index, Y_true, sample_weights, _ = \
+        a0205_data_without_imputation.kmer_index_encoding()
    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        Y=Y_true,
@@ -64,20 +57,14 @@ def test_performance_improves_for_A0205_with_pretraining():

    Y_pred_without_imputation = \
        predictor_without_imputation.predict_scores_for_kmer_array(X_index)
-    print("Y_pred without imputation: %s" % (Y_pred_without_imputation,))
-    mse_without_imputation = np.mean((Y_true - Y_pred_without_imputation) ** 2)
-    print("MSE w/out imputation: %f" % mse_without_imputation)
+    diff_squared = (Y_true - Y_pred_without_imputation) ** 2
+    mse_without_imputation = (diff_squared * sample_weights) / sample_weights.sum()

-    imputed_data_dict = create_imputed_datasets(
-        allele_data_dict, MICE(n_imputations=25))
-    a0205_data_with_imputation = imputed_data_dict["HLA-A0205"]
-    print("Imputed data, # samples = %d, # original peptides = %d" % (
-        len(a0205_data_with_imputation.peptides),
-        len(set(a0205_data_with_imputation.original_peptides))))
+    imputed_datset = dataset.impute(MICE(n_imputations=25))
+    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")

-    X_index_imputed = a0205_data_with_imputation.X_index
-    Y_imputed = a0205_data_with_imputation.Y
-    sample_weights_imputed = a0205_data_with_imputation.weights
+    X_index_imputed, Y_imputed, sample_weights_imputed, _ = \
+        a0205_data_with_imputation.kmer_index_encoding()

    predictor_with_imputation = \
        Class1BindingPredictor.from_hyperparameters(name="A0205-impute")
@@ -93,7 +80,9 @@ def test_performance_improves_for_A0205_with_pretraining():

    Y_pred_with_imputation = \
        predictor_with_imputation.predict_scores_for_kmer_array(X_index)
-    mse_with_imputation = np.mean((Y_true - Y_pred_with_imputation) ** 2)
+
+    diff_squared = (Y_true - Y_pred_with_imputation) ** 2
+    mse_with_imputation = (diff_squared * sample_weights_imputed) / sample_weights_imputed.sum()
    print("MSE w/ imputation: %f" % (mse_with_imputation,))

    assert mse_with_imputation < mse_without_imputation, \