added test for Dataset

7b7075f0 · Alex Rubinsteyn · bc0ec2c3 · 7b7075f0 · 7b7075f0 · bc0ec2c3
Commit 7b7075f0 authored 8 years ago by Alex Rubinsteyn
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -37,7 +37,8 @@ from .serialization_helpers import (
 from .peptide_encoding import check_valid_index_encoding_array
 from .feedforward_hyperparameters import LOSS, OPTIMIZER
 from .regression_target import MAX_IC50
-from .training_helpers import check_training_data_shapes
+from .dataset import Dataset
+from .training_helpers import combine_training_arrays

 _allele_predictor_cache = {}

@@ -133,70 +134,6 @@ class Class1BindingPredictor(PredictorBase):
            kmer_size=peptide_length,
            **kwargs)

-    def _combine_training_data(
-            self,
-            X,
-            Y,
-            sample_weights,
-            X_pretrain,
-            Y_pretrain,
-            sample_weights_pretrain,
-            verbose=False):
-        """
-        Make sure the shapes of given training and pre-training data
-        conform with each other. Then concatenate the pre-training and the
-        training data.
-
-        Returns (X_combined, Y_combined, weights_combined, n_pretrain_samples)
-        """
-        X = np.asarray(X)
-        Y = np.asarray(Y)
-
-        if sample_weights is None:
-            sample_weights = np.ones_like(Y)
-        else:
-            sample_weights = np.asarray(sample_weights)
-
-        n_samples, n_dims = check_training_data_shapes(X, Y, sample_weights)
-
-        if X_pretrain is None or Y_pretrain is None:
-            X_pretrain = np.empty((0, n_dims), dtype=X.dtype)
-            Y_pretrain = np.empty((0,), dtype=Y.dtype)
-        else:
-            X_pretrain = np.asarray(X_pretrain)
-            Y_pretrain = np.asarray(Y_pretrain)
-
-        if sample_weights_pretrain is None:
-            sample_weights_pretrain = np.ones_like(Y_pretrain)
-        else:
-            sample_weights_pretrain = np.asarray(sample_weights_pretrain)
-
-        n_pretrain_samples, n_pretrain_dims = check_training_data_shapes(
-            X_pretrain, Y_pretrain, sample_weights_pretrain)
-
-        if Y.min() < 0:
-            raise ValueError("Minimum value of Y can't be negative, got %f" % (
-                Y.min()))
-        if Y.max() > 1:
-            raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
-                Y.max()))
-
-        if len(Y_pretrain) > 0 and Y_pretrain.min() < 0:
-            raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % (
-                Y.min()))
-
-        if len(Y_pretrain) > 0 and Y_pretrain.max() > 1:
-            raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % (
-                Y.max()))
-
-        X_combined = np.vstack([X_pretrain, X])
-        Y_combined = np.concatenate([Y_pretrain, Y])
-        combined_weights = np.concatenate([
-            sample_weights_pretrain,
-            sample_weights,
-        ])
-        return X_combined, Y_combined, combined_weights, n_pretrain_samples
-
    def _extend_with_negative_random_samples(
            self, X, Y, weights, n_random_negative_samples):
        """
@@ -240,7 +177,7 @@ class Class1BindingPredictor(PredictorBase):
        assert len(weights_with_negative) == len(weights) + n_random_negative_samples
        return X_with_negative, Y_with_negative, weights_with_negative

-    def fit(
+    def fit_kmer_encoded_arrays(
            self,
            X,
            Y,
@@ -294,10 +231,9 @@ class Class1BindingPredictor(PredictorBase):
        batch_size : int
        """
        X_combined, Y_combined, combined_weights, n_pretrain = \
-            self._combine_training_data(
+            combine_training_arrays(
                X, Y, sample_weights,
-                X_pretrain, Y_pretrain, sample_weights_pretrain,
-                verbose=verbose)
+                X_pretrain, Y_pretrain, sample_weights_pretrain)

        total_pretrain_sample_weight = combined_weights[:n_pretrain].sum()
        total_train_sample_weight = combined_weights[n_pretrain:].sum()
@@ -426,7 +362,7 @@ class Class1BindingPredictor(PredictorBase):
    def __str__(self):
        return repr(self)

-    def predict(self, X):
+    def predict_from_kmer_encoding(self, X):
        """
        Given an encoded array of amino acid indices, returns a vector
        of predicted log IC50 values.

--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -28,13 +28,16 @@ from .amino_acid import (
    common_amino_acids
 )
 from .regression_target import regression_target_to_ic50, MAX_IC50
+from .dataset import Dataset


 class PredictorBase(object):
    """
-    Base class for all mhcflurry predictors (including the Ensemble class)
+    Base class for all mhcflurry predictors which used fixed-length
+    k-mer representation of peptides. Eventually will need to move this code
+    to something like FixedLengthPredictor to fit RNN-based sequence
+    predictors into the inheritance hierarchy.
    """
-
    def __init__(
            self,
            name,
@@ -103,21 +106,7 @@ class PredictorBase(object):
        scores = self.predict_kmer_peptides(peptides)
        return regression_target_to_ic50(scores, max_ic50=self.max_ic50)

-    def predict_peptides_ic50(self, peptides):
-        """
-        Predict IC50 affinities for peptides of any length
-        """
-        scores = self.predict_peptides(peptides)
-        return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
-
-    def predict(self, X):
-        raise ValueError("Method 'predict' not yet implemented for %s!" % (
-            self.__class__.__name__,))
-
-    def predict_peptides(
-            self,
-            peptides,
-            combine_fn=np.mean):
+    def predict_scores(self, peptides, combine_fn=np.mean):
        """
        Given a list of peptides of any length, returns an array of predicted
        normalized affinity values. Unlike IC50, a higher value here
@@ -144,3 +133,59 @@ class PredictorBase(object):
            for (p, ys) in multiple_predictions_dict.items()
        }
        return np.array([combined_predictions_dict[p] for p in peptides])
+
+    def predict(self, peptides):
+        """
+        Predict IC50 affinities for peptides of any length
+        """
+        scores = self.predict_peptides(peptides)
+        return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
+
+    def fit_dictionary(self, peptide_to_ic50_dict, **kwargs):
+        """
+        Fit the model parameters using the given peptide->IC50 dictionary,
+        all samples are given the same weight.
+
+        Parameters
+        ----------
+        peptide_to_ic50_dict : dict
+            Dictionary that maps peptides to IC50 values.
+        """
+        dataset = Dataset.from_peptide_to_affinity_dictionary(
+            allele_name=self.name,
+            peptide_to_affinity_dict=peptide_to_ic50_dict)
+        return self.fit_dataset(dataset, **kwargs)
+
+    def fit_dataset(self, dataset, pretraining_dataset=None, *args, **kwargs):
+        """
+        Fit the model parameters on the given training data.
+
+        Parameters
+        ----------
+        dataset : Dataset
+
+        pretraining_dataset : Dataset
+
+        **kwargs : dict
+            Extra arguments are passed on to the fit_encoded_kmer_arrays()
+            method.
+        """
+        X, Y, sample_weights = dataset.encode()
+        X_pretrain, Y_pretrain, sample_weights_pretrain = pretraining_dataset.encode()
+        return self.fit_arrays(
+            X=X,
+            Y=Y,
+            sample_weights=sample_weights,
+            X_pretrain=X_pretrain,
+            Y_pretrain=Y_pretrain,
+            sample_weights_pretrain=sample_weights,
+            **kwargs)
+
+    def fit_sequences(self, peptides, affinities, sample_weights=None, **kwargs):
+        alleles = [self.name] * len(peptides)
+        dataset = Dataset.from_sequences(
+            alleles=alleles,
+            peptides=peptides,
+            affinities=affinities,
+            sample_weights=sample_weights)
+        return self.fit_dataset(dataset, **kwargs)
--- a/test/test_allele_data.py
+++ b/test/test_allele_data.py
-from nose.tools import eq_
-from mhcflurry.data import (
-    create_allele_data_from_peptide_to_ic50_dict,
-    AlleleData
-)
-
-def test_create_allele_data_from_peptide_to_ic50_dict():
-    peptide_to_ic50_dict = {
-        ("A" * 10): 1.2,
-        ("C" * 9): 1000,
-    }
-    allele_data = create_allele_data_from_peptide_to_ic50_dict(
-        peptide_to_ic50_dict,
-        max_ic50=50000.0)
-    assert isinstance(allele_data, AlleleData)
-    expected_peptides = set([
-        "A" * 9,
-        "C" * 9,
-    ])
-    peptides = set(allele_data.peptides)
-    eq_(expected_peptides, peptides)
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
+from nose.tools import eq_
+from mhcflurry.dataset import Dataset
+
+def test_create_allele_data_from_single_allele_dict():
+    peptide_to_ic50_dict = {
+        ("A" * 10): 1.2,
+        ("C" * 9): 1000,
+    }
+    dataset = Dataset.from_peptide_to_affinity_dictionary(
+        allele_name="A0201",
+        peptide_to_affinity_dict=peptide_to_ic50_dict)
+    assert isinstance(dataset, Dataset)
+
+    eq_(len(peptide_to_ic50_dict), len(dataset))
+    expected_peptides = set([
+        "A" * 10,
+        "C" * 9,
+    ])
+    for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)):
+        eq_(pi, pj)
+    for pi, pj in zip(sorted(expected_peptides), sorted(dataset.unique_peptides())):
+        eq_(pi, pj)
+
+if __name__ == "__main__":
+    test_create_allele_data_from_single_allele_dict()