From d293a5ee7d5545602731265e3a4c3895ca644e4d Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 18 May 2017 22:20:01 -0400
Subject: [PATCH] add class1 mdoels download

---
 .../class1_binding_predictor.py               | 21 ++++++----
 mhcflurry/downloads.yml                       | 12 ++++--
 test/test_class1_binding_predictor_A0205.py   | 41 +++++++++++--------
 3 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/mhcflurry/class1_affinity_prediction/class1_binding_predictor.py b/mhcflurry/class1_affinity_prediction/class1_binding_predictor.py
index 1b569929..04b87d06 100644
--- a/mhcflurry/class1_affinity_prediction/class1_binding_predictor.py
+++ b/mhcflurry/class1_affinity_prediction/class1_binding_predictor.py
@@ -150,7 +150,7 @@ class Class1BindingPredictor(object):
                 self.hyperparameters['random_negative_rate'] +
                 self.hyperparameters['random_negative_constant'])
         num_random_negative = pandas.Series(num_random_negative)
-        logging.info("Random negative counts per length: %s" % (
+        logging.info("Random negative counts per length:\n%s" % (
             str(num_random_negative)))
 
         aa_distribution = None
@@ -160,7 +160,7 @@ class Class1BindingPredictor(object):
                 smoothing=self.hyperparameters[
                     'random_negative_distribution_smoothing'])
             logging.info(
-                "Using amino acid distribution for random negative: %s" % (
+                "Using amino acid distribution for random negative:\n%s" % (
                 str(aa_distribution)))
 
         y_values = from_ic50(affinities)
@@ -224,17 +224,14 @@ class Class1BindingPredictor(object):
                 "peptide": numpy.concatenate([
                     random_negative_peptides_encoding,
                     peptide_encoding,
-                ])
+                ]) if len(random_negative_peptides_encoding) > 0
+                else peptide_encoding
             }
             if pseudosequence_length:
                 # TODO: add random pseudosequences for random negative peptides
                 raise NotImplemented(
                     "Allele pseudosequences unsupported with random negatives")
 
-            logging.info("Epoch %3d / %3d. Min val loss at epoch %s" % (
-                i,
-                self.hyperparameters['max_epochs'],
-                min_val_loss_iteration))
             fit_history = self.network.fit(
                 x_dict_with_random_negatives,
                 y_dict_with_random_negatives,
@@ -248,6 +245,13 @@ class Class1BindingPredictor(object):
             for (key, value) in fit_history.history.items():
                 self.fit_history[key].extend(value)
 
+            logging.info(
+                "Epoch %3d / %3d: loss=%g. Min val loss at epoch %s" % (
+                    i,
+                    self.hyperparameters['max_epochs'],
+                    self.fit_history['loss'][-1],
+                    min_val_loss_iteration))
+
             if self.hyperparameters['validation_split']:
                 val_loss = fit_history.history['val_loss'][-1]
                 val_losses.append(val_loss)
@@ -273,7 +277,8 @@ class Class1BindingPredictor(object):
             pseudosequences_input = self.pseudosequence_to_network_input(
                 allele_pseudosequences)
             x_dict['pseudosequence'] = pseudosequences_input
-        return numpy.array(self.network.predict(x_dict))
+        (predictions,) = numpy.array(self.network.predict(x_dict)).T
+        return to_ic50(predictions)
 
     @staticmethod
     def make_network(
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 76f420fb..703ca938 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -20,6 +20,14 @@ releases:
     1.0.0:
         compatibility-version: 2
         downloads:
+            - name: models_class1
+              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/models_class1.tar.bz2
+              default: true
+
+            - name: data_curated
+              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_curated.tar.bz2
+              default: true
+
             - name: data_kim2014
               url: http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2
               default: false
@@ -28,10 +36,6 @@ releases:
               url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_iedb.tar.bz2
               default: false
 
-            - name: data_curated
-              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_curated.tar.bz2
-              default: true
-
     0.2.0:
         compatibility-version: 1
         downloads:
diff --git a/test/test_class1_binding_predictor_A0205.py b/test/test_class1_binding_predictor_A0205.py
index fb3df4a7..451d3ca7 100644
--- a/test/test_class1_binding_predictor_A0205.py
+++ b/test/test_class1_binding_predictor_A0205.py
@@ -1,7 +1,7 @@
-import numpy as np
-np.random.seed(0)
+import numpy
+import pandas
+numpy.random.seed(0)
 
-from mhcflurry.affinity_measurement_dataset import AffinityMeasurementDataset
 from mhcflurry import Class1BindingPredictor
 
 from nose.tools import eq_
@@ -11,27 +11,32 @@ from mhcflurry.downloads import get_path
 
 
 def test_class1_binding_predictor_A0205_training_accuracy():
-    dataset = AffinityMeasurementDataset.from_csv(get_path(
-        "data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
-    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
-    dataset_a0205 = AffinityMeasurementDataset(
-        dataset_a0205_all_lengths._df.ix[
-            dataset_a0205_all_lengths._df.peptide.str.len() == 9])
+    df = pandas.read_csv(
+        get_path(
+            "data_curated", "curated_training_data.csv.bz2"))
+    df = df.ix[df.allele == "HLA-A*02:05"]
+    df = df.ix[
+        df.peptide.str.len() == 9
+    ]
+    df = df.ix[
+        df.measurement_type == "quantitative"
+    ]
+    df = df.ix[
+        df.measurement_source == "kim2014"
+    ]
 
     predictor = Class1BindingPredictor(
-        name="A0205",
-        embedding_output_dim=32,
         activation="tanh",
         layer_sizes=[64],
-        optimizer="adam",
+        max_epochs=1000,  # Memorize the dataset.
+        early_stopping=False,
         dropout_probability=0.0)
-    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
-    peptides = dataset_a0205.peptides
-    ic50_pred = predictor.predict(peptides)
-    ic50_true = dataset_a0205.affinities
+    predictor.fit(df.peptide.values, df.measurement_value.values)
+    ic50_pred = predictor.predict(df.peptide.values)
+    ic50_true = df.measurement_value.values
     eq_(len(ic50_pred), len(ic50_true))
     testing.assert_allclose(
-        np.log(ic50_pred),
-        np.log(ic50_true),
+        numpy.log(ic50_pred),
+        numpy.log(ic50_true),
         rtol=0.2,
         atol=0.2)
-- 
GitLab