From e554cf2a010b4156a2d16084a0d4d71d43cd0bd5 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Wed, 4 May 2016 15:26:09 -0400
Subject: [PATCH] reorganizing code to test better and sanity check some
 strangeness with learning rate

---
 mhcflurry/amino_acid.py                       |   3 +
 mhcflurry/class1_binding_predictor.py         | 110 +++++++++---------
 mhcflurry/common.py                           |  35 ------
 mhcflurry/data.py                             |   4 +-
 mhcflurry/ensemble.py                         |   2 +-
 mhcflurry/feedforward.py                      |  51 ++++----
 ...ters.py => feedforward_hyperparameters.py} |  21 +++-
 mhcflurry/imputation.py                       |   2 +-
 mhcflurry/predictor_base.py                   |  46 +++++---
 mhcflurry/regression_target.py                |  51 ++++++++
 mhcflurry/serialization_helpers.py            |  55 +++++++--
 mhcflurry/training_helpers.py                 |  43 +++++++
 ...rry-train-class1-allele-specific-models.py |  12 +-
 test/test_class1_binding_predictor.py         |   9 --
 test/test_imputation.py                       |  15 ++-
 test/test_neural_nets.py                      |   9 +-
 ...h_peptides.py => test_peptide_encoding.py} |   0
 ...st_common.py => test_regression_target.py} |   2 +-
 test/test_serialization.py                    |  30 +++++
 test/test_training_helpers.py                 |  22 ++++
 20 files changed, 347 insertions(+), 175 deletions(-)
 rename mhcflurry/{class1_allele_specific_hyperparameters.py => feedforward_hyperparameters.py} (80%)
 create mode 100644 mhcflurry/regression_target.py
 create mode 100644 mhcflurry/training_helpers.py
 rename test/{test_fixed_length_peptides.py => test_peptide_encoding.py} (100%)
 rename test/{test_common.py => test_regression_target.py} (90%)
 create mode 100644 test/test_serialization.py
 create mode 100644 test/test_training_helpers.py

diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index 9c06fada..3762a42e 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -53,6 +53,9 @@ class Alphabet(object):
     def __setitem__(self, k, v):
         self.add(k, v)
 
+    def __len__(self):
+        return len(self.letters_to_names)
+
     def index_encoding_list(self, peptides):
         index_dict = self.index_dict()
         return [
diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py
index b96f1d97..86712389 100644
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -28,21 +28,23 @@ import numpy as np
 
 from .common import normalize_allele_name
 from .paths import CLASS1_MODEL_DIRECTORY
-from .feedforward import make_embedding_network
+from .feedforward import make_embedding_network, compile_network
 from .predictor_base import PredictorBase
 from .serialization_helpers import (
     load_keras_model_from_disk,
     save_keras_model_to_disk
 )
 from .peptide_encoding import check_valid_index_encoding_array
-from .class1_allele_specific_hyperparameters import MAX_IC50
+from .feedforward_hyperparameters import LOSS, OPTIMIZER
+from .regression_target import MAX_IC50
+from .training_helpers import check_training_data_shapes
 
 _allele_predictor_cache = {}
 
 class Class1BindingPredictor(PredictorBase):
     """
     Allele-specific Class I MHC binding predictor which uses
-    fixed-length (9mer) index encoding for inputs and outputs
+    fixed-length (k-mer) index encoding for inputs and outputs
     a value between 0 and 1 (where 1 is the strongest binder).
     """
     def __init__(
@@ -51,13 +53,15 @@ class Class1BindingPredictor(PredictorBase):
             name=None,
             max_ic50=MAX_IC50,
             allow_unknown_amino_acids=True,
-            verbose=False):
+            verbose=False,
+            kmer_size=9):
         PredictorBase.__init__(
             self,
             name=name,
             max_ic50=max_ic50,
             allow_unknown_amino_acids=allow_unknown_amino_acids,
-            verbose=verbose)
+            verbose=verbose,
+            kmer_size=kmer_size)
         self.name = name
         self.model = model
 
@@ -66,6 +70,9 @@ class Class1BindingPredictor(PredictorBase):
             cls,
             model_json_path,
             weights_hdf_path=None,
+            name=None,
+            optimizer=OPTIMIZER,
+            loss=LOSS,
             **kwargs):
         """
         Load model from stored JSON representation of network and
@@ -74,7 +81,8 @@ class Class1BindingPredictor(PredictorBase):
         model = load_keras_model_from_disk(
             model_json_path,
             weights_hdf_path,
-            name=None)
+            name=name)
+        compile_network(model, loss=loss, optimizer=optimizer)
         return cls(model=model, **kwargs)
 
     def to_disk(self, model_json_path, weights_hdf_path, overwrite=False):
@@ -97,10 +105,10 @@ class Class1BindingPredictor(PredictorBase):
             layer_sizes=[50],
             activation="tanh",
             init="lecun_uniform",
-            loss="mse",
             output_activation="sigmoid",
             dropout_probability=0,
-            learning_rate=0.001,
+            loss=LOSS,
+            optimizer=OPTIMIZER,
             **kwargs):
         """
         Create untrained predictor with the given hyperparameters.
@@ -114,14 +122,15 @@ class Class1BindingPredictor(PredictorBase):
             activation=activation,
             init=init,
             loss=loss,
+            optimizer=optimizer,
             output_activation=output_activation,
-            dropout_probability=dropout_probability,
-            learning_rate=learning_rate)
+            dropout_probability=dropout_probability)
         return cls(
             name=name,
             max_ic50=max_ic50,
             model=model,
             allow_unknown_amino_acids=allow_unknown_amino_acids,
+            kmer_size=peptide_length,
             **kwargs)
 
     def _combine_training_data(
@@ -143,32 +152,12 @@ class Class1BindingPredictor(PredictorBase):
         X = np.asarray(X)
         Y = np.asarray(Y)
 
-        if verbose:
-            print("X.shape = %s" % (X.shape,))
-
-        n_samples, n_dims = X.shape
-
-        if len(Y) != n_samples:
-            raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % (
-                n_samples, len(Y)))
-
-        if Y.min() < 0:
-            raise ValueError("Minimum value of Y can't be negative, got %f" % (
-                Y.min()))
-        if Y.max() > 1:
-            raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
-                Y.max()))
-
         if sample_weights is None:
             sample_weights = np.ones_like(Y)
         else:
             sample_weights = np.asarray(sample_weights)
 
-        if len(sample_weights) != n_samples:
-            raise ValueError(
-                "Length of sample_weights (%d) doesn't match number of samples (%d)" % (
-                    len(sample_weights),
-                    n_samples))
+        n_samples, n_dims = check_training_data_shapes(X, Y, sample_weights)
 
         if X_pretrain is None or Y_pretrain is None:
             X_pretrain = np.empty((0, n_dims), dtype=X.dtype)
@@ -177,35 +166,29 @@ class Class1BindingPredictor(PredictorBase):
             X_pretrain = np.asarray(X_pretrain)
             Y_pretrain = np.asarray(Y_pretrain)
 
-        if verbose:
-            print("X_pretrain.shape = %s" % (X_pretrain.shape,))
-        n_pretrain_samples, n_pretrain_dims = X_pretrain.shape
-        if n_pretrain_dims != n_dims:
-            raise ValueError(
-                "# of dims for pretraining data (%d) doesn't match X.shape[1] = %d" % (
-                    n_pretrain_dims, n_dims))
+        if sample_weights_pretrain is None:
+            sample_weights_pretrain = np.ones_like(Y_pretrain)
+        else:
+            sample_weights_pretrain = np.asarray(sample_weights_pretrain)
 
-        if len(Y_pretrain) != n_pretrain_samples:
-            raise ValueError(
-                "length of Y_pretrain (%d) != length of X_pretrain (%d)" % (
-                    len(Y_pretrain),
-                    len(X_pretrain)))
+        n_pretrain_samples, n_pretrain_dims = check_training_data_shapes(
+            X_pretrain, Y_pretrain, sample_weights_pretrain)
+
+        if Y.min() < 0:
+            raise ValueError("Minimum value of Y can't be negative, got %f" % (
+                Y.min()))
+        if Y.max() > 1:
+            raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
+                Y.max()))
 
         if len(Y_pretrain) > 0 and Y_pretrain.min() < 0:
             raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % (
                 Y.min()))
+
         if len(Y_pretrain) > 0 and Y_pretrain.max() > 1:
             raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % (
                 Y.max()))
 
-        if sample_weights_pretrain is None:
-            sample_weights_pretrain = np.ones_like(Y_pretrain)
-        else:
-            sample_weights_pretrain = np.asarray(sample_weights_pretrain)
-        if verbose:
-            print("sample weights mean = %f, pretrain weights mean = %f" % (
-                sample_weights.mean(),
-                sample_weights_pretrain.mean()))
         X_combined = np.vstack([X_pretrain, X])
         Y_combined = np.concatenate([Y_pretrain, Y])
         combined_weights = np.concatenate([
@@ -220,16 +203,30 @@ class Class1BindingPredictor(PredictorBase):
         Extend training data with randomly generated negative samples.
         This only works since most random peptides will not bind to a
         particular allele.
+
+        Parameters
+        ----------
+        X : numpy.ndarray
+            2d array of integer amino acid encodings
+
+        Y : numpy.ndarray
+            1d array of regression targets
+
+        weights : numpy.ndarray
+            1d array of sample weights (must be same length as X and Y)
+
+        n_random_negative_samples : int
+            Number of random negative samplex to create
+
+        Returns X, Y, weights (extended with random negative samples)
         """
         assert len(X) == len(Y) == len(weights)
         if n_random_negative_samples == 0:
             return X, Y, weights
-        min_value = X.min()
-        max_value = X.max()
         n_cols = X.shape[1]
         X_random = np.random.randint(
-            low=min_value,
-            high=max_value + 1,
+            low=0,
+            high=self.max_amino_acid_encoding_value,
             size=(n_random_negative_samples, n_cols)).astype(X.dtype)
         Y_random = np.zeros(n_random_negative_samples, dtype=float)
         weights_random = np.ones(n_random_negative_samples, dtype=float)
@@ -257,7 +254,8 @@ class Class1BindingPredictor(PredictorBase):
             verbose=False,
             batch_size=128):
         """
-        Train predictive model from index encoding of fixed length 9mer peptides.
+        Train predictive model from index encoding of fixed length k-mer
+        peptides.
 
         Parameters
         ----------
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index a3318c2f..5fd34782 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -17,7 +17,6 @@ from __future__ import (
     division,
     absolute_import,
 )
-import numpy as np
 
 def parse_int_list(s):
     return [int(part.strip() for part in s.split(","))]
@@ -65,37 +64,3 @@ def split_allele_names(s):
         for part
         in s.split(",")
     ]
-
-def ic50_to_regression_target(ic50, max_ic50):
-    """
-    Transform IC50 inhibitory binding concentrations to affinity values between
-    [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very
-    strong binder.
-
-    Parameters
-    ----------
-    ic50 : numpy.ndarray
-
-    max_ic50 : float
-    """
-    log_ic50 = np.log(ic50) / np.log(max_ic50)
-    regression_target = 1.0 - log_ic50
-    # clamp to values between 0, 1
-    regression_target = np.maximum(regression_target, 0.0)
-    regression_target = np.minimum(regression_target, 1.0)
-    return regression_target
-
-def regression_target_to_ic50(y, max_ic50):
-    """
-    Transform values between [0,1] to IC50 inhibitory binding concentrations
-    between [1.0, infinity]
-
-    Parameters
-    ----------
-    y : numpy.ndarray of float
-
-    max_ic50 : float
-
-    Returns numpy.ndarray
-    """
-    return max_ic50 ** (1.0 - y)
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
index 1ce28b2f..136449af 100644
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -22,14 +22,14 @@ from collections import namedtuple, defaultdict
 import pandas as pd
 import numpy as np
 
-from .common import normalize_allele_name, ic50_to_regression_target
+from .common import normalize_allele_name
 from .amino_acid import common_amino_acids
 from .peptide_encoding import (
     indices_to_hotshot_encoding,
     fixed_length_index_encoding,
     check_valid_index_encoding_array,
 )
-from .class1_allele_specific_hyperparameters import MAX_IC50
+from .regression_target import MAX_IC50, ic50_to_regression_target
 
 index_encoding = common_amino_acids.index_encoding
 
diff --git a/mhcflurry/ensemble.py b/mhcflurry/ensemble.py
index 10880ce3..cc304906 100644
--- a/mhcflurry/ensemble.py
+++ b/mhcflurry/ensemble.py
@@ -17,7 +17,7 @@ from os.path import splitext, join
 
 import numpy as np
 
-from .class1_allele_specific_hyperparameters import MAX_IC50
+from .regression_target import MAX_IC50
 from .predictor_base import PredictorBase
 
 class Ensemble(PredictorBase):
diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py
index 11de3636..1d16e53f 100644
--- a/mhcflurry/feedforward.py
+++ b/mhcflurry/feedforward.py
@@ -17,39 +17,43 @@ from __future__ import (
     division,
     absolute_import,
 )
+import logging
 
-import keras
 from keras.models import Sequential
 from keras.layers.core import Dense, Activation, Flatten, Dropout
 from keras.layers.embeddings import Embedding
-
 import theano
+
+from .feedforward_hyperparameters import OPTIMIZER, LOSS, ACTIVATION
+
 theano.config.exception_verbosity = 'high'
 
 
+def compile_network(model, optimizer=OPTIMIZER, loss=LOSS):
+    """
+    Compile a keras network and return it.
+    """
+    logging.info("Compiling %s with optimizer=%s, loss=%s" % (
+        model, optimizer, loss))
+    model.compile(loss=loss, optimizer=optimizer)
+    return model
+
 def make_network(
         input_size,
         embedding_input_dim=None,
         embedding_output_dim=None,
         layer_sizes=[100],
-        activation="relu",
+        activation=ACTIVATION,
         init="lecun_uniform",
-        loss="mse",
         output_activation="sigmoid",
         dropout_probability=0.0,
         model=None,
-        optimizer=None,
-        learning_rate=0.001):
+        optimizer=OPTIMIZER,
+        loss=LOSS):
 
     if model is None:
         model = Sequential()
 
-    if optimizer is None:
-        optimizer = keras.optimizers.RMSprop(
-            lr=learning_rate,
-            rho=0.9,
-            epsilon=1e-6)
-
     if embedding_input_dim:
         if not embedding_output_dim:
             raise ValueError(
@@ -93,20 +97,18 @@ def make_network(
         output_dim=1,
         init=init))
     model.add(Activation(output_activation))
-    model.compile(loss=loss, optimizer=optimizer)
-    return model
+    return compile_network(model, optimizer=optimizer, loss=loss)
 
 
 def make_hotshot_network(
         peptide_length=9,
         layer_sizes=[100],
-        activation="relu",
+        activation=ACTIVATION,
         init="lecun_uniform",
-        loss="mse",
         output_activation="sigmoid",
         dropout_probability=0.0,
-        optimizer=None,
-        learning_rate=0.001,
+        optimizer=OPTIMIZER,
+        loss=LOSS,
         n_amino_acids=20):
     return make_network(
         input_size=peptide_length * n_amino_acids,
@@ -116,8 +118,7 @@ def make_hotshot_network(
         loss=loss,
         output_activation=output_activation,
         dropout_probability=dropout_probability,
-        optimizer=optimizer,
-        learning_rate=learning_rate)
+        optimizer=optimizer)
 
 
 def make_embedding_network(
@@ -125,13 +126,12 @@ def make_embedding_network(
         embedding_input_dim=20,
         embedding_output_dim=20,
         layer_sizes=[100],
-        activation="relu",
+        activation=ACTIVATION,
         init="lecun_uniform",
-        loss="mse",
         output_activation="sigmoid",
         dropout_probability=0.0,
-        optimizer=None,
-        learning_rate=0.001):
+        loss=LOSS,
+        optimizer=OPTIMIZER):
     return make_network(
         input_size=peptide_length,
         embedding_input_dim=embedding_input_dim,
@@ -142,5 +142,4 @@ def make_embedding_network(
         loss=loss,
         output_activation=output_activation,
         dropout_probability=dropout_probability,
-        optimizer=optimizer,
-        learning_rate=learning_rate)
+        optimizer=optimizer)
diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/feedforward_hyperparameters.py
similarity index 80%
rename from mhcflurry/class1_allele_specific_hyperparameters.py
rename to mhcflurry/feedforward_hyperparameters.py
index cfdf846b..55ae0fcb 100644
--- a/mhcflurry/class1_allele_specific_hyperparameters.py
+++ b/mhcflurry/feedforward_hyperparameters.py
@@ -12,14 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .regression_target import MAX_IC50
+
 N_EPOCHS = 250
 ACTIVATION = "tanh"
 INITIALIZATION_METHOD = "lecun_uniform"
 EMBEDDING_DIM = 32
 HIDDEN_LAYER_SIZE = 100
 DROPOUT_PROBABILITY = 0.1
-MAX_IC50 = 50000.0
 LEARNING_RATE = 0.001
+OPTIMIZER = "rmsprop"
+LOSS = "mse"
 
 def add_hyperparameter_arguments_to_parser(parser):
     """
@@ -31,6 +34,9 @@ def add_hyperparameter_arguments_to_parser(parser):
         --hidden-layer-size
         --dropout
         --max-ic50
+        --random-negative-samples
+        --imputation-method
+        --learning-rate
     """
     parser.add_argument(
         "--training-epochs",
@@ -82,4 +88,17 @@ def add_hyperparameter_arguments_to_parser(parser):
         default=0.001,
         help="Learning rate for training neural network. Default: %(default)s")
 
+    parser.add_argument(
+        "--random-negative-samples",
+        type=int,
+        default=0,
+        help="Number of random negtive samples to generate each training epoch")
+
+    parser.add_argument(
+        "--imputation-method",
+        default="none",
+        choices=("mice", "knn", "softimpute", "svd", "mean", "none"),
+        type=lambda s: s.strip().lower(),
+        help="Use the given imputation method to generate data for pre-training models")
+
     return parser
diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py
index 115d66a2..018356f3 100644
--- a/mhcflurry/imputation.py
+++ b/mhcflurry/imputation.py
@@ -31,7 +31,7 @@ from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
 from .data import (
     create_allele_data_from_peptide_to_ic50_dict,
 )
-from .common import regression_target_to_ic50
+from .regression_target import regression_target_to_ic50
 
 def _check_dense_pMHC_array(X, peptide_list, allele_list):
     if len(peptide_list) != len(set(peptide_list)):
diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py
index a335158c..fee86b7d 100644
--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -27,7 +27,7 @@ from .amino_acid import (
     amino_acids_with_unknown,
     common_amino_acids
 )
-from .common import regression_target_to_ic50
+from .regression_target import regression_target_to_ic50, MAX_IC50
 
 
 class PredictorBase(object):
@@ -38,19 +38,30 @@ class PredictorBase(object):
     def __init__(
             self,
             name,
-            max_ic50,
             allow_unknown_amino_acids,
-            verbose):
+            verbose,
+            max_ic50=MAX_IC50,
+            kmer_size=9):
         self.name = name
         self.max_ic50 = max_ic50
         self.allow_unknown_amino_acids = allow_unknown_amino_acids
         self.verbose = verbose
+        self.kmer_size = kmer_size
 
-    def encode_9mer_peptides(self, peptides):
+    @property
+    def amino_acids(self):
+        """
+        Amino acid alphabet used for encoding peptides, may include
+        "X" if allow_unknown_amino_acids is True.
+        """
         if self.allow_unknown_amino_acids:
-            return amino_acids_with_unknown.index_encoding(peptides, 9)
+            return amino_acids_with_unknown
         else:
-            return common_amino_acids.index_encoding(peptides, 9)
+            return common_amino_acids
+
+    @property
+    def max_amino_acid_encoding_value(self):
+        return len(self.amino_acids)
 
     def encode_peptides(self, peptides):
         """
@@ -68,28 +79,28 @@ class PredictorBase(object):
         for i, peptide in enumerate(peptides):
             matrix, _, _, _ = fixed_length_index_encoding(
                 peptides=[peptide],
-                desired_length=9,
+                desired_length=self.kmer_size,
                 allow_unknown_amino_acids=self.allow_unknown_amino_acids)
             encoded_matrices.append(matrix)
             indices.extend([i] * len(matrix))
         combined_matrix = np.concatenate(encoded_matrices)
         index_array = np.array(indices)
-        expected_shape = (len(index_array), 9)
+        expected_shape = (len(index_array), self.kmer_size)
         assert combined_matrix.shape == expected_shape, \
             "Expected shape %s but got %s" % (expected_shape, combined_matrix.shape)
         return combined_matrix, index_array
 
-    def predict_9mer_peptides(self, peptides):
+    def _predict_kmer_peptides(self, peptides):
         """
         Predict binding affinity for 9mer peptides
         """
-        if any(len(peptide) != 9 for peptide in peptides):
+        if any(len(peptide) != self.kmer_size for peptide in peptides):
             raise ValueError("Can only predict 9mer peptides")
         X, _ = self.encode_peptides(peptides)
         return self.predict(X)
 
-    def predict_9mer_peptides_ic50(self, peptides):
-        scores = self.predict_9mer_peptides(peptides)
+    def _predict_kmer_peptides_ic50(self, peptides):
+        scores = self.predict_kmer_peptides(peptides)
         return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
 
     def predict_peptides_ic50(self, peptides):
@@ -111,26 +122,25 @@ class PredictorBase(object):
         Given a list of peptides of any length, returns an array of predicted
         normalized affinity values. Unlike IC50, a higher value here
         means a stronger affinity. Peptides of lengths other than 9 are
-        transformed into a set of 9mers either by deleting or inserting
+        transformed into a set of k-mers either by deleting or inserting
         amino acid characters. The prediction for a single peptide will be
-        the average of expanded 9mers.
+        the average of expanded k-mers.
         """
         if isinstance(peptides, string_types):
             raise TypeError("Input must be a list of peptides, not %s : %s" % (
                 peptides, type(peptides)))
 
         input_matrix, original_peptide_indices = self.encode_peptides(peptides)
-        # non-9mer peptides get multiple predictions, which are then combined
-        # with the combine_fn argument
+        # peptides of lengths other than self.kmer_size get multiple predictions,
+        # which are then combined with the combine_fn argument
         multiple_predictions_dict = defaultdict(list)
         fixed_length_predictions = self.predict(input_matrix)
         for i, yi in enumerate(fixed_length_predictions):
             original_peptide_index = original_peptide_indices[i]
             original_peptide = peptides[original_peptide_index]
             multiple_predictions_dict[original_peptide].append(yi)
-
         combined_predictions_dict = {
-            p: combine_fn(ys)
+            p: combine_fn(ys) if len(ys) > 1 else ys[0]
             for (p, ys) in multiple_predictions_dict.items()
         }
         return np.array([combined_predictions_dict[p] for p in peptides])
diff --git a/mhcflurry/regression_target.py b/mhcflurry/regression_target.py
new file mode 100644
index 00000000..66ca26cd
--- /dev/null
+++ b/mhcflurry/regression_target.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+MAX_IC50 = 50000.0
+
+def ic50_to_regression_target(ic50, max_ic50=MAX_IC50):
+    """
+    Transform IC50 inhibitory binding concentrations to affinity values between
+    [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very
+    strong binder.
+
+    Parameters
+    ----------
+    ic50 : numpy.ndarray
+
+    max_ic50 : float
+    """
+    log_ic50 = np.log(ic50) / np.log(max_ic50)
+    regression_target = 1.0 - log_ic50
+    # clamp to values between 0, 1
+    regression_target = np.maximum(regression_target, 0.0)
+    regression_target = np.minimum(regression_target, 1.0)
+    return regression_target
+
+def regression_target_to_ic50(y, max_ic50=MAX_IC50):
+    """
+    Transform values between [0,1] to IC50 inhibitory binding concentrations
+    between [1.0, infinity]
+
+    Parameters
+    ----------
+    y : numpy.ndarray of float
+
+    max_ic50 : float
+
+    Returns numpy.ndarray
+    """
+    return max_ic50 ** (1.0 - y)
diff --git a/mhcflurry/serialization_helpers.py b/mhcflurry/serialization_helpers.py
index 7089b65b..be20e92e 100644
--- a/mhcflurry/serialization_helpers.py
+++ b/mhcflurry/serialization_helpers.py
@@ -28,10 +28,25 @@ import json
 
 from keras.models import model_from_config
 
+from .feedforward import compile_network
+
 def load_keras_model_from_disk(
         model_json_path,
         weights_hdf_path,
         name=None):
+    """
+    Loads a model from two files on disk: a JSON configuration and HDF5 weights.
+
+    Parameters
+    ----------
+    model_json_path : str
+
+    weights_hdf_path : str
+
+    name : str, optional
+
+    Returns a Keras model.
+    """
 
     if not exists(model_json_path):
         raise ValueError("Model file %s (name = %s) not found" % (
@@ -42,13 +57,15 @@ def load_keras_model_from_disk(
 
     model = model_from_config(config_dict)
 
-    if weights_hdf_path:
+    if weights_hdf_path is not None:
         if not exists(weights_hdf_path):
             raise ValueError(
-                "Missing model weights file %s (name = %s)" % (
-                    weights_hdf_path, name))
-
+                "Missing model weights file %s (name = %s)" % (weights_hdf_path, name))
         model.load_weights(weights_hdf_path)
+
+    # In some cases I haven't been able to use a model after loading it
+    # without compiling it first.
+    compile_network(model)
     return model
 
 
@@ -56,8 +73,23 @@ def save_keras_model_to_disk(
         model,
         model_json_path,
         weights_hdf_path,
-        overwrite=False,
-        name=None):
+        overwrite=False):
+    """
+    Write a Keras model configuration to disk along with its weights.
+
+    Parameters
+    ----------
+    model : keras model
+
+    model_json_path : str
+        Path to JSON file where model configuration will be saved.
+
+    weights_hdf_path : str
+        Path to HDF5 file where model weights will be saved.
+
+    overwrite : bool
+        Overwrite existing files?
+    """
     if exists(model_json_path) and overwrite:
         logging.info(
             "Removing existing model JSON file '%s'" % (
@@ -69,9 +101,10 @@ def save_keras_model_to_disk(
             "Model JSON file '%s' already exists" % (model_json_path,))
     else:
         logging.info(
-            "Saving model file %s (name=%s)" % (model_json_path, name))
+            "Saving model file %s" % (model_json_path,))
         with open(model_json_path, "w") as f:
-            f.write(model.to_json())
+            config = model.get_config()
+            json.dump(config, f)
 
     if exists(weights_hdf_path) and overwrite:
         logging.info(
@@ -81,10 +114,8 @@ def save_keras_model_to_disk(
 
     if exists(weights_hdf_path):
         logging.warn(
-            "Model weights HDF5 file '%s' already exists" % (
-                weights_hdf_path,))
+            "Model weights HDF5 file '%s' already exists" % (weights_hdf_path,))
     else:
         logging.info(
-            "Saving model weights HDF5 file %s (name=%s)" % (
-                weights_hdf_path, name))
+            "Saving model weights HDF5 file %s" % (weights_hdf_path,))
         model.save_weights(weights_hdf_path)
diff --git a/mhcflurry/training_helpers.py b/mhcflurry/training_helpers.py
new file mode 100644
index 00000000..65cea3c6
--- /dev/null
+++ b/mhcflurry/training_helpers.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+def check_training_data_shapes(X, Y, weights):
+    """
+    Check to make sure that the shapes of X, Y, and weights are all compatible.
+
+    Returns the numbers of rows and columns in X.
+    """
+    if len(X.shape) != 2:
+        raise ValueError("Expected X to be 2d, got shape: %s" % (X.shape,))
+
+    if len(Y.shape) != 1:
+        raise ValueError("Expected Y to be 1d, got shape: %s" % (Y.shape,))
+
+    if len(weights.shape) != 1:
+        raise ValueError("Expected weights to be 1d, got shape: %s" % (
+            weights.shape,))
+
+    n_samples, n_dims = X.shape
+
+    if len(Y) != n_samples:
+        raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % (
+            n_samples, len(Y)))
+
+    if len(weights) != n_samples:
+        raise ValueError(
+            "Length of sample_weights (%d) doesn't match number of samples (%d)" % (
+                len(weights),
+                n_samples))
+
+    return n_samples, n_dims
diff --git a/script/mhcflurry-train-class1-allele-specific-models.py b/script/mhcflurry-train-class1-allele-specific-models.py
index d2755ce2..a925d640 100755
--- a/script/mhcflurry-train-class1-allele-specific-models.py
+++ b/script/mhcflurry-train-class1-allele-specific-models.py
@@ -41,6 +41,8 @@ from os import makedirs, remove
 from os.path import exists, join
 import argparse
 
+from keras.optimizers import RMSprop
+
 from mhcflurry.common import normalize_allele_name
 from mhcflurry.data import load_allele_datasets
 from mhcflurry.class1_binding_predictor import Class1BindingPredictor
@@ -92,13 +94,6 @@ parser.add_argument(
     nargs="+",
     type=normalize_allele_name)
 
-parser.add_argument(
-    "--imputation-method",
-    default=None,
-    choices=("mice", "knn", "softimpute", "svd", "mean"),
-    type=lambda s: s.strip().lower(),
-    help="Use the given imputation method to generate data for pre-training models")
-
 # add options for neural network hyperparameters
 parser = add_hyperparameter_arguments_to_parser(parser)
 
@@ -183,7 +178,7 @@ if __name__ == "__main__":
             activation=args.activation,
             init=args.initialization,
             dropout_probability=args.dropout,
-            learning_rate=args.learning_rate)
+            optimizer=RMSprop(learning_rate=args.learning_rate))
 
         json_filename = allele_name + ".json"
         json_path = join(args.output_dir, json_filename)
@@ -215,6 +210,7 @@ if __name__ == "__main__":
             Y_pretrain=Y_pretrain,
             sample_weights_pretrain=weights_pretrain,
             n_training_epochs=args.training_epochs,
+            n_random_negative_samples=args.random_negative_samples,
             verbose=True)
 
         model.to_disk(
diff --git a/test/test_class1_binding_predictor.py b/test/test_class1_binding_predictor.py
index 7039fb6b..c17d1eaa 100644
--- a/test/test_class1_binding_predictor.py
+++ b/test/test_class1_binding_predictor.py
@@ -30,11 +30,6 @@ def test_always_zero_9mer_inputs():
     assert len(y) == n_expected
     assert np.all(y == 0)
 
-    # call the predict method for 9mers directly
-    y = always_zero_predictor_with_unknown_AAs.predict_9mer_peptides(test_9mer_peptides)
-    assert len(y) == n_expected
-    assert np.all(y == 0)
-
     ic50 = always_zero_predictor_with_unknown_AAs.predict_peptides_ic50(test_9mer_peptides)
     assert len(y) == n_expected
     assert np.all(ic50 == always_zero_predictor_with_unknown_AAs.max_ic50), ic50
@@ -74,10 +69,6 @@ def test_always_zero_10mer_inputs():
 
 
 def test_encode_peptides_9mer():
-    X = always_zero_predictor_with_unknown_AAs.encode_9mer_peptides(["AAASSSYYY"])
-    assert X.shape[0] == 1, X.shape
-    assert X.shape[1] == 9, X.shape
-
     X, indices = always_zero_predictor_with_unknown_AAs.encode_peptides(["AAASSSYYY"])
     assert len(indices) == 1
     assert indices[0] == 0
diff --git a/test/test_imputation.py b/test/test_imputation.py
index 70e13681..0f3489c5 100644
--- a/test/test_imputation.py
+++ b/test/test_imputation.py
@@ -1,5 +1,6 @@
 from mhcflurry.imputation import (
     create_imputed_datasets,
+    imputer_from_name,
 )
 from mhcflurry.data import (
     create_allele_data_from_peptide_to_ic50_dict,
@@ -8,7 +9,7 @@ from mhcflurry.data import (
 from mhcflurry.paths import CLASS1_DATA_CSV_PATH
 from mhcflurry import Class1BindingPredictor
 
-from fancyimpute import MICE
+from fancyimpute import MICE, KNN, SoftImpute, IterativeSVD
 from nose.tools import eq_
 import numpy as np
 
@@ -102,7 +103,19 @@ def test_performance_improves_for_A0205_with_pretraining():
         "Expected MSE with imputation (%f) to be less than (%f) without imputation" % (
             mse_with_imputation, mse_without_imputation)
 
+def test_imputer_from_name():
+    mice = imputer_from_name("mice")
+    assert isinstance(mice, MICE)
+    softimpute = imputer_from_name("softimpute")
+    assert isinstance(softimpute, SoftImpute)
+    svdimpute = imputer_from_name("svd")
+    assert isinstance(svdimpute, IterativeSVD)
+    knnimpute = imputer_from_name("knn")
+    assert isinstance(knnimpute, KNN)
+
+
 if __name__ == "__main__":
     test_create_imputed_datasets_empty()
     test_create_imputed_datasets_two_alleles()
     test_performance_improves_for_A0205_with_pretraining()
+    test_imputer_from_name()
diff --git a/test/test_neural_nets.py b/test/test_neural_nets.py
index 966efdd5..921d619d 100644
--- a/test/test_neural_nets.py
+++ b/test/test_neural_nets.py
@@ -2,6 +2,7 @@ from mhcflurry.feedforward import (
     make_embedding_network,
     make_hotshot_network,
 )
+from keras.optimizers import RMSprop
 import numpy as np
 
 def test_make_embedding_network():
@@ -11,7 +12,7 @@ def test_make_embedding_network():
         activation="tanh",
         embedding_input_dim=3,
         embedding_output_dim=20,
-        learning_rate=0.05)
+        optimizer=RMSprop(learning_rate=0.05))
 
     X_negative = np.array([
         [0] * 3,
@@ -33,7 +34,7 @@ def test_make_embedding_network():
     ])
     X_index = np.vstack([X_negative, X_positive])
     Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive))
-    nn.fit(X_index, Y, nb_epoch=20)
+    nn.fit(X_index, Y, nb_epoch=30)
     Y_pred = nn.predict(X_index)
     print(Y)
     print(Y_pred)
@@ -46,7 +47,7 @@ def test_make_hotshot_network():
         activation="relu",
         layer_sizes=[4],
         n_amino_acids=2,
-        learning_rate=0.05)
+        optimizer=RMSprop(learning_rate=0.05))
     X_binary = np.array([
         [True, False, True, False, True, False],
         [True, False, True, False, False, True],
@@ -56,7 +57,7 @@ def test_make_hotshot_network():
         [False, True, True, False, False, True],
     ], dtype=bool)
     Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
-    nn.fit(X_binary, Y, nb_epoch=20)
+    nn.fit(X_binary, Y, nb_epoch=30)
     Y_pred = nn.predict(X_binary)
     print(Y)
     print(Y_pred)
diff --git a/test/test_fixed_length_peptides.py b/test/test_peptide_encoding.py
similarity index 100%
rename from test/test_fixed_length_peptides.py
rename to test/test_peptide_encoding.py
diff --git a/test/test_common.py b/test/test_regression_target.py
similarity index 90%
rename from test/test_common.py
rename to test/test_regression_target.py
index fb5cd8d9..dfe67496 100644
--- a/test/test_common.py
+++ b/test/test_regression_target.py
@@ -1,4 +1,4 @@
-from mhcflurry.common import (
+from mhcflurry.regression_target import (
     ic50_to_regression_target,
     regression_target_to_ic50,
 )
diff --git a/test/test_serialization.py b/test/test_serialization.py
new file mode 100644
index 00000000..c9ee1ba2
--- /dev/null
+++ b/test/test_serialization.py
@@ -0,0 +1,30 @@
+from mhcflurry import Class1BindingPredictor
+from tempfile import NamedTemporaryFile
+import numpy as np
+from os import remove
+
+def test_predict_after_saving_model_to_disk():
+    # don't even both fitting the model, just save its random weights
+    # and check we get the same predictions back afterward
+    model = Class1BindingPredictor.from_hyperparameters(name="rando")
+    peptides = ["A" * 9, "C" * 9]
+    original_predictions = model.predict_peptides_ic50(peptides)
+    json_filename = NamedTemporaryFile("w", delete=False).name
+    hdf_filename = NamedTemporaryFile("w+b", delete=False).name
+    print("JSON: %s" % json_filename)
+    print("HDF5: %s" % hdf_filename)
+
+    model.to_disk(json_filename, hdf_filename, overwrite=True)
+
+    deserialized_model = Class1BindingPredictor.from_disk(json_filename, hdf_filename)
+    assert deserialized_model.model is not None
+
+    deserialized_predictions = deserialized_model.predict_peptides_ic50(peptides)
+
+    assert np.allclose(original_predictions, deserialized_predictions), (
+        peptides, original_predictions, deserialized_predictions)
+    remove(json_filename)
+    remove(hdf_filename)
+
+if __name__ == "__main__":
+    test_predict_after_saving_model_to_disk()
diff --git a/test/test_training_helpers.py b/test/test_training_helpers.py
new file mode 100644
index 00000000..7e99e30b
--- /dev/null
+++ b/test/test_training_helpers.py
@@ -0,0 +1,22 @@
+from mhcflurry.training_helpers import check_training_data_shapes
+import numpy as np
+from nose.tools import eq_, assert_raises
+
+def test_check_training_data_shapes():
+    X = np.random.randn(40, 2)
+    Y_good = np.random.randn(40)
+    weights_good = np.random.randn(40)
+
+    Y_too_long = np.random.randn(41)
+    weights_empty = np.array([], dtype=float)
+
+    eq_(check_training_data_shapes(X, Y_good, weights_good), X.shape)
+    with assert_raises(ValueError):
+        check_training_data_shapes(X, Y_too_long, weights_good)
+
+    with assert_raises(ValueError):
+        check_training_data_shapes(X, Y_good, weights_empty)
+
+
+if __name__ == "__main__":
+    test_check_training_data_shapes()
-- 
GitLab