From e554cf2a010b4156a2d16084a0d4d71d43cd0bd5 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Wed, 4 May 2016 15:26:09 -0400 Subject: [PATCH] reorganizing code to test better and sanity check some strangeness with learning rate --- mhcflurry/amino_acid.py | 3 + mhcflurry/class1_binding_predictor.py | 110 +++++++++--------- mhcflurry/common.py | 35 ------ mhcflurry/data.py | 4 +- mhcflurry/ensemble.py | 2 +- mhcflurry/feedforward.py | 51 ++++---- ...ters.py => feedforward_hyperparameters.py} | 21 +++- mhcflurry/imputation.py | 2 +- mhcflurry/predictor_base.py | 46 +++++--- mhcflurry/regression_target.py | 51 ++++++++ mhcflurry/serialization_helpers.py | 55 +++++++-- mhcflurry/training_helpers.py | 43 +++++++ ...rry-train-class1-allele-specific-models.py | 12 +- test/test_class1_binding_predictor.py | 9 -- test/test_imputation.py | 15 ++- test/test_neural_nets.py | 9 +- ...h_peptides.py => test_peptide_encoding.py} | 0 ...st_common.py => test_regression_target.py} | 2 +- test/test_serialization.py | 30 +++++ test/test_training_helpers.py | 22 ++++ 20 files changed, 347 insertions(+), 175 deletions(-) rename mhcflurry/{class1_allele_specific_hyperparameters.py => feedforward_hyperparameters.py} (80%) create mode 100644 mhcflurry/regression_target.py create mode 100644 mhcflurry/training_helpers.py rename test/{test_fixed_length_peptides.py => test_peptide_encoding.py} (100%) rename test/{test_common.py => test_regression_target.py} (90%) create mode 100644 test/test_serialization.py create mode 100644 test/test_training_helpers.py diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index 9c06fada..3762a42e 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -53,6 +53,9 @@ class Alphabet(object): def __setitem__(self, k, v): self.add(k, v) + def __len__(self): + return len(self.letters_to_names) + def index_encoding_list(self, peptides): index_dict = self.index_dict() return [ diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py index b96f1d97..86712389 100644 --- a/mhcflurry/class1_binding_predictor.py +++ b/mhcflurry/class1_binding_predictor.py @@ -28,21 +28,23 @@ import numpy as np from .common import normalize_allele_name from .paths import CLASS1_MODEL_DIRECTORY -from .feedforward import make_embedding_network +from .feedforward import make_embedding_network, compile_network from .predictor_base import PredictorBase from .serialization_helpers import ( load_keras_model_from_disk, save_keras_model_to_disk ) from .peptide_encoding import check_valid_index_encoding_array -from .class1_allele_specific_hyperparameters import MAX_IC50 +from .feedforward_hyperparameters import LOSS, OPTIMIZER +from .regression_target import MAX_IC50 +from .training_helpers import check_training_data_shapes _allele_predictor_cache = {} class Class1BindingPredictor(PredictorBase): """ Allele-specific Class I MHC binding predictor which uses - fixed-length (9mer) index encoding for inputs and outputs + fixed-length (k-mer) index encoding for inputs and outputs a value between 0 and 1 (where 1 is the strongest binder). """ def __init__( @@ -51,13 +53,15 @@ class Class1BindingPredictor(PredictorBase): name=None, max_ic50=MAX_IC50, allow_unknown_amino_acids=True, - verbose=False): + verbose=False, + kmer_size=9): PredictorBase.__init__( self, name=name, max_ic50=max_ic50, allow_unknown_amino_acids=allow_unknown_amino_acids, - verbose=verbose) + verbose=verbose, + kmer_size=kmer_size) self.name = name self.model = model @@ -66,6 +70,9 @@ class Class1BindingPredictor(PredictorBase): cls, model_json_path, weights_hdf_path=None, + name=None, + optimizer=OPTIMIZER, + loss=LOSS, **kwargs): """ Load model from stored JSON representation of network and @@ -74,7 +81,8 @@ class Class1BindingPredictor(PredictorBase): model = load_keras_model_from_disk( model_json_path, weights_hdf_path, - name=None) + name=name) + compile_network(model, loss=loss, optimizer=optimizer) return cls(model=model, **kwargs) def to_disk(self, model_json_path, weights_hdf_path, overwrite=False): @@ -97,10 +105,10 @@ class Class1BindingPredictor(PredictorBase): layer_sizes=[50], activation="tanh", init="lecun_uniform", - loss="mse", output_activation="sigmoid", dropout_probability=0, - learning_rate=0.001, + loss=LOSS, + optimizer=OPTIMIZER, **kwargs): """ Create untrained predictor with the given hyperparameters. @@ -114,14 +122,15 @@ class Class1BindingPredictor(PredictorBase): activation=activation, init=init, loss=loss, + optimizer=optimizer, output_activation=output_activation, - dropout_probability=dropout_probability, - learning_rate=learning_rate) + dropout_probability=dropout_probability) return cls( name=name, max_ic50=max_ic50, model=model, allow_unknown_amino_acids=allow_unknown_amino_acids, + kmer_size=peptide_length, **kwargs) def _combine_training_data( @@ -143,32 +152,12 @@ class Class1BindingPredictor(PredictorBase): X = np.asarray(X) Y = np.asarray(Y) - if verbose: - print("X.shape = %s" % (X.shape,)) - - n_samples, n_dims = X.shape - - if len(Y) != n_samples: - raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % ( - n_samples, len(Y))) - - if Y.min() < 0: - raise ValueError("Minimum value of Y can't be negative, got %f" % ( - Y.min())) - if Y.max() > 1: - raise ValueError("Maximum value of Y can't be greater than 1, got %f" % ( - Y.max())) - if sample_weights is None: sample_weights = np.ones_like(Y) else: sample_weights = np.asarray(sample_weights) - if len(sample_weights) != n_samples: - raise ValueError( - "Length of sample_weights (%d) doesn't match number of samples (%d)" % ( - len(sample_weights), - n_samples)) + n_samples, n_dims = check_training_data_shapes(X, Y, sample_weights) if X_pretrain is None or Y_pretrain is None: X_pretrain = np.empty((0, n_dims), dtype=X.dtype) @@ -177,35 +166,29 @@ class Class1BindingPredictor(PredictorBase): X_pretrain = np.asarray(X_pretrain) Y_pretrain = np.asarray(Y_pretrain) - if verbose: - print("X_pretrain.shape = %s" % (X_pretrain.shape,)) - n_pretrain_samples, n_pretrain_dims = X_pretrain.shape - if n_pretrain_dims != n_dims: - raise ValueError( - "# of dims for pretraining data (%d) doesn't match X.shape[1] = %d" % ( - n_pretrain_dims, n_dims)) + if sample_weights_pretrain is None: + sample_weights_pretrain = np.ones_like(Y_pretrain) + else: + sample_weights_pretrain = np.asarray(sample_weights_pretrain) - if len(Y_pretrain) != n_pretrain_samples: - raise ValueError( - "length of Y_pretrain (%d) != length of X_pretrain (%d)" % ( - len(Y_pretrain), - len(X_pretrain))) + n_pretrain_samples, n_pretrain_dims = check_training_data_shapes( + X_pretrain, Y_pretrain, sample_weights_pretrain) + + if Y.min() < 0: + raise ValueError("Minimum value of Y can't be negative, got %f" % ( + Y.min())) + if Y.max() > 1: + raise ValueError("Maximum value of Y can't be greater than 1, got %f" % ( + Y.max())) if len(Y_pretrain) > 0 and Y_pretrain.min() < 0: raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % ( Y.min())) + if len(Y_pretrain) > 0 and Y_pretrain.max() > 1: raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % ( Y.max())) - if sample_weights_pretrain is None: - sample_weights_pretrain = np.ones_like(Y_pretrain) - else: - sample_weights_pretrain = np.asarray(sample_weights_pretrain) - if verbose: - print("sample weights mean = %f, pretrain weights mean = %f" % ( - sample_weights.mean(), - sample_weights_pretrain.mean())) X_combined = np.vstack([X_pretrain, X]) Y_combined = np.concatenate([Y_pretrain, Y]) combined_weights = np.concatenate([ @@ -220,16 +203,30 @@ class Class1BindingPredictor(PredictorBase): Extend training data with randomly generated negative samples. This only works since most random peptides will not bind to a particular allele. + + Parameters + ---------- + X : numpy.ndarray + 2d array of integer amino acid encodings + + Y : numpy.ndarray + 1d array of regression targets + + weights : numpy.ndarray + 1d array of sample weights (must be same length as X and Y) + + n_random_negative_samples : int + Number of random negative samplex to create + + Returns X, Y, weights (extended with random negative samples) """ assert len(X) == len(Y) == len(weights) if n_random_negative_samples == 0: return X, Y, weights - min_value = X.min() - max_value = X.max() n_cols = X.shape[1] X_random = np.random.randint( - low=min_value, - high=max_value + 1, + low=0, + high=self.max_amino_acid_encoding_value, size=(n_random_negative_samples, n_cols)).astype(X.dtype) Y_random = np.zeros(n_random_negative_samples, dtype=float) weights_random = np.ones(n_random_negative_samples, dtype=float) @@ -257,7 +254,8 @@ class Class1BindingPredictor(PredictorBase): verbose=False, batch_size=128): """ - Train predictive model from index encoding of fixed length 9mer peptides. + Train predictive model from index encoding of fixed length k-mer + peptides. Parameters ---------- diff --git a/mhcflurry/common.py b/mhcflurry/common.py index a3318c2f..5fd34782 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -17,7 +17,6 @@ from __future__ import ( division, absolute_import, ) -import numpy as np def parse_int_list(s): return [int(part.strip() for part in s.split(","))] @@ -65,37 +64,3 @@ def split_allele_names(s): for part in s.split(",") ] - -def ic50_to_regression_target(ic50, max_ic50): - """ - Transform IC50 inhibitory binding concentrations to affinity values between - [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very - strong binder. - - Parameters - ---------- - ic50 : numpy.ndarray - - max_ic50 : float - """ - log_ic50 = np.log(ic50) / np.log(max_ic50) - regression_target = 1.0 - log_ic50 - # clamp to values between 0, 1 - regression_target = np.maximum(regression_target, 0.0) - regression_target = np.minimum(regression_target, 1.0) - return regression_target - -def regression_target_to_ic50(y, max_ic50): - """ - Transform values between [0,1] to IC50 inhibitory binding concentrations - between [1.0, infinity] - - Parameters - ---------- - y : numpy.ndarray of float - - max_ic50 : float - - Returns numpy.ndarray - """ - return max_ic50 ** (1.0 - y) diff --git a/mhcflurry/data.py b/mhcflurry/data.py index 1ce28b2f..136449af 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -22,14 +22,14 @@ from collections import namedtuple, defaultdict import pandas as pd import numpy as np -from .common import normalize_allele_name, ic50_to_regression_target +from .common import normalize_allele_name from .amino_acid import common_amino_acids from .peptide_encoding import ( indices_to_hotshot_encoding, fixed_length_index_encoding, check_valid_index_encoding_array, ) -from .class1_allele_specific_hyperparameters import MAX_IC50 +from .regression_target import MAX_IC50, ic50_to_regression_target index_encoding = common_amino_acids.index_encoding diff --git a/mhcflurry/ensemble.py b/mhcflurry/ensemble.py index 10880ce3..cc304906 100644 --- a/mhcflurry/ensemble.py +++ b/mhcflurry/ensemble.py @@ -17,7 +17,7 @@ from os.path import splitext, join import numpy as np -from .class1_allele_specific_hyperparameters import MAX_IC50 +from .regression_target import MAX_IC50 from .predictor_base import PredictorBase class Ensemble(PredictorBase): diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py index 11de3636..1d16e53f 100644 --- a/mhcflurry/feedforward.py +++ b/mhcflurry/feedforward.py @@ -17,39 +17,43 @@ from __future__ import ( division, absolute_import, ) +import logging -import keras from keras.models import Sequential from keras.layers.core import Dense, Activation, Flatten, Dropout from keras.layers.embeddings import Embedding - import theano + +from .feedforward_hyperparameters import OPTIMIZER, LOSS, ACTIVATION + theano.config.exception_verbosity = 'high' +def compile_network(model, optimizer=OPTIMIZER, loss=LOSS): + """ + Compile a keras network and return it. + """ + logging.info("Compiling %s with optimizer=%s, loss=%s" % ( + model, optimizer, loss)) + model.compile(loss=loss, optimizer=optimizer) + return model + def make_network( input_size, embedding_input_dim=None, embedding_output_dim=None, layer_sizes=[100], - activation="relu", + activation=ACTIVATION, init="lecun_uniform", - loss="mse", output_activation="sigmoid", dropout_probability=0.0, model=None, - optimizer=None, - learning_rate=0.001): + optimizer=OPTIMIZER, + loss=LOSS): if model is None: model = Sequential() - if optimizer is None: - optimizer = keras.optimizers.RMSprop( - lr=learning_rate, - rho=0.9, - epsilon=1e-6) - if embedding_input_dim: if not embedding_output_dim: raise ValueError( @@ -93,20 +97,18 @@ def make_network( output_dim=1, init=init)) model.add(Activation(output_activation)) - model.compile(loss=loss, optimizer=optimizer) - return model + return compile_network(model, optimizer=optimizer, loss=loss) def make_hotshot_network( peptide_length=9, layer_sizes=[100], - activation="relu", + activation=ACTIVATION, init="lecun_uniform", - loss="mse", output_activation="sigmoid", dropout_probability=0.0, - optimizer=None, - learning_rate=0.001, + optimizer=OPTIMIZER, + loss=LOSS, n_amino_acids=20): return make_network( input_size=peptide_length * n_amino_acids, @@ -116,8 +118,7 @@ def make_hotshot_network( loss=loss, output_activation=output_activation, dropout_probability=dropout_probability, - optimizer=optimizer, - learning_rate=learning_rate) + optimizer=optimizer) def make_embedding_network( @@ -125,13 +126,12 @@ def make_embedding_network( embedding_input_dim=20, embedding_output_dim=20, layer_sizes=[100], - activation="relu", + activation=ACTIVATION, init="lecun_uniform", - loss="mse", output_activation="sigmoid", dropout_probability=0.0, - optimizer=None, - learning_rate=0.001): + loss=LOSS, + optimizer=OPTIMIZER): return make_network( input_size=peptide_length, embedding_input_dim=embedding_input_dim, @@ -142,5 +142,4 @@ def make_embedding_network( loss=loss, output_activation=output_activation, dropout_probability=dropout_probability, - optimizer=optimizer, - learning_rate=learning_rate) + optimizer=optimizer) diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/feedforward_hyperparameters.py similarity index 80% rename from mhcflurry/class1_allele_specific_hyperparameters.py rename to mhcflurry/feedforward_hyperparameters.py index cfdf846b..55ae0fcb 100644 --- a/mhcflurry/class1_allele_specific_hyperparameters.py +++ b/mhcflurry/feedforward_hyperparameters.py @@ -12,14 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .regression_target import MAX_IC50 + N_EPOCHS = 250 ACTIVATION = "tanh" INITIALIZATION_METHOD = "lecun_uniform" EMBEDDING_DIM = 32 HIDDEN_LAYER_SIZE = 100 DROPOUT_PROBABILITY = 0.1 -MAX_IC50 = 50000.0 LEARNING_RATE = 0.001 +OPTIMIZER = "rmsprop" +LOSS = "mse" def add_hyperparameter_arguments_to_parser(parser): """ @@ -31,6 +34,9 @@ def add_hyperparameter_arguments_to_parser(parser): --hidden-layer-size --dropout --max-ic50 + --random-negative-samples + --imputation-method + --learning-rate """ parser.add_argument( "--training-epochs", @@ -82,4 +88,17 @@ def add_hyperparameter_arguments_to_parser(parser): default=0.001, help="Learning rate for training neural network. Default: %(default)s") + parser.add_argument( + "--random-negative-samples", + type=int, + default=0, + help="Number of random negtive samples to generate each training epoch") + + parser.add_argument( + "--imputation-method", + default="none", + choices=("mice", "knn", "softimpute", "svd", "mean", "none"), + type=lambda s: s.strip().lower(), + help="Use the given imputation method to generate data for pre-training models") + return parser diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py index 115d66a2..018356f3 100644 --- a/mhcflurry/imputation.py +++ b/mhcflurry/imputation.py @@ -31,7 +31,7 @@ from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary from .data import ( create_allele_data_from_peptide_to_ic50_dict, ) -from .common import regression_target_to_ic50 +from .regression_target import regression_target_to_ic50 def _check_dense_pMHC_array(X, peptide_list, allele_list): if len(peptide_list) != len(set(peptide_list)): diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py index a335158c..fee86b7d 100644 --- a/mhcflurry/predictor_base.py +++ b/mhcflurry/predictor_base.py @@ -27,7 +27,7 @@ from .amino_acid import ( amino_acids_with_unknown, common_amino_acids ) -from .common import regression_target_to_ic50 +from .regression_target import regression_target_to_ic50, MAX_IC50 class PredictorBase(object): @@ -38,19 +38,30 @@ class PredictorBase(object): def __init__( self, name, - max_ic50, allow_unknown_amino_acids, - verbose): + verbose, + max_ic50=MAX_IC50, + kmer_size=9): self.name = name self.max_ic50 = max_ic50 self.allow_unknown_amino_acids = allow_unknown_amino_acids self.verbose = verbose + self.kmer_size = kmer_size - def encode_9mer_peptides(self, peptides): + @property + def amino_acids(self): + """ + Amino acid alphabet used for encoding peptides, may include + "X" if allow_unknown_amino_acids is True. + """ if self.allow_unknown_amino_acids: - return amino_acids_with_unknown.index_encoding(peptides, 9) + return amino_acids_with_unknown else: - return common_amino_acids.index_encoding(peptides, 9) + return common_amino_acids + + @property + def max_amino_acid_encoding_value(self): + return len(self.amino_acids) def encode_peptides(self, peptides): """ @@ -68,28 +79,28 @@ class PredictorBase(object): for i, peptide in enumerate(peptides): matrix, _, _, _ = fixed_length_index_encoding( peptides=[peptide], - desired_length=9, + desired_length=self.kmer_size, allow_unknown_amino_acids=self.allow_unknown_amino_acids) encoded_matrices.append(matrix) indices.extend([i] * len(matrix)) combined_matrix = np.concatenate(encoded_matrices) index_array = np.array(indices) - expected_shape = (len(index_array), 9) + expected_shape = (len(index_array), self.kmer_size) assert combined_matrix.shape == expected_shape, \ "Expected shape %s but got %s" % (expected_shape, combined_matrix.shape) return combined_matrix, index_array - def predict_9mer_peptides(self, peptides): + def _predict_kmer_peptides(self, peptides): """ Predict binding affinity for 9mer peptides """ - if any(len(peptide) != 9 for peptide in peptides): + if any(len(peptide) != self.kmer_size for peptide in peptides): raise ValueError("Can only predict 9mer peptides") X, _ = self.encode_peptides(peptides) return self.predict(X) - def predict_9mer_peptides_ic50(self, peptides): - scores = self.predict_9mer_peptides(peptides) + def _predict_kmer_peptides_ic50(self, peptides): + scores = self.predict_kmer_peptides(peptides) return regression_target_to_ic50(scores, max_ic50=self.max_ic50) def predict_peptides_ic50(self, peptides): @@ -111,26 +122,25 @@ class PredictorBase(object): Given a list of peptides of any length, returns an array of predicted normalized affinity values. Unlike IC50, a higher value here means a stronger affinity. Peptides of lengths other than 9 are - transformed into a set of 9mers either by deleting or inserting + transformed into a set of k-mers either by deleting or inserting amino acid characters. The prediction for a single peptide will be - the average of expanded 9mers. + the average of expanded k-mers. """ if isinstance(peptides, string_types): raise TypeError("Input must be a list of peptides, not %s : %s" % ( peptides, type(peptides))) input_matrix, original_peptide_indices = self.encode_peptides(peptides) - # non-9mer peptides get multiple predictions, which are then combined - # with the combine_fn argument + # peptides of lengths other than self.kmer_size get multiple predictions, + # which are then combined with the combine_fn argument multiple_predictions_dict = defaultdict(list) fixed_length_predictions = self.predict(input_matrix) for i, yi in enumerate(fixed_length_predictions): original_peptide_index = original_peptide_indices[i] original_peptide = peptides[original_peptide_index] multiple_predictions_dict[original_peptide].append(yi) - combined_predictions_dict = { - p: combine_fn(ys) + p: combine_fn(ys) if len(ys) > 1 else ys[0] for (p, ys) in multiple_predictions_dict.items() } return np.array([combined_predictions_dict[p] for p in peptides]) diff --git a/mhcflurry/regression_target.py b/mhcflurry/regression_target.py new file mode 100644 index 00000000..66ca26cd --- /dev/null +++ b/mhcflurry/regression_target.py @@ -0,0 +1,51 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +MAX_IC50 = 50000.0 + +def ic50_to_regression_target(ic50, max_ic50=MAX_IC50): + """ + Transform IC50 inhibitory binding concentrations to affinity values between + [0,1] where 0 means a value greater or equal to max_ic50 and 1 means very + strong binder. + + Parameters + ---------- + ic50 : numpy.ndarray + + max_ic50 : float + """ + log_ic50 = np.log(ic50) / np.log(max_ic50) + regression_target = 1.0 - log_ic50 + # clamp to values between 0, 1 + regression_target = np.maximum(regression_target, 0.0) + regression_target = np.minimum(regression_target, 1.0) + return regression_target + +def regression_target_to_ic50(y, max_ic50=MAX_IC50): + """ + Transform values between [0,1] to IC50 inhibitory binding concentrations + between [1.0, infinity] + + Parameters + ---------- + y : numpy.ndarray of float + + max_ic50 : float + + Returns numpy.ndarray + """ + return max_ic50 ** (1.0 - y) diff --git a/mhcflurry/serialization_helpers.py b/mhcflurry/serialization_helpers.py index 7089b65b..be20e92e 100644 --- a/mhcflurry/serialization_helpers.py +++ b/mhcflurry/serialization_helpers.py @@ -28,10 +28,25 @@ import json from keras.models import model_from_config +from .feedforward import compile_network + def load_keras_model_from_disk( model_json_path, weights_hdf_path, name=None): + """ + Loads a model from two files on disk: a JSON configuration and HDF5 weights. + + Parameters + ---------- + model_json_path : str + + weights_hdf_path : str + + name : str, optional + + Returns a Keras model. + """ if not exists(model_json_path): raise ValueError("Model file %s (name = %s) not found" % ( @@ -42,13 +57,15 @@ def load_keras_model_from_disk( model = model_from_config(config_dict) - if weights_hdf_path: + if weights_hdf_path is not None: if not exists(weights_hdf_path): raise ValueError( - "Missing model weights file %s (name = %s)" % ( - weights_hdf_path, name)) - + "Missing model weights file %s (name = %s)" % (weights_hdf_path, name)) model.load_weights(weights_hdf_path) + + # In some cases I haven't been able to use a model after loading it + # without compiling it first. + compile_network(model) return model @@ -56,8 +73,23 @@ def save_keras_model_to_disk( model, model_json_path, weights_hdf_path, - overwrite=False, - name=None): + overwrite=False): + """ + Write a Keras model configuration to disk along with its weights. + + Parameters + ---------- + model : keras model + + model_json_path : str + Path to JSON file where model configuration will be saved. + + weights_hdf_path : str + Path to HDF5 file where model weights will be saved. + + overwrite : bool + Overwrite existing files? + """ if exists(model_json_path) and overwrite: logging.info( "Removing existing model JSON file '%s'" % ( @@ -69,9 +101,10 @@ def save_keras_model_to_disk( "Model JSON file '%s' already exists" % (model_json_path,)) else: logging.info( - "Saving model file %s (name=%s)" % (model_json_path, name)) + "Saving model file %s" % (model_json_path,)) with open(model_json_path, "w") as f: - f.write(model.to_json()) + config = model.get_config() + json.dump(config, f) if exists(weights_hdf_path) and overwrite: logging.info( @@ -81,10 +114,8 @@ def save_keras_model_to_disk( if exists(weights_hdf_path): logging.warn( - "Model weights HDF5 file '%s' already exists" % ( - weights_hdf_path,)) + "Model weights HDF5 file '%s' already exists" % (weights_hdf_path,)) else: logging.info( - "Saving model weights HDF5 file %s (name=%s)" % ( - weights_hdf_path, name)) + "Saving model weights HDF5 file %s" % (weights_hdf_path,)) model.save_weights(weights_hdf_path) diff --git a/mhcflurry/training_helpers.py b/mhcflurry/training_helpers.py new file mode 100644 index 00000000..65cea3c6 --- /dev/null +++ b/mhcflurry/training_helpers.py @@ -0,0 +1,43 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def check_training_data_shapes(X, Y, weights): + """ + Check to make sure that the shapes of X, Y, and weights are all compatible. + + Returns the numbers of rows and columns in X. + """ + if len(X.shape) != 2: + raise ValueError("Expected X to be 2d, got shape: %s" % (X.shape,)) + + if len(Y.shape) != 1: + raise ValueError("Expected Y to be 1d, got shape: %s" % (Y.shape,)) + + if len(weights.shape) != 1: + raise ValueError("Expected weights to be 1d, got shape: %s" % ( + weights.shape,)) + + n_samples, n_dims = X.shape + + if len(Y) != n_samples: + raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % ( + n_samples, len(Y))) + + if len(weights) != n_samples: + raise ValueError( + "Length of sample_weights (%d) doesn't match number of samples (%d)" % ( + len(weights), + n_samples)) + + return n_samples, n_dims diff --git a/script/mhcflurry-train-class1-allele-specific-models.py b/script/mhcflurry-train-class1-allele-specific-models.py index d2755ce2..a925d640 100755 --- a/script/mhcflurry-train-class1-allele-specific-models.py +++ b/script/mhcflurry-train-class1-allele-specific-models.py @@ -41,6 +41,8 @@ from os import makedirs, remove from os.path import exists, join import argparse +from keras.optimizers import RMSprop + from mhcflurry.common import normalize_allele_name from mhcflurry.data import load_allele_datasets from mhcflurry.class1_binding_predictor import Class1BindingPredictor @@ -92,13 +94,6 @@ parser.add_argument( nargs="+", type=normalize_allele_name) -parser.add_argument( - "--imputation-method", - default=None, - choices=("mice", "knn", "softimpute", "svd", "mean"), - type=lambda s: s.strip().lower(), - help="Use the given imputation method to generate data for pre-training models") - # add options for neural network hyperparameters parser = add_hyperparameter_arguments_to_parser(parser) @@ -183,7 +178,7 @@ if __name__ == "__main__": activation=args.activation, init=args.initialization, dropout_probability=args.dropout, - learning_rate=args.learning_rate) + optimizer=RMSprop(learning_rate=args.learning_rate)) json_filename = allele_name + ".json" json_path = join(args.output_dir, json_filename) @@ -215,6 +210,7 @@ if __name__ == "__main__": Y_pretrain=Y_pretrain, sample_weights_pretrain=weights_pretrain, n_training_epochs=args.training_epochs, + n_random_negative_samples=args.random_negative_samples, verbose=True) model.to_disk( diff --git a/test/test_class1_binding_predictor.py b/test/test_class1_binding_predictor.py index 7039fb6b..c17d1eaa 100644 --- a/test/test_class1_binding_predictor.py +++ b/test/test_class1_binding_predictor.py @@ -30,11 +30,6 @@ def test_always_zero_9mer_inputs(): assert len(y) == n_expected assert np.all(y == 0) - # call the predict method for 9mers directly - y = always_zero_predictor_with_unknown_AAs.predict_9mer_peptides(test_9mer_peptides) - assert len(y) == n_expected - assert np.all(y == 0) - ic50 = always_zero_predictor_with_unknown_AAs.predict_peptides_ic50(test_9mer_peptides) assert len(y) == n_expected assert np.all(ic50 == always_zero_predictor_with_unknown_AAs.max_ic50), ic50 @@ -74,10 +69,6 @@ def test_always_zero_10mer_inputs(): def test_encode_peptides_9mer(): - X = always_zero_predictor_with_unknown_AAs.encode_9mer_peptides(["AAASSSYYY"]) - assert X.shape[0] == 1, X.shape - assert X.shape[1] == 9, X.shape - X, indices = always_zero_predictor_with_unknown_AAs.encode_peptides(["AAASSSYYY"]) assert len(indices) == 1 assert indices[0] == 0 diff --git a/test/test_imputation.py b/test/test_imputation.py index 70e13681..0f3489c5 100644 --- a/test/test_imputation.py +++ b/test/test_imputation.py @@ -1,5 +1,6 @@ from mhcflurry.imputation import ( create_imputed_datasets, + imputer_from_name, ) from mhcflurry.data import ( create_allele_data_from_peptide_to_ic50_dict, @@ -8,7 +9,7 @@ from mhcflurry.data import ( from mhcflurry.paths import CLASS1_DATA_CSV_PATH from mhcflurry import Class1BindingPredictor -from fancyimpute import MICE +from fancyimpute import MICE, KNN, SoftImpute, IterativeSVD from nose.tools import eq_ import numpy as np @@ -102,7 +103,19 @@ def test_performance_improves_for_A0205_with_pretraining(): "Expected MSE with imputation (%f) to be less than (%f) without imputation" % ( mse_with_imputation, mse_without_imputation) +def test_imputer_from_name(): + mice = imputer_from_name("mice") + assert isinstance(mice, MICE) + softimpute = imputer_from_name("softimpute") + assert isinstance(softimpute, SoftImpute) + svdimpute = imputer_from_name("svd") + assert isinstance(svdimpute, IterativeSVD) + knnimpute = imputer_from_name("knn") + assert isinstance(knnimpute, KNN) + + if __name__ == "__main__": test_create_imputed_datasets_empty() test_create_imputed_datasets_two_alleles() test_performance_improves_for_A0205_with_pretraining() + test_imputer_from_name() diff --git a/test/test_neural_nets.py b/test/test_neural_nets.py index 966efdd5..921d619d 100644 --- a/test/test_neural_nets.py +++ b/test/test_neural_nets.py @@ -2,6 +2,7 @@ from mhcflurry.feedforward import ( make_embedding_network, make_hotshot_network, ) +from keras.optimizers import RMSprop import numpy as np def test_make_embedding_network(): @@ -11,7 +12,7 @@ def test_make_embedding_network(): activation="tanh", embedding_input_dim=3, embedding_output_dim=20, - learning_rate=0.05) + optimizer=RMSprop(learning_rate=0.05)) X_negative = np.array([ [0] * 3, @@ -33,7 +34,7 @@ def test_make_embedding_network(): ]) X_index = np.vstack([X_negative, X_positive]) Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive)) - nn.fit(X_index, Y, nb_epoch=20) + nn.fit(X_index, Y, nb_epoch=30) Y_pred = nn.predict(X_index) print(Y) print(Y_pred) @@ -46,7 +47,7 @@ def test_make_hotshot_network(): activation="relu", layer_sizes=[4], n_amino_acids=2, - learning_rate=0.05) + optimizer=RMSprop(learning_rate=0.05)) X_binary = np.array([ [True, False, True, False, True, False], [True, False, True, False, False, True], @@ -56,7 +57,7 @@ def test_make_hotshot_network(): [False, True, True, False, False, True], ], dtype=bool) Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0]) - nn.fit(X_binary, Y, nb_epoch=20) + nn.fit(X_binary, Y, nb_epoch=30) Y_pred = nn.predict(X_binary) print(Y) print(Y_pred) diff --git a/test/test_fixed_length_peptides.py b/test/test_peptide_encoding.py similarity index 100% rename from test/test_fixed_length_peptides.py rename to test/test_peptide_encoding.py diff --git a/test/test_common.py b/test/test_regression_target.py similarity index 90% rename from test/test_common.py rename to test/test_regression_target.py index fb5cd8d9..dfe67496 100644 --- a/test/test_common.py +++ b/test/test_regression_target.py @@ -1,4 +1,4 @@ -from mhcflurry.common import ( +from mhcflurry.regression_target import ( ic50_to_regression_target, regression_target_to_ic50, ) diff --git a/test/test_serialization.py b/test/test_serialization.py new file mode 100644 index 00000000..c9ee1ba2 --- /dev/null +++ b/test/test_serialization.py @@ -0,0 +1,30 @@ +from mhcflurry import Class1BindingPredictor +from tempfile import NamedTemporaryFile +import numpy as np +from os import remove + +def test_predict_after_saving_model_to_disk(): + # don't even both fitting the model, just save its random weights + # and check we get the same predictions back afterward + model = Class1BindingPredictor.from_hyperparameters(name="rando") + peptides = ["A" * 9, "C" * 9] + original_predictions = model.predict_peptides_ic50(peptides) + json_filename = NamedTemporaryFile("w", delete=False).name + hdf_filename = NamedTemporaryFile("w+b", delete=False).name + print("JSON: %s" % json_filename) + print("HDF5: %s" % hdf_filename) + + model.to_disk(json_filename, hdf_filename, overwrite=True) + + deserialized_model = Class1BindingPredictor.from_disk(json_filename, hdf_filename) + assert deserialized_model.model is not None + + deserialized_predictions = deserialized_model.predict_peptides_ic50(peptides) + + assert np.allclose(original_predictions, deserialized_predictions), ( + peptides, original_predictions, deserialized_predictions) + remove(json_filename) + remove(hdf_filename) + +if __name__ == "__main__": + test_predict_after_saving_model_to_disk() diff --git a/test/test_training_helpers.py b/test/test_training_helpers.py new file mode 100644 index 00000000..7e99e30b --- /dev/null +++ b/test/test_training_helpers.py @@ -0,0 +1,22 @@ +from mhcflurry.training_helpers import check_training_data_shapes +import numpy as np +from nose.tools import eq_, assert_raises + +def test_check_training_data_shapes(): + X = np.random.randn(40, 2) + Y_good = np.random.randn(40) + weights_good = np.random.randn(40) + + Y_too_long = np.random.randn(41) + weights_empty = np.array([], dtype=float) + + eq_(check_training_data_shapes(X, Y_good, weights_good), X.shape) + with assert_raises(ValueError): + check_training_data_shapes(X, Y_too_long, weights_good) + + with assert_raises(ValueError): + check_training_data_shapes(X, Y_good, weights_empty) + + +if __name__ == "__main__": + test_check_training_data_shapes() -- GitLab