From f7039fb4fc685f8d71bf9c0f4b4f8cb543cab35f Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Thu, 19 May 2016 15:17:32 -0400 Subject: [PATCH] using mhcflurry.args in dataset-size-sensitivity --- mhcflurry/args.py | 22 ++++- mhcflurry/feedforward_hyperparameters.py | 7 +- script/mhcflurry-dataset-size-sensitivity.py | 93 ++++++++++---------- 3 files changed, 73 insertions(+), 49 deletions(-) diff --git a/mhcflurry/args.py b/mhcflurry/args.py index 28fefa3a..a8bf1e5f 100644 --- a/mhcflurry/args.py +++ b/mhcflurry/args.py @@ -20,6 +20,7 @@ from .feedforward_hyperparameters import ( INITIALIZATION_METHOD, ACTIVATION, DROPOUT_PROBABILITY, + BATCH_SIZE ) from .class1_binding_predictor import Class1BindingPredictor from .imputation_helpers import imputer_from_name @@ -92,11 +93,19 @@ def add_hyperparameter_arguments_to_parser(parser): default=DROPOUT_PROBABILITY, help="Dropout probability after neural network layers. " "Default: %(default)s") + parser.add_argument( "--kmer-size", type=int, default=9, help="Size of input vector for neural network") + + parser.add_argument( + "--max-ic50", + type=float, + default=MAX_IC50, + help="Largest IC50 value representable as output of neural network") + return parser def add_training_arguments_to_parser(parser): @@ -105,22 +114,32 @@ def add_training_arguments_to_parser(parser): --training-epochs --random-negative-samples --learning-rate + --batch-size """ parser.add_argument( "--random-negative-samples", type=int, default=0, help="Number of random negtive samples to generate each training epoch") + parser.add_argument( "--learning-rate", type=float, default=0.001, help="Learning rate for training neural network. Default: %(default)s") + parser.add_argument( "--training-epochs", type=int, default=N_EPOCHS, help="Number of training epochs. Default: %(default)s") + + parser.add_argument( + "--batch-size", + type=int, + default=BATCH_SIZE, + help="Number of samples in SGD mini-batch") + return parser def add_arguments_to_parser(parser): @@ -153,12 +172,13 @@ def predictor_from_args(args, allele_name): """ Given parsed arguments returns a Class1BindingPredictor """ + layer_sizes = (args.hidden_layer_size,) if args.hidden_layer_size > 0 else () return Class1BindingPredictor.from_hyperparameters( name=allele_name, peptide_length=args.kmer_size, max_ic50=args.max_ic50, embedding_output_dim=args.embedding_size, - layer_sizes=(args.hidden_layer_size,), + layer_sizes=layer_sizes, activation=args.activation, init=args.initialization, dropout_probability=args.dropout, diff --git a/mhcflurry/feedforward_hyperparameters.py b/mhcflurry/feedforward_hyperparameters.py index 6339acbe..cc5ac576 100644 --- a/mhcflurry/feedforward_hyperparameters.py +++ b/mhcflurry/feedforward_hyperparameters.py @@ -26,8 +26,9 @@ EMBEDDING_DIM = 32 HIDDEN_LAYER_SIZE = 100 DROPOUT_PROBABILITY = 0.1 LEARNING_RATE = 0.001 -OPTIMIZER = "adam" +OPTIMIZER = "rmsprop" LOSS = "mse" +BATCH_SIZE = 32 Params = namedtuple("Params", [ "activation", @@ -39,6 +40,7 @@ Params = namedtuple("Params", [ "loss", "optimizer", "n_training_epochs", + "batch_size", ]) default_hyperparameters = Params( @@ -50,7 +52,8 @@ default_hyperparameters = Params( hidden_layer_size=HIDDEN_LAYER_SIZE, loss=LOSS, optimizer=OPTIMIZER, - n_training_epochs=N_EPOCHS) + n_training_epochs=N_EPOCHS, + batch_size=BATCH_SIZE) def all_combinations_of_hyperparameters(**kwargs): # enusre that all parameters are members of the Params object diff --git a/script/mhcflurry-dataset-size-sensitivity.py b/script/mhcflurry-dataset-size-sensitivity.py index 25af080a..001b6bcf 100755 --- a/script/mhcflurry-dataset-size-sensitivity.py +++ b/script/mhcflurry-dataset-size-sensitivity.py @@ -26,8 +26,13 @@ import sklearn.metrics import seaborn from mhcflurry.dataset import Dataset -from mhcflurry.class1_binding_predictor import Class1BindingPredictor -from mhcflurry.args import add_imputation_argument_to_parser, imputer_from_args +from mhcflurry.args import ( + add_imputation_argument_to_parser, + add_hyperparameter_arguments_to_parser, + add_training_arguments_to_parser, + imputer_from_args, + predictor_from_args, +) parser = ArgumentParser() @@ -39,45 +44,42 @@ parser.add_argument( "--allele", default="A0201") -parser.add_argument( - "--max-ic50", - type=float, - default=50000.0) parser.add_argument( - "--hidden-layer-size", + "--repeat", type=int, - default=10, - help="Hidden layer size for neural network, if 0 use linear regression") + default=1, + help="How many times to train model for same dataset size") parser.add_argument( - "--embedding-dim", + "--number-dataset-sizes", type=int, - default=50, - help="Number of dimensions for vector embedding of amino acids") + default=10) parser.add_argument( - "--activation", - default="tanh") - -parser.add_argument( - "--training-epochs", + "--min-training-samples", type=int, - default=100) + default=20) + parser.add_argument( - "--minibatch-size", + "--max-training-samples", type=int, - default=128) + default=2000) +""" parser.add_argument( - "--repeat", - type=int, - default=10, - help="How many times to train model for same dataset size") + "--remove-similar-peptides-from-test-data", + action="store_true", + default=False, + help=( + "Use a 4 letter reduced amino acid alphabet to identify and " + "remove correlated peptides from the test data.")) +""" add_imputation_argument_to_parser(parser) - +add_hyperparameter_arguments_to_parser(parser) +add_training_arguments_to_parser(parser) def subsample_performance( dataset, @@ -86,8 +88,8 @@ def subsample_performance( imputer=None, min_training_samples=20, max_training_samples=3000, - n_subsample_sizes=5, - n_repeats_per_size=3, + n_subsample_sizes=10, + n_repeats_per_size=1, n_training_epochs=200, n_random_negative_samples=100, batch_size=32): @@ -105,10 +107,10 @@ def subsample_performance( log_min_samples = np.log(min_training_samples) log_max_samples = np.log(max_training_samples) - log_sample_sizes = np.linspace(log_min_samples, log_max_samples) - sample_sizes = np.exp(log_sample_sizes).astype(int) + log_sample_sizes = np.linspace(log_min_samples, log_max_samples, num=n_subsample_sizes) + sample_sizes = np.exp(log_sample_sizes).astype(int) + 1 - for n_train in sample_sizes: + for i, n_train in enumerate(sample_sizes): for _ in range(n_repeats_per_size): if imputer is None: dataset_train, dataset_test = dataset.random_split(n_train) @@ -120,7 +122,9 @@ def subsample_performance( n_training_samples=n_train, imputation_method=imputer, min_observations_per_peptide=2) - print("=== Training model for %s with sample_size = %d/%d" % ( + print("=== #%d/%d: Training model for %s with sample_size = %d/%d" % ( + i + 1, + len(sample_sizes), allele, n_train, n_total)) @@ -157,10 +161,7 @@ if __name__ == "__main__": imputer = imputer_from_args(args) def make_model(): - return Class1BindingPredictor.from_hyperparameters( - layer_sizes=[args.hidden_layer_size] if args.hidden_layer_size > 0 else [], - activation=args.activation, - embedding_output_dim=args.embedding_dim) + return predictor_from_args(allele_name=args.allele, args=args) xs, aucs, f1s = subsample_performance( dataset=dataset, @@ -169,7 +170,11 @@ if __name__ == "__main__": model_fn=make_model, n_repeats_per_size=args.repeat, n_training_epochs=args.training_epochs, - batch_size=args.minibatch_size) + batch_size=args.batch_size, + min_training_samples=args.min_training_samples, + max_training_samples=args.max_training_samples, + n_subsample_sizes=args.number_dataset_sizes, + n_random_negative_samples=args.random_negative_samples) for (name, values) in [("AUC", aucs), ("F1", f1s)]: figure = seaborn.plt.figure(figsize=(10, 8)) @@ -184,14 +189,10 @@ if __name__ == "__main__": scatter_kws=dict(alpha=0.5, s=50)) seaborn.plt.xlabel("# samples (subset of %s)" % args.allele) seaborn.plt.ylabel(name) - if args.hidden_layer_size: - filename = "%s-%s-vs-nsamples-hidden-%s-activation-%s.png" % ( - args.allele, - name, - args.hidden_layer_size, - args.activation) - else: - filename = "%s-%s-vs-nsamples-linear.png" % ( - args.allele, - name) + filename = "%s-%s-vs-nsamples-hidden-%s-activation-%s-impute-%s.png" % ( + args.allele, + name, + args.hidden_layer_size, + args.activation, + args.imputation_method) figure.savefig(filename) -- GitLab