From f7039fb4fc685f8d71bf9c0f4b4f8cb543cab35f Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Thu, 19 May 2016 15:17:32 -0400
Subject: [PATCH] using mhcflurry.args in dataset-size-sensitivity

---
 mhcflurry/args.py                            | 22 ++++-
 mhcflurry/feedforward_hyperparameters.py     |  7 +-
 script/mhcflurry-dataset-size-sensitivity.py | 93 ++++++++++----------
 3 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/mhcflurry/args.py b/mhcflurry/args.py
index 28fefa3a..a8bf1e5f 100644
--- a/mhcflurry/args.py
+++ b/mhcflurry/args.py
@@ -20,6 +20,7 @@ from .feedforward_hyperparameters import (
     INITIALIZATION_METHOD,
     ACTIVATION,
     DROPOUT_PROBABILITY,
+    BATCH_SIZE
 )
 from .class1_binding_predictor import Class1BindingPredictor
 from .imputation_helpers import imputer_from_name
@@ -92,11 +93,19 @@ def add_hyperparameter_arguments_to_parser(parser):
         default=DROPOUT_PROBABILITY,
         help="Dropout probability after neural network layers. "
         "Default: %(default)s")
+
     parser.add_argument(
         "--kmer-size",
         type=int,
         default=9,
         help="Size of input vector for neural network")
+
+    parser.add_argument(
+        "--max-ic50",
+        type=float,
+        default=MAX_IC50,
+        help="Largest IC50 value representable as output of neural network")
+
     return parser
 
 def add_training_arguments_to_parser(parser):
@@ -105,22 +114,32 @@ def add_training_arguments_to_parser(parser):
         --training-epochs
         --random-negative-samples
         --learning-rate
+        --batch-size
     """
     parser.add_argument(
         "--random-negative-samples",
         type=int,
         default=0,
         help="Number of random negtive samples to generate each training epoch")
+
     parser.add_argument(
         "--learning-rate",
         type=float,
         default=0.001,
         help="Learning rate for training neural network. Default: %(default)s")
+
     parser.add_argument(
         "--training-epochs",
         type=int,
         default=N_EPOCHS,
         help="Number of training epochs. Default: %(default)s")
+
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=BATCH_SIZE,
+        help="Number of samples in SGD mini-batch")
+
     return parser
 
 def add_arguments_to_parser(parser):
@@ -153,12 +172,13 @@ def predictor_from_args(args, allele_name):
     """
     Given parsed arguments returns a Class1BindingPredictor
     """
+    layer_sizes = (args.hidden_layer_size,) if args.hidden_layer_size > 0 else ()
     return Class1BindingPredictor.from_hyperparameters(
         name=allele_name,
         peptide_length=args.kmer_size,
         max_ic50=args.max_ic50,
         embedding_output_dim=args.embedding_size,
-        layer_sizes=(args.hidden_layer_size,),
+        layer_sizes=layer_sizes,
         activation=args.activation,
         init=args.initialization,
         dropout_probability=args.dropout,
diff --git a/mhcflurry/feedforward_hyperparameters.py b/mhcflurry/feedforward_hyperparameters.py
index 6339acbe..cc5ac576 100644
--- a/mhcflurry/feedforward_hyperparameters.py
+++ b/mhcflurry/feedforward_hyperparameters.py
@@ -26,8 +26,9 @@ EMBEDDING_DIM = 32
 HIDDEN_LAYER_SIZE = 100
 DROPOUT_PROBABILITY = 0.1
 LEARNING_RATE = 0.001
-OPTIMIZER = "adam"
+OPTIMIZER = "rmsprop"
 LOSS = "mse"
+BATCH_SIZE = 32
 
 Params = namedtuple("Params", [
     "activation",
@@ -39,6 +40,7 @@ Params = namedtuple("Params", [
     "loss",
     "optimizer",
     "n_training_epochs",
+    "batch_size",
 ])
 
 default_hyperparameters = Params(
@@ -50,7 +52,8 @@ default_hyperparameters = Params(
     hidden_layer_size=HIDDEN_LAYER_SIZE,
     loss=LOSS,
     optimizer=OPTIMIZER,
-    n_training_epochs=N_EPOCHS)
+    n_training_epochs=N_EPOCHS,
+    batch_size=BATCH_SIZE)
 
 def all_combinations_of_hyperparameters(**kwargs):
     # enusre that all parameters are members of the Params object
diff --git a/script/mhcflurry-dataset-size-sensitivity.py b/script/mhcflurry-dataset-size-sensitivity.py
index 25af080a..001b6bcf 100755
--- a/script/mhcflurry-dataset-size-sensitivity.py
+++ b/script/mhcflurry-dataset-size-sensitivity.py
@@ -26,8 +26,13 @@ import sklearn.metrics
 import seaborn
 
 from mhcflurry.dataset import Dataset
-from mhcflurry.class1_binding_predictor import Class1BindingPredictor
-from mhcflurry.args import add_imputation_argument_to_parser, imputer_from_args
+from mhcflurry.args import (
+    add_imputation_argument_to_parser,
+    add_hyperparameter_arguments_to_parser,
+    add_training_arguments_to_parser,
+    imputer_from_args,
+    predictor_from_args,
+)
 
 parser = ArgumentParser()
 
@@ -39,45 +44,42 @@ parser.add_argument(
     "--allele",
     default="A0201")
 
-parser.add_argument(
-    "--max-ic50",
-    type=float,
-    default=50000.0)
 
 parser.add_argument(
-    "--hidden-layer-size",
+    "--repeat",
     type=int,
-    default=10,
-    help="Hidden layer size for neural network, if 0 use linear regression")
+    default=1,
+    help="How many times to train model for same dataset size")
 
 parser.add_argument(
-    "--embedding-dim",
+    "--number-dataset-sizes",
     type=int,
-    default=50,
-    help="Number of dimensions for vector embedding of amino acids")
+    default=10)
 
 parser.add_argument(
-    "--activation",
-    default="tanh")
-
-parser.add_argument(
-    "--training-epochs",
+    "--min-training-samples",
     type=int,
-    default=100)
+    default=20)
+
 
 parser.add_argument(
-    "--minibatch-size",
+    "--max-training-samples",
     type=int,
-    default=128)
+    default=2000)
 
+"""
 parser.add_argument(
-    "--repeat",
-    type=int,
-    default=10,
-    help="How many times to train model for same dataset size")
+    "--remove-similar-peptides-from-test-data",
+    action="store_true",
+    default=False,
+    help=(
+        "Use a 4 letter reduced amino acid alphabet to identify and "
+        "remove correlated peptides from the test data."))
+"""
 
 add_imputation_argument_to_parser(parser)
-
+add_hyperparameter_arguments_to_parser(parser)
+add_training_arguments_to_parser(parser)
 
 def subsample_performance(
         dataset,
@@ -86,8 +88,8 @@ def subsample_performance(
         imputer=None,
         min_training_samples=20,
         max_training_samples=3000,
-        n_subsample_sizes=5,
-        n_repeats_per_size=3,
+        n_subsample_sizes=10,
+        n_repeats_per_size=1,
         n_training_epochs=200,
         n_random_negative_samples=100,
         batch_size=32):
@@ -105,10 +107,10 @@ def subsample_performance(
     log_min_samples = np.log(min_training_samples)
     log_max_samples = np.log(max_training_samples)
 
-    log_sample_sizes = np.linspace(log_min_samples, log_max_samples)
-    sample_sizes = np.exp(log_sample_sizes).astype(int)
+    log_sample_sizes = np.linspace(log_min_samples, log_max_samples, num=n_subsample_sizes)
+    sample_sizes = np.exp(log_sample_sizes).astype(int) + 1
 
-    for n_train in sample_sizes:
+    for i, n_train in enumerate(sample_sizes):
         for _ in range(n_repeats_per_size):
             if imputer is None:
                 dataset_train, dataset_test = dataset.random_split(n_train)
@@ -120,7 +122,9 @@ def subsample_performance(
                         n_training_samples=n_train,
                         imputation_method=imputer,
                         min_observations_per_peptide=2)
-            print("=== Training model for %s with sample_size = %d/%d" % (
+            print("=== #%d/%d: Training model for %s with sample_size = %d/%d" % (
+                i + 1,
+                len(sample_sizes),
                 allele,
                 n_train,
                 n_total))
@@ -157,10 +161,7 @@ if __name__ == "__main__":
     imputer = imputer_from_args(args)
 
     def make_model():
-        return Class1BindingPredictor.from_hyperparameters(
-            layer_sizes=[args.hidden_layer_size] if args.hidden_layer_size > 0 else [],
-            activation=args.activation,
-            embedding_output_dim=args.embedding_dim)
+        return predictor_from_args(allele_name=args.allele, args=args)
 
     xs, aucs, f1s = subsample_performance(
         dataset=dataset,
@@ -169,7 +170,11 @@ if __name__ == "__main__":
         model_fn=make_model,
         n_repeats_per_size=args.repeat,
         n_training_epochs=args.training_epochs,
-        batch_size=args.minibatch_size)
+        batch_size=args.batch_size,
+        min_training_samples=args.min_training_samples,
+        max_training_samples=args.max_training_samples,
+        n_subsample_sizes=args.number_dataset_sizes,
+        n_random_negative_samples=args.random_negative_samples)
 
     for (name, values) in [("AUC", aucs), ("F1", f1s)]:
         figure = seaborn.plt.figure(figsize=(10, 8))
@@ -184,14 +189,10 @@ if __name__ == "__main__":
             scatter_kws=dict(alpha=0.5, s=50))
         seaborn.plt.xlabel("# samples (subset of %s)" % args.allele)
         seaborn.plt.ylabel(name)
-        if args.hidden_layer_size:
-            filename = "%s-%s-vs-nsamples-hidden-%s-activation-%s.png" % (
-                args.allele,
-                name,
-                args.hidden_layer_size,
-                args.activation)
-        else:
-            filename = "%s-%s-vs-nsamples-linear.png" % (
-                args.allele,
-                name)
+        filename = "%s-%s-vs-nsamples-hidden-%s-activation-%s-impute-%s.png" % (
+            args.allele,
+            name,
+            args.hidden_layer_size,
+            args.activation,
+            args.imputation_method)
         figure.savefig(filename)
-- 
GitLab