Skip to content
Snippets Groups Projects
Commit f7039fb4 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

using mhcflurry.args in dataset-size-sensitivity

parent 5e820f66
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,7 @@ from .feedforward_hyperparameters import (
INITIALIZATION_METHOD,
ACTIVATION,
DROPOUT_PROBABILITY,
BATCH_SIZE
)
from .class1_binding_predictor import Class1BindingPredictor
from .imputation_helpers import imputer_from_name
......@@ -92,11 +93,19 @@ def add_hyperparameter_arguments_to_parser(parser):
default=DROPOUT_PROBABILITY,
help="Dropout probability after neural network layers. "
"Default: %(default)s")
parser.add_argument(
"--kmer-size",
type=int,
default=9,
help="Size of input vector for neural network")
parser.add_argument(
"--max-ic50",
type=float,
default=MAX_IC50,
help="Largest IC50 value representable as output of neural network")
return parser
def add_training_arguments_to_parser(parser):
......@@ -105,22 +114,32 @@ def add_training_arguments_to_parser(parser):
--training-epochs
--random-negative-samples
--learning-rate
--batch-size
"""
parser.add_argument(
"--random-negative-samples",
type=int,
default=0,
help="Number of random negtive samples to generate each training epoch")
parser.add_argument(
"--learning-rate",
type=float,
default=0.001,
help="Learning rate for training neural network. Default: %(default)s")
parser.add_argument(
"--training-epochs",
type=int,
default=N_EPOCHS,
help="Number of training epochs. Default: %(default)s")
parser.add_argument(
"--batch-size",
type=int,
default=BATCH_SIZE,
help="Number of samples in SGD mini-batch")
return parser
def add_arguments_to_parser(parser):
......@@ -153,12 +172,13 @@ def predictor_from_args(args, allele_name):
"""
Given parsed arguments returns a Class1BindingPredictor
"""
layer_sizes = (args.hidden_layer_size,) if args.hidden_layer_size > 0 else ()
return Class1BindingPredictor.from_hyperparameters(
name=allele_name,
peptide_length=args.kmer_size,
max_ic50=args.max_ic50,
embedding_output_dim=args.embedding_size,
layer_sizes=(args.hidden_layer_size,),
layer_sizes=layer_sizes,
activation=args.activation,
init=args.initialization,
dropout_probability=args.dropout,
......
......@@ -26,8 +26,9 @@ EMBEDDING_DIM = 32
HIDDEN_LAYER_SIZE = 100
DROPOUT_PROBABILITY = 0.1
LEARNING_RATE = 0.001
OPTIMIZER = "adam"
OPTIMIZER = "rmsprop"
LOSS = "mse"
BATCH_SIZE = 32
Params = namedtuple("Params", [
"activation",
......@@ -39,6 +40,7 @@ Params = namedtuple("Params", [
"loss",
"optimizer",
"n_training_epochs",
"batch_size",
])
default_hyperparameters = Params(
......@@ -50,7 +52,8 @@ default_hyperparameters = Params(
hidden_layer_size=HIDDEN_LAYER_SIZE,
loss=LOSS,
optimizer=OPTIMIZER,
n_training_epochs=N_EPOCHS)
n_training_epochs=N_EPOCHS,
batch_size=BATCH_SIZE)
def all_combinations_of_hyperparameters(**kwargs):
# enusre that all parameters are members of the Params object
......
......@@ -26,8 +26,13 @@ import sklearn.metrics
import seaborn
from mhcflurry.dataset import Dataset
from mhcflurry.class1_binding_predictor import Class1BindingPredictor
from mhcflurry.args import add_imputation_argument_to_parser, imputer_from_args
from mhcflurry.args import (
add_imputation_argument_to_parser,
add_hyperparameter_arguments_to_parser,
add_training_arguments_to_parser,
imputer_from_args,
predictor_from_args,
)
parser = ArgumentParser()
......@@ -39,45 +44,42 @@ parser.add_argument(
"--allele",
default="A0201")
parser.add_argument(
"--max-ic50",
type=float,
default=50000.0)
parser.add_argument(
"--hidden-layer-size",
"--repeat",
type=int,
default=10,
help="Hidden layer size for neural network, if 0 use linear regression")
default=1,
help="How many times to train model for same dataset size")
parser.add_argument(
"--embedding-dim",
"--number-dataset-sizes",
type=int,
default=50,
help="Number of dimensions for vector embedding of amino acids")
default=10)
parser.add_argument(
"--activation",
default="tanh")
parser.add_argument(
"--training-epochs",
"--min-training-samples",
type=int,
default=100)
default=20)
parser.add_argument(
"--minibatch-size",
"--max-training-samples",
type=int,
default=128)
default=2000)
"""
parser.add_argument(
"--repeat",
type=int,
default=10,
help="How many times to train model for same dataset size")
"--remove-similar-peptides-from-test-data",
action="store_true",
default=False,
help=(
"Use a 4 letter reduced amino acid alphabet to identify and "
"remove correlated peptides from the test data."))
"""
add_imputation_argument_to_parser(parser)
add_hyperparameter_arguments_to_parser(parser)
add_training_arguments_to_parser(parser)
def subsample_performance(
dataset,
......@@ -86,8 +88,8 @@ def subsample_performance(
imputer=None,
min_training_samples=20,
max_training_samples=3000,
n_subsample_sizes=5,
n_repeats_per_size=3,
n_subsample_sizes=10,
n_repeats_per_size=1,
n_training_epochs=200,
n_random_negative_samples=100,
batch_size=32):
......@@ -105,10 +107,10 @@ def subsample_performance(
log_min_samples = np.log(min_training_samples)
log_max_samples = np.log(max_training_samples)
log_sample_sizes = np.linspace(log_min_samples, log_max_samples)
sample_sizes = np.exp(log_sample_sizes).astype(int)
log_sample_sizes = np.linspace(log_min_samples, log_max_samples, num=n_subsample_sizes)
sample_sizes = np.exp(log_sample_sizes).astype(int) + 1
for n_train in sample_sizes:
for i, n_train in enumerate(sample_sizes):
for _ in range(n_repeats_per_size):
if imputer is None:
dataset_train, dataset_test = dataset.random_split(n_train)
......@@ -120,7 +122,9 @@ def subsample_performance(
n_training_samples=n_train,
imputation_method=imputer,
min_observations_per_peptide=2)
print("=== Training model for %s with sample_size = %d/%d" % (
print("=== #%d/%d: Training model for %s with sample_size = %d/%d" % (
i + 1,
len(sample_sizes),
allele,
n_train,
n_total))
......@@ -157,10 +161,7 @@ if __name__ == "__main__":
imputer = imputer_from_args(args)
def make_model():
return Class1BindingPredictor.from_hyperparameters(
layer_sizes=[args.hidden_layer_size] if args.hidden_layer_size > 0 else [],
activation=args.activation,
embedding_output_dim=args.embedding_dim)
return predictor_from_args(allele_name=args.allele, args=args)
xs, aucs, f1s = subsample_performance(
dataset=dataset,
......@@ -169,7 +170,11 @@ if __name__ == "__main__":
model_fn=make_model,
n_repeats_per_size=args.repeat,
n_training_epochs=args.training_epochs,
batch_size=args.minibatch_size)
batch_size=args.batch_size,
min_training_samples=args.min_training_samples,
max_training_samples=args.max_training_samples,
n_subsample_sizes=args.number_dataset_sizes,
n_random_negative_samples=args.random_negative_samples)
for (name, values) in [("AUC", aucs), ("F1", f1s)]:
figure = seaborn.plt.figure(figsize=(10, 8))
......@@ -184,14 +189,10 @@ if __name__ == "__main__":
scatter_kws=dict(alpha=0.5, s=50))
seaborn.plt.xlabel("# samples (subset of %s)" % args.allele)
seaborn.plt.ylabel(name)
if args.hidden_layer_size:
filename = "%s-%s-vs-nsamples-hidden-%s-activation-%s.png" % (
args.allele,
name,
args.hidden_layer_size,
args.activation)
else:
filename = "%s-%s-vs-nsamples-linear.png" % (
args.allele,
name)
filename = "%s-%s-vs-nsamples-hidden-%s-activation-%s-impute-%s.png" % (
args.allele,
name,
args.hidden_layer_size,
args.activation,
args.imputation_method)
figure.savefig(filename)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment