From d1aa3d76b1184f387c49e2c20f23950e72d73b93 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Tue, 10 May 2016 19:01:51 -0400 Subject: [PATCH] working on liberating the titration script from the experiments directory --- experiments/dataset-size-sensitivity.py | 84 +++++++++++-------- ...llele_specific_kmer_ic50_predictor_base.py | 7 +- mhcflurry/dataset.py | 4 +- mhcflurry/ic50_predictor_base.py | 3 +- ...rry-train-class1-allele-specific-models.py | 14 ++-- 5 files changed, 63 insertions(+), 49 deletions(-) diff --git a/experiments/dataset-size-sensitivity.py b/experiments/dataset-size-sensitivity.py index d6e67a93..4635a13b 100644 --- a/experiments/dataset-size-sensitivity.py +++ b/experiments/dataset-size-sensitivity.py @@ -21,12 +21,13 @@ Plot AUC and F1 score of predictors as a function of dataset size from argparse import ArgumentParser import numpy as np -import mhcflurry import sklearn import sklearn.metrics -from sklearn.linear_model import LinearRegression import seaborn +from mhcflurry.dataset import Dataset +from mhcflurry.class1_binding_predictor import Class1BindingPredictor + from dataset_paths import PETERS2009_CSV_PATH parser = ArgumentParser() @@ -42,7 +43,7 @@ parser.add_argument( parser.add_argument( "--max-ic50", type=float, - default=20000.0) + default=50000.0) parser.add_argument( "--hidden-layer-size", @@ -50,6 +51,12 @@ parser.add_argument( default=10, help="Hidden layer size for neural network, if 0 use linear regression") +parser.add_argument( + "--embedding-dim", + type=int, + default=50, + help="Number of dimensions for vector embedding of amino acids") + parser.add_argument( "--activation", default="tanh") @@ -71,35 +78,41 @@ parser.add_argument( help="How many times to train model for same dataset size") -def binary_encode(X, n_indices=20): - n_cols = X.shape[1] - X_encode = np.zeros((len(X), n_indices * n_cols), dtype=float) - for i in range(len(X)): - for col_idx in range(n_cols): - X_encode[i, col_idx * n_indices + X[i, col_idx]] = True - return X_encode - - def subsample_performance( - X, - Y, - max_ic50, + dataset, + allele, + pretraining=False, model_fn=None, - fractions=np.arange(0.01, 1, 0.03), + n_training_samples=[ + 25, + 50, + 100, + 200, + 300, + 400, + 500, + 600, + 800, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000], niters=10, - fraction_test=0.2, - nb_epoch=50, + n_training_epochs=200, batch_size=32): - n = len(Y) + n_total = len(dataset) + xs = [] aucs = [] f1s = [] + for iternum in range(niters): - if model_fn is None: - model = LinearRegression() - else: - model = model_fn() - initial_weights = model.get_weights() + + model = model_fn() + initial_weights = model.get_weights() + mask = np.random.rand(n) > fraction_test X_train = X[mask] X_test = X[~mask] @@ -139,22 +152,21 @@ def subsample_performance( if __name__ == "__main__": args = parser.parse_args() - print(args) - datasets, _ = mhcflurry.data_helpers.load_data( - args.training_csv, - binary_encoding=True, - flatten_binary_encoding=True, - max_ic50=args.max_ic50) - dataset = datasets[args.allele] + dataset = Dataset.load_csv(args.training_csv) X = dataset.X Y = dataset.Y print("Total # of samples for %s: %d" % (args.allele, len(Y))) - if args.hidden_layer_size > 0: - model_fn = lambda: mhcflurry.feedforward.make_hotshot_network( - layer_sizes=[args.hidden_layer_size], - activation=args.activation) + if args.hidden_layer_size == 0: + hidden_layer_sizes = [] else: - model_fn = None + hidden_layer_sizes = [args.hidden_layer_size] + + def make_model(): + return Class1BindingPredictor.from_hyperparameters( + layer_sizes=[args.hidden_layer_size], + activation=args.activation, + embedding_dim=args.embedding_dim) + xs, aucs, f1s = subsample_performance( X=X, Y=Y, diff --git a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py index 3a242d8b..5f1886c6 100644 --- a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py +++ b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py @@ -43,10 +43,11 @@ class Class1AlleleSpecificKmerIC50PredictorBase(IC50PredictorBase): verbose, max_ic50=MAX_IC50, kmer_size=9): - self.name = name - self.max_ic50 = max_ic50 + IC50PredictorBase.__init__( + name=name, + max_ic50=max_ic50, + verbose=verbose) self.allow_unknown_amino_acids = allow_unknown_amino_acids - self.verbose = verbose self.kmer_size = kmer_size def __repr__(self): diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py index d58180d6..ea0dc87d 100644 --- a/mhcflurry/dataset.py +++ b/mhcflurry/dataset.py @@ -254,7 +254,9 @@ class Dataset(object): if len(column) != len(alleles): raise ValueError( "Wrong length for column '%s', expected %d but got %d" % ( - column_name, column)) + column_name, + len(alleles), + len(column))) df[column_name] = np.asarray(column) return cls(df) diff --git a/mhcflurry/ic50_predictor_base.py b/mhcflurry/ic50_predictor_base.py index fe93cfc2..6b32d8ae 100644 --- a/mhcflurry/ic50_predictor_base.py +++ b/mhcflurry/ic50_predictor_base.py @@ -32,7 +32,7 @@ class IC50PredictorBase(object): def __init__( self, name, - verbose, + verbose=False, max_ic50=MAX_IC50): self.name = name self.max_ic50 = max_ic50 @@ -42,7 +42,6 @@ class IC50PredictorBase(object): return "%s(name=%s, max_ic50=%f)" % ( self.__class__.__name__, self.name, - self.model, self.max_ic50) def __str__(self): diff --git a/script/mhcflurry-train-class1-allele-specific-models.py b/script/mhcflurry-train-class1-allele-specific-models.py index b9225505..81f60782 100755 --- a/script/mhcflurry-train-class1-allele-specific-models.py +++ b/script/mhcflurry-train-class1-allele-specific-models.py @@ -167,6 +167,13 @@ if __name__ == "__main__": print("-- too few data points, skipping") continue + model.fit_dataset( + allele_dataset, + pretraining_dataset=imputed_dataset_allele, + n_training_epochs=args.training_epochs, + n_random_negative_samples=args.random_negative_samples, + verbose=True) + if exists(json_path): print("-- removing old model description %s" % json_path) remove(json_path) @@ -175,13 +182,6 @@ if __name__ == "__main__": print("-- removing old weights file %s" % hdf_path) remove(hdf_path) - model.fit_dataset( - allele_dataset, - pretraining_dataset=imputed_dataset_allele, - n_training_epochs=args.training_epochs, - n_random_negative_samples=args.random_negative_samples, - verbose=True) - model.to_disk( model_json_path=json_path, weights_hdf_path=hdf_path, -- GitLab