Skip to content
Snippets Groups Projects
Commit d1aa3d76 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

working on liberating the titration script from the experiments directory

parent fa353a34
No related branches found
No related tags found
No related merge requests found
......@@ -21,12 +21,13 @@ Plot AUC and F1 score of predictors as a function of dataset size
from argparse import ArgumentParser
import numpy as np
import mhcflurry
import sklearn
import sklearn.metrics
from sklearn.linear_model import LinearRegression
import seaborn
from mhcflurry.dataset import Dataset
from mhcflurry.class1_binding_predictor import Class1BindingPredictor
from dataset_paths import PETERS2009_CSV_PATH
parser = ArgumentParser()
......@@ -42,7 +43,7 @@ parser.add_argument(
parser.add_argument(
"--max-ic50",
type=float,
default=20000.0)
default=50000.0)
parser.add_argument(
"--hidden-layer-size",
......@@ -50,6 +51,12 @@ parser.add_argument(
default=10,
help="Hidden layer size for neural network, if 0 use linear regression")
parser.add_argument(
"--embedding-dim",
type=int,
default=50,
help="Number of dimensions for vector embedding of amino acids")
parser.add_argument(
"--activation",
default="tanh")
......@@ -71,35 +78,41 @@ parser.add_argument(
help="How many times to train model for same dataset size")
def binary_encode(X, n_indices=20):
n_cols = X.shape[1]
X_encode = np.zeros((len(X), n_indices * n_cols), dtype=float)
for i in range(len(X)):
for col_idx in range(n_cols):
X_encode[i, col_idx * n_indices + X[i, col_idx]] = True
return X_encode
def subsample_performance(
X,
Y,
max_ic50,
dataset,
allele,
pretraining=False,
model_fn=None,
fractions=np.arange(0.01, 1, 0.03),
n_training_samples=[
25,
50,
100,
200,
300,
400,
500,
600,
800,
1000,
1200,
1400,
1600,
1800,
2000],
niters=10,
fraction_test=0.2,
nb_epoch=50,
n_training_epochs=200,
batch_size=32):
n = len(Y)
n_total = len(dataset)
xs = []
aucs = []
f1s = []
for iternum in range(niters):
if model_fn is None:
model = LinearRegression()
else:
model = model_fn()
initial_weights = model.get_weights()
model = model_fn()
initial_weights = model.get_weights()
mask = np.random.rand(n) > fraction_test
X_train = X[mask]
X_test = X[~mask]
......@@ -139,22 +152,21 @@ def subsample_performance(
if __name__ == "__main__":
args = parser.parse_args()
print(args)
datasets, _ = mhcflurry.data_helpers.load_data(
args.training_csv,
binary_encoding=True,
flatten_binary_encoding=True,
max_ic50=args.max_ic50)
dataset = datasets[args.allele]
dataset = Dataset.load_csv(args.training_csv)
X = dataset.X
Y = dataset.Y
print("Total # of samples for %s: %d" % (args.allele, len(Y)))
if args.hidden_layer_size > 0:
model_fn = lambda: mhcflurry.feedforward.make_hotshot_network(
layer_sizes=[args.hidden_layer_size],
activation=args.activation)
if args.hidden_layer_size == 0:
hidden_layer_sizes = []
else:
model_fn = None
hidden_layer_sizes = [args.hidden_layer_size]
def make_model():
return Class1BindingPredictor.from_hyperparameters(
layer_sizes=[args.hidden_layer_size],
activation=args.activation,
embedding_dim=args.embedding_dim)
xs, aucs, f1s = subsample_performance(
X=X,
Y=Y,
......
......@@ -43,10 +43,11 @@ class Class1AlleleSpecificKmerIC50PredictorBase(IC50PredictorBase):
verbose,
max_ic50=MAX_IC50,
kmer_size=9):
self.name = name
self.max_ic50 = max_ic50
IC50PredictorBase.__init__(
name=name,
max_ic50=max_ic50,
verbose=verbose)
self.allow_unknown_amino_acids = allow_unknown_amino_acids
self.verbose = verbose
self.kmer_size = kmer_size
def __repr__(self):
......
......@@ -254,7 +254,9 @@ class Dataset(object):
if len(column) != len(alleles):
raise ValueError(
"Wrong length for column '%s', expected %d but got %d" % (
column_name, column))
column_name,
len(alleles),
len(column)))
df[column_name] = np.asarray(column)
return cls(df)
......
......@@ -32,7 +32,7 @@ class IC50PredictorBase(object):
def __init__(
self,
name,
verbose,
verbose=False,
max_ic50=MAX_IC50):
self.name = name
self.max_ic50 = max_ic50
......@@ -42,7 +42,6 @@ class IC50PredictorBase(object):
return "%s(name=%s, max_ic50=%f)" % (
self.__class__.__name__,
self.name,
self.model,
self.max_ic50)
def __str__(self):
......
......@@ -167,6 +167,13 @@ if __name__ == "__main__":
print("-- too few data points, skipping")
continue
model.fit_dataset(
allele_dataset,
pretraining_dataset=imputed_dataset_allele,
n_training_epochs=args.training_epochs,
n_random_negative_samples=args.random_negative_samples,
verbose=True)
if exists(json_path):
print("-- removing old model description %s" % json_path)
remove(json_path)
......@@ -175,13 +182,6 @@ if __name__ == "__main__":
print("-- removing old weights file %s" % hdf_path)
remove(hdf_path)
model.fit_dataset(
allele_dataset,
pretraining_dataset=imputed_dataset_allele,
n_training_epochs=args.training_epochs,
n_random_negative_samples=args.random_negative_samples,
verbose=True)
model.to_disk(
model_json_path=json_path,
weights_hdf_path=hdf_path,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment