Skip to content
Snippets Groups Projects
Commit e3e6f5eb authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

pulling together more bits of isolated training scripts into helpers file

parent 70af16e2
No related branches found
No related tags found
No related merge requests found
......@@ -29,6 +29,7 @@ from mhcflurry.data import (
from arg_parsing import parse_int_list, parse_float_list
from dataset_paths import PETERS2009_CSV_PATH
from common import load_csv_binding_data_as_dict
from training_helpers import create_and_evaluate_model_with_synthetic_data
parser = argparse.ArgumentParser()
......@@ -159,6 +160,25 @@ def data_augmentation(
return aucs, f1s, n_originals
def rescale_ic50(ic50, max_ic50):
log_ic50 = np.log(ic50) / np.log(args.max_ic50)
return max(0.0, min(1.0, 1.0 - log_ic50))
def load_synthetic_data(csv_path, max_ic50):
synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
csv_path)
return {
allele: {
peptide: rescale_ic50(ic50, max_ic50=max_ic50)
for (peptide, ic50)
in allele_dict.items()
}
for (allele, allele_dict)
in synthetic_allele_to_peptide_to_ic50_dict.items()
}
if __name__ == "__main__":
args = parser.parse_args()
print(args)
......@@ -169,70 +189,39 @@ if __name__ == "__main__":
max_ic50=args.max_ic50,
only_human=False)
print("Loading synthetic data from %s" % args.synthetic_data_csv)
synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
args.synthetic_data_csv)
synthetic_allele_to_peptide_to_y_dict = {
allele: {
peptide: max(
0.0,
min(
1.0,
1.0 - np.log(ic50) / np.log(args.max_ic50)))
for (peptide, ic50)
in allele_dict.items()
}
for (allele, allele_dict)
in synthetic_allele_to_peptide_to_ic50_dict.items()
}
synthetic_affinities = load_synthetic_data(
csv_path=args.synthetic_data_csv,
max_ic50=args.max_ic50)
combined_allele_set = set(allele_datasets.keys()).union(
synthetic_allele_to_peptide_to_y_dict.keys())
synthetic_affinities.keys())
combined_allele_list = list(sorted(combined_allele_set))
for allele in combined_allele_list:
actual_dataset = allele_datasets[allele]
X_actual = actual_dataset.X_index
weights_actual = actual_dataset.weights
Y_actual = actual_dataset.Y
synthetic_dict = synthetic_allele_to_peptide_to_y_dict[allele]
_, _, C_synth, X_synth, _, Y_synth = encode_peptide_to_affinity_dict(
synthetic_dict)
n_actual_samples, n_actual_dims = X_actual.shape
n_synth_samples, n_synth_dims = X_synth.shape
assert n_actual_dims == n_synth_dims, \
"Mismatch between # of actual dims %d and synthetic dims %d" % (
n_actual_dims, n_synth_dims)
print("-- Using %d actual samples and %d synthetic samples for %s" % (
n_actual_samples, n_synth_samples, allele))
X = np.vstack([X_actual, X_synth])
print("-- X.shape = %s, dtype = %s" % (X.shape, X.dtype))
n_samples = n_actual_samples + n_synth_samples
assert X.shape[0] == n_samples, \
"Expected %d samples but got data array with shape %s" % (
n_actual_samples + n_synth_samples, X.shape)
Y = np.concatenate([Y_actual, Y_synth])
print("-- Y.shape = %s, dtype = %s" % (Y.shape, Y.dtype))
assert Y.min() >= 0, \
"Y should not contain negative numbers! Y.min() = %f" % (Y.min(),)
assert Y.max() <= 1, \
"Y should have max value 1.0, got Y.max() = %f" % (Y.max(),)
weights_synth = 1.0 / C_synth
weights = np.concatenate([weights_actual, weights_synth])
assert len(weights) == n_samples
print("-- weights.shape = %s, dtype = %s" % (
weights.shape, weights.dtype))
synthetic_allele_dict = synthetic_affinities[allele]
(_, _, Counts_synth, X_synth, _, Y_synth) = \
encode_peptide_to_affinity_dict(synthetic_allele_dict)
synthetic_sample_weights = 1.0 / Counts_synth
scores = {}
for dropout in args.dropouts:
for embedding_dim_size in args.embedding_dim_sizes:
for hidden_layer_size in args.hidden_layer_sizes:
params = (
("dropout", dropout),
("dropout_probability", dropout),
("embedding_dim_size", embedding_dim_size),
("hidden_layer_size", hidden_layer_size),
)
tau, auc, f1 = evaluate_model(**dict(params))
tau, auc, f1 = create_and_evaluate_model_with_synthetic_data(
X_original=allele_datasets[allele].X_index,
Y_original=allele_datasets[allele].Y,
X_synth=X_synth,
Y_synth=Y_synth,
original_sample_weights=allele_datasets[allele].weights,
synthetic_sample_weights=synthetic_sample_weights,
n_training_epochs=150,
max_ic50=args.max_ic50,
**dict(params))
scores[params] = (tau, auc, f1)
print("%s => tau=%f, AUC=%f, F1=%f" % (
params,
......
......@@ -128,29 +128,51 @@ def train_model_and_return_scores(
return (accuracy, auc, f1_score)
def create_and_evaluate_model_with_synthetic_data(
X,
Y,
weights_synth,
weights_actual,
def train_model_with_synthetic_data(
model,
n_training_epochs,
dropout,
embedding_dim_size,
hidden_layer_size):
model = mhcflurry.feedforward.make_embedding_network(
peptide_length=9,
embedding_input_dim=20,
embedding_output_dim=4,
layer_sizes=[4],
activation="tanh",
init="lecun_uniform",
loss="mse",
output_activation="sigmoid",
dropout_probability=0.0,
optimizer=None,
learning_rate=0.001)
total_synth_weights = weights_synth.sum()
total_actual_weights = weights_actual.sum()
max_ic50,
X_original,
Y_original,
X_synth,
Y_synth,
original_sample_weights,
synthetic_sample_weights):
total_synth_weights = synthetic_sample_weights.sum()
total_original_weights = original_sample_weights.sum()
print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % (
np.mean(Y_original),
np.mean(Y_synth),
np.mean(original_sample_weights),
np.mean(synthetic_sample_weights)))
combined_weights = np.concatenate([
original_sample_weights,
synthetic_sample_weights
])
n_actual_samples, n_actual_dims = X_original.shape
n_synth_samples, n_synth_dims = X_synth.shape
assert n_actual_dims == n_synth_dims, \
"Mismatch between # of actual dims %d and synthetic dims %d" % (
n_actual_dims, n_synth_dims)
X_combined = np.vstack([X_original, X_synth])
n_combined_samples = n_actual_samples + n_synth_samples
assert X_combined.shape[0] == n_combined_samples, \
"Expected %d samples but got data array with shape %s" % (
n_actual_samples + n_synth_samples, X_combined.shape)
Y_combined = np.concatenate([Y_original, Y_synth])
assert Y_combined.min() >= 0, \
"Y should not contain negative numbers! Y.min() = %f" % (
Y_combined.min(),)
assert Y_combined.max() <= 1, \
"Y should have max value 1.0, got Y.max() = %f" % (
Y_combined.max(),)
combined_weights = np.concatenate([
original_sample_weights,
synthetic_sample_weights
])
assert len(combined_weights) == n_combined_samples
for epoch in range(n_training_epochs):
# weights for synthetic points can be shrunk as:
# ~ 1 / (1+epoch)**2
......@@ -160,20 +182,69 @@ def create_and_evaluate_model_with_synthetic_data(
# if the contribution of synthetic samples is less than a
# thousandth of the actual data, then stop using it
synth_contribution = total_synth_weights * decay_factor
if synth_contribution < total_actual_weights / 1000:
if synth_contribution < total_original_weights / 1000:
print("Epoch %d, using only actual data" % (epoch + 1,))
model.fit(
X_actual,
Y_actual,
sample_weight=weights_actual,
nb_epoch=1)
X_original,
Y_original,
sample_weight=original_sample_weights,
nb_epoch=1,
verbose=0)
else:
print("Epoch %d, synth decay factor = %f" % (
epoch + 1, decay_factor))
weights[n_actual_samples:] = weights_synth * decay_factor
model.fit(X, Y, sample_weight=weights, nb_epoch=1)
Y_pred = model.predict(X_actual)
print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean())
combined_weights[n_actual_samples:] = (
synthetic_sample_weights * decay_factor)
model.fit(
X_combined,
Y_combined,
sample_weight=combined_weights,
nb_epoch=1,
verbose=0)
Y_pred = model.predict(X_original)
training_mse = ((Y_original - Y_pred) ** 2).mean()
print(
"-- Epoch %d/%d Training MSE %0.4f" % (
epoch + 1,
n_training_epochs,
training_mse))
def create_and_evaluate_model_with_synthetic_data(
X_original,
Y_original,
X_synth,
Y_synth,
original_sample_weights=None,
synthetic_sample_weights=None,
n_training_epochs=150,
embedding_dim_size=16,
hidden_layer_size=50,
dropout_probability=0.0,
max_ic50=50000.0):
if original_sample_weights is None:
original_sample_weights = np.ones(len(X_original), dtype=float)
if synthetic_sample_weights is None:
synthetic_sample_weights = np.ones(len(X_synth), dtype=float)
model = mhcflurry.feedforward.make_embedding_network(
peptide_length=9,
embedding_input_dim=20,
embedding_output_dim=embedding_dim_size,
layer_sizes=[hidden_layer_size],
activation="tanh",
init="lecun_uniform",
loss="mse",
output_activation="sigmoid",
dropout_probability=dropout_probability,
optimizer=None,
learning_rate=0.001)
train_model_with_synthetic_data(
model=model,
n_training_epochs=n_training_epochs,
max_ic50=max_ic50,
X_original=X_original,
Y_original=Y_original,
X_synth=X_synth,
Y_synth=Y_synth,
original_sample_weights=original_sample_weights,
synthetic_sample_weights=synthetic_sample_weights)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment