Skip to content
Snippets Groups Projects
Commit 5aa591b0 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

split up CV code for checking best smoothing coefs for first type of data synthesis

parent 2142e440
No related branches found
No related tags found
No related merge requests found
...@@ -25,6 +25,7 @@ averaged across all given alleles. ...@@ -25,6 +25,7 @@ averaged across all given alleles.
""" """
import argparse import argparse
from collections import defaultdict
from mhcflurry.data import load_allele_dicts from mhcflurry.data import load_allele_dicts
from scipy import stats from scipy import stats
...@@ -69,43 +70,32 @@ parser.add_argument( ...@@ -69,43 +70,32 @@ parser.add_argument(
help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent") help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
def evaluate_synthetic_data( def generate_cross_validation_datasets(
allele_to_peptide_to_affinity, allele_to_peptide_to_affinity,
smoothing_coef,
exponent,
max_ic50,
n_folds=4): n_folds=4):
""" for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
Use cross-validation over entries of each allele to determine the predictive
accuracy of data synthesis on known affinities.
"""
taus = []
f1_scores = []
aucs = []
peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
allele_to_peptide_to_affinity)
for allele, dataset in allele_to_peptide_to_affinity.items():
peptides = list(dataset.keys()) peptides = list(dataset.keys())
affinities = list(dataset.values()) affinities = list(dataset.values())
all_indices = np.arange(len(peptides))
np.random.shuffle(all_indices)
n_samples = len(peptides) n_samples = len(peptides)
cv_aucs = [] print("Generating similarities for folds of %s data (n=%d)" % (
cv_taus = [] allele,
cv_f1_scores = [] n_samples))
if len(peptides) < n_folds: if n_samples < n_folds:
print("Too few samples (%d) for %d-fold cross-validation" % (
n_samples,
n_folds))
continue continue
kfold_iterator = enumerate( kfold_iterator = enumerate(
KFold(n_samples, n_folds=n_folds, shuffle=True)) KFold(n_samples, n_folds=n_folds, shuffle=True))
for fold_number, (train_indices, test_indices) in kfold_iterator: for fold_number, (train_indices, test_indices) in kfold_iterator:
print("-- Fold %d for %s" % (fold_number + 1, allele))
train_peptides = [peptides[i] for i in train_indices] train_peptides = [peptides[i] for i in train_indices]
train_affinities = [affinities[i] for i in train_indices] train_affinities = [affinities[i] for i in train_indices]
test_peptide_set = set([peptides[i] for i in test_indices]) test_peptide_set = set([peptides[i] for i in test_indices])
print("-- Test peptides = %s" % (list(test_peptide_set),))
# copy the affinity data for all alleles other than this one # copy the affinity data for all alleles other than this one
fold_affinity_dict = { fold_affinity_dict = {
allele_key: affinity_dict allele_key: affinity_dict
...@@ -125,97 +115,126 @@ def evaluate_synthetic_data( ...@@ -125,97 +115,126 @@ def evaluate_synthetic_data(
allele_to_peptide_to_affinity=fold_affinity_dict, allele_to_peptide_to_affinity=fold_affinity_dict,
min_weight=0.1) min_weight=0.1)
this_allele_similarities = allele_similarities[allele] this_allele_similarities = allele_similarities[allele]
# create a peptide -> list[(allele, affinity, weight)] dictionary
# restricted only to the peptides for which we want to test accuracy
test_reverse_lookup = {
peptide: triplets
for (peptide, triplets) in peptide_to_affinities.items()
if peptide in test_peptide_set
}
synthetic_values = synthesize_affinities_for_single_allele(
similarities=this_allele_similarities,
peptide_to_affinities=test_reverse_lookup,
smoothing=smoothing_coef,
exponent=exponent,
exclude_alleles=[allele])
synthetic_peptide_set = set(synthetic_values.keys())
# set of peptides for which we have both true and synthetic
# affinity values
combined_peptide_set = synthetic_peptide_set.intersection(
test_peptide_set)
combined_peptide_list = list(sorted(combined_peptide_set)) yield (
allele,
dataset,
fold_number,
this_allele_similarities,
test_peptide_set
)
if allele.startswith("HLA"):
break
# print("-- allele = %s, n_actual = %d, n_synthetic = %d, intersection = %d" % (
# allele,
# len(dataset),
# len(synthetic_peptide_set),
# len(combined_peptide_list)))
if len(combined_peptide_list) < 2: def evaluate_synthetic_data(
continue allele_to_peptide_to_affinity,
smoothing_coef,
exponent,
max_ic50,
n_folds=4):
"""
Use cross-validation over entries of each allele to determine the predictive
accuracy of data synthesis on known affinities.
"""
synthetic_affinity_list = [ peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
synthetic_values[peptide] allele_to_peptide_to_affinity)
for peptide in combined_peptide_list
]
true_affinity_list = [ taus = defaultdict(list)
dataset[peptide] f1_scores = defaultdict(list)
for peptide in combined_peptide_list aucs = defaultdict(list)
]
assert len(true_affinity_list) == len(synthetic_affinity_list) for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \
if all(x == true_affinity_list[0] for x in true_affinity_list): generate_cross_validation_datasets(
continue allele_to_peptide_to_affinity,
n_folds=n_folds):
tau, _ = stats.kendalltau( # create a peptide -> list[(allele, affinity, weight)] dictionary
synthetic_affinity_list, # restricted only to the peptides for which we want to test accuracy
true_affinity_list) test_reverse_lookup = {
assert not np.isnan(tau) peptide: triplets
print("==> %s (CV fold %d/%d) tau = %f" % ( for (peptide, triplets) in peptide_to_affinities.items()
allele, if peptide in fold_test_peptides
fold_number + 1, }
n_folds,
tau)) synthetic_values = synthesize_affinities_for_single_allele(
cv_taus.append(tau) similarities=fold_sims,
peptide_to_affinities=test_reverse_lookup,
true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list)) smoothing=smoothing_coef,
predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list)) exponent=exponent,
exclude_alleles=[allele])
true_binding_label = true_ic50s <= 500
if true_binding_label.all() or not true_binding_label.any(): synthetic_peptide_set = set(synthetic_values.keys())
# can't compute AUC or F1 without both negative and positive cases # set of peptides for which we have both true and synthetic
continue # affinity values
auc = sklearn.metrics.roc_auc_score( combined_peptide_set = synthetic_peptide_set.intersection(
true_binding_label, fold_test_peptides)
synthetic_affinity_list)
print("==> %s (CV fold %d/%d) AUC = %f" % ( combined_peptide_list = list(sorted(combined_peptide_set))
allele,
fold_number + 1, if len(combined_peptide_list) < 2:
n_folds, print("Too few peptides in combined set %s for fold %d/%d of %s" % (
auc)) combined_peptide_list,
cv_aucs.append(auc) fold_num + 1,
predicted_binding_label = predicted_ic50s <= 500
if predicted_binding_label.all() or not predicted_binding_label.any():
# can't compute F1 without both positive and negative predictions
continue
f1_score = sklearn.metrics.f1_score(
true_binding_label,
predicted_binding_label)
print("==> %s (CV fold %d/%d) F1 = %f" % (
allele,
fold_number + 1,
n_folds, n_folds,
f1_score)) allele))
cv_f1_scores.append(f1_score) continue
if len(cv_taus) > 0:
taus.append(np.mean(cv_taus)) synthetic_affinity_list = [
if len(cv_aucs) > 0: synthetic_values[peptide]
aucs.append(np.mean(cv_aucs)) for peptide in combined_peptide_list
if len(f1_scores) > 0: ]
f1_scores.append(np.mean(f1_scores))
true_affinity_list = [
allele_to_peptide_to_affinity[allele][peptide]
for peptide in combined_peptide_list
]
assert len(true_affinity_list) == len(synthetic_affinity_list)
if all(x == true_affinity_list[0] for x in true_affinity_list):
continue
tau, _ = stats.kendalltau(
synthetic_affinity_list,
true_affinity_list)
assert not np.isnan(tau)
print("==> %s (CV fold %d/%d) tau = %f" % (
allele,
fold_num + 1,
n_folds,
tau))
taus[allele].append(tau)
true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
true_binding_label = true_ic50s <= 500
if true_binding_label.all() or not true_binding_label.any():
# can't compute AUC or F1 without both negative and positive cases
continue
auc = sklearn.metrics.roc_auc_score(
true_binding_label,
synthetic_affinity_list)
print("==> %s (CV fold %d/%d) AUC = %f" % (
allele,
fold_num + 1,
n_folds,
auc))
aucs[allele].append(auc)
predicted_binding_label = predicted_ic50s <= 500
if predicted_binding_label.all() or not predicted_binding_label.any():
# can't compute F1 without both positive and negative predictions
continue
f1_score = sklearn.metrics.f1_score(
true_binding_label,
predicted_binding_label)
print("==> %s (CV fold %d/%d) F1 = %f" % (
allele,
fold_num + 1,
n_folds,
f1_score))
f1_scores[allele].append(f1_score)
return taus, aucs, f1_scores return taus, aucs, f1_scores
...@@ -245,9 +264,15 @@ if __name__ == "__main__": ...@@ -245,9 +264,15 @@ if __name__ == "__main__":
smoothing_coef=smoothing_coef, smoothing_coef=smoothing_coef,
exponent=exponent, exponent=exponent,
max_ic50=args.max_ic50) max_ic50=args.max_ic50)
median_tau = np.median(taus) allele_keys = list(sorted(taus.keys()))
median_f1 = np.median(f1_scores) tau_values = [taus[allele] for allele in allele_keys]
median_auc = np.median(aucs) auc_values = [aucs.get(allele, 0.5) for allele in allele_keys]
f1_score_values = [
f1_scores.get(allele, 0.0) for allele in allele_keys
]
median_tau = np.median(tau_values)
median_f1 = np.median(f1_score_values)
median_auc = np.median(auc_values)
print( print(
"Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % ( "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
exponent, exponent,
......
...@@ -143,6 +143,8 @@ parser.add_argument( ...@@ -143,6 +143,8 @@ parser.add_argument(
help="Comma separated list of optimization methods") help="Comma separated list of optimization methods")
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
configs = generate_all_model_configs( configs = generate_all_model_configs(
......
...@@ -159,53 +159,6 @@ def data_augmentation( ...@@ -159,53 +159,6 @@ def data_augmentation(
return aucs, f1s, n_originals return aucs, f1s, n_originals
def evaluate_model(
X,
Y,
weights_synth,
weights_actual,
dropout,
embedding_dim_size,
hidden_layer_size):
model = mhcflurry.feedforward.make_embedding_network(
peptide_length=9,
embedding_input_dim=20,
embedding_output_dim=4,
layer_sizes=[4],
activation="tanh",
init="lecun_uniform",
loss="mse",
output_activation="sigmoid",
dropout_probability=0.0,
optimizer=None,
learning_rate=0.001)
total_synth_weights = weights_synth.sum()
total_actual_weights = weights_actual.sum()
for epoch in range(args.training_epochs):
# weights for synthetic points can be shrunk as:
# ~ 1 / (1+epoch)**2
# or
# 2.0 ** -epoch
decay_factor = 2.0 ** -epoch
# if the contribution of synthetic samples is less than a
# thousandth of the actual data, then stop using it
synth_contribution = total_synth_weights * decay_factor
if synth_contribution < total_actual_weights / 1000:
print("Epoch %d, using only actual data" % (epoch + 1,))
model.fit(
X_actual,
Y_actual,
sample_weight=weights_actual,
nb_epoch=1)
else:
print("Epoch %d, synth decay factor = %f" % (
epoch + 1, decay_factor))
weights[n_actual_samples:] = weights_synth * decay_factor
model.fit(X, Y, sample_weight=weights, nb_epoch=1)
Y_pred = model.predict(X_actual)
print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean())
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
......
...@@ -17,6 +17,7 @@ from collections import OrderedDict ...@@ -17,6 +17,7 @@ from collections import OrderedDict
import numpy as np import numpy as np
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
import mhcflurry
from mhcflurry.common import normalize_allele_name from mhcflurry.common import normalize_allele_name
from mhcflurry.data_helpers import indices_to_hotshot_encoding from mhcflurry.data_helpers import indices_to_hotshot_encoding
......
import pandas as pd # Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from collections import OrderedDict
import pandas as pd
from .paths import CLASS1_MODEL_DIRECTORY from .paths import CLASS1_MODEL_DIRECTORY
from .mhc1_binding_predictor import Mhc1BindingPredictor from .mhc1_binding_predictor import Mhc1BindingPredictor
def predict(alleles, peptides): def predict(alleles, peptides):
allele_dataframes = [] allele_dataframes = OrderedDict([])
for allele in alleles: for allele in alleles:
model = Mhc1BindingPredictor(allele=allele) model = Mhc1BindingPredictor(allele=allele)
df = model.predict_peptides(peptides) result_dictionary = model.predict_peptides(peptides)
allele_dataframes.append(df) allele_dataframes.append(df)
return pd.concat(allele_dataframes) return pd.concat(allele_dataframes)
def supported_alleles():
alleles = []
for filename in os.listdir(CLASS1_MODEL_DIRECTORY):
allele = filename.replace(".hdf", "")
if len(allele) < 5:
# skipping serotype names like A2 or B7
continue
allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
alleles.append(allele)
return alleles
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment