From 6d95f5018ebc7e985013e3b8aa4f900fb6d3105b Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 30 Nov 2015 14:16:06 -0600 Subject: [PATCH] lots of fixes to multi-length peptide code and evaluation of synthetic data --- ....py => best-synthetic-data-hyperparams.py} | 119 ++++++++++++------ experiments/synthetic_data.py | 2 + mhcflurry/data.py | 61 +++++++-- mhcflurry/fixed_length_peptides.py | 6 + 4 files changed, 139 insertions(+), 49 deletions(-) rename experiments/{best-synthetic-data-smoothing-coef.py => best-synthetic-data-hyperparams.py} (59%) diff --git a/experiments/best-synthetic-data-smoothing-coef.py b/experiments/best-synthetic-data-hyperparams.py similarity index 59% rename from experiments/best-synthetic-data-smoothing-coef.py rename to experiments/best-synthetic-data-hyperparams.py index 9d235305..cd19b22a 100644 --- a/experiments/best-synthetic-data-smoothing-coef.py +++ b/experiments/best-synthetic-data-hyperparams.py @@ -29,6 +29,7 @@ import argparse import mhcflurry from scipy import stats import numpy as np +import sklearn.metrics from common import curry_dictionary from dataset_paths import PETERS2009_CSV_PATH @@ -64,74 +65,79 @@ parser.add_argument( parser.add_argument( "--smoothing-coefs", - default=[10.0 ** -power for power in np.arange(0, 5.0, 0.25)], + default=[0.1, 0.025, 0.05, 0.01, 0.0025, 0.005, 0.001, 0.0005, 0.0001], type=lambda s: [float(si.strip()) for si in s.split(",")], help="Smoothing value used for peptides with low weight across alleles") parser.add_argument( - "--similarity-exponent", - default=2.0, - type=float, + "--similarity-exponents", + default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + type=lambda s: [float(si.strip()) for si in s.split(",")], help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent") -def evaluate_smoothing_coef( +def evaluate_synthetic_data( true_data, curried_allele_similarities, - smoothing_coef): + smoothing_coef, + exponent, + max_ic50): taus = [] + f1_scores = [] + aucs = [] peptide_to_affinities = create_reverse_peptide_affinity_lookup_dict( true_data) for allele, dataset in true_data.items(): - allele_similarities = curried_allele_similarities[allele] - true_data_peptide_set = set(dataset.peptides) - true_data_peptide_list = list(dataset.peptides) + this_allele_similarities = curried_allele_similarities[allele] + this_allele_peptides = set(dataset.peptides) # create a peptide -> (allele, affinity, weight) dictionary restricted # only to the peptides for which we have data for this allele restricted_reverse_lookup = { peptide: triplet for (peptide, triplet) in peptide_to_affinities.items() - if peptide in true_data_peptide_set + if peptide in this_allele_peptides } synthetic_values = synthesize_affinities_for_single_allele( - similarities=allele_similarities, + similarities=this_allele_similarities, peptide_to_affinities=restricted_reverse_lookup, smoothing=smoothing_coef, - exponent=2.0, + exponent=exponent, exclude_alleles=[allele]) + synthetic_peptide_set = set(synthetic_values.keys()) - # ordered list of peptides for which we have both true and synthetic - # affinity values - combined_peptide_list = [ + # set of peptides for which we have both true and synthetic + # affinity values and for which the "true" values were not derived + # from elongating or shortening the sequence of another sample + combined_peptide_set = { peptide - for peptide in true_data_peptide_list - if peptide in synthetic_peptide_set - ] - + for (peptide, original_peptide) + in zip(dataset.peptides, dataset.original_peptides) + if peptide in synthetic_peptide_set and peptide == original_peptide + } + combined_peptide_list = list(sorted(combined_peptide_set)) if len(combined_peptide_list) < 2: - print( - "-- Skipping evaluation of %s due to insufficient data" % ( - allele,)) continue synthetic_affinity_list = [ synthetic_values[peptide] for peptide in combined_peptide_list ] + + assert len(dataset.peptides) == len(dataset.Y), \ + "Mismatch between # of peptides %d and # of outputs %d" % ( + len(dataset.peptides), len(dataset.Y)) true_affinity_dict = { peptide: yi for (peptide, yi) in zip(dataset.peptides, dataset.Y) } + true_affinity_list = [ true_affinity_dict[peptide] for peptide in combined_peptide_list ] assert len(true_affinity_list) == len(synthetic_affinity_list) if all(x == true_affinity_list[0] for x in true_affinity_list): - print( - "-- can't compute Kendall's tau for %s, all affinities same" % ( - allele,)) continue tau, _ = stats.kendalltau( @@ -139,7 +145,27 @@ def evaluate_smoothing_coef( true_affinity_list) assert not np.isnan(tau) taus.append(tau) - return np.median(taus) + + true_ic50s = max_ic50 ** np.array(true_affinity_list) + predicted_ic50s = max_ic50 ** np.array(synthetic_affinity_list) + true_binding_label = true_ic50s <= 500 + if true_binding_label.all() or not true_binding_label.any(): + # can't compute AUC or F1 without both negative and positive cases + continue + auc = sklearn.metrics.roc_auc_score( + true_binding_label, + synthetic_affinity_list) + aucs.append(auc) + + predicted_binding_label = predicted_ic50s <= 500 + if predicted_binding_label.all() or not predicted_binding_label.any(): + # can't compute F1 without both positive and negative predictions + continue + f1_score = sklearn.metrics.f1_score( + true_binding_label, + predicted_binding_label) + f1_scores.append(f1_score) + return taus, aucs, f1_scores def create_curried_similarity_matrix(allele_to_peptide_to_affinity, min_weight=2.0): @@ -177,21 +203,40 @@ if __name__ == "__main__": for (allele, dataset) in allele_datasets.items() } - curried_sims_dict = create_curried_similarity_matrix(allele_to_peptide_to_affinity) + curried_sims_dict = create_curried_similarity_matrix( + allele_to_peptide_to_affinity) print("Generated similarities between %d alleles" % len(curried_sims_dict)) results = {} for smoothing_coef in args.smoothing_coefs: - median_tau = evaluate_smoothing_coef( - true_data=allele_datasets, - curried_allele_similarities=curried_sims_dict, - smoothing_coef=smoothing_coef) - print("Coef=%f, median Kendall tau=%f" % ( - smoothing_coef, - median_tau)) - results[smoothing_coef] = median_tau + for exponent in args.similarity_exponents: + taus, aucs, f1_scores = evaluate_synthetic_data( + true_data=allele_datasets, + curried_allele_similarities=curried_sims_dict, + smoothing_coef=smoothing_coef, + exponent=exponent, + max_ic50=args.max_ic50) + median_tau = np.median(taus) + median_f1 = np.median(f1_scores) + median_auc = np.median(aucs) + print( + "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % ( + exponent, + smoothing_coef, + median_tau, + median_auc, + median_f1)) + scores = (median_tau, median_auc, median_f1) + results[(exponent, smoothing_coef)] = scores print("===") - (best_coef, best_tau) = max(results.items(), key=lambda x: x[1]) - print("Best coef = %f (tau = %f)" % (best_coef, best_tau)) + ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max( + results.items(), + key=lambda x: x[1][0] * x[1][1] * x[1][2]) + print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % ( + best_exponent, + best_coef, + median_tau, + median_auc, + median_f1)) diff --git a/experiments/synthetic_data.py b/experiments/synthetic_data.py index 54d020fc..dd2b5335 100644 --- a/experiments/synthetic_data.py +++ b/experiments/synthetic_data.py @@ -93,6 +93,8 @@ def synthesize_affinities_for_single_allele( total = 0.0 denom = 0.0 for (allele, y, sample_weight) in affinities: + if allele in exclude_alleles: + continue sim = similarities.get(allele, 0) if sim == 0: continue diff --git a/mhcflurry/data.py b/mhcflurry/data.py index 66ccbe3c..998fadd0 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -24,9 +24,12 @@ import numpy as np from .common import normalize_allele_name from .peptide_encoding import ( - fixed_length_index_encoding, + index_encoding, indices_to_hotshot_encoding, ) +from .fixed_length_peptides import ( + fixed_length_from_many_peptides +) AlleleData = namedtuple( "AlleleData", @@ -266,21 +269,44 @@ def load_allele_datasets( # convert from a Pandas column to a list, since that's easier to # interact with later raw_peptides = list(raw_peptides) - # convert numberical values from a Pandas column to arrays - ic50 = np.array(group[ic50_column_name]) - Y = np.array(group["regression_output"]) + # create dictionaries of outputs from which we can look up values + # after peptides have been expanded + ic50_dict = { + peptide: ic50 + for (peptide, ic50) + in zip(raw_peptides, group[ic50_column_name]) + } + Y_dict = { + peptide: y + for (peptide, y) + in zip(raw_peptides, group["regression_output"]) + } - X_index, original_peptides, counts = fixed_length_index_encoding( - peptides=raw_peptides, - desired_length=peptide_length) + fixed_length_peptides, original_peptides, subsequence_counts = \ + fixed_length_from_many_peptides( + peptides=raw_peptides, + desired_length=peptide_length, + start_offset_shorten=0, + end_offset_shorten=0, + start_offset_extend=0, + end_offset_extend=0) + n_samples = len(fixed_length_peptides) + assert n_samples == len(original_peptides), \ + "Mismatch between # of samples (%d) and # of peptides (%d)" % ( + n_samples, len(original_peptides)) + assert n_samples == len(subsequence_counts), \ + "Mismatch between # of samples (%d) and # of counts (%d)" % ( + n_samples, len(subsequence_counts)) + + X_index = index_encoding(fixed_length_peptides, peptide_length) X_binary = indices_to_hotshot_encoding(X_index, n_indices=20) + assert X_binary.shape[0] == X_index.shape[0], \ ("Mismatch between number of samples for index encoding (%d)" " vs. binary encoding (%d)") % ( X_binary.shape[0], X_index.shape[0]) - n_samples = X_binary.shape[0] if flatten_binary_encoding: # collapse 3D input into 2D matrix @@ -288,16 +314,27 @@ def load_allele_datasets( X_binary = X_binary.reshape((n_samples, n_binary_features)) # easier to work with counts when they're an array instead of list - counts = np.array(counts) + subsequence_counts = np.array(subsequence_counts) + + Y = np.array([Y_dict[p] for p in original_peptides]) + ic50 = np.array([ic50_dict[p] for p in original_peptides]) + + assert n_samples == len(Y), \ + "Mismatch between # peptides %d and # regression outputs %d" % ( + n_samples, len(Y)) + + assert n_samples == len(ic50), \ + "Mismatch between # of peptides %d and # IC50 outputs %d" % ( + n_samples, len(ic50)) allele_groups[allele] = AlleleData( X_index=X_index, X_binary=X_binary, Y=Y, ic50=ic50, - peptides=raw_peptides, + peptides=fixed_length_peptides, original_peptides=original_peptides, original_lengths=[len(peptide) for peptide in original_peptides], - substring_counts=counts, - weights=1.0 / counts) + substring_counts=subsequence_counts, + weights=1.0 / subsequence_counts) return allele_groups diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py index edaa4548..1ab37bdf 100644 --- a/mhcflurry/fixed_length_peptides.py +++ b/mhcflurry/fixed_length_peptides.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import ( + print_function, + division, + absolute_import, +) import itertools import logging @@ -217,6 +222,7 @@ def fixed_length_from_many_peptides( start_offset=start_offset_shorten, end_offset=end_offset_shorten, alphabet=alphabet) + n_fixed_length = len(fixed_length_peptides) all_fixed_length_peptides.extend(fixed_length_peptides) original_peptide_sequences.extend([peptide] * n_fixed_length) -- GitLab