Skip to content
Snippets Groups Projects
Commit 6d95f501 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

lots of fixes to multi-length peptide code and evaluation of synthetic data

parent 07cba541
No related branches found
No related tags found
No related merge requests found
...@@ -29,6 +29,7 @@ import argparse ...@@ -29,6 +29,7 @@ import argparse
import mhcflurry import mhcflurry
from scipy import stats from scipy import stats
import numpy as np import numpy as np
import sklearn.metrics
from common import curry_dictionary from common import curry_dictionary
from dataset_paths import PETERS2009_CSV_PATH from dataset_paths import PETERS2009_CSV_PATH
...@@ -64,74 +65,79 @@ parser.add_argument( ...@@ -64,74 +65,79 @@ parser.add_argument(
parser.add_argument( parser.add_argument(
"--smoothing-coefs", "--smoothing-coefs",
default=[10.0 ** -power for power in np.arange(0, 5.0, 0.25)], default=[0.1, 0.025, 0.05, 0.01, 0.0025, 0.005, 0.001, 0.0005, 0.0001],
type=lambda s: [float(si.strip()) for si in s.split(",")], type=lambda s: [float(si.strip()) for si in s.split(",")],
help="Smoothing value used for peptides with low weight across alleles") help="Smoothing value used for peptides with low weight across alleles")
parser.add_argument( parser.add_argument(
"--similarity-exponent", "--similarity-exponents",
default=2.0, default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
type=float, type=lambda s: [float(si.strip()) for si in s.split(",")],
help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent") help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
def evaluate_smoothing_coef( def evaluate_synthetic_data(
true_data, true_data,
curried_allele_similarities, curried_allele_similarities,
smoothing_coef): smoothing_coef,
exponent,
max_ic50):
taus = [] taus = []
f1_scores = []
aucs = []
peptide_to_affinities = create_reverse_peptide_affinity_lookup_dict( peptide_to_affinities = create_reverse_peptide_affinity_lookup_dict(
true_data) true_data)
for allele, dataset in true_data.items(): for allele, dataset in true_data.items():
allele_similarities = curried_allele_similarities[allele] this_allele_similarities = curried_allele_similarities[allele]
true_data_peptide_set = set(dataset.peptides) this_allele_peptides = set(dataset.peptides)
true_data_peptide_list = list(dataset.peptides)
# create a peptide -> (allele, affinity, weight) dictionary restricted # create a peptide -> (allele, affinity, weight) dictionary restricted
# only to the peptides for which we have data for this allele # only to the peptides for which we have data for this allele
restricted_reverse_lookup = { restricted_reverse_lookup = {
peptide: triplet peptide: triplet
for (peptide, triplet) in peptide_to_affinities.items() for (peptide, triplet) in peptide_to_affinities.items()
if peptide in true_data_peptide_set if peptide in this_allele_peptides
} }
synthetic_values = synthesize_affinities_for_single_allele( synthetic_values = synthesize_affinities_for_single_allele(
similarities=allele_similarities, similarities=this_allele_similarities,
peptide_to_affinities=restricted_reverse_lookup, peptide_to_affinities=restricted_reverse_lookup,
smoothing=smoothing_coef, smoothing=smoothing_coef,
exponent=2.0, exponent=exponent,
exclude_alleles=[allele]) exclude_alleles=[allele])
synthetic_peptide_set = set(synthetic_values.keys()) synthetic_peptide_set = set(synthetic_values.keys())
# ordered list of peptides for which we have both true and synthetic # set of peptides for which we have both true and synthetic
# affinity values # affinity values and for which the "true" values were not derived
combined_peptide_list = [ # from elongating or shortening the sequence of another sample
combined_peptide_set = {
peptide peptide
for peptide in true_data_peptide_list for (peptide, original_peptide)
if peptide in synthetic_peptide_set in zip(dataset.peptides, dataset.original_peptides)
] if peptide in synthetic_peptide_set and peptide == original_peptide
}
combined_peptide_list = list(sorted(combined_peptide_set))
if len(combined_peptide_list) < 2: if len(combined_peptide_list) < 2:
print(
"-- Skipping evaluation of %s due to insufficient data" % (
allele,))
continue continue
synthetic_affinity_list = [ synthetic_affinity_list = [
synthetic_values[peptide] synthetic_values[peptide]
for peptide in combined_peptide_list for peptide in combined_peptide_list
] ]
assert len(dataset.peptides) == len(dataset.Y), \
"Mismatch between # of peptides %d and # of outputs %d" % (
len(dataset.peptides), len(dataset.Y))
true_affinity_dict = { true_affinity_dict = {
peptide: yi peptide: yi
for (peptide, yi) for (peptide, yi)
in zip(dataset.peptides, dataset.Y) in zip(dataset.peptides, dataset.Y)
} }
true_affinity_list = [ true_affinity_list = [
true_affinity_dict[peptide] true_affinity_dict[peptide]
for peptide in combined_peptide_list for peptide in combined_peptide_list
] ]
assert len(true_affinity_list) == len(synthetic_affinity_list) assert len(true_affinity_list) == len(synthetic_affinity_list)
if all(x == true_affinity_list[0] for x in true_affinity_list): if all(x == true_affinity_list[0] for x in true_affinity_list):
print(
"-- can't compute Kendall's tau for %s, all affinities same" % (
allele,))
continue continue
tau, _ = stats.kendalltau( tau, _ = stats.kendalltau(
...@@ -139,7 +145,27 @@ def evaluate_smoothing_coef( ...@@ -139,7 +145,27 @@ def evaluate_smoothing_coef(
true_affinity_list) true_affinity_list)
assert not np.isnan(tau) assert not np.isnan(tau)
taus.append(tau) taus.append(tau)
return np.median(taus)
true_ic50s = max_ic50 ** np.array(true_affinity_list)
predicted_ic50s = max_ic50 ** np.array(synthetic_affinity_list)
true_binding_label = true_ic50s <= 500
if true_binding_label.all() or not true_binding_label.any():
# can't compute AUC or F1 without both negative and positive cases
continue
auc = sklearn.metrics.roc_auc_score(
true_binding_label,
synthetic_affinity_list)
aucs.append(auc)
predicted_binding_label = predicted_ic50s <= 500
if predicted_binding_label.all() or not predicted_binding_label.any():
# can't compute F1 without both positive and negative predictions
continue
f1_score = sklearn.metrics.f1_score(
true_binding_label,
predicted_binding_label)
f1_scores.append(f1_score)
return taus, aucs, f1_scores
def create_curried_similarity_matrix(allele_to_peptide_to_affinity, min_weight=2.0): def create_curried_similarity_matrix(allele_to_peptide_to_affinity, min_weight=2.0):
...@@ -177,21 +203,40 @@ if __name__ == "__main__": ...@@ -177,21 +203,40 @@ if __name__ == "__main__":
for (allele, dataset) for (allele, dataset)
in allele_datasets.items() in allele_datasets.items()
} }
curried_sims_dict = create_curried_similarity_matrix(allele_to_peptide_to_affinity) curried_sims_dict = create_curried_similarity_matrix(
allele_to_peptide_to_affinity)
print("Generated similarities between %d alleles" % len(curried_sims_dict)) print("Generated similarities between %d alleles" % len(curried_sims_dict))
results = {} results = {}
for smoothing_coef in args.smoothing_coefs: for smoothing_coef in args.smoothing_coefs:
median_tau = evaluate_smoothing_coef( for exponent in args.similarity_exponents:
true_data=allele_datasets, taus, aucs, f1_scores = evaluate_synthetic_data(
curried_allele_similarities=curried_sims_dict, true_data=allele_datasets,
smoothing_coef=smoothing_coef) curried_allele_similarities=curried_sims_dict,
print("Coef=%f, median Kendall tau=%f" % ( smoothing_coef=smoothing_coef,
smoothing_coef, exponent=exponent,
median_tau)) max_ic50=args.max_ic50)
results[smoothing_coef] = median_tau median_tau = np.median(taus)
median_f1 = np.median(f1_scores)
median_auc = np.median(aucs)
print(
"Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
exponent,
smoothing_coef,
median_tau,
median_auc,
median_f1))
scores = (median_tau, median_auc, median_f1)
results[(exponent, smoothing_coef)] = scores
print("===") print("===")
(best_coef, best_tau) = max(results.items(), key=lambda x: x[1]) ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max(
print("Best coef = %f (tau = %f)" % (best_coef, best_tau)) results.items(),
key=lambda x: x[1][0] * x[1][1] * x[1][2])
print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
best_exponent,
best_coef,
median_tau,
median_auc,
median_f1))
...@@ -93,6 +93,8 @@ def synthesize_affinities_for_single_allele( ...@@ -93,6 +93,8 @@ def synthesize_affinities_for_single_allele(
total = 0.0 total = 0.0
denom = 0.0 denom = 0.0
for (allele, y, sample_weight) in affinities: for (allele, y, sample_weight) in affinities:
if allele in exclude_alleles:
continue
sim = similarities.get(allele, 0) sim = similarities.get(allele, 0)
if sim == 0: if sim == 0:
continue continue
......
...@@ -24,9 +24,12 @@ import numpy as np ...@@ -24,9 +24,12 @@ import numpy as np
from .common import normalize_allele_name from .common import normalize_allele_name
from .peptide_encoding import ( from .peptide_encoding import (
fixed_length_index_encoding, index_encoding,
indices_to_hotshot_encoding, indices_to_hotshot_encoding,
) )
from .fixed_length_peptides import (
fixed_length_from_many_peptides
)
AlleleData = namedtuple( AlleleData = namedtuple(
"AlleleData", "AlleleData",
...@@ -266,21 +269,44 @@ def load_allele_datasets( ...@@ -266,21 +269,44 @@ def load_allele_datasets(
# convert from a Pandas column to a list, since that's easier to # convert from a Pandas column to a list, since that's easier to
# interact with later # interact with later
raw_peptides = list(raw_peptides) raw_peptides = list(raw_peptides)
# convert numberical values from a Pandas column to arrays # create dictionaries of outputs from which we can look up values
ic50 = np.array(group[ic50_column_name]) # after peptides have been expanded
Y = np.array(group["regression_output"]) ic50_dict = {
peptide: ic50
for (peptide, ic50)
in zip(raw_peptides, group[ic50_column_name])
}
Y_dict = {
peptide: y
for (peptide, y)
in zip(raw_peptides, group["regression_output"])
}
X_index, original_peptides, counts = fixed_length_index_encoding( fixed_length_peptides, original_peptides, subsequence_counts = \
peptides=raw_peptides, fixed_length_from_many_peptides(
desired_length=peptide_length) peptides=raw_peptides,
desired_length=peptide_length,
start_offset_shorten=0,
end_offset_shorten=0,
start_offset_extend=0,
end_offset_extend=0)
n_samples = len(fixed_length_peptides)
assert n_samples == len(original_peptides), \
"Mismatch between # of samples (%d) and # of peptides (%d)" % (
n_samples, len(original_peptides))
assert n_samples == len(subsequence_counts), \
"Mismatch between # of samples (%d) and # of counts (%d)" % (
n_samples, len(subsequence_counts))
X_index = index_encoding(fixed_length_peptides, peptide_length)
X_binary = indices_to_hotshot_encoding(X_index, n_indices=20) X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
assert X_binary.shape[0] == X_index.shape[0], \ assert X_binary.shape[0] == X_index.shape[0], \
("Mismatch between number of samples for index encoding (%d)" ("Mismatch between number of samples for index encoding (%d)"
" vs. binary encoding (%d)") % ( " vs. binary encoding (%d)") % (
X_binary.shape[0], X_binary.shape[0],
X_index.shape[0]) X_index.shape[0])
n_samples = X_binary.shape[0]
if flatten_binary_encoding: if flatten_binary_encoding:
# collapse 3D input into 2D matrix # collapse 3D input into 2D matrix
...@@ -288,16 +314,27 @@ def load_allele_datasets( ...@@ -288,16 +314,27 @@ def load_allele_datasets(
X_binary = X_binary.reshape((n_samples, n_binary_features)) X_binary = X_binary.reshape((n_samples, n_binary_features))
# easier to work with counts when they're an array instead of list # easier to work with counts when they're an array instead of list
counts = np.array(counts) subsequence_counts = np.array(subsequence_counts)
Y = np.array([Y_dict[p] for p in original_peptides])
ic50 = np.array([ic50_dict[p] for p in original_peptides])
assert n_samples == len(Y), \
"Mismatch between # peptides %d and # regression outputs %d" % (
n_samples, len(Y))
assert n_samples == len(ic50), \
"Mismatch between # of peptides %d and # IC50 outputs %d" % (
n_samples, len(ic50))
allele_groups[allele] = AlleleData( allele_groups[allele] = AlleleData(
X_index=X_index, X_index=X_index,
X_binary=X_binary, X_binary=X_binary,
Y=Y, Y=Y,
ic50=ic50, ic50=ic50,
peptides=raw_peptides, peptides=fixed_length_peptides,
original_peptides=original_peptides, original_peptides=original_peptides,
original_lengths=[len(peptide) for peptide in original_peptides], original_lengths=[len(peptide) for peptide in original_peptides],
substring_counts=counts, substring_counts=subsequence_counts,
weights=1.0 / counts) weights=1.0 / subsequence_counts)
return allele_groups return allele_groups
...@@ -12,6 +12,11 @@ ...@@ -12,6 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import (
print_function,
division,
absolute_import,
)
import itertools import itertools
import logging import logging
...@@ -217,6 +222,7 @@ def fixed_length_from_many_peptides( ...@@ -217,6 +222,7 @@ def fixed_length_from_many_peptides(
start_offset=start_offset_shorten, start_offset=start_offset_shorten,
end_offset=end_offset_shorten, end_offset=end_offset_shorten,
alphabet=alphabet) alphabet=alphabet)
n_fixed_length = len(fixed_length_peptides) n_fixed_length = len(fixed_length_peptides)
all_fixed_length_peptides.extend(fixed_length_peptides) all_fixed_length_peptides.extend(fixed_length_peptides)
original_peptide_sequences.extend([peptide] * n_fixed_length) original_peptide_sequences.extend([peptide] * n_fixed_length)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment