Skip to content
Snippets Groups Projects
Commit bb4701a0 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

Merge pull request #12 from hammerlab/soft-pretraining

Soft-pretraining and lots of other changes
parents 5a856ce8 d2989115
No related branches found
No related tags found
No related merge requests found
Showing
with 3573 additions and 284 deletions
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper functions for computing pairwise similarities between alleles
"""
import numpy as np
import seaborn
from fancyimpute import NuclearNormMinimization
from common import matrix_to_dictionary, curry_dictionary
def compute_similarities_for_single_allele_from_peptide_overlap(
allele_name,
allele_to_peptide_to_affinity,
allele_name_length=None,
min_weight=1.0):
"""
Parameters
----------
allele_name : str
allele_to_peptide_to_affinity : dict
Dictionary from allele anmes to a dictionary of (peptide -> affinity)
Returns three dictionaries, mapping alleles to
- similarity
- number of overlapping peptides
- total weight of overlapping affinitie
"""
sims = {}
overlaps = {}
weights = {}
dataset_a = allele_to_peptide_to_affinity[allele_name]
peptide_set_a = set(dataset_a.keys())
for other_allele, dataset_b in allele_to_peptide_to_affinity.items():
if allele_name_length and len(other_allele) != allele_name_length:
continue
peptide_set_b = set(dataset_b.keys())
intersection = peptide_set_a.intersection(peptide_set_b)
overlaps[other_allele] = len(intersection)
total = 0.0
weight = 0.0
for peptide in intersection:
ya = dataset_a[peptide]
yb = dataset_b[peptide]
minval = min(ya, yb)
maxval = max(ya, yb)
total += minval
weight += maxval
weights[other_allele] = weight
if weight > min_weight:
sims[other_allele] = total / weight
return sims, overlaps, weights
def compute_partial_similarities_from_peptide_overlap(
allele_to_peptide_to_affinity,
min_weight=1.0,
allele_name_length=None):
"""
Determine similarity between pairs of alleles by examining
affinity values for overlapping peptides. Returns curried dictionaries
mapping allele -> allele -> float, where the final value is one of:
- similarity between alleles
- # overlapping peptides
- sum weight of overlapping affinities
"""
sims = {}
overlaps = {}
weights = {}
for allele_name in allele_to_peptide_to_affinity.keys():
if allele_name_length and len(allele_name) != allele_name_length:
continue
curr_sims, curr_overlaps, curr_weights = \
compute_similarities_for_single_allele_from_peptide_overlap(
allele_name=allele_name,
allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
allele_name_length=allele_name_length,
min_weight=min_weight)
sims[allele_name] = curr_sims
overlaps[allele_name] = curr_overlaps
weights[allele_name] = curr_weights
return sims, overlaps, weights
def build_incomplete_similarity_matrix(
allele_to_peptide_to_affinity,
curried_sims_dict):
allele_list = list(sorted(allele_to_peptide_to_affinity.keys()))
allele_order = {
allele_name: i
for (i, allele_name) in enumerate(allele_list)
}
n_alleles = len(allele_list)
sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float)
for a, ai in allele_order.items():
for b, bi in allele_order.items():
# if allele pair is missing from similarities dict then
# fill its slot with NaN, indicating missing data
similarity = curried_sims_dict.get(a, {}).get(b, np.nan)
sims_incomplete_matrix[(ai, bi)] = similarity
return allele_list, allele_order, sims_incomplete_matrix
def save_heatmap(matrix, allele_names, filename):
seaborn.set_context("paper")
with seaborn.plotting_context(font_scale=1):
figure = seaborn.plt.figure(figsize=(20, 18))
ax = figure.add_axes()
heatmap = seaborn.heatmap(
data=matrix,
xticklabels=allele_names,
yticklabels=allele_names,
linewidths=0,
annot=False,
ax=ax,
fmt=".2g")
heatmap.set_xticklabels(labels=allele_names, rotation=45)
heatmap.set_yticklabels(labels=allele_names, rotation=45)
figure.savefig(filename)
def fill_in_similarities(
curried_raw_sims_dict,
allele_to_peptide_to_affinity,
raw_sims_heatmap_path=None,
complete_sims_heatmap_path=None,
curried_overlap_weights=None,
scalar_error_tolerance=0.0001):
"""
Given an incomplete dictionary of pairwise allele similarities and
a dictionary of binding data, generate the completed similarities
"""
allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix(
allele_to_peptide_to_affinity,
curried_sims_dict=curried_raw_sims_dict)
missing = np.isnan(sims_matrix)
if curried_overlap_weights:
error_tolerance = np.ones_like(sims_matrix)
for allele_a, a_dict in curried_overlap_weights.items():
for allele_b, weight in a_dict.items():
i = allele_order[allele_a]
j = allele_order[allele_b]
error_tolerance[i, j] = 2.0 ** -weight
print(
"-- Error tolerance distribution: min %f, max %f, median %f" % (
error_tolerance[~missing].min(),
error_tolerance[~missing].max(),
np.median(error_tolerance[~missing])))
else:
error_tolerance = scalar_error_tolerance
if raw_sims_heatmap_path:
save_heatmap(
sims_matrix,
allele_list,
raw_sims_heatmap_path)
print("Completing %s similarities matrix with %d missing entries" % (
sims_matrix.shape,
missing.sum()))
solver = NuclearNormMinimization(
require_symmetric_solution=True,
min_value=0.0,
max_value=1.0,
error_tolerance=error_tolerance)
sims_complete_matrix = solver.complete(sims_matrix)
if complete_sims_heatmap_path:
save_heatmap(
sims_complete_matrix,
allele_list,
complete_sims_heatmap_path)
complete_sims_dict = curry_dictionary(
matrix_to_dictionary(
sims=sims_complete_matrix,
allele_list=allele_list))
return complete_sims_dict
def save_csv(filename, sims, overlap_counts, overlap_weights):
"""
Save pairwise allele similarities in CSV file.
Assumes the dictionaries sims, overlap_counts, and overlap_weights are
'curried', meaning that their keys are single allele names which map
to nested dictionaries from other allele names to numeric values.
As a type signature, this means the dictionaries are of the form
(allele -> allele -> number)
and not
(allele, allele) -> number
"""
with open(filename, "w") as f:
f.write("allele_A,allele_B,similarity,count,weight\n")
for (a, a_row) in sorted(sims.items()):
a_counts = overlap_counts.get(a, {})
a_weights = overlap_weights.get(a, {})
for (b, similarity) in sorted(a_row.items()):
count = a_counts.get(b, 0)
weight = a_weights.get(b, 0.0)
f.write("%s,%s,%0.4f,%d,%0.4f\n" % (
a,
b,
similarity,
count,
weight))
def compute_allele_similarities(allele_to_peptide_to_affinity, min_weight=0.1):
"""
Compute pairwise allele similarities from binding data,
complete similarity matrix for alleles which lack sufficient data,
returns:
- dictionary of allele similarities
- dictionary of # overlapping peptides between alleles
- dictionary of "weight" of overlapping peptides between alleles
"""
assert isinstance(allele_to_peptide_to_affinity, dict), \
"Wrong type, expected dict but got %s" % (
type(allele_to_peptide_to_affinity),)
raw_sims_dict, overlap_counts, overlap_weights = \
compute_partial_similarities_from_peptide_overlap(
allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
min_weight=min_weight)
complete_sims_dict = fill_in_similarities(
curried_raw_sims_dict=raw_sims_dict,
allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
curried_overlap_weights=overlap_weights)
return complete_sims_dict, overlap_counts, overlap_weights
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Find best smoothing coefficient for creating synthetic data. The smoothing
coefficient adds artifical weight toward low binding affinities, which dominate
in cases where no similar allele has values for some peptide.
We're determining "best" as most predictive of already known binding affinities,
averaged across all given alleles.
"""
import argparse
from collections import defaultdict
from mhcflurry.data import load_allele_dicts
from scipy import stats
import numpy as np
import sklearn.metrics
from dataset_paths import PETERS2009_CSV_PATH
from synthetic_data import (
synthesize_affinities_for_single_allele,
create_reverse_lookup_from_simple_dicts
)
from allele_similarities import compute_allele_similarities
parser = argparse.ArgumentParser()
parser.add_argument(
"--binding-data-csv",
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--smoothing-coefs",
default=[0.05, 0.025, 0.01, 0.005, 0.0025, 0.001, 0.0005, 0.0001, 0],
type=lambda s: [float(si.strip()) for si in s.split(",")],
help="Smoothing value used for peptides with low weight across alleles")
parser.add_argument(
"--similarity-exponents",
default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
type=lambda s: [float(si.strip()) for si in s.split(",")],
help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
def generate_cross_validation_datasets(
allele_to_peptide_to_affinity,
n_folds=4):
for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
peptides = list(dataset.keys())
affinities = list(dataset.values())
n_samples = len(peptides)
print("Generating similarities for folds of %s data (n=%d)" % (
allele,
n_samples))
if n_samples < n_folds:
print("Too few samples (%d) for %d-fold cross-validation" % (
n_samples,
n_folds))
continue
kfold_iterator = enumerate(
KFold(n_samples, n_folds=n_folds, shuffle=True))
for fold_number, (train_indices, test_indices) in kfold_iterator:
train_peptides = [peptides[i] for i in train_indices]
train_affinities = [affinities[i] for i in train_indices]
test_peptide_set = set([peptides[i] for i in test_indices])
# copy the affinity data for all alleles other than this one
fold_affinity_dict = {
allele_key: affinity_dict
for (allele_key, affinity_dict)
in allele_to_peptide_to_affinity.items()
if allele_key != allele
}
# include an affinity dictionary for this allele which
# only uses training data
fold_affinity_dict[allele] = {
train_peptide: train_affinity
for (train_peptide, train_affinity)
in zip(train_peptides, train_affinities)
}
allele_similarities, overlap_counts, overlap_weights = \
compute_allele_similarities(
allele_to_peptide_to_affinity=fold_affinity_dict,
min_weight=0.1)
this_allele_similarities = allele_similarities[allele]
yield (
allele,
dataset,
fold_number,
this_allele_similarities,
test_peptide_set
)
def evaluate_synthetic_data(
allele_to_peptide_to_affinity,
smoothing_coef,
exponent,
max_ic50,
n_folds=4):
"""
Use cross-validation over entries of each allele to determine the predictive
accuracy of data synthesis on known affinities.
"""
peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
allele_to_peptide_to_affinity)
taus = defaultdict(list)
f1_scores = defaultdict(list)
aucs = defaultdict(list)
for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \
generate_cross_validation_datasets(
allele_to_peptide_to_affinity,
n_folds=n_folds):
# create a peptide -> list[(allele, affinity, weight)] dictionary
# restricted only to the peptides for which we want to test accuracy
test_reverse_lookup = {
peptide: triplets
for (peptide, triplets) in peptide_to_affinities.items()
if peptide in fold_test_peptides
}
synthetic_values = synthesize_affinities_for_single_allele(
similarities=fold_sims,
peptide_to_affinities=test_reverse_lookup,
smoothing=smoothing_coef,
exponent=exponent,
exclude_alleles=[allele])
synthetic_peptide_set = set(synthetic_values.keys())
# set of peptides for which we have both true and synthetic
# affinity values
combined_peptide_set = synthetic_peptide_set.intersection(
fold_test_peptides)
combined_peptide_list = list(sorted(combined_peptide_set))
if len(combined_peptide_list) < 2:
print("Too few peptides in combined set %s for fold %d/%d of %s" % (
combined_peptide_list,
fold_num + 1,
n_folds,
allele))
continue
synthetic_affinity_list = [
synthetic_values[peptide]
for peptide in combined_peptide_list
]
true_affinity_list = [
allele_to_peptide_to_affinity[allele][peptide]
for peptide in combined_peptide_list
]
assert len(true_affinity_list) == len(synthetic_affinity_list)
if all(x == true_affinity_list[0] for x in true_affinity_list):
continue
tau, _ = stats.kendalltau(
synthetic_affinity_list,
true_affinity_list)
assert not np.isnan(tau)
print("==> %s (CV fold %d/%d) tau = %f" % (
allele,
fold_num + 1,
n_folds,
tau))
taus[allele].append(tau)
true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
true_binding_label = true_ic50s <= 500
if true_binding_label.all() or not true_binding_label.any():
# can't compute AUC or F1 without both negative and positive cases
continue
auc = sklearn.metrics.roc_auc_score(
true_binding_label,
synthetic_affinity_list)
print("==> %s (CV fold %d/%d) AUC = %f" % (
allele,
fold_num + 1,
n_folds,
auc))
aucs[allele].append(auc)
predicted_binding_label = predicted_ic50s <= 500
if predicted_binding_label.all() or not predicted_binding_label.any():
# can't compute F1 without both positive and negative predictions
continue
f1_score = sklearn.metrics.f1_score(
true_binding_label,
predicted_binding_label)
print("==> %s (CV fold %d/%d) F1 = %f" % (
allele,
fold_num + 1,
n_folds,
f1_score))
f1_scores[allele].append(f1_score)
return taus, aucs, f1_scores
if __name__ == "__main__":
args = parser.parse_args()
print(args)
allele_to_peptide_to_affinity = load_allele_dicts(
args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=args.only_human,
regression_output=True)
n_binding_values = sum(
len(allele_dict)
for allele_dict in
allele_to_peptide_to_affinity.values()
)
print("Loaded %d binding values for %d alleles" % (
n_binding_values, len(allele_to_peptide_to_affinity)))
results = {}
for smoothing_coef in args.smoothing_coefs:
for exponent in args.similarity_exponents:
taus, aucs, f1_scores = evaluate_synthetic_data(
allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
smoothing_coef=smoothing_coef,
exponent=exponent,
max_ic50=args.max_ic50)
allele_keys = list(sorted(taus.keys()))
tau_allele_means = [
np.mean(taus[allele])
for allele in allele_keys
if taus[allele]
]
auc_allele_means = [
np.mean(aucs[allele])
for allele in allele_keys
if aucs[allele]
]
f1_score_means = [
np.mean(f1_scores[allele])
for allele in allele_keys
if f1_scores[allele]
]
median_tau = np.median(tau_allele_means)
median_f1 = np.median(f1_score_means)
median_auc = np.median(auc_allele_means)
print("\n\n::::::\n")
print(
"Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
exponent,
smoothing_coef,
median_tau,
median_auc,
median_f1))
print("\n^^^^^^\n")
scores = (median_tau, median_auc, median_f1)
results[(exponent, smoothing_coef)] = scores
print("===")
def combine_item_scores(x):
return x[1][0] * x[1][1] * x[1][2]
for ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) in \
sorted(results.items(), key=combine_item_scores):
print("-- exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
best_exponent,
best_coef,
median_tau,
median_auc,
median_f1))
((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max(
results.items(),
key=combine_item_scores)
print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
best_exponent,
best_coef,
median_tau,
median_auc,
median_f1))
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import pandas as pd
def curry_dictionary(key_pair_dict, default_value=0.0):
"""
Transform dictionary from pairs of keys to dict -> dict -> float
"""
result = defaultdict(dict)
for (a, b), value in key_pair_dict.items():
result[a][b] = value
return result
def uncurry_dictionary(curried_dict):
"""
Transform dictionary from (key_a -> key_b -> float) to
(key_a, key_b) -> float
"""
result = {}
for a, a_dict in curried_dict.items():
for b, value in a_dict.items():
result[(a, b)] = value
return result
def matrix_to_dictionary(sims, allele_list):
sims_dict = {}
for i in range(sims.shape[0]):
a = allele_list[i]
for j in range(sims.shape[1]):
b = allele_list[j]
sims_dict[a, b] = sims[i, j]
return sims_dict
def load_csv_binding_data_as_dict(
csv_path,
mhc_column_name="mhc",
peptide_column_name="sequence",
ic50_column_name="ic50"):
"""
Given a path to a CSV file containing peptide-MHC binding data,
load it as a dictionary mapping alleles to a dictionary peptide->IC50
"""
df = pd.read_csv(csv_path)
print("-- Read %d rows from %s" % (len(df), csv_path))
return {
allele: {
peptide: ic50
for (peptide, ic50)
in zip(group[peptide_column_name], group[ic50_column_name])
}
for allele, group in df.groupby(mhc_column_name)
}
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import fancyimpute
import numpy as np
import mhcflurry
from dataset_paths import PETERS2009_CSV_PATH
from allele_similarities import (
compute_partial_similarities_from_peptide_overlap,
fill_in_similarities,
save_csv
)
parser = argparse.ArgumentParser()
......@@ -13,204 +32,96 @@ parser.add_argument(
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--fill-missing-values",
action="store_true",
default=False)
"--min-overlap-weight",
default=0.05,
help="Minimum overlap weight between pair of alleles",
type=float)
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--min-overlap-weight",
default=2.0,
help="Minimum overlap weight between pair of alleles")
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--max-ic50",
default=50000.0)
"--raw-similarities-output-path",
help="CSV file which contains incomplete allele similarities")
parser.add_argument(
"--output-csv",
required=True)
def binary_encode(X, n_indices=20):
n_cols = X.shape[1]
X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float)
for i in range(len(X)):
for col_idx in range(n_cols):
X_encode[i, col_idx*n_indices + X[i, col_idx]] = True
return X_encode
def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5):
if allele_name_length:
datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length}
if prefix:
datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)}
if len(datasets_dict) == 0:
raise ValueError("No alleles matched criteria")
all_peptides = set([])
allele_list = []
for k,d in sorted(datasets_dict.items()):
print(k, len(d.peptides))
allele_list.append(k)
for p in d.peptides:
all_peptides.add(p)
peptide_list = list(sorted(all_peptides))
allele_order = {a:i for (i,a) in enumerate(allele_list)}
peptide_order = {p:i for (i,p) in enumerate(peptide_list)}
n_alleles = len(allele_order)
n_peptides = len(peptide_order)
incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan
for k,d in datasets_dict.items():
if k not in allele_order:
continue
i = allele_order[k]
for p,y in zip(d.peptides, d.Y):
j = peptide_order[p]
incomplete_affinity_matrix[i,j] = y
print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides))
return incomplete_affinity_matrix, allele_order, peptide_order
def get_extra_data(allele, train_datasets, expanded_predictions):
original_dataset = train_datasets[allele]
original_peptides = set(original_dataset.peptides)
expanded_allele_affinities = expanded_predictions[allele]
extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides}
extra_peptides = list(extra_affinities.keys())
extra_values = list(extra_affinities.values())
extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
extra_Y = np.array(extra_values)
return extra_X, extra_Y
def WHO_KNOWS_WHAT():
smoothing = 0.005
exponent = 2
all_predictions = defaultdict(dict)
for allele, allele_idx in allele_order.items():
for peptide in peptide_order.keys():
predictions = reverse_lookups[peptide]
total_value = 0.0
total_denom = 0.0
for (other_allele, y) in predictions:
key = (allele,other_allele)
if key not in completed_sims_dict:
continue
sim = completed_sims_dict[key]
weight = sim ** exponent
total_value += weight * y
total_denom += weight
if total_denom > 0.0:
all_predictions[allele][peptide] = total_value / (smoothing + total_denom)
def data_augmentation(X, Y, extra_X, extra_Y,
fraction=0.5,
niters=10,
extra_sample_weight=0.1,
nb_epoch=50,
nn=True,
hidden_layer_size=5):
n = len(Y)
aucs = []
f1s = []
n_originals = []
for _ in range(niters):
mask = np.random.rand(n) <= fraction
X_train = X[mask]
X_test = X[~mask]
Y_train = Y[mask]
Y_test = Y[~mask]
test_ic50 = 20000 ** (1-Y_test)
test_label = test_ic50 <= 500
if test_label.all() or not test_label.any():
continue
n_original = mask.sum()
print("Keeping %d original training samples" % n_original)
X_train_combined = np.vstack([X_train, extra_X])
Y_train_combined = np.concatenate([Y_train, extra_Y])
print("Combined shape: %s" % (X_train_combined.shape,))
assert len(X_train_combined) == len(Y_train_combined)
# initialize weights to count synthesized and actual data equally
# but weight on synthesized points will decay across training epochs
weight = np.ones(len(Y_train_combined))
if nn:
model = mhcflurry.feedforward.make_embedding_network(
layer_sizes=[hidden_layer_size],
embedding_output_dim=10,
activation="tanh")
for i in range(nb_epoch):
# decay weight as training progresses
weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2
model.fit(
X_train_combined,
Y_train_combined,
sample_weight=weight,
shuffle=True,
nb_epoch=1,
verbose=0)
pred = model.predict(X_test)
else:
model = sklearn.linear_model.Ridge(alpha=5)
X_train_combined_binary = binary_encode(X_train_combined)
X_test_binary = binary_encode(X_test)
model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight)
pred = model.predict(X_test_binary)
pred_ic50 = 20000 ** (1-pred)
pred_label = pred_ic50 <= 500
mse = sklearn.metrics.mean_squared_error(Y_test, pred)
auc = sklearn.metrics.roc_auc_score(test_label, pred)
if pred_label.all() or not pred_label.any():
f1 = 0
else:
f1 = sklearn.metrics.f1_score(test_label, pred_label)
print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1))
n_originals.append(n_original)
aucs.append(auc)
f1s.append(f1)
return aucs, f1s, n_originals
"--complete-similarities-output-path",
help="CSV file which contains allele similarities after matrix completion")
parser.add_argument(
"--raw-heatmap-output-path",
help="PNG file to save heatmap of incomplete similarities matrix")
parser.add_argument(
"--complete-heatmap-output-path",
help="PNG file to save heatmap of complete similarities matrix")
def print_dataset_sizes(allele_groups):
print("---\nDataset Sizes\n---")
for (allele_name, g) in sorted(allele_groups.items()):
print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % (
allele_name,
len(g),
sum(len(k) == 8 for k in g.keys()),
sum(len(k) == 9 for k in g.keys()),
sum(len(k) == 10 for k in g.keys()),
sum(len(k) == 11 for k in g.keys()),
))
print("---")
if __name__ == "__main__":
args = parser.parse_args()
datasets = mhcflurry.data_helpers.load_data(
print(args)
df, peptide_column_name = mhcflurry.data.load_dataframe(
args.binding_data_csv,
binary_encoding=True,
max_ic50=args.max_ic50)
sims = {}
overlaps = {}
weights = {}
for allele_name_a, da in train_datasets.items():
if len(allele_name_a) != 5:
continue
y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)}
seta = set(da.peptides)
for allele_name_b, db in train_datasets.items():
if len(allele_name_b) != 5:
continue
y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)}
setb = set(db.peptides)
intersection = seta.intersection(setb)
overlaps[a,b] = len(intersection)
total = 0.0
weight = 0.0
for peptide in intersection:
ya = y_dict_a[peptide]
yb = y_dict_b[peptide]
minval = min(ya, yb)
maxval = max(ya, yb)
total += minval
weight += maxval
weights[a,b] = weight
if weight > min_weight:
sims[allele_name_a, allele_name_b] = total / weight
else:
sims[allele_name_a, allele_name_b] = np.nan
if args.fill_missing_values:
sims_array = np.zeros((n_alleles, n_alleles), dtype=float)
for i, a in enumerate(allele_names):
for j, b in enumerate(allele_names):
sims_array[i,j] = sims[(a,b)]
max_ic50=args.max_ic50,
only_human=args.only_human)
allele_groups = {
allele_name: {
row[peptide_column_name]: row["regression_output"]
for (_, row) in group.iterrows()
}
for (allele_name, group) in df.groupby("mhc")
}
print_dataset_sizes(allele_groups)
raw_sims_dict, overlap_counts, overlap_weights = \
compute_partial_similarities_from_peptide_overlap(
allele_groups,
min_weight=args.min_overlap_weight)
if args.raw_similarities_output_path:
save_csv(
args.raw_similarities_output_path,
raw_sims_dict,
overlap_counts,
overlap_weights)
complete_sims_dict = fill_in_similarities(
curried_raw_sims_dict=raw_sims_dict,
allele_to_peptide_to_affinity=allele_groups,
raw_sims_heatmap_path=args.raw_heatmap_output_path,
complete_sims_heatmap_path=args.complete_heatmap_output_path)
n_total_entries = sum(len(row) for row in complete_sims_dict.values())
n_original_entries = sum(len(row) for row in raw_sims_dict.values())
print("-- Added %d/%d allele similarities" % (
n_total_entries - n_original_entries,
n_total_entries
))
if args.complete_similarities_output_path:
save_csv(
args.complete_similarities_output_path,
complete_sims_dict,
overlap_counts,
overlap_weights)
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sklearn.cross_validation import KFold
def generate_cross_validation_datasets(
allele_to_peptide_to_affinity,
n_folds=4):
for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
peptides = list(dataset.keys())
affinities = list(dataset.values())
n_samples = len(peptides)
print("Generating similarities for folds of %s data (n=%d)" % (
allele,
n_samples))
if n_samples < n_folds:
print("Too few samples (%d) for %d-fold cross-validation" % (
n_samples,
n_folds))
continue
kfold_iterator = enumerate(
KFold(n_samples, n_folds=n_folds, shuffle=True))
for fold_number, (train_indices, test_indices) in kfold_iterator:
train_peptides = [peptides[i] for i in train_indices]
train_affinities = [affinities[i] for i in train_indices]
test_peptide_set = set([peptides[i] for i in test_indices])
# copy the affinity data for all alleles other than this one
fold_affinity_dict = {
allele_key: affinity_dict
for (allele_key, affinity_dict)
in allele_to_peptide_to_affinity.items()
if allele_key != allele
}
# include an affinity dictionary for this allele which
# only uses training data
fold_affinity_dict[allele] = {
train_peptide: train_affinity
for (train_peptide, train_affinity)
in zip(train_peptides, train_affinities)
}
allele_similarities, overlap_counts, overlap_weights = \
compute_allele_similarities(
allele_to_peptide_to_affinity=fold_affinity_dict,
min_weight=0.1)
this_allele_similarities = allele_similarities[allele]
yield (
allele,
dataset,
fold_number,
this_allele_similarities,
test_peptide_set
)
\ No newline at end of file
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from fancyimpute import (
SoftImpute,
IterativeSVD,
SimilarityWeightedAveraging,
KNN,
MICE,
BiScaler,
SimpleFill,
)
import numpy as np
import pandas as pd
from dataset_paths import PETERS2009_CSV_PATH
from score_set import ScoreSet
from matrix_completion_helpers import (
load_data,
evaluate_predictions,
stratified_cross_validation
)
parser = argparse.ArgumentParser()
parser.add_argument(
"--binding-data-csv",
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--save-incomplete-affinity-matrix",
default=None,
help="Path to CSV which will contains the incomplete affinity matrix")
parser.add_argument(
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--output-file",
default="matrix-completion-accuracy-results.csv")
parser.add_argument(
"--normalize-columns",
default=False,
action="store_true",
help="Center and rescale columns of affinity matrix")
parser.add_argument(
"--normalize-rows",
default=False,
action="store_true",
help="Center and rescale rows of affinity matrix")
parser.add_argument(
"--min-observations-per-peptide",
type=int,
default=2,
help="Drop peptide entries with fewer than this number of measured affinities")
parser.add_argument(
"--n-folds",
default=10,
type=int,
help="Number of cross-validation folds")
parser.add_argument(
"--verbose",
default=False,
action="store_true")
def create_imputation_methods(
verbose=False,
clip_imputed_values=False,
knn_print_interval=20,
knn_params=[1, 3, 5],
softimpute_params=[1, 5, 10],
svd_params=[5, 10, 20]):
min_value = 0 if clip_imputed_values else None
max_value = 1 if clip_imputed_values else None
result_dict = {
"meanFill": SimpleFill("mean"),
"zeroFill": SimpleFill("zero"),
"mice": MICE(
n_burn_in=5,
n_imputations=25,
min_value=min_value,
max_value=max_value,
verbose=verbose),
"similarityWeightedAveraging": SimilarityWeightedAveraging(
orientation="columns",
verbose=verbose),
}
for threshold in softimpute_params:
result_dict["softImpute-%d" % threshold] = SoftImpute(
threshold,
verbose=verbose,
min_value=min_value,
max_value=max_value)
for rank in svd_params:
result_dict["svdImpute-%d" % rank] = IterativeSVD(
rank,
verbose=verbose,
min_value=min_value,
max_value=max_value)
for k in knn_params:
result_dict["knnImpute-%d" % k] = KNN(
k,
orientation="columns",
verbose=verbose,
print_interval=knn_print_interval)
return result_dict
if __name__ == "__main__":
args = parser.parse_args()
print(args)
imputation_methods = create_imputation_methods(
verbose=args.verbose,
clip_imputed_values=not (args.normalize_rows or args.normalize_rows),
)
print("Imputation methods: %s" % imputation_methods)
X, peptide_list, allele_list = load_data(
binding_data_csv=args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=args.only_human,
min_observations_per_allele=args.n_folds,
min_observations_per_peptide=args.min_observations_per_peptide)
observed_mask = np.isfinite(X)
print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % (
X.shape,
observed_mask.sum(),
X.size,
100.0 * observed_mask.sum() / X.size))
if args.save_incomplete_affinity_matrix:
print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
scores = ScoreSet()
kfold = stratified_cross_validation(
X=X,
observed_mask=observed_mask,
n_folds=args.n_folds)
for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold):
X_fold_reduced = X_fold[ok_mesh]
biscaler = BiScaler(
scale_rows=args.normalize_rows,
center_rows=args.normalize_rows,
scale_columns=args.normalize_columns,
center_columns=args.normalize_columns)
X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
for (method_name, solver) in sorted(imputation_methods.items()):
print("CV fold %d/%d, running %s" % (
fold_idx + 1,
args.n_folds,
method_name))
X_completed_reduced_scaled = solver.complete(X_fold_reduced)
X_completed_reduced = biscaler.inverse_transform(
X_completed_reduced_scaled)
X_completed = np.zeros_like(X)
X_completed[ok_mesh] = X_completed_reduced
y_pred = X_completed[test_coords]
mae, tau, auc, f1_score = evaluate_predictions(
y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50)
scores.add_many(
method_name,
mae=mae,
tau=tau,
f1_score=f1_score,
auc=auc)
scores.to_csv(args.output_file)
This diff is collapsed.
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from fancyimpute.dictionary_helpers import (
dense_matrix_from_nested_dictionary,
transpose_nested_dictionary,
)
from mhcflurry.data import load_allele_dicts
import sklearn.metrics
from sklearn.cross_validation import StratifiedKFold
from scipy import stats
import numpy as np
def evaluate_predictions(
y_true,
y_pred,
max_ic50):
"""
Return mean absolute error, Kendall tau, AUC and F1 score
"""
if len(y_pred) != len(y_true):
raise ValueError("y_pred must have same number of elements as y_true")
mae = np.mean(np.abs(y_true - y_pred))
# if all predictions are the same
if (y_pred[0] == y_pred).all():
logging.warn("All predicted values were the same: %f" % y_pred[0])
return (mae, 0, 0.5, 0)
tau, _ = stats.kendalltau(
y_pred,
y_true)
if np.isnan(tau):
logging.warn("Value for Kendall's tau was NaN (n=%d)" % (len(y_true),))
tau = 0.0
true_ic50s = max_ic50 ** (1.0 - np.array(y_true))
predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred))
true_binding_label = true_ic50s <= 500
if true_binding_label.all():
logging.warn("All true labels are binders, can't compute AUC")
return (mae, tau, 0.5, 0)
elif not true_binding_label.any():
logging.warn("No true labels are binders, can't compute AUC")
# can't compute AUC or F1 without both negative and positive cases
return (mae, tau, 0.5, 0)
auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred)
predicted_binding_label = predicted_ic50s <= 500
if predicted_binding_label.all():
logging.warn("All predicted labels are binders, can't compute F1")
return (mae, tau, auc, 0)
elif not predicted_binding_label.any():
logging.warn("No predicted labels are binders, can't compute F1")
# can't compute F1 without both positive and negative predictions
return (mae, tau, auc, 0)
f1_score = sklearn.metrics.f1_score(
true_binding_label,
predicted_binding_label)
return mae, tau, auc, f1_score
def prune_data(
X,
peptide_list,
allele_list,
min_observations_per_peptide=1,
min_observations_per_allele=1):
observed_mask = np.isfinite(X)
n_observed_per_peptide = observed_mask.sum(axis=1)
too_few_peptide_observations = (
n_observed_per_peptide < min_observations_per_peptide)
if too_few_peptide_observations.any():
drop_peptide_indices = np.where(too_few_peptide_observations)[0]
keep_peptide_indices = np.where(~too_few_peptide_observations)[0]
print("Dropping %d peptides with <%d observations" % (
len(drop_peptide_indices),
min_observations_per_peptide))
X = X[keep_peptide_indices]
observed_mask = observed_mask[keep_peptide_indices]
peptide_list = [peptide_list[i] for i in keep_peptide_indices]
n_observed_per_allele = observed_mask.sum(axis=0)
too_few_allele_observations = (
n_observed_per_allele < min_observations_per_peptide)
if too_few_peptide_observations.any():
drop_allele_indices = np.where(too_few_allele_observations)[0]
keep_allele_indices = np.where(~too_few_allele_observations)[0]
print("Dropping %d alleles with <%d observations: %s" % (
len(drop_allele_indices),
min_observations_per_allele,
[allele_list[i] for i in drop_allele_indices]))
X = X[:, keep_allele_indices]
observed_mask = observed_mask[:, keep_allele_indices]
allele_list = [allele_list[i] for i in keep_allele_indices]
return X, peptide_list, allele_list
def load_data(
binding_data_csv,
max_ic50,
only_human=False,
min_observations_per_allele=1,
min_observations_per_peptide=1):
allele_to_peptide_to_affinity = load_allele_dicts(
binding_data_csv,
max_ic50=max_ic50,
only_human=only_human,
regression_output=True)
peptide_to_allele_to_affinity = transpose_nested_dictionary(
allele_to_peptide_to_affinity)
n_binding_values = sum(
len(allele_dict)
for allele_dict in
allele_to_peptide_to_affinity.values()
)
print("Loaded %d binding values for %d alleles" % (
n_binding_values,
len(allele_to_peptide_to_affinity)))
X, peptide_list, allele_list = \
dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity)
return prune_data(
X,
peptide_list,
allele_list,
min_observations_per_peptide=min_observations_per_peptide,
min_observations_per_allele=min_observations_per_allele)
def index_counts(indices):
max_index = indices.max()
counts = np.zeros(max_index + 1, dtype=int)
for index in indices:
counts[index] += 1
return counts
def stratified_cross_validation(X, observed_mask, n_folds=10):
n_observed = observed_mask.sum()
(observed_peptide_index, observed_allele_index) = np.where(observed_mask)
observed_indices = np.ravel_multi_index(
(observed_peptide_index, observed_allele_index),
dims=observed_mask.shape)
assert len(observed_indices) == n_observed
observed_allele_counts = observed_mask.sum(axis=0)
print("# observed per allele: %s" % (observed_allele_counts,))
assert (index_counts(observed_allele_index) == observed_allele_counts).all()
kfold = StratifiedKFold(
observed_allele_index,
n_folds=n_folds,
shuffle=True)
for (_, indirect_test_indices) in kfold:
test_linear_indices = observed_indices[indirect_test_indices]
test_coords = np.unravel_index(
test_linear_indices,
dims=observed_mask.shape)
test_allele_counts = index_counts(test_coords[1])
allele_fractions = test_allele_counts / observed_allele_counts.astype(float)
print("Fraction of each allele in this CV fold: %s" % (allele_fractions,))
X_test_vector = X[test_coords]
X_fold = X.copy()
X_fold[test_coords] = np.nan
empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0
ok_row_mask = ~empty_row_mask
ok_row_indices = np.where(ok_row_mask)[0]
empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0
ok_col_mask = ~empty_col_mask
ok_col_indices = np.where(ok_col_mask)[0]
ok_mesh = np.ix_(ok_row_indices, ok_col_indices)
print("Dropping %d empty rows, %d empty columns" % (
empty_row_mask.sum(),
empty_col_mask.sum()))
yield (X_fold, ok_mesh, test_coords, X_test_vector)
......@@ -143,6 +143,8 @@ parser.add_argument(
help="Comma separated list of optimization methods")
if __name__ == "__main__":
args = parser.parse_args()
configs = generate_all_model_configs(
......
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
import argparse
import pandas as pd
from dataset_paths import PETERS2009_CSV_PATH
from allele_similarities import compute_allele_similarities
from synthetic_data import (
create_reverse_lookup_from_simple_dicts,
synthesize_affinities_for_all_alleles,
load_sims_dict
)
from mhcflurry.data import load_allele_dicts
parser = argparse.ArgumentParser()
parser.add_argument(
"--binding-data-csv",
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--output-csv",
help="Path for CSV containing synthesized dataset of affinities",
required=True)
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--allele-similarity-csv",
required=False)
parser.add_argument(
"--smoothing-coef",
default=0.001,
type=float,
help="Smoothing value used for peptides with low weight across alleles")
parser.add_argument(
"--similarity-exponent",
default=2.0,
type=float,
help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
if __name__ == "__main__":
args = parser.parse_args()
print(args)
allele_to_peptide_to_affinity = load_allele_dicts(
args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=args.only_human,
regression_output=True)
print("Loaded binding data for %d alleles" % (
len(allele_to_peptide_to_affinity),))
reverse_lookup = create_reverse_lookup_from_simple_dicts(
allele_to_peptide_to_affinity)
print("Created reverse lookup dictionary for %d peptides" % len(reverse_lookup))
if args.allele_similarity_csv:
sims_dict = load_sims_dict(
args.allele_similarity_csv,
allele_pair_keys=False)
allele_names = list(sims_dict.keys())
print("Loaded similarities between %d alleles from %s" % (
len(allele_names),
args.allele_similarity_csv))
else:
sims_dict, _, _ = \
compute_allele_similarities(
allele_to_peptide_to_affinity,
min_weight=0.1)
synthetic_data = synthesize_affinities_for_all_alleles(
peptide_to_affinities=reverse_lookup,
pairwise_allele_similarities=sims_dict,
smoothing=args.smoothing_coef,
exponent=args.similarity_exponent,
allele_pair_keys=False)
synthetic_data_dict = OrderedDict([
("mhc", []),
("sequence", []),
("ic50", []),
])
for allele, allele_entries in sorted(synthetic_data.items()):
print("%s (n=%d)" % (allele, len(allele_entries)))
for (peptide, regression_value) in allele_entries.items():
synthetic_data_dict["mhc"].append(allele)
synthetic_data_dict["sequence"].append(peptide)
ic50 = args.max_ic50 ** (1.0 - regression_value)
synthetic_data_dict["ic50"].append(ic50)
synthetic_data_df = pd.DataFrame(synthetic_data_dict)
print("Created dataset with %d synthesized pMHC affinities" % (
len(synthetic_data_df),))
synthetic_data_df.to_csv(args.output_csv, index=False)
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper functions for creating synthetic pMHC affinities from allele similarities
and true binding data
"""
from collections import defaultdict
import pandas as pd
from common import curry_dictionary
def load_sims_dict(csv_path, allele_pair_keys=True):
"""
Read in allele-to-allele similarities as DataFrame
"""
sims_df = pd.read_csv(csv_path)
# extract dictionary mapping pairs of alleles to similarity
sims_dict = {
(row["allele_A"], row["allele_B"]): row["similarity"]
for (_, row)
in sims_df.iterrows()
}
if not allele_pair_keys:
return curry_dictionary(sims_dict)
else:
return sims_dict
def synthesize_affinities_for_single_allele(
similarities,
peptide_to_affinities,
smoothing=0.005,
exponent=2.0,
exclude_alleles=[]):
"""
Parameters
----------
similarities : dict
Dictionary mapping allele names to their similarity to the
allele of interest.
peptide_to_affinities : dict
Dictionary mapping peptide sequence to list of triplets containing
the following fields:
- allele
- log-scaled affinity normalized to 0.0 = no binding, 1.0 = strong
- sample weight
smoothing : float
exponent : float
exclude_alleles : collection of allele names
Don't use these alleles for synthesizing new affinities
Returns dictionary mapping peptide sequences to affinities.
"""
assert isinstance(similarities, dict), \
"Wrong type for similarities: %s" % type(similarities)
results = {}
for peptide, affinities in peptide_to_affinities.items():
total = 0.0
denom = 0.0
for entry in affinities:
if len(entry) == 2:
(allele, y) = entry
sample_weight = 1.0
else:
assert len(entry) == 3
(allele, y, sample_weight) = entry
if allele in exclude_alleles:
continue
sim = similarities.get(allele, 0)
if sim == 0:
continue
combined_weight = sim ** exponent * sample_weight
total += combined_weight * y
denom += combined_weight
if denom > 0.0:
results[peptide] = total / (smoothing + denom)
return results
def create_reverse_lookup_from_allele_data_objects(allele_datasets):
"""
Given a dictionary mapping each allele name to an AlleleData object,
create reverse-lookup dictionary mapping each peptide to a list of triplets:
[(allele, regression_output, weight), ...]
"""
peptide_affinities_dict = defaultdict(list)
for allele, dataset in allele_datasets.items():
for peptide, y, weight in zip(
dataset.peptides, dataset.Y, dataset.weights):
entry = (allele, y, weight)
peptide_affinities_dict[peptide].append(entry)
return peptide_affinities_dict
def create_reverse_lookup_from_simple_dicts(affinities):
"""
Create a lookup table from peptides to lists of (allele, affinity, weight)
"""
reverse_lookup = defaultdict(list)
for allele, affinity_dict in affinities.items():
for (peptide, affinity) in affinity_dict.items():
reverse_lookup[peptide].append((allele, affinity, 1.0))
return reverse_lookup
def synthesize_affinities_for_all_alleles(
peptide_to_affinities,
pairwise_allele_similarities,
allele_pair_keys=True,
smoothing=0.005,
exponent=2.0):
"""
peptide_to_affinities : dict
Maps each peptide to list of either
(allele, affinity) or (allele, affinity, weight)
pairwise_allele_similarities : dict
Dictionary from allele -> allele -> value between 0..1
smoothing : float
exponent : float
"""
if allele_pair_keys:
pairwise_allele_similarities = curry_dictionary(
pairwise_allele_similarities)
all_predictions = {}
allele_names = set(pairwise_allele_similarities.keys())
for allele in allele_names:
all_predictions[allele] = synthesize_affinities_for_single_allele(
similarities=pairwise_allele_similarities[allele],
peptide_to_affinities=peptide_to_affinities,
smoothing=smoothing,
exponent=exponent)
return all_predictions
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import mhcflurry
from mhcflurry.data import (
load_allele_datasets,
encode_peptide_to_affinity_dict,
)
from arg_parsing import parse_int_list, parse_float_list
from dataset_paths import PETERS2009_CSV_PATH
from common import load_csv_binding_data_as_dict
from training_helpers import (
kfold_cross_validation_of_model_params_with_synthetic_data,
average_prediction_scores_list
)
parser = argparse.ArgumentParser()
parser.add_argument(
"--binding-data-csv",
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--synthetic-data-csv",
required=True,
help="CSV with {mhc, sequence, ic50} columns of synthetic data")
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--embedding-dim-sizes",
default=[4, 8, 16, 32, 64],
type=parse_int_list)
parser.add_argument(
"--hidden-layer-sizes",
default=[4, 8, 16, 32, 64, 128, 256],
type=parse_int_list)
parser.add_argument(
"--dropouts",
default=[0.0, 0.25, 0.5],
type=parse_float_list)
parser.add_argument(
"--activation-functions",
default=["tanh", "relu"],
type=lambda s: [si.strip() for si in s.split(",")])
parser.add_argument(
"--training-epochs",
type=int,
default=250)
parser.add_argument("--log-file")
def get_extra_data(allele, train_datasets, expanded_predictions):
original_dataset = train_datasets[allele]
original_peptides = set(original_dataset.peptides)
expanded_allele_affinities = expanded_predictions[allele]
extra_affinities = {
k: v
for (k, v) in expanded_allele_affinities.items()
if k not in original_peptides
}
extra_peptides = list(extra_affinities.keys())
extra_values = list(extra_affinities.values())
extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
extra_Y = np.array(extra_values)
return extra_X, extra_Y
def rescale_ic50(ic50, max_ic50):
log_ic50 = np.log(ic50) / np.log(args.max_ic50)
return max(0.0, min(1.0, 1.0 - log_ic50))
def load_synthetic_data(csv_path, max_ic50):
synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
csv_path)
return {
allele: {
peptide: rescale_ic50(ic50, max_ic50=max_ic50)
for (peptide, ic50)
in allele_dict.items()
}
for (allele, allele_dict)
in synthetic_allele_to_peptide_to_ic50_dict.items()
}
if __name__ == "__main__":
args = parser.parse_args()
print(args)
print("Loading binding data from %s" % args.binding_data_csv)
allele_datasets = load_allele_datasets(
args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=False)
print("Loading synthetic data from %s" % args.synthetic_data_csv)
synthetic_affinities = load_synthetic_data(
csv_path=args.synthetic_data_csv,
max_ic50=args.max_ic50)
combined_allele_set = set(allele_datasets.keys()).union(
synthetic_affinities.keys())
combined_allele_list = list(sorted(combined_allele_set))
if args.log_file:
logfile = open(args.log_file, "w")
logfile.write("allele,n_samples,n_unique,n_synth,")
else:
logfile = None
logfile_needs_header = True
for allele in combined_allele_list:
synthetic_allele_dict = synthetic_affinities[allele]
(_, _, Counts_synth, X_synth, _, Y_synth) = \
encode_peptide_to_affinity_dict(synthetic_allele_dict)
synthetic_sample_weights = 1.0 / Counts_synth
scores = {}
source_peptides = allele_datasets[allele].original_peptides
X_original = allele_datasets[allele].X_index
Y_original = allele_datasets[allele].Y
n_samples = len(X_original)
n_unique_samples = len(set(source_peptides))
n_synth_samples = len(synthetic_sample_weights)
for dropout in args.dropouts:
for embedding_dim_size in args.embedding_dim_sizes:
for hidden_layer_size in args.hidden_layer_sizes:
for activation in args.activation_functions:
params = (
("dropout_probability", dropout),
("embedding_dim_size", embedding_dim_size),
("hidden_layer_size", hidden_layer_size),
("activation", activation),
)
print(
"Evaluating allele %s (n=%d, unique=%d): %s" % (
allele,
n_samples,
n_unique_samples,
params))
fold_scores = \
kfold_cross_validation_of_model_params_with_synthetic_data(
X_original=X_original,
Y_original=Y_original,
source_peptides_original=source_peptides,
X_synth=X_synth,
Y_synth=Y_synth,
original_sample_weights=allele_datasets[allele].weights,
synthetic_sample_weights=synthetic_sample_weights,
n_training_epochs=args.training_epochs,
max_ic50=args.max_ic50,
**dict(params))
average_scores = average_prediction_scores_list(fold_scores)
if logfile:
if logfile_needs_header:
for param_name, _ in params:
logfile.write("%s," % param_name)
for score_name in average_scores._fields:
logfile.write("%s," % score_name)
logfile.write("param_id\n")
logfile_needs_header = False
logfile.write("%s,%d,%d,%d," % (
allele,
n_samples,
n_unique_samples,
n_synth_samples))
for _, param_value in params:
logfile.write("%s," % param_value)
for score_name in average_scores._fields:
score_value = average_scores.__dict__[score_name]
logfile.write("%0.4f," % score_value)
logfile.write("%d\n" % len(scores))
logfile.flush()
scores[params] = average_scores
print("%s => %s" % (params, average_scores))
if logfile:
logfile.close()
......@@ -12,13 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
from collections import OrderedDict, namedtuple
import logging
from itertools import groupby
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.cross_validation import LabelKFold
from scipy import stats
import mhcflurry
from mhcflurry.common import normalize_allele_name
from mhcflurry.data_helpers import indices_to_hotshot_encoding
from mhcflurry.data import indices_to_hotshot_encoding
PredictionScores = namedtuple("PredictionScores", "tau auc f1 accuracy mae")
def encode_allele_dataset(
......@@ -81,27 +89,54 @@ def encode_allele_datasets(
return (X_dict, Y_log_ic50_dict, ic50_dict)
def f1_score(true_label, label_pred):
tp = (true_label & label_pred).sum()
fp = ((~true_label) & label_pred).sum()
fn = (true_label & (~label_pred)).sum()
recall = (tp / float(tp + fn)) if (tp + fn) > 0 else 0.0
precision = (tp / float(tp + fp)) if (tp + fp) > 0 else 0.0
if (precision + recall) > 0:
return (2 * precision * recall) / (precision + recall)
else:
return 0.0
def score_predictions(predicted_log_ic50, true_label, max_ic50):
"""Computes accuracy, AUC, and F1 score of predictions"""
auc = roc_auc_score(true_label, predicted_log_ic50)
ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50)
label_pred = (ic50_pred <= 500)
same_mask = true_label == label_pred
def score_predictions(
predicted_log_ic50,
true_log_ic50,
max_ic50):
"""Computes Kendall-tau, AUC, F1 score, and accuracy of predictions
Parameters
----------
predicted_log_ic50 : np.array
true_log_ic50 : np.array
max_ic50 : float
Returns PredictionScores object with fields (tau, auc, f1, accuracy)
"""
mae = np.abs(predicted_log_ic50 - true_log_ic50).mean()
tau, _ = stats.kendalltau(predicted_log_ic50, true_log_ic50)
if np.isnan(tau):
logging.warn("Kendall tau was NaN!")
tau = 0.0
true_ic50s = max_ic50 ** (1.0 - np.array(true_log_ic50))
predicted_ic50s = max_ic50 ** (1.0 - np.array(predicted_log_ic50))
true_binding_label = true_ic50s <= 500
if true_binding_label.all() or not true_binding_label.any():
logging.warn(
("Can't compute AUC, F1, accuracy without both"
" negative and positive ground truth labels"))
return PredictionScores(
tau=tau,
auc=0.5,
f1=0.0,
accuracy=0.0,
mae=mae)
auc = metrics.roc_auc_score(true_binding_label, predicted_log_ic50)
predicted_binding_label = predicted_ic50s <= 500
f1_score = metrics.f1_score(true_binding_label, predicted_binding_label)
same_mask = true_binding_label == predicted_binding_label
accuracy = np.mean(same_mask)
f1 = f1_score(true_label, label_pred)
return accuracy, auc, f1
return PredictionScores(
tau=tau,
auc=auc,
f1=f1_score,
accuracy=accuracy,
mae=mae)
def train_model_and_return_scores(
......@@ -109,7 +144,7 @@ def train_model_and_return_scores(
X_train,
log_ic50_train,
X_test,
binder_label_test,
log_ic50_test,
n_training_epochs,
minibatch_size,
max_ic50):
......@@ -120,8 +155,339 @@ def train_model_and_return_scores(
verbose=0,
batch_size=minibatch_size)
pred = model.predict(X_test).flatten()
accuracy, auc, f1_score = score_predictions(
return score_predictions(
predicted_log_ic50=pred,
true_label=binder_label_test,
true_log_ic50=log_ic50_test,
max_ic50=max_ic50)
return (accuracy, auc, f1_score)
def train_model_with_synthetic_data(
model,
n_training_epochs,
X_original,
Y_original,
X_synth,
Y_synth,
original_sample_weights=None,
synthetic_sample_weights=None):
n_actual_samples, n_actual_dims = X_original.shape
n_synth_samples, n_synth_dims = X_synth.shape
if original_sample_weights is None:
original_sample_weights = np.ones_like(Y_original)
if synthetic_sample_weights is None:
synthetic_sample_weights = np.ones_like(Y_synth)
total_synth_weights = synthetic_sample_weights.sum()
total_original_weights = original_sample_weights.sum()
print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % (
np.mean(Y_original),
np.mean(Y_synth),
np.mean(original_sample_weights),
np.mean(synthetic_sample_weights)))
combined_weights = np.concatenate([
original_sample_weights,
synthetic_sample_weights
])
assert n_actual_dims == n_synth_dims, \
"Mismatch between # of actual dims %d and synthetic dims %d" % (
n_actual_dims, n_synth_dims)
X_combined = np.vstack([X_original, X_synth])
n_combined_samples = n_actual_samples + n_synth_samples
assert X_combined.shape[0] == n_combined_samples, \
"Expected %d samples but got data array with shape %s" % (
n_actual_samples + n_synth_samples, X_combined.shape)
assert len(combined_weights) == n_combined_samples, \
"Expected combined_weights to have length %d but got shape = %s" % (
n_combined_samples,
combined_weights.shape)
Y_combined = np.concatenate([Y_original, Y_synth])
assert Y_combined.min() >= 0, \
"Y should not contain negative numbers! Y.min() = %f" % (
Y_combined.min(),)
assert Y_combined.max() <= 1, \
"Y should have max value 1.0, got Y.max() = %f" % (
Y_combined.max(),)
for epoch in range(n_training_epochs):
# weights for synthetic points can be shrunk as:
# ~ 1 / (1+epoch)**2
# or
# e ** -epoch
decay_factor = np.exp(-epoch)
# if the contribution of synthetic samples is less than a
# thousandth of the actual data, then stop using it
synth_contribution = total_synth_weights * decay_factor
synth_fraction_contribution = synth_contribution / (
synth_contribution + total_original_weights)
# only use synthetic data if it contributes at least 1/1000th of
# sample weight
use_synth_data = synth_fraction_contribution > 0.001
if use_synth_data:
combined_weights[n_actual_samples:] = (
synthetic_sample_weights * decay_factor)
model.fit(
X_combined,
Y_combined,
sample_weight=combined_weights,
nb_epoch=1,
verbose=0)
else:
model.fit(
X_original,
Y_original,
sample_weight=original_sample_weights,
nb_epoch=1,
verbose=0)
Y_pred = model.predict(X_original)
training_mse = ((Y_original - Y_pred) ** 2).mean()
synth_data_percent = (
((1.0 - synth_fraction_contribution) * 100)
if use_synth_data
else 100
)
real_data_percent = (
(synth_fraction_contribution * 100)
if use_synth_data else 0
)
print(
("-- Epoch %d/%d real data weight %0.2f%%,"
" synth data weight %0.2f%%,"
" Training MSE %0.4f") % (
epoch + 1,
n_training_epochs,
real_data_percent,
synth_data_percent,
training_mse))
def peptide_group_indices(original_peptide_sequences):
"""
Given a list of peptide sequences, some of which are identical,
generate a sequence of index lists for the occurrence of distinct
entries.
"""
indices_and_sequences = enumerate(original_peptide_sequences)
for _, subiter in groupby(indices_and_sequences, lambda pair: pair[1]):
yield [idx for (idx, _) in subiter]
def collapse_peptide_group_affinities(
predictions,
true_values,
original_peptide_sequences,
combine_multiple_predictions_fn=np.median):
"""
Given predictions and true log-transformed IC50 values for 9mers which may
have been either elongated or shortened from peptide sequences of other
lengths, collapse the log-transformed IC50 values to a single value
per original peptide sequence.
"""
assert len(predictions) == len(true_values), \
("Expected predictions (%d elements) to have same length"
" as true values (%d elements)") % (len(predictions), len(true_values))
assert len(predictions) == len(original_peptide_sequences), \
("Expected predictions (%d elements) to have same length"
" as peptide sequences (%d elements)") % (
len(predictions), len(original_peptide_sequences))
collapsed_predictions = []
collapsed_true_values = []
for peptide_indices in peptide_group_indices(original_peptide_sequences):
if len(peptide_indices) == 1:
idx = peptide_indices[0]
collapsed_predictions.append(predictions[idx])
collapsed_true_values.append(true_values[idx])
else:
collapsed_predictions.append(
combine_multiple_predictions_fn(predictions[peptide_indices]))
collapsed_true_values.append(
combine_multiple_predictions_fn(
true_values[peptide_indices]))
return collapsed_predictions, collapsed_true_values
def kfold_cross_validation_of_model_fn_with_synthetic_data(
make_model_fn,
n_training_epochs,
max_ic50,
X_train,
Y_train,
source_peptides_train,
X_synth,
Y_synth,
original_sample_weights,
synthetic_sample_weights,
n_cross_validation_folds,
combine_multiple_predictions_fn=np.median,
random_seed=0):
"""
Given a function which generates fresh copies of a model, use k-fold
cross validation (stratified by the list of source peptide sequences) to
evaluate the predictive performance of this model type.
Returns a list of pairs, the first element is a trained model and the second
element is a PredictionScores object with the following fields:
("tau", "f1", "auc", "accuracy")
"""
assert len(X_train) == len(Y_train)
assert len(source_peptides_train) == len(X_train)
# randomly shuffle the training data first
indices = np.arange(len(X_train))
np.random.seed(random_seed)
np.random.shuffle(indices)
X_train, Y_train, weights_train = \
X_train[indices], Y_train[indices], original_sample_weights[indices]
source_peptides_train = [source_peptides_train[i] for i in indices]
# we need to do cross-validation carefully since some of our training
# samples may be extracted from the same longer peptide.
# To avoid training vs. test contamination we do k-fold cross validation
# stratified by the "source" peptides of each sample.
cv_iterator = enumerate(
LabelKFold(
labels=source_peptides_train,
n_folds=n_cross_validation_folds))
results = []
for fold_number, (fold_train_index, fold_test_index) in cv_iterator:
model = make_model_fn()
X_train_fold, X_test_fold = \
X_train[fold_train_index], X_train[fold_test_index]
weights_train_fold = weights_train[fold_train_index]
Y_train_fold, Y_test_fold = \
Y_train[fold_train_index], Y_train[fold_test_index]
peptides_train_fold = [
source_peptides_train[i]
for i in fold_train_index
]
peptides_test_fold = [
source_peptides_train[i]
for i in fold_test_index
]
train_model_with_synthetic_data(
model=model,
n_training_epochs=n_training_epochs,
X_original=X_train_fold,
Y_original=Y_train_fold,
X_synth=X_synth,
Y_synth=Y_synth,
original_sample_weights=weights_train_fold,
synthetic_sample_weights=synthetic_sample_weights)
Y_pred = model.predict(X_test_fold).flatten()
# since some 'epitopes' are actually substrings of longer peptides
# we need to coalesce those predictions by calling the function
# `combine_multiple_predictions_fn` on the set of predictions
# associated with one longer peptide
collapsed_predictions, collapsed_true_values = \
collapse_peptide_group_affinities(
predictions=Y_pred,
true_values=Y_test_fold,
original_peptide_sequences=peptides_test_fold,
combine_multiple_predictions_fn=combine_multiple_predictions_fn)
scores = score_predictions(
predicted_log_ic50=collapsed_predictions,
true_log_ic50=collapsed_true_values,
max_ic50=max_ic50)
print("::: CV fold %d/%d (n_samples=%d, n_unique=%d): %s\n\n" % (
fold_number + 1,
n_cross_validation_folds,
len(peptides_train_fold),
len(set(peptides_train_fold)),
scores))
results.append((model, scores))
return results
def average_prediction_scores_list(model_scores_list):
"""
Given a list of (model, PredictionScores) pairs,
returns a single PredictionScores object whose fields are
the average across the list.
"""
n = len(model_scores_list)
return PredictionScores(
tau=sum(x.tau for (_, x) in model_scores_list) / n,
auc=sum(x.auc for (_, x) in model_scores_list) / n,
f1=sum(x.f1 for (_, x) in model_scores_list) / n,
accuracy=sum(x.accuracy for (_, x) in model_scores_list) / n)
def kfold_cross_validation_of_model_params_with_synthetic_data(
X_original,
Y_original,
source_peptides_original,
X_synth=None,
Y_synth=None,
original_sample_weights=None,
synthetic_sample_weights=None,
n_training_epochs=150,
n_cross_validation_folds=5,
embedding_dim_size=16,
hidden_layer_size=50,
dropout_probability=0.0,
activation="tanh",
max_ic50=50000.0):
"""
Returns:
- PredictionScores object with average Kendall-tau, AUC, F1,
and accuracy across all cross-validation folds.
- list of all (model, PredictionScores) objects from cross-validation
folds
"""
n_unique_samples = len(set(source_peptides_original))
if n_cross_validation_folds > n_unique_samples:
n_cross_validation_folds = n_unique_samples
if original_sample_weights is None:
original_sample_weights = np.ones(len(X_original), dtype=float)
# if synthetic data is missing then make an empty array so all the
# downstream code still works
if X_synth is None:
X_synth = np.array([[]])
# if only the target values for synthetic data are missing then
# treat all synthetic entries as negative examples
if Y_synth is None:
Y_synth = np.zeros(len(X_synth), dtype=float)
if synthetic_sample_weights is None:
synthetic_sample_weights = np.ones(len(X_synth), dtype=float)
def make_model():
return mhcflurry.feedforward.make_embedding_network(
peptide_length=9,
embedding_input_dim=20,
embedding_output_dim=embedding_dim_size,
layer_sizes=[hidden_layer_size],
activation=activation,
init="lecun_uniform",
loss="mse",
output_activation="sigmoid",
dropout_probability=dropout_probability,
optimizer=None,
learning_rate=0.001)
model_scores_list = kfold_cross_validation_of_model_fn_with_synthetic_data(
make_model_fn=make_model,
n_training_epochs=n_training_epochs,
max_ic50=max_ic50,
X_train=X_original,
Y_train=Y_original,
source_peptides_train=source_peptides_original,
X_synth=X_synth,
Y_synth=Y_synth,
original_sample_weights=original_sample_weights,
synthetic_sample_weights=synthetic_sample_weights,
n_cross_validation_folds=n_cross_validation_folds)
return model_scores_list
......@@ -13,15 +13,19 @@
# limitations under the License.
from . import paths
from . import data_helpers
from . import data
from . import feedforward
from . import common
from .mhc1_binding_predictor import Mhc1BindingPredictor
from . import peptide_encoding
from . import amino_acid
from .class1_binding_predictor import Class1BindingPredictor
__all__ = [
"paths",
"data_helpers",
"data",
"feedforward",
"peptide_encoding",
"amino_acid",
"common",
"Mhc1BindingPredictor"
]
\ No newline at end of file
"Class1BindingPredictor"
]
......@@ -12,7 +12,81 @@
# See the License for the specific language governing permissions and
# limitations under the License.
amino_acids = {
import numpy as np
class Alphabet(object):
"""
Used to track the order of amino acids used for peptide encodings
"""
def __init__(self, **kwargs):
self.letters_to_names = {}
for (k, v) in kwargs.items():
self.add(k, v)
def add(self, letter, name):
assert letter not in self.letters_to_names
assert len(letter) == 1
self.letters_to_names[letter] = name
def letters(self):
return list(sorted(self.letters_to_names.keys()))
def names(self):
return [self.letters_to_names[k] for k in self.letters()]
def index_dict(self):
return {c: i for (i, c) in enumerate(self.letters())}
def copy(self):
return Alphabet(**self.letters_to_names)
def __getitem__(self, k):
return self.letters_to_names[k]
def __setitem__(self, k, v):
self.add(k, v)
def index_encoding_list(self, peptides):
index_dict = self.index_dict()
return [
[index_dict[amino_acid] for amino_acid in peptide]
for peptide in peptides
]
def index_encoding(self, peptides, peptide_length):
"""
Encode a set of equal length peptides as a matrix of their
amino acid indices.
"""
X = np.zeros((len(peptides), peptide_length), dtype=int)
index_dict = self.index_dict()
for i, peptide in enumerate(peptides):
for j, amino_acid in enumerate(peptide):
X[i, j] = index_dict[amino_acid]
return X
def hotshot_encoding(
self,
peptides,
peptide_length):
"""
Encode a set of equal length peptides as a binary matrix,
where each letter is transformed into a length 20 vector with a single
element that is 1 (and the others are 0).
"""
shape = (len(peptides), peptide_length, 20)
index_dict = self.index_dict()
X = np.zeros(shape, dtype=bool)
for i, peptide in enumerate(peptides):
for j, amino_acid in enumerate(peptide):
k = index_dict[amino_acid]
X[i, j, k] = 1
return X
common_amino_acids = Alphabet(**{
"A": "Alanine",
"R": "Arginine",
"N": "Asparagine",
......@@ -33,8 +107,9 @@ amino_acids = {
"W": "Tryptophan",
"Y": "Tyrosine",
"V": "Valine",
}
})
common_amino_acid_letters = common_amino_acids.letters()
amino_acid_letters = list(sorted(amino_acids.keys()))
amino_acid_names = [amino_acids[k] for k in amino_acid_letters]
amino_acid_letter_indices = {c: i for (i, c) in enumerate(amino_acid_letters)}
amino_acids_with_unknown = common_amino_acids.copy()
amino_acids_with_unknown.add("X", "Unknown")
amino_acids_with_unknown_letters = amino_acids_with_unknown.letters()
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Allele specific MHC Class I binding affinity predictor
"""
from __future__ import (
print_function,
division,
absolute_import,
)
import logging
from os import listdir, remove
from os.path import exists, join
import json
import numpy as np
from keras.models import model_from_config
from .common import normalize_allele_name
from .paths import CLASS1_MODEL_DIRECTORY
from .feedforward import make_embedding_network
from .predictor_base import PredictorBase
from .class1_allele_specific_hyperparameters import MAX_IC50
_allele_predictor_cache = {}
class Class1BindingPredictor(PredictorBase):
def __init__(
self,
model,
name=None,
max_ic50=MAX_IC50,
allow_unknown_amino_acids=False,
verbose=False):
PredictorBase.__init__(
self,
name=name,
max_ic50=max_ic50,
allow_unknown_amino_acids=allow_unknown_amino_acids,
verbose=verbose)
self.name = name
self.model = model
@classmethod
def from_disk(
cls,
model_json_path,
weights_hdf_path=None,
name=None,
max_ic50=MAX_IC50):
"""
Load model from stored JSON representation of network and
(optionally) load weights from HDF5 file.
"""
if not exists(model_json_path):
raise ValueError("Model file %s (name = %s) not found" % (
model_json_path, name,))
with open(model_json_path, "r") as f:
config_dict = json.load(f)
model = model_from_config(config_dict)
if weights_hdf_path:
if not exists(weights_hdf_path):
raise ValueError(
"Missing model weights file %s (name = %s)" % (
weights_hdf_path, name))
model.load_weights(weights_hdf_path)
return cls.__init__(
model=model,
max_ic50=max_ic50,
name=name)
@classmethod
def from_hyperparameters(
cls,
name=None,
max_ic50=MAX_IC50,
peptide_length=9,
embedding_input_dim=20,
embedding_output_dim=20,
layer_sizes=[50],
activation="tanh",
init="lecun_uniform",
loss="mse",
output_activation="sigmoid",
dropout_probability=0,
learning_rate=0.001,
**kwargs):
"""
Create untrained predictor with the given hyperparameters.
"""
model = make_embedding_network(
peptide_length=peptide_length,
embedding_input_dim=embedding_input_dim,
embedding_output_dim=embedding_output_dim,
layer_sizes=layer_sizes,
activation=activation,
init=init,
loss=loss,
output_activation=output_activation,
dropout_probability=dropout_probability,
learning_rate=learning_rate)
return cls(
name=name,
max_ic50=max_ic50,
model=model,
**kwargs)
def _combine_training_data(
self,
X,
Y,
sample_weights,
X_pretrain,
Y_pretrain,
pretrain_sample_weights,
verbose=False):
"""
Make sure the shapes of given training and pre-training data
conform with each other. Then concatenate the pre-training and the
training data.
Returns (X_combined, Y_combined, weights_combined, n_pretrain_samples)
"""
X = np.asarray(X)
Y = np.asarray(Y)
if verbose:
print("X.shape = %s" % (X.shape,))
n_samples, n_dims = X.shape
if len(Y) != n_samples:
raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % (
n_samples, len(Y)))
if Y.min() < 0:
raise ValueError("Minimum value of Y can't be negative, got %f" % (
Y.min()))
if Y.max() > 1:
raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
Y.max()))
if sample_weights is None:
sample_weights = np.ones_like(Y)
else:
sample_weights = np.asarray(sample_weights)
if len(sample_weights) != n_samples:
raise ValueError(
"Length of sample_weights (%d) doesn't match number of samples (%d)" % (
len(sample_weights),
n_samples))
if X_pretrain is None or Y_pretrain is None:
X_pretrain = np.empty((0, n_dims), dtype=X.dtype)
Y_pretrain = np.empty((0,), dtype=Y.dtype)
else:
X_pretrain = np.asarray(X_pretrain)
Y_pretrain = np.asarray(Y_pretrain)
if verbose:
print("X_pretrain.shape = %s" % (X_pretrain.shape,))
n_pretrain_samples, n_pretrain_dims = X_pretrain.shape
if n_pretrain_dims != n_dims:
raise ValueError(
"# of dims for pretraining data (%d) doesn't match X.shape[1] = %d" % (
n_pretrain_dims, n_dims))
if len(Y_pretrain) != n_pretrain_samples:
raise ValueError(
"length of Y_pretrain (%d) != length of X_pretrain (%d)" % (
len(Y_pretrain),
len(X_pretrain)))
if len(Y_pretrain) > 0 and Y_pretrain.min() < 0:
raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % (
Y.min()))
if len(Y_pretrain) > 0 and Y_pretrain.max() > 1:
raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % (
Y.max()))
if pretrain_sample_weights is None:
pretrain_sample_weights = np.ones_like(Y_pretrain)
else:
pretrain_sample_weights = np.asarray(pretrain_sample_weights)
if verbose:
print("sample weights mean = %f, pretrain weights mean = %f" % (
sample_weights.mean(),
pretrain_sample_weights.mean()))
X_combined = np.vstack([X_pretrain, X])
Y_combined = np.concatenate([Y_pretrain, Y])
combined_weights = np.concatenate([
pretrain_sample_weights,
sample_weights,
])
return X_combined, Y_combined, combined_weights, n_pretrain_samples
def fit(
self,
X,
Y,
sample_weights=None,
X_pretrain=None,
Y_pretrain=None,
pretrain_sample_weights=None,
n_training_epochs=200,
verbose=False,
batch_size=128):
"""
Train predictive model from index encoding of fixed length 9mer peptides.
Parameters
----------
X : array
Training data with shape (n_samples, n_dims)
Y : array
Training labels with shape (n_samples,)
sample_weights : array
Weight of each training sample with shape (n_samples,)
X_pretrain : array
Extra samples used for soft pretraining of the predictor,
should have same number of dimensions as X. During training the weights
of these samples will decay after each epoch.
Y_pretrain : array
Labels for extra samples, shape
pretrain_sample_weights : array
Initial weights for the rows of X_pretrain. If not specified then
initialized to ones.
n_training_epochs : int
verbose : bool
batch_size : int
"""
X_combined, Y_combined, combined_weights, n_pretrain = \
self._combine_training_data(
X, Y, sample_weights, X_pretrain, Y_pretrain, pretrain_sample_weights,
verbose=verbose)
total_pretrain_sample_weight = combined_weights[:n_pretrain].sum()
total_train_sample_weight = combined_weights[n_pretrain:].sum()
total_combined_sample_weight = (
total_pretrain_sample_weight + total_train_sample_weight)
if self.verbose:
print("-- Total pretrain weight = %f (%f%%), sample weight = %f (%f%%)" % (
total_pretrain_sample_weight,
100 * total_pretrain_sample_weight / total_combined_sample_weight,
total_train_sample_weight,
100 * total_train_sample_weight / total_combined_sample_weight))
for epoch in range(n_training_epochs):
# weights for synthetic points can be shrunk as:
# ~ 1 / (1+epoch)**2
# or
# e ** -epoch
decay_factor = np.exp(-epoch)
# if the contribution of synthetic samples is less than a
# thousandth of the actual data, then stop using it
pretrain_contribution = total_pretrain_sample_weight * decay_factor
pretrain_fraction_contribution = (
pretrain_contribution / total_combined_sample_weight)
use_pretrain_data = pretrain_fraction_contribution > 0.001
# only use synthetic data if it contributes at least 1/1000th of
# sample weight
if verbose:
real_data_percent = (
((1.0 - pretrain_fraction_contribution) * 100)
if use_pretrain_data
else 100
)
pretrain_data_percent = (
(pretrain_fraction_contribution * 100)
if use_pretrain_data else 0
)
print(
("-- Epoch %d/%d decay=%f, real data weight=%0.2f%%,"
" synth data weight=%0.2f%% (use_pretrain=%s)") % (
epoch + 1,
n_training_epochs,
decay_factor,
real_data_percent,
pretrain_data_percent,
use_pretrain_data))
if use_pretrain_data:
combined_weights[:n_pretrain] *= decay_factor
self.model.fit(
X_combined,
Y_combined,
sample_weight=combined_weights,
nb_epoch=1,
verbose=0,
batch_size=batch_size)
else:
self.model.fit(
X_combined[n_pretrain:],
Y_combined[n_pretrain:],
sample_weight=combined_weights[n_pretrain:],
nb_epoch=1,
verbose=0,
batch_size=batch_size)
def to_disk(self, model_json_path, weights_hdf_path, overwrite=False):
if exists(model_json_path) and overwrite:
logging.info(
"Removing existing model JSON file '%s'" % (
model_json_path,))
remove(model_json_path)
if exists(model_json_path):
logging.warn(
"Model JSON file '%s' already exists" % (model_json_path,))
else:
logging.info(
"Saving model file %s (name=%s)" % (model_json_path, self.name))
with open(model_json_path, "w") as f:
f.write(self.model.to_json())
if exists(weights_hdf_path) and overwrite:
logging.info(
"Removing existing model weights HDF5 file '%s'" % (
weights_hdf_path,))
remove(weights_hdf_path)
if exists(weights_hdf_path):
logging.warn(
"Model weights HDF5 file '%s' already exists" % (
weights_hdf_path,))
else:
logging.info(
"Saving model weights HDF5 file %s (name=%s)" % (
weights_hdf_path, self.name))
self.model.save_weights(weights_hdf_path)
@classmethod
def from_allele_name(
cls,
allele_name,
model_directory=CLASS1_MODEL_DIRECTORY,
max_ic50=MAX_IC50):
if not exists(model_directory) or len(listdir(model_directory)) == 0:
raise ValueError(
"No MHC prediction models found in %s" % (model_directory,))
allele_name = normalize_allele_name(allele_name)
key = (allele_name, model_directory, max_ic50)
if key in _allele_predictor_cache:
return _allele_predictor_cache[key]
if not exists(model_directory) or len(listdir(model_directory)) == 0:
raise ValueError(
"No MHC prediction models found in %s" % (model_directory,))
model_json_filename = allele_name + ".json"
model_json_path = join(model_directory, model_json_filename)
weights_hdf_filename = allele_name + ".hdf"
weights_hdf_path = join(model_directory, weights_hdf_filename)
predictor = cls.from_disk(
model_json_path=model_json_path,
weights_hdf_path=weights_hdf_path,
name=allele_name,
max_ic50=max_ic50)
_allele_predictor_cache[key] = predictor
return predictor
@classmethod
def supported_alleles(cls, model_directory=CLASS1_MODEL_DIRECTORY):
alleles_with_weights = set([])
alleles_with_models = set([])
for filename in listdir(model_directory):
if filename.endswith(".hdf"):
alleles_with_weights.add(filename.replace(".hdf", ""))
elif filename.endswith(".json"):
alleles_with_models.add(filename.replace(".json", ""))
alleles = alleles_with_models.intersection(alleles_with_weights)
return list(sorted(alleles))
def __repr__(self):
return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % (
self.name,
self.model,
self.max_ic50)
def __str__(self):
return repr(self)
def predict_encoded(self, X):
max_expected_index = 20 if self.allow_unknown_amino_acids else 19
assert X.max() <= max_expected_index, \
"Got index %d in peptide encoding" % (X.max(),)
return self.model.predict(X, verbose=False).flatten()
......@@ -17,7 +17,6 @@ from __future__ import (
division,
absolute_import,
)
from .amino_acid import amino_acid_letters
def parse_int_list(s):
......@@ -27,35 +26,37 @@ def parse_int_list(s):
def split_uppercase_sequences(s):
return [part.strip().upper() for part in s.split(",")]
MHC_PREFIXES = [
"HLA",
"H-2",
"Mamu",
"Patr",
"Gogo",
"ELA",
]
def normalize_allele_name(allele_name):
def normalize_allele_name(allele_name, default_prefix="HLA"):
"""
Only works for mouse, human, and rhesus monkey alleles.
Only works for a small number of species.
TODO: use the same logic as mhctools for MHC name parsing.
Possibly even worth its own small repo called something like "mhcnames"
"""
allele_name = allele_name.upper()
if allele_name.startswith("MAMU"):
prefix = "Mamu-"
elif allele_name.startswith("H-2") or allele_name.startswith("H2"):
prefix = "H-2-"
else:
prefix = ""
# old school HLA-C serotypes look like "Cw"
allele_name = allele_name.replace("CW", "C")
patterns = [
"HLA-",
"H-2",
"H2",
"MAMU",
"-",
"*",
":"
]
for pattern in patterns:
prefix = default_prefix
for candidate in MHC_PREFIXES:
if (allele_name.startswith(candidate.upper()) or
allele_name.startswith(candidate.replace("-", "").upper())):
prefix = candidate
allele_name = allele_name[len(prefix):]
break
for pattern in MHC_PREFIXES + ["-", "*", ":"]:
allele_name = allele_name.replace(pattern, "")
return "%s%s" % (prefix, allele_name)
return "%s%s" % (prefix + "-" if prefix else "", allele_name)
def split_allele_names(s):
......@@ -64,37 +65,3 @@ def split_allele_names(s):
for part
in s.split(",")
]
def expand_9mer_peptides(peptides, length):
"""
Expand non-9mer peptides using methods from
Accurate approximation method for prediction of class I MHC
affinities for peptides of length 8, 10 and 11 using prediction
tools trained on 9mers.
by Lundegaard et. al.
http://bioinformatics.oxfordjournals.org/content/24/11/1397
"""
assert len(peptides) > 0
if length < 8:
raise ValueError("Invalid peptide length: %d (%s)" % (
length, peptides[0]))
elif length == 9:
return peptides
elif length == 8:
# extend each peptide by inserting every possible amino acid
# between base-1 positions 4-8
return [
peptide[:i] + extra_amino_acid + peptide[i:]
for peptide in peptides
for i in range(3, 8)
for extra_amino_acid in amino_acid_letters
]
else:
# drop interior residues between base-1 positions 4 to last
n_skip = length - 9
return [
peptide[:i] + peptide[i + n_skip:]
for peptide in peptides
for i in range(3, 9)
]
......@@ -17,59 +17,34 @@ from __future__ import (
division,
absolute_import,
)
from collections import namedtuple
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
from .common import normalize_allele_name
from .amino_acid import amino_acid_letter_indices
AlleleData = namedtuple("AlleleData", "X Y peptides ic50")
def hotshot_encoding(peptides, peptide_length):
"""
Encode a set of equal length peptides as a binary matrix,
where each letter is transformed into a length 20 vector with a single
element that is 1 (and the others are 0).
"""
shape = (len(peptides), peptide_length, 20)
X = np.zeros(shape, dtype=bool)
for i, peptide in enumerate(peptides):
for j, amino_acid in enumerate(peptide):
k = amino_acid_letter_indices[amino_acid]
X[i, j, k] = 1
return X
def index_encoding(peptides, peptide_length):
"""
Encode a set of equal length peptides as a vector of their
amino acid indices.
"""
X = np.zeros((len(peptides), peptide_length), dtype=int)
for i, peptide in enumerate(peptides):
for j, amino_acid in enumerate(peptide):
X[i, j] = amino_acid_letter_indices[amino_acid]
return X
from .amino_acid import common_amino_acids
from .peptide_encoding import (
indices_to_hotshot_encoding,
fixed_length_from_many_peptides
)
def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
"""
Given an (n_samples, peptide_length) integer matrix
convert it to a binary encoding of shape:
(n_samples, peptide_length * n_indices)
"""
(n_samples, peptide_length) = X.shape
if not n_indices:
n_indices = X.max() - first_index_value + 1
index_encoding = common_amino_acids.index_encoding
X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
for i, row in enumerate(X):
for j, xij in enumerate(row):
X_binary[i, n_indices * j + xij - first_index_value] = 1
return X_binary.astype(float)
AlleleData = namedtuple(
"AlleleData",
[
"X_index", # index-based featue encoding of fixed length peptides
"X_binary", # binary encoding of fixed length peptides
"Y", # regression encoding of IC50 (log scaled between 0..1)
"peptides", # list of fixed length peptide string
"ic50", # IC50 value associated with each entry
"original_peptides", # original peptides may be of different lengths
"original_lengths", # len(original_peptide)
"substring_counts", # how many substrings were extracted from
# each original peptide string
"weights", # 1.0 / count
])
def _infer_csv_separator(filename):
......@@ -168,14 +143,115 @@ def load_dataframe(
regression_output = np.maximum(regression_output, 0.0)
regression_output = np.minimum(regression_output, 1.0)
df["regression_output"] = regression_output
return df
return df, peptide_column_name
def load_allele_dicts(
filename,
max_ic50=50000.0,
regression_output=False,
sep=None,
species_column_name="species",
allele_column_name="mhc",
peptide_column_name=None,
peptide_length_column_name="peptide_length",
ic50_column_name="meas",
only_human=True,
min_allele_size=1):
"""
Parsing CSV of binding data into dictionary of dictionaries.
The outer key is an allele name, the inner key is a peptide sequence,
and the inner value is an IC50 or log-transformed value between [0,1]
"""
binding_df, peptide_column_name = load_dataframe(
filename=filename,
max_ic50=max_ic50,
sep=sep,
species_column_name=species_column_name,
allele_column_name=allele_column_name,
peptide_column_name=peptide_column_name,
peptide_length_column_name=peptide_length_column_name,
ic50_column_name=ic50_column_name,
only_human=only_human)
# map peptides to either the raw IC50 or rescaled log IC50 depending
# on the `regression_output` parameter
output_column_name = (
"regression_output"
if regression_output
else ic50_column_name
)
return {
allele_name: {
peptide: output
for (peptide, output)
in zip(group[peptide_column_name], group[output_column_name])
}
for (allele_name, group)
in binding_df.groupby(allele_column_name)
if len(group) >= min_allele_size
}
def encode_peptide_to_affinity_dict(
peptide_to_affinity_dict,
peptide_length=9,
flatten_binary_encoding=True):
"""
Given a dictionary mapping from peptide sequences to affinity values,
returns tuple with the following fields:
- kmer_peptides: fixed length peptide strings
- original_peptides: variable length peptide strings
- counts: how many fixed length peptides were made from this original
- X_index: index encoding of fixed length peptides
- X_binary: binary encoding of fixed length peptides
- Y: affinity values associated with original peptides
"""
raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
kmer_peptides, original_peptides, counts = \
fixed_length_from_many_peptides(
peptides=raw_peptides,
desired_length=peptide_length,
start_offset_shorten=0,
end_offset_shorten=0,
start_offset_extend=0,
end_offset_extend=0)
n_samples = len(kmer_peptides)
assert n_samples == len(original_peptides), \
"Mismatch between # of samples (%d) and # of peptides (%d)" % (
n_samples, len(original_peptides))
assert n_samples == len(counts), \
"Mismatch between # of samples (%d) and # of counts (%d)" % (
n_samples, len(counts))
X_index = index_encoding(kmer_peptides, peptide_length)
X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
assert X_binary.shape[0] == X_index.shape[0], \
("Mismatch between number of samples for index encoding (%d)"
" vs. binary encoding (%d)") % (
X_binary.shape[0],
X_index.shape[0])
if flatten_binary_encoding:
# collapse 3D input into 2D matrix
n_binary_features = peptide_length * 20
X_binary = X_binary.reshape((n_samples, n_binary_features))
# easier to work with counts when they're an array instead of list
counts = np.array(counts)
Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
assert n_samples == len(Y), \
"Mismatch between # peptides %d and # regression outputs %d" % (
n_samples, len(Y))
return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
def load_allele_datasets(
filename,
peptide_length=9,
max_ic50=5000.0,
binary_encoding=True,
use_multiple_peptide_lengths=True,
max_ic50=50000.0,
flatten_binary_encoding=True,
sep=None,
species_column_name="species",
......@@ -202,14 +278,18 @@ def load_allele_datasets(
peptide_length : int
Which length peptides to use (default=9)
use_multiple_peptide_lengths : bool
If a peptide is shorter than `peptide_length`, expand it into many
peptides of the appropriate length by inserting all combinations of
amino acids. Similarly, if a peptide is longer than `peptide_length`,
shorten it by deleting stretches of contiguous amino acids at all
peptide positions.
max_ic50 : float
Treat IC50 scores above this value as all equally bad
(transform them to 0.0 in the rescaled output)
binary_encoding : bool
Encode amino acids of each peptide as indices or binary vectors
flatten_features : bool
flatten_binary_encoding : bool
If False, returns a (n_samples, peptide_length, 20) matrix, otherwise
returns the 2D flattened version of the same data.
......@@ -222,7 +302,7 @@ def load_allele_datasets(
only_human : bool
Only load entries from human MHC alleles
"""
df = load_dataframe(
df, peptide_column_name = load_dataframe(
filename=filename,
max_ic50=max_ic50,
sep=sep,
......@@ -236,21 +316,83 @@ def load_allele_datasets(
allele_groups = {}
for allele, group in df.groupby(allele_column_name):
ic50 = np.array(group[ic50_column_name])
Y = np.array(group["regression_output"])
peptides = list(group[peptide_column_name])
if binary_encoding:
X = hotshot_encoding(peptides, peptide_length=peptide_length)
if flatten_binary_encoding:
# collapse 3D input into 2D matrix
X = X.reshape((X.shape[0], peptide_length * 20))
else:
X = index_encoding(peptides, peptide_length=peptide_length)
assert allele not in allele_groups, \
"Duplicate datasets for %s" % allele
raw_peptides = group[peptide_column_name]
# filter lengths in case user wants to drop peptides that are longer
# or shorter than the desired fixed length
if not use_multiple_peptide_lengths:
drop_mask = raw_peptides.str.len() != peptide_length
group = group[~drop_mask]
raw_peptides = raw_peptides[~drop_mask]
# convert from a Pandas column to a list, since that's easier to
# interact with later
raw_peptides = list(raw_peptides)
# create dictionaries of outputs from which we can look up values
# after peptides have been expanded
ic50_dict = {
peptide: ic50
for (peptide, ic50)
in zip(raw_peptides, group[ic50_column_name])
}
Y_dict = {
peptide: y
for (peptide, y)
in zip(raw_peptides, group["regression_output"])
}
(kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
encode_peptide_to_affinity_dict(
Y_dict,
peptide_length=peptide_length,
flatten_binary_encoding=flatten_binary_encoding)
ic50 = np.array([ic50_dict[p] for p in original_peptides])
assert len(kmer_peptides) == len(ic50), \
"Mismatch between # of peptides %d and # IC50 outputs %d" % (
len(kmer_peptides), len(ic50))
allele_groups[allele] = AlleleData(
X=X,
X_index=X_index,
X_binary=X_binary,
Y=Y,
ic50=ic50,
peptides=peptides)
peptides=kmer_peptides,
original_peptides=original_peptides,
original_lengths=[len(peptide) for peptide in original_peptides],
substring_counts=counts,
weights=1.0 / counts)
return allele_groups
def collapse_multiple_peptide_entries(allele_datasets):
"""
If an (allele, peptide) pair occurs multiple times then reduce it
to a single entry by taking the weighted average of affinity values.
Returns a dictionary of alleles, each of which maps to a dictionary of
peptides that map to affinity values.
"""
allele_to_peptide_to_affinity = {}
for (allele, dataset) in allele_datasets.items():
multiple_affinities = defaultdict(list)
for (peptide, normalized_affinity, weight) in zip(
dataset.peptides, dataset.Y, dataset.weights):
multiple_affinities[peptide].append((normalized_affinity, weight))
weighted_averages = {}
for peptide, affinity_weight_tuples in multiple_affinities.items():
denom = 0.0
sum_weighted_affinities = 0.0
for affinity, weight in affinity_weight_tuples:
sum_weighted_affinities += affinity * weight
denom += weight
if denom > 0:
weighted_averages[peptide] = sum_weighted_affinities / denom
allele_to_peptide_to_affinity[allele] = weighted_averages
return allele_to_peptide_to_affinity
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
class Ensemble(object):
def __init__(self, models, name=None):
self.name = name
self.models = models
@classmethod
def from_directory(cls, directory_path):
files = os.listdir(directory_path)
......@@ -128,9 +128,10 @@ def make_hotshot_network(
output_activation="sigmoid",
dropout_probability=0.0,
optimizer=None,
learning_rate=0.001):
learning_rate=0.001,
n_amino_acids=20):
return make_network(
input_size=peptide_length * 20,
input_size=peptide_length * n_amino_acids,
layer_sizes=layer_sizes,
activation=activation,
init=init,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment