diff --git a/experiments/allele_similarities.py b/experiments/allele_similarities.py new file mode 100644 index 0000000000000000000000000000000000000000..bc3def736b3c995a5e228190337e353ac653ca98 --- /dev/null +++ b/experiments/allele_similarities.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helper functions for computing pairwise similarities between alleles +""" + +import numpy as np +import seaborn +from fancyimpute import NuclearNormMinimization + +from common import matrix_to_dictionary, curry_dictionary + + +def compute_similarities_for_single_allele_from_peptide_overlap( + allele_name, + allele_to_peptide_to_affinity, + allele_name_length=None, + min_weight=1.0): + """ + Parameters + ---------- + allele_name : str + + allele_to_peptide_to_affinity : dict + Dictionary from allele anmes to a dictionary of (peptide -> affinity) + + Returns three dictionaries, mapping alleles to + - similarity + - number of overlapping peptides + - total weight of overlapping affinitie + """ + sims = {} + overlaps = {} + weights = {} + dataset_a = allele_to_peptide_to_affinity[allele_name] + peptide_set_a = set(dataset_a.keys()) + for other_allele, dataset_b in allele_to_peptide_to_affinity.items(): + if allele_name_length and len(other_allele) != allele_name_length: + continue + peptide_set_b = set(dataset_b.keys()) + intersection = peptide_set_a.intersection(peptide_set_b) + overlaps[other_allele] = len(intersection) + total = 0.0 + weight = 0.0 + for peptide in intersection: + ya = dataset_a[peptide] + yb = dataset_b[peptide] + minval = min(ya, yb) + maxval = max(ya, yb) + total += minval + weight += maxval + weights[other_allele] = weight + if weight > min_weight: + sims[other_allele] = total / weight + return sims, overlaps, weights + + +def compute_partial_similarities_from_peptide_overlap( + allele_to_peptide_to_affinity, + min_weight=1.0, + allele_name_length=None): + """ + Determine similarity between pairs of alleles by examining + affinity values for overlapping peptides. Returns curried dictionaries + mapping allele -> allele -> float, where the final value is one of: + - similarity between alleles + - # overlapping peptides + - sum weight of overlapping affinities + """ + sims = {} + overlaps = {} + weights = {} + for allele_name in allele_to_peptide_to_affinity.keys(): + if allele_name_length and len(allele_name) != allele_name_length: + continue + curr_sims, curr_overlaps, curr_weights = \ + compute_similarities_for_single_allele_from_peptide_overlap( + allele_name=allele_name, + allele_to_peptide_to_affinity=allele_to_peptide_to_affinity, + allele_name_length=allele_name_length, + min_weight=min_weight) + sims[allele_name] = curr_sims + overlaps[allele_name] = curr_overlaps + weights[allele_name] = curr_weights + return sims, overlaps, weights + + +def build_incomplete_similarity_matrix( + allele_to_peptide_to_affinity, + curried_sims_dict): + allele_list = list(sorted(allele_to_peptide_to_affinity.keys())) + allele_order = { + allele_name: i + for (i, allele_name) in enumerate(allele_list) + } + n_alleles = len(allele_list) + sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float) + + for a, ai in allele_order.items(): + for b, bi in allele_order.items(): + # if allele pair is missing from similarities dict then + # fill its slot with NaN, indicating missing data + similarity = curried_sims_dict.get(a, {}).get(b, np.nan) + sims_incomplete_matrix[(ai, bi)] = similarity + return allele_list, allele_order, sims_incomplete_matrix + + +def save_heatmap(matrix, allele_names, filename): + seaborn.set_context("paper") + + with seaborn.plotting_context(font_scale=1): + figure = seaborn.plt.figure(figsize=(20, 18)) + ax = figure.add_axes() + heatmap = seaborn.heatmap( + data=matrix, + xticklabels=allele_names, + yticklabels=allele_names, + linewidths=0, + annot=False, + ax=ax, + fmt=".2g") + heatmap.set_xticklabels(labels=allele_names, rotation=45) + heatmap.set_yticklabels(labels=allele_names, rotation=45) + figure.savefig(filename) + + +def fill_in_similarities( + curried_raw_sims_dict, + allele_to_peptide_to_affinity, + raw_sims_heatmap_path=None, + complete_sims_heatmap_path=None, + curried_overlap_weights=None, + scalar_error_tolerance=0.0001): + """ + Given an incomplete dictionary of pairwise allele similarities and + a dictionary of binding data, generate the completed similarities + """ + + allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix( + allele_to_peptide_to_affinity, + curried_sims_dict=curried_raw_sims_dict) + + missing = np.isnan(sims_matrix) + + if curried_overlap_weights: + error_tolerance = np.ones_like(sims_matrix) + for allele_a, a_dict in curried_overlap_weights.items(): + for allele_b, weight in a_dict.items(): + i = allele_order[allele_a] + j = allele_order[allele_b] + error_tolerance[i, j] = 2.0 ** -weight + print( + "-- Error tolerance distribution: min %f, max %f, median %f" % ( + error_tolerance[~missing].min(), + error_tolerance[~missing].max(), + np.median(error_tolerance[~missing]))) + else: + error_tolerance = scalar_error_tolerance + + if raw_sims_heatmap_path: + save_heatmap( + sims_matrix, + allele_list, + raw_sims_heatmap_path) + + print("Completing %s similarities matrix with %d missing entries" % ( + sims_matrix.shape, + missing.sum())) + + solver = NuclearNormMinimization( + require_symmetric_solution=True, + min_value=0.0, + max_value=1.0, + error_tolerance=error_tolerance) + + sims_complete_matrix = solver.complete(sims_matrix) + + if complete_sims_heatmap_path: + save_heatmap( + sims_complete_matrix, + allele_list, + complete_sims_heatmap_path) + + complete_sims_dict = curry_dictionary( + matrix_to_dictionary( + sims=sims_complete_matrix, + allele_list=allele_list)) + return complete_sims_dict + + +def save_csv(filename, sims, overlap_counts, overlap_weights): + """ + Save pairwise allele similarities in CSV file. + + Assumes the dictionaries sims, overlap_counts, and overlap_weights are + 'curried', meaning that their keys are single allele names which map + to nested dictionaries from other allele names to numeric values. + As a type signature, this means the dictionaries are of the form + (allele -> allele -> number) + and not + (allele, allele) -> number + """ + with open(filename, "w") as f: + f.write("allele_A,allele_B,similarity,count,weight\n") + for (a, a_row) in sorted(sims.items()): + a_counts = overlap_counts.get(a, {}) + a_weights = overlap_weights.get(a, {}) + for (b, similarity) in sorted(a_row.items()): + count = a_counts.get(b, 0) + weight = a_weights.get(b, 0.0) + f.write("%s,%s,%0.4f,%d,%0.4f\n" % ( + a, + b, + similarity, + count, + weight)) + + +def compute_allele_similarities(allele_to_peptide_to_affinity, min_weight=0.1): + """ + Compute pairwise allele similarities from binding data, + complete similarity matrix for alleles which lack sufficient data, + returns: + - dictionary of allele similarities + - dictionary of # overlapping peptides between alleles + - dictionary of "weight" of overlapping peptides between alleles + """ + assert isinstance(allele_to_peptide_to_affinity, dict), \ + "Wrong type, expected dict but got %s" % ( + type(allele_to_peptide_to_affinity),) + raw_sims_dict, overlap_counts, overlap_weights = \ + compute_partial_similarities_from_peptide_overlap( + allele_to_peptide_to_affinity=allele_to_peptide_to_affinity, + min_weight=min_weight) + + complete_sims_dict = fill_in_similarities( + curried_raw_sims_dict=raw_sims_dict, + allele_to_peptide_to_affinity=allele_to_peptide_to_affinity, + curried_overlap_weights=overlap_weights) + + return complete_sims_dict, overlap_counts, overlap_weights diff --git a/experiments/best-synthetic-data-hyperparams.py b/experiments/best-synthetic-data-hyperparams.py new file mode 100755 index 0000000000000000000000000000000000000000..e1c2c7eac0cc150ef6146b1ccaf4cb4301812490 --- /dev/null +++ b/experiments/best-synthetic-data-hyperparams.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Find best smoothing coefficient for creating synthetic data. The smoothing +coefficient adds artifical weight toward low binding affinities, which dominate +in cases where no similar allele has values for some peptide. + +We're determining "best" as most predictive of already known binding affinities, +averaged across all given alleles. +""" + +import argparse +from collections import defaultdict + +from mhcflurry.data import load_allele_dicts +from scipy import stats +import numpy as np +import sklearn.metrics + +from dataset_paths import PETERS2009_CSV_PATH +from synthetic_data import ( + synthesize_affinities_for_single_allele, + create_reverse_lookup_from_simple_dicts +) +from allele_similarities import compute_allele_similarities + + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--only-human", + default=False, + action="store_true") + +parser.add_argument( + "--smoothing-coefs", + default=[0.05, 0.025, 0.01, 0.005, 0.0025, 0.001, 0.0005, 0.0001, 0], + type=lambda s: [float(si.strip()) for si in s.split(",")], + help="Smoothing value used for peptides with low weight across alleles") + +parser.add_argument( + "--similarity-exponents", + default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + type=lambda s: [float(si.strip()) for si in s.split(",")], + help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent") + + +def generate_cross_validation_datasets( + allele_to_peptide_to_affinity, + n_folds=4): + for allele, dataset in sorted(allele_to_peptide_to_affinity.items()): + peptides = list(dataset.keys()) + affinities = list(dataset.values()) + n_samples = len(peptides) + print("Generating similarities for folds of %s data (n=%d)" % ( + allele, + n_samples)) + + if n_samples < n_folds: + print("Too few samples (%d) for %d-fold cross-validation" % ( + n_samples, + n_folds)) + continue + + kfold_iterator = enumerate( + KFold(n_samples, n_folds=n_folds, shuffle=True)) + + for fold_number, (train_indices, test_indices) in kfold_iterator: + train_peptides = [peptides[i] for i in train_indices] + train_affinities = [affinities[i] for i in train_indices] + test_peptide_set = set([peptides[i] for i in test_indices]) + # copy the affinity data for all alleles other than this one + fold_affinity_dict = { + allele_key: affinity_dict + for (allele_key, affinity_dict) + in allele_to_peptide_to_affinity.items() + if allele_key != allele + } + # include an affinity dictionary for this allele which + # only uses training data + fold_affinity_dict[allele] = { + train_peptide: train_affinity + for (train_peptide, train_affinity) + in zip(train_peptides, train_affinities) + } + allele_similarities, overlap_counts, overlap_weights = \ + compute_allele_similarities( + allele_to_peptide_to_affinity=fold_affinity_dict, + min_weight=0.1) + this_allele_similarities = allele_similarities[allele] + + yield ( + allele, + dataset, + fold_number, + this_allele_similarities, + test_peptide_set + ) + + +def evaluate_synthetic_data( + allele_to_peptide_to_affinity, + smoothing_coef, + exponent, + max_ic50, + n_folds=4): + """ + Use cross-validation over entries of each allele to determine the predictive + accuracy of data synthesis on known affinities. + """ + + peptide_to_affinities = create_reverse_lookup_from_simple_dicts( + allele_to_peptide_to_affinity) + + taus = defaultdict(list) + f1_scores = defaultdict(list) + aucs = defaultdict(list) + + for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \ + generate_cross_validation_datasets( + allele_to_peptide_to_affinity, + n_folds=n_folds): + # create a peptide -> list[(allele, affinity, weight)] dictionary + # restricted only to the peptides for which we want to test accuracy + test_reverse_lookup = { + peptide: triplets + for (peptide, triplets) in peptide_to_affinities.items() + if peptide in fold_test_peptides + } + + synthetic_values = synthesize_affinities_for_single_allele( + similarities=fold_sims, + peptide_to_affinities=test_reverse_lookup, + smoothing=smoothing_coef, + exponent=exponent, + exclude_alleles=[allele]) + + synthetic_peptide_set = set(synthetic_values.keys()) + # set of peptides for which we have both true and synthetic + # affinity values + combined_peptide_set = synthetic_peptide_set.intersection( + fold_test_peptides) + + combined_peptide_list = list(sorted(combined_peptide_set)) + + if len(combined_peptide_list) < 2: + print("Too few peptides in combined set %s for fold %d/%d of %s" % ( + combined_peptide_list, + fold_num + 1, + n_folds, + allele)) + continue + + synthetic_affinity_list = [ + synthetic_values[peptide] + for peptide in combined_peptide_list + ] + + true_affinity_list = [ + allele_to_peptide_to_affinity[allele][peptide] + for peptide in combined_peptide_list + ] + assert len(true_affinity_list) == len(synthetic_affinity_list) + if all(x == true_affinity_list[0] for x in true_affinity_list): + continue + + tau, _ = stats.kendalltau( + synthetic_affinity_list, + true_affinity_list) + assert not np.isnan(tau) + print("==> %s (CV fold %d/%d) tau = %f" % ( + allele, + fold_num + 1, + n_folds, + tau)) + taus[allele].append(tau) + + true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list)) + predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list)) + + true_binding_label = true_ic50s <= 500 + if true_binding_label.all() or not true_binding_label.any(): + # can't compute AUC or F1 without both negative and positive cases + continue + auc = sklearn.metrics.roc_auc_score( + true_binding_label, + synthetic_affinity_list) + print("==> %s (CV fold %d/%d) AUC = %f" % ( + allele, + fold_num + 1, + n_folds, + auc)) + aucs[allele].append(auc) + + predicted_binding_label = predicted_ic50s <= 500 + if predicted_binding_label.all() or not predicted_binding_label.any(): + # can't compute F1 without both positive and negative predictions + continue + f1_score = sklearn.metrics.f1_score( + true_binding_label, + predicted_binding_label) + print("==> %s (CV fold %d/%d) F1 = %f" % ( + allele, + fold_num + 1, + n_folds, + f1_score)) + f1_scores[allele].append(f1_score) + + return taus, aucs, f1_scores + + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + + allele_to_peptide_to_affinity = load_allele_dicts( + args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human, + regression_output=True) + + n_binding_values = sum( + len(allele_dict) + for allele_dict in + allele_to_peptide_to_affinity.values() + ) + print("Loaded %d binding values for %d alleles" % ( + n_binding_values, len(allele_to_peptide_to_affinity))) + + results = {} + for smoothing_coef in args.smoothing_coefs: + for exponent in args.similarity_exponents: + taus, aucs, f1_scores = evaluate_synthetic_data( + allele_to_peptide_to_affinity=allele_to_peptide_to_affinity, + smoothing_coef=smoothing_coef, + exponent=exponent, + max_ic50=args.max_ic50) + allele_keys = list(sorted(taus.keys())) + tau_allele_means = [ + np.mean(taus[allele]) + for allele in allele_keys + if taus[allele] + ] + auc_allele_means = [ + np.mean(aucs[allele]) + for allele in allele_keys + if aucs[allele] + ] + + f1_score_means = [ + np.mean(f1_scores[allele]) + for allele in allele_keys + if f1_scores[allele] + ] + + median_tau = np.median(tau_allele_means) + median_f1 = np.median(f1_score_means) + median_auc = np.median(auc_allele_means) + print("\n\n::::::\n") + print( + "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % ( + exponent, + smoothing_coef, + median_tau, + median_auc, + median_f1)) + print("\n^^^^^^\n") + + scores = (median_tau, median_auc, median_f1) + results[(exponent, smoothing_coef)] = scores + + print("===") + + def combine_item_scores(x): + return x[1][0] * x[1][1] * x[1][2] + for ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) in \ + sorted(results.items(), key=combine_item_scores): + print("-- exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % ( + best_exponent, + best_coef, + median_tau, + median_auc, + median_f1)) + + ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max( + results.items(), + key=combine_item_scores) + print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % ( + best_exponent, + best_coef, + median_tau, + median_auc, + median_f1)) diff --git a/experiments/common.py b/experiments/common.py new file mode 100644 index 0000000000000000000000000000000000000000..95800f82d9b7e9c9bee9fa56c805f960d08725a4 --- /dev/null +++ b/experiments/common.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from collections import defaultdict +import pandas as pd + + +def curry_dictionary(key_pair_dict, default_value=0.0): + """ + Transform dictionary from pairs of keys to dict -> dict -> float + """ + result = defaultdict(dict) + for (a, b), value in key_pair_dict.items(): + result[a][b] = value + return result + + +def uncurry_dictionary(curried_dict): + """ + Transform dictionary from (key_a -> key_b -> float) to + (key_a, key_b) -> float + """ + result = {} + for a, a_dict in curried_dict.items(): + for b, value in a_dict.items(): + result[(a, b)] = value + return result + + +def matrix_to_dictionary(sims, allele_list): + sims_dict = {} + for i in range(sims.shape[0]): + a = allele_list[i] + for j in range(sims.shape[1]): + b = allele_list[j] + sims_dict[a, b] = sims[i, j] + return sims_dict + + +def load_csv_binding_data_as_dict( + csv_path, + mhc_column_name="mhc", + peptide_column_name="sequence", + ic50_column_name="ic50"): + """ + Given a path to a CSV file containing peptide-MHC binding data, + load it as a dictionary mapping alleles to a dictionary peptide->IC50 + """ + df = pd.read_csv(csv_path) + print("-- Read %d rows from %s" % (len(df), csv_path)) + return { + allele: { + peptide: ic50 + for (peptide, ic50) + in zip(group[peptide_column_name], group[ic50_column_name]) + } + for allele, group in df.groupby(mhc_column_name) + } diff --git a/experiments/compute-allele-similarities.py b/experiments/compute-allele-similarities.py old mode 100644 new mode 100755 index 290e4a7ab50a9b2fb07d99b39f30c8e923d2bc5b..833bfd6a67513927cebb2d9a74081b24933d1750 --- a/experiments/compute-allele-similarities.py +++ b/experiments/compute-allele-similarities.py @@ -1,10 +1,29 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse -import fancyimpute -import numpy as np import mhcflurry from dataset_paths import PETERS2009_CSV_PATH +from allele_similarities import ( + compute_partial_similarities_from_peptide_overlap, + fill_in_similarities, + save_csv +) parser = argparse.ArgumentParser() @@ -13,204 +32,96 @@ parser.add_argument( default=PETERS2009_CSV_PATH) parser.add_argument( - "--fill-missing-values", - action="store_true", - default=False) + "--min-overlap-weight", + default=0.05, + help="Minimum overlap weight between pair of alleles", + type=float) +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) parser.add_argument( - "--min-overlap-weight", - default=2.0, - help="Minimum overlap weight between pair of alleles") + "--only-human", + default=False, + action="store_true") parser.add_argument( - "--max-ic50", - default=50000.0) + "--raw-similarities-output-path", + help="CSV file which contains incomplete allele similarities") parser.add_argument( - "--output-csv", - required=True) - -def binary_encode(X, n_indices=20): - n_cols = X.shape[1] - X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float) - for i in range(len(X)): - for col_idx in range(n_cols): - X_encode[i, col_idx*n_indices + X[i, col_idx]] = True - return X_encode - -def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5): - if allele_name_length: - datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length} - if prefix: - datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)} - if len(datasets_dict) == 0: - raise ValueError("No alleles matched criteria") - all_peptides = set([]) - allele_list = [] - for k,d in sorted(datasets_dict.items()): - print(k, len(d.peptides)) - allele_list.append(k) - for p in d.peptides: - all_peptides.add(p) - peptide_list = list(sorted(all_peptides)) - allele_order = {a:i for (i,a) in enumerate(allele_list)} - peptide_order = {p:i for (i,p) in enumerate(peptide_list)} - n_alleles = len(allele_order) - n_peptides = len(peptide_order) - incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan - for k,d in datasets_dict.items(): - if k not in allele_order: - continue - i = allele_order[k] - for p,y in zip(d.peptides, d.Y): - j = peptide_order[p] - incomplete_affinity_matrix[i,j] = y - print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides)) - return incomplete_affinity_matrix, allele_order, peptide_order - -def get_extra_data(allele, train_datasets, expanded_predictions): - original_dataset = train_datasets[allele] - original_peptides = set(original_dataset.peptides) - expanded_allele_affinities = expanded_predictions[allele] - extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides} - extra_peptides = list(extra_affinities.keys()) - extra_values = list(extra_affinities.values()) - extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9) - extra_Y = np.array(extra_values) - return extra_X, extra_Y - -def WHO_KNOWS_WHAT(): - - - smoothing = 0.005 - exponent = 2 - all_predictions = defaultdict(dict) - for allele, allele_idx in allele_order.items(): - for peptide in peptide_order.keys(): - predictions = reverse_lookups[peptide] - total_value = 0.0 - total_denom = 0.0 - for (other_allele, y) in predictions: - key = (allele,other_allele) - if key not in completed_sims_dict: - continue - sim = completed_sims_dict[key] - weight = sim ** exponent - total_value += weight * y - total_denom += weight - if total_denom > 0.0: - all_predictions[allele][peptide] = total_value / (smoothing + total_denom) - - -def data_augmentation(X, Y, extra_X, extra_Y, - fraction=0.5, - niters=10, - extra_sample_weight=0.1, - nb_epoch=50, - nn=True, - hidden_layer_size=5): - n = len(Y) - aucs = [] - f1s = [] - n_originals = [] - for _ in range(niters): - mask = np.random.rand(n) <= fraction - X_train = X[mask] - X_test = X[~mask] - Y_train = Y[mask] - Y_test = Y[~mask] - test_ic50 = 20000 ** (1-Y_test) - test_label = test_ic50 <= 500 - if test_label.all() or not test_label.any(): - continue - n_original = mask.sum() - print("Keeping %d original training samples" % n_original) - X_train_combined = np.vstack([X_train, extra_X]) - Y_train_combined = np.concatenate([Y_train, extra_Y]) - print("Combined shape: %s" % (X_train_combined.shape,)) - assert len(X_train_combined) == len(Y_train_combined) - # initialize weights to count synthesized and actual data equally - # but weight on synthesized points will decay across training epochs - weight = np.ones(len(Y_train_combined)) - if nn: - model = mhcflurry.feedforward.make_embedding_network( - layer_sizes=[hidden_layer_size], - embedding_output_dim=10, - activation="tanh") - for i in range(nb_epoch): - # decay weight as training progresses - weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2 - model.fit( - X_train_combined, - Y_train_combined, - sample_weight=weight, - shuffle=True, - nb_epoch=1, - verbose=0) - pred = model.predict(X_test) - else: - model = sklearn.linear_model.Ridge(alpha=5) - X_train_combined_binary = binary_encode(X_train_combined) - X_test_binary = binary_encode(X_test) - model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight) - pred = model.predict(X_test_binary) - pred_ic50 = 20000 ** (1-pred) - pred_label = pred_ic50 <= 500 - mse = sklearn.metrics.mean_squared_error(Y_test, pred) - auc = sklearn.metrics.roc_auc_score(test_label, pred) - - if pred_label.all() or not pred_label.any(): - f1 = 0 - else: - f1 = sklearn.metrics.f1_score(test_label, pred_label) - print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1)) - n_originals.append(n_original) - aucs.append(auc) - f1s.append(f1) - return aucs, f1s, n_originals + "--complete-similarities-output-path", + help="CSV file which contains allele similarities after matrix completion") + +parser.add_argument( + "--raw-heatmap-output-path", + help="PNG file to save heatmap of incomplete similarities matrix") + +parser.add_argument( + "--complete-heatmap-output-path", + help="PNG file to save heatmap of complete similarities matrix") + + +def print_dataset_sizes(allele_groups): + print("---\nDataset Sizes\n---") + for (allele_name, g) in sorted(allele_groups.items()): + print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % ( + allele_name, + len(g), + sum(len(k) == 8 for k in g.keys()), + sum(len(k) == 9 for k in g.keys()), + sum(len(k) == 10 for k in g.keys()), + sum(len(k) == 11 for k in g.keys()), + )) + print("---") if __name__ == "__main__": args = parser.parse_args() - - datasets = mhcflurry.data_helpers.load_data( + print(args) + df, peptide_column_name = mhcflurry.data.load_dataframe( args.binding_data_csv, - binary_encoding=True, - max_ic50=args.max_ic50) - - sims = {} - overlaps = {} - weights = {} - for allele_name_a, da in train_datasets.items(): - if len(allele_name_a) != 5: - continue - y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)} - seta = set(da.peptides) - for allele_name_b, db in train_datasets.items(): - if len(allele_name_b) != 5: - continue - y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)} - setb = set(db.peptides) - intersection = seta.intersection(setb) - overlaps[a,b] = len(intersection) - total = 0.0 - weight = 0.0 - for peptide in intersection: - ya = y_dict_a[peptide] - yb = y_dict_b[peptide] - minval = min(ya, yb) - maxval = max(ya, yb) - total += minval - weight += maxval - weights[a,b] = weight - if weight > min_weight: - sims[allele_name_a, allele_name_b] = total / weight - else: - sims[allele_name_a, allele_name_b] = np.nan - - if args.fill_missing_values: - sims_array = np.zeros((n_alleles, n_alleles), dtype=float) - for i, a in enumerate(allele_names): - for j, b in enumerate(allele_names): - sims_array[i,j] = sims[(a,b)] + max_ic50=args.max_ic50, + only_human=args.only_human) + allele_groups = { + allele_name: { + row[peptide_column_name]: row["regression_output"] + for (_, row) in group.iterrows() + } + for (allele_name, group) in df.groupby("mhc") + } + print_dataset_sizes(allele_groups) + + raw_sims_dict, overlap_counts, overlap_weights = \ + compute_partial_similarities_from_peptide_overlap( + allele_groups, + min_weight=args.min_overlap_weight) + + if args.raw_similarities_output_path: + save_csv( + args.raw_similarities_output_path, + raw_sims_dict, + overlap_counts, + overlap_weights) + + complete_sims_dict = fill_in_similarities( + curried_raw_sims_dict=raw_sims_dict, + allele_to_peptide_to_affinity=allele_groups, + raw_sims_heatmap_path=args.raw_heatmap_output_path, + complete_sims_heatmap_path=args.complete_heatmap_output_path) + + n_total_entries = sum(len(row) for row in complete_sims_dict.values()) + n_original_entries = sum(len(row) for row in raw_sims_dict.values()) + print("-- Added %d/%d allele similarities" % ( + n_total_entries - n_original_entries, + n_total_entries + )) + + if args.complete_similarities_output_path: + save_csv( + args.complete_similarities_output_path, + complete_sims_dict, + overlap_counts, + overlap_weights) diff --git a/experiments/cross_validation_affinity_data.py b/experiments/cross_validation_affinity_data.py new file mode 100644 index 0000000000000000000000000000000000000000..d01fc6d93a30fdeb2fc974394fee3bfd31d58c60 --- /dev/null +++ b/experiments/cross_validation_affinity_data.py @@ -0,0 +1,68 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sklearn.cross_validation import KFold + + +def generate_cross_validation_datasets( + allele_to_peptide_to_affinity, + n_folds=4): + for allele, dataset in sorted(allele_to_peptide_to_affinity.items()): + peptides = list(dataset.keys()) + affinities = list(dataset.values()) + n_samples = len(peptides) + print("Generating similarities for folds of %s data (n=%d)" % ( + allele, + n_samples)) + + if n_samples < n_folds: + print("Too few samples (%d) for %d-fold cross-validation" % ( + n_samples, + n_folds)) + continue + + kfold_iterator = enumerate( + KFold(n_samples, n_folds=n_folds, shuffle=True)) + + for fold_number, (train_indices, test_indices) in kfold_iterator: + train_peptides = [peptides[i] for i in train_indices] + train_affinities = [affinities[i] for i in train_indices] + test_peptide_set = set([peptides[i] for i in test_indices]) + # copy the affinity data for all alleles other than this one + fold_affinity_dict = { + allele_key: affinity_dict + for (allele_key, affinity_dict) + in allele_to_peptide_to_affinity.items() + if allele_key != allele + } + # include an affinity dictionary for this allele which + # only uses training data + fold_affinity_dict[allele] = { + train_peptide: train_affinity + for (train_peptide, train_affinity) + in zip(train_peptides, train_affinities) + } + allele_similarities, overlap_counts, overlap_weights = \ + compute_allele_similarities( + allele_to_peptide_to_affinity=fold_affinity_dict, + min_weight=0.1) + this_allele_similarities = allele_similarities[allele] + + yield ( + allele, + dataset, + fold_number, + this_allele_similarities, + test_peptide_set + ) \ No newline at end of file diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..0fef8576f2cc836411972aff280d25c829730430 --- /dev/null +++ b/experiments/matrix-completion-accuracy.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fancyimpute import ( + SoftImpute, + IterativeSVD, + SimilarityWeightedAveraging, + KNN, + MICE, + BiScaler, + SimpleFill, +) +import numpy as np +import pandas as pd + +from dataset_paths import PETERS2009_CSV_PATH +from score_set import ScoreSet +from matrix_completion_helpers import ( + load_data, + evaluate_predictions, + stratified_cross_validation +) + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--save-incomplete-affinity-matrix", + default=None, + help="Path to CSV which will contains the incomplete affinity matrix") + +parser.add_argument( + "--only-human", + default=False, + action="store_true") + +parser.add_argument( + "--output-file", + default="matrix-completion-accuracy-results.csv") + +parser.add_argument( + "--normalize-columns", + default=False, + action="store_true", + help="Center and rescale columns of affinity matrix") + + +parser.add_argument( + "--normalize-rows", + default=False, + action="store_true", + help="Center and rescale rows of affinity matrix") + +parser.add_argument( + "--min-observations-per-peptide", + type=int, + default=2, + help="Drop peptide entries with fewer than this number of measured affinities") + +parser.add_argument( + "--n-folds", + default=10, + type=int, + help="Number of cross-validation folds") + +parser.add_argument( + "--verbose", + default=False, + action="store_true") + + +def create_imputation_methods( + verbose=False, + clip_imputed_values=False, + knn_print_interval=20, + knn_params=[1, 3, 5], + softimpute_params=[1, 5, 10], + svd_params=[5, 10, 20]): + min_value = 0 if clip_imputed_values else None + max_value = 1 if clip_imputed_values else None + result_dict = { + "meanFill": SimpleFill("mean"), + "zeroFill": SimpleFill("zero"), + "mice": MICE( + n_burn_in=5, + n_imputations=25, + min_value=min_value, + max_value=max_value, + verbose=verbose), + "similarityWeightedAveraging": SimilarityWeightedAveraging( + orientation="columns", + verbose=verbose), + } + for threshold in softimpute_params: + result_dict["softImpute-%d" % threshold] = SoftImpute( + threshold, + verbose=verbose, + min_value=min_value, + max_value=max_value) + for rank in svd_params: + result_dict["svdImpute-%d" % rank] = IterativeSVD( + rank, + verbose=verbose, + min_value=min_value, + max_value=max_value) + for k in knn_params: + result_dict["knnImpute-%d" % k] = KNN( + k, + orientation="columns", + verbose=verbose, + print_interval=knn_print_interval) + return result_dict + + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + imputation_methods = create_imputation_methods( + verbose=args.verbose, + clip_imputed_values=not (args.normalize_rows or args.normalize_rows), + ) + print("Imputation methods: %s" % imputation_methods) + + X, peptide_list, allele_list = load_data( + binding_data_csv=args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human, + min_observations_per_allele=args.n_folds, + min_observations_per_peptide=args.min_observations_per_peptide) + observed_mask = np.isfinite(X) + print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % ( + X.shape, + observed_mask.sum(), + X.size, + 100.0 * observed_mask.sum() / X.size)) + if args.save_incomplete_affinity_matrix: + print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) + df = pd.DataFrame(X, columns=allele_list, index=peptide_list) + df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") + + scores = ScoreSet() + kfold = stratified_cross_validation( + X=X, + observed_mask=observed_mask, + n_folds=args.n_folds) + for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold): + X_fold_reduced = X_fold[ok_mesh] + biscaler = BiScaler( + scale_rows=args.normalize_rows, + center_rows=args.normalize_rows, + scale_columns=args.normalize_columns, + center_columns=args.normalize_columns) + X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced) + for (method_name, solver) in sorted(imputation_methods.items()): + print("CV fold %d/%d, running %s" % ( + fold_idx + 1, + args.n_folds, + method_name)) + X_completed_reduced_scaled = solver.complete(X_fold_reduced) + X_completed_reduced = biscaler.inverse_transform( + X_completed_reduced_scaled) + X_completed = np.zeros_like(X) + X_completed[ok_mesh] = X_completed_reduced + y_pred = X_completed[test_coords] + mae, tau, auc, f1_score = evaluate_predictions( + y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50) + scores.add_many( + method_name, + mae=mae, + tau=tau, + f1_score=f1_score, + auc=auc) + scores.to_csv(args.output_file) diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py new file mode 100755 index 0000000000000000000000000000000000000000..3d224bb8e7808cced4d5dd7324904a6633368f59 --- /dev/null +++ b/experiments/matrix-completion-hyperparameter-search.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +For each allele, perform 5-fold cross validation to + (1) complete the matrix of affinities to generate auxilliary training data + (2) train multiple hyperparameter to find the best hyperparameters for + each network +""" + +import argparse +from collections import defaultdict + +from fancyimpute import MICE, KNN, SimpleFill, IterativeSVD, SoftImpute +import numpy as np +import pandas as pd +from mhcflurry.peptide_encoding import fixed_length_index_encoding +from mhcflurry.amino_acid import ( + common_amino_acids, + amino_acids_with_unknown, +) +from mhcflurry import Class1BindingPredictor +from sklearn.cross_validation import StratifiedKFold + +from dataset_paths import PETERS2009_CSV_PATH + +from matrix_completion_helpers import load_data, evaluate_predictions + +from arg_parsing import parse_int_list, parse_float_list, parse_string_list + +from matrix_completion_helpers import prune_data + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--save-incomplete-affinity-matrix", + default=None, + help="Path to CSV which will contains the incomplete affinity matrix") + +parser.add_argument( + "--only-human", + default=False, + action="store_true") + +parser.add_argument( + "--output-file", + default="hyperparameter-matrix-completion-results.csv") + +parser.add_argument( + "--n-folds", + default=5, + type=int, + help="Number of cross-validation folds") + +parser.add_argument( + "--max-training-peptide-length", + type=int, + default=15) + +parser.add_argument( + "--verbose", + default=False, + action="store_true") + +parser.add_argument( + "--embedding-dim-sizes", + default=[10, 20, 40], + type=parse_int_list) + +parser.add_argument( + "--first-hidden-layer-sizes", + default=[25, 50, 100], + type=parse_int_list) + + +parser.add_argument( + "--second-hidden-layer-sizes", + default=[0], + type=parse_int_list) + + +parser.add_argument( + "--dropouts", + default=[0.0], + type=parse_float_list) + +parser.add_argument( + "--activation-functions", + default=["tanh"], + type=lambda s: [si.strip() for si in s.split(",")]) + +parser.add_argument( + "--training-epochs", + type=int, + default=100) + +parser.add_argument( + "--impute", + default="mice", + help="Use {'mice', 'knn', 'meanfill', 'none', 'svt', 'svd'} for imputing pre-training data") + +parser.add_argument("--batch-size", type=int, default=64) + +parser.add_argument( + "--unknown-amino-acids", + default=False, + action="store_true", + help="When expanding 8mers into 9mers use 'X' instead of all possible AAs") + +parser.add_argument( + "--pretrain-only-9mers", + default=False, + action="store_true", + help="Exclude expanded samples from e.g. 8mers or 10mers") + +parser.add_argument( + "--alleles", + default=None, + type=parse_string_list, + help="Only evaluate these alleles (by default use all in the dataset)") + +parser.add_argument("--min-samples-per-cv-fold", type=int, default=5) + + +def print_length_distribution(peptides, values, name): + print("Length distribution for %s (n=%d):" % (name, len(peptides))) + grouped_affinity_dict = defaultdict(list) + for p, v in zip(peptides, values): + grouped_affinity_dict[len(p)].append(v) + for length, affinities in sorted(grouped_affinity_dict.items()): + print("%d => %d (mean affinity %0.4f)" % ( + length, + len(affinities), + np.mean(affinities))) + + +def print_full_dataset_length_distribution( + pMHC_affinity_matrix, + observed_mask, + row_peptide_sequences): + row_idx, col_idx = np.where(observed_mask) + all_observed_peptides = [] + all_observed_values = [] + + for i, j in zip(row_idx, col_idx): + all_observed_peptides.append(row_peptide_sequences[i]) + all_observed_values.append(pMHC_affinity_matrix[i, j]) + + print_length_distribution(all_observed_peptides, all_observed_values, "ALL") + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + + # initially load all the data, we're going to later prune it for matrix + # completion + pMHC_affinity_matrix, peptide_list, allele_list = load_data( + binding_data_csv=args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human, + min_observations_per_peptide=1, + min_observations_per_allele=1) + observed_mask = np.isfinite(pMHC_affinity_matrix) + print_full_dataset_length_distribution( + pMHC_affinity_matrix=pMHC_affinity_matrix, + observed_mask=observed_mask, + row_peptide_sequences=peptide_list) + + n_observed_per_allele = observed_mask.sum(axis=0) + + print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % ( + pMHC_affinity_matrix.shape, + observed_mask.sum(), + pMHC_affinity_matrix.size, + 100.0 * observed_mask.sum() / pMHC_affinity_matrix.size)) + + if args.verbose: + print("\n----\n# observed per allele:") + for allele_idx, allele in sorted(enumerate(allele_list), key=lambda x: x[1]): + print("--- %s: %d" % (allele, n_observed_per_allele[allele_idx])) + + if args.save_incomplete_affinity_matrix: + print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) + df = pd.DataFrame(pMHC_affinity_matrix, columns=allele_list, index=peptide_list) + df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") + + if args.output_file: + output_file = open(args.output_file, "w") + fields = [ + "allele", + "cv_fold", + "peptide_count", + "sample_count", + "dropout_probability", + "embedding_dim_size", + "hidden_layer_size1", + "hidden_layer_size2", + "activation", + "mae", + "tau", + "auc", + "f1" + ] + header_line = ",".join(fields) + output_file.write(header_line + "\n") + else: + output_file = None + if args.unknown_amino_acids: + index_encoding = amino_acids_with_unknown.index_encoding + else: + index_encoding = common_amino_acids.index_encoding + + impute_method_name = args.impute.lower() + if impute_method_name.startswith("mice"): + imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1) + elif impute_method_name.startswith("knn"): + imputer = KNN(k=1, orientation="columns", print_interval=10) + elif impute_method_name.startswith("svd"): + imputer = IterativeSVD(rank=10) + elif impute_method_name.startswith("svt"): + imputer = SoftImpute() + elif impute_method_name.startswith("mean"): + imputer = SimpleFill("mean") + elif impute_method_name == "none": + imputer = None + else: + raise ValueError("Invalid imputation method: %s" % impute_method_name) + + predictors = {} + initial_weights = {} + initial_optimizer_states = {} + + def generate_predictors(): + """ + Generator of all possible predictors generated by combinations of + hyperparameters. + + To avoid recompiling and initializing a lot of neural networks + just save all the models up front along with their initial weight matrices + """ + for dropout in args.dropouts: + for embedding_dim_size in args.embedding_dim_sizes: + for hidden_layer_size1 in args.first_hidden_layer_sizes: + for hidden_layer_size2 in args.second_hidden_layer_sizes: + for activation in args.activation_functions: + key = "%0.2f,%d,%d,%d,%s" % ( + dropout, + embedding_dim_size, + hidden_layer_size1, + hidden_layer_size2, + activation + ) + if key not in predictors: + layer_sizes = [hidden_layer_size1] + if hidden_layer_size2: + layer_sizes.append(hidden_layer_size2) + + predictor = Class1BindingPredictor.from_hyperparameters( + embedding_output_dim=embedding_dim_size, + layer_sizes=layer_sizes, + activation=activation, + output_activation="sigmoid", + dropout_probability=dropout, + verbose=args.verbose, + allow_unknown_amino_acids=args.unknown_amino_acids, + embedding_input_dim=21 if args.unknown_amino_acids else 20, + ) + weights = predictor.model.get_weights() + opt_state = predictor.model.optimizer.get_state() + predictors[key] = predictor + initial_weights[key] = weights + initial_optimizer_states[key] = opt_state + else: + predictor = predictors[key] + weights = initial_weights[key] + opt_state = initial_optimizer_states[key] + # reset the predictor to its initial condition + predictor.model.set_weights([ + w.copy() for w in weights]) + predictor.model.optimizer.set_state([ + s.copy() for s in opt_state]) + yield (key, predictor) + + min_samples_per_allele = args.min_samples_per_cv_fold * args.n_folds + for allele_idx, allele in enumerate(allele_list): + if args.alleles and not any( + pattern in allele for pattern in args.alleles): + # if user specifies an allele list then skip anything which isn't included + continue + if n_observed_per_allele[allele_idx] < min_samples_per_allele: + print("-- Skipping allele %s which only has %d samples (need %d)" % ( + allele, + n_observed_per_allele[allele_idx], + min_samples_per_allele)) + continue + column = pMHC_affinity_matrix[:, allele_idx] + observed_row_indices = np.where(observed_mask[:, allele_idx])[0] + + # drop indices which are for peptides that are extremely long, + # like a 26-mer + observed_row_indices = np.array([ + i + for i in observed_row_indices + if len(peptide_list[i]) <= args.max_training_peptide_length + ]) + observed_values = column[observed_row_indices] + std = np.std(observed_values) + if std < 0.001: + print("-- Skipping allele %s due to insufficient variance in affinities" % ( + allele)) + continue + + print( + "Evaluating allele %s (n=%d)" % ( + allele, + len(observed_row_indices))) + ic50_values = args.max_ic50 ** (1 - observed_values) + + below_500nM = ic50_values <= 500 + if below_500nM.all(): + print("-- Skipping %s due to all negative predictions" % allele) + continue + if below_500nM.sum() < args.n_folds: + print("-- Skipping %s due to insufficient positive examples (%d)" % ( + allele, + below_500nM.sum())) + continue + + observed_peptides = [peptide_list[i] for i in observed_row_indices] + print_length_distribution(observed_peptides, observed_values, name=allele) + + # k-fold cross validation stratified to keep an even balance of low + # vs. high-binding peptides in each fold + for fold_idx, (train_indices, test_indices) in enumerate(StratifiedKFold( + y=below_500nM, + n_folds=args.n_folds, + shuffle=True)): + + train_peptides_fold = [observed_peptides[i] for i in train_indices] + train_values_fold = [observed_values[i] for i in train_indices] + train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)} + + test_peptides = [observed_peptides[i] for i in test_indices] + test_values = [observed_values[i] for i in test_indices] + test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} + if imputer is None: + X_pretrain = np.array([], dtype=int).reshape((0, 9)) + Y_pretrain = np.array([], dtype=float) + pretrain_sample_weights = np.array([], dtype=float) + else: + # drop the test peptides from the full matrix and then + # run completion on it to get synthesized affinities + pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy() + test_indices_among_all_rows = observed_row_indices[test_indices] + pMHC_affinities_fold_incomplete[ + test_indices_among_all_rows, allele_idx] = np.nan + + # drop peptides with fewer than 2 measurements and alleles + # with fewer than 10 peptides + pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ + prune_data( + X=pMHC_affinities_fold_incomplete, + peptide_list=peptide_list, + allele_list=allele_list, + min_observations_per_peptide=2, + min_observations_per_allele=args.min_samples_per_cv_fold) + + pMHC_affinities_fold_pruned_imputed = imputer.complete( + pMHC_affinities_fold_pruned) + + if args.pretrain_only_9mers: + # if we're expanding 8mers to >100 9mers + # then the pretraining dataset becomes + # unmanageably large, so let's only use 9mers for pre-training + + # In the future: we'll use an neural network that + # takes multiple input lengths + ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) + ninemer_indices = np.where(ninemer_mask)[0] + pMHC_affinities_fold_pruned_imputed = \ + pMHC_affinities_fold_pruned_imputed[ninemer_indices] + pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices] + + # since we may have dropped some of the columns in the completed + # pMHC matrix need to find which column corresponds to the + # same name we're currently predicting + if allele in pruned_alleles_fold: + pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) + pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[ + :, pMHC_fold_allele_idx] + if pMHC_allele_values.std() == 0: + print("WARNING: unexpected zero-variance in pretraining affinity values") + else: + print( + "WARNING: Couldn't find allele %s in pre-training matrix" % allele) + column = pMHC_affinities_fold_incomplete[:, allele_idx] + column_mean = np.nanmean(column) + print("-- Setting pre-training target value to nanmean = %f" % column_mean) + pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean + + assert len(pruned_peptides_fold) == len(pMHC_allele_values) + + # dictionary mapping peptides to imputed affinity values + pretrain_dict = { + pi: yi + for (pi, yi) + in zip(pruned_peptides_fold, pMHC_allele_values) + } + + X_pretrain, pretrain_row_peptides, pretrain_counts = \ + fixed_length_index_encoding( + peptides=pruned_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + pretrain_sample_weights = 1.0 / np.array(pretrain_counts) + Y_pretrain = np.array( + [pretrain_dict[p] for p in pretrain_row_peptides]) + + X_train, training_row_peptides, training_counts = \ + fixed_length_index_encoding( + peptides=train_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + training_sample_weights = 1.0 / np.array(training_counts) + Y_train = np.array([train_dict_fold[p] for p in training_row_peptides]) + + n_pretrain = len(X_pretrain) + n_train_unique = len(train_peptides_fold) + n_train = len(X_train) + for key, predictor in generate_predictors(): + print("\n-----") + print( + ("Training CV fold %d/%d for %s " + "(# peptides = %d, # samples=%d, # pretrain samples=%d)" + " with parameters: %s") % ( + fold_idx + 1, + args.n_folds, + allele, + n_train_unique, + n_train, + n_pretrain, + key)) + print("-----") + predictor.fit( + X=X_train, + Y=Y_train, + sample_weights=training_sample_weights, + X_pretrain=X_pretrain, + Y_pretrain=Y_pretrain, + pretrain_sample_weights=pretrain_sample_weights, + n_training_epochs=args.training_epochs, + verbose=args.verbose, + batch_size=args.batch_size) + y_pred = predictor.predict_peptides(test_peptides) + if args.verbose: + print("-- mean(Y) = %f, mean(Y_pred) = %f" % ( + Y_train.mean(), + y_pred.mean())) + mae, tau, auc, f1_score = evaluate_predictions( + y_true=test_values, + y_pred=y_pred, + max_ic50=args.max_ic50) + + cv_fold_field_values = [ + allele, + str(fold_idx), + str(n_train_unique), + str(n_train), + ] + accuracy_field_values = [ + "%0.4f" % mae, + "%0.4f" % tau, + "%0.4f" % auc, + "%0.4f" % f1_score + ] + output_line = ( + ",".join(cv_fold_field_values) + + "," + key + + "," + ",".join(accuracy_field_values) + + "\n" + ) + print("CV fold result: %s" % output_line) + if output_file: + output_file.write(output_line) + output_file.flush() diff --git a/experiments/matrix_completion_helpers.py b/experiments/matrix_completion_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..bb6d803e7a509e6582d78822536c522b0a753e6a --- /dev/null +++ b/experiments/matrix_completion_helpers.py @@ -0,0 +1,205 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from fancyimpute.dictionary_helpers import ( + dense_matrix_from_nested_dictionary, + transpose_nested_dictionary, +) +from mhcflurry.data import load_allele_dicts +import sklearn.metrics +from sklearn.cross_validation import StratifiedKFold +from scipy import stats +import numpy as np + + +def evaluate_predictions( + y_true, + y_pred, + max_ic50): + """ + Return mean absolute error, Kendall tau, AUC and F1 score + """ + if len(y_pred) != len(y_true): + raise ValueError("y_pred must have same number of elements as y_true") + + mae = np.mean(np.abs(y_true - y_pred)) + + # if all predictions are the same + if (y_pred[0] == y_pred).all(): + logging.warn("All predicted values were the same: %f" % y_pred[0]) + return (mae, 0, 0.5, 0) + + tau, _ = stats.kendalltau( + y_pred, + y_true) + if np.isnan(tau): + logging.warn("Value for Kendall's tau was NaN (n=%d)" % (len(y_true),)) + tau = 0.0 + + true_ic50s = max_ic50 ** (1.0 - np.array(y_true)) + predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred)) + + true_binding_label = true_ic50s <= 500 + if true_binding_label.all(): + logging.warn("All true labels are binders, can't compute AUC") + return (mae, tau, 0.5, 0) + elif not true_binding_label.any(): + logging.warn("No true labels are binders, can't compute AUC") + # can't compute AUC or F1 without both negative and positive cases + return (mae, tau, 0.5, 0) + + auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred) + + predicted_binding_label = predicted_ic50s <= 500 + if predicted_binding_label.all(): + logging.warn("All predicted labels are binders, can't compute F1") + return (mae, tau, auc, 0) + elif not predicted_binding_label.any(): + logging.warn("No predicted labels are binders, can't compute F1") + # can't compute F1 without both positive and negative predictions + return (mae, tau, auc, 0) + + f1_score = sklearn.metrics.f1_score( + true_binding_label, + predicted_binding_label) + + return mae, tau, auc, f1_score + + +def prune_data( + X, + peptide_list, + allele_list, + min_observations_per_peptide=1, + min_observations_per_allele=1): + observed_mask = np.isfinite(X) + n_observed_per_peptide = observed_mask.sum(axis=1) + too_few_peptide_observations = ( + n_observed_per_peptide < min_observations_per_peptide) + if too_few_peptide_observations.any(): + drop_peptide_indices = np.where(too_few_peptide_observations)[0] + keep_peptide_indices = np.where(~too_few_peptide_observations)[0] + print("Dropping %d peptides with <%d observations" % ( + len(drop_peptide_indices), + min_observations_per_peptide)) + X = X[keep_peptide_indices] + observed_mask = observed_mask[keep_peptide_indices] + peptide_list = [peptide_list[i] for i in keep_peptide_indices] + + n_observed_per_allele = observed_mask.sum(axis=0) + too_few_allele_observations = ( + n_observed_per_allele < min_observations_per_peptide) + if too_few_peptide_observations.any(): + drop_allele_indices = np.where(too_few_allele_observations)[0] + keep_allele_indices = np.where(~too_few_allele_observations)[0] + + print("Dropping %d alleles with <%d observations: %s" % ( + len(drop_allele_indices), + min_observations_per_allele, + [allele_list[i] for i in drop_allele_indices])) + X = X[:, keep_allele_indices] + observed_mask = observed_mask[:, keep_allele_indices] + allele_list = [allele_list[i] for i in keep_allele_indices] + return X, peptide_list, allele_list + + +def load_data( + binding_data_csv, + max_ic50, + only_human=False, + min_observations_per_allele=1, + min_observations_per_peptide=1): + allele_to_peptide_to_affinity = load_allele_dicts( + binding_data_csv, + max_ic50=max_ic50, + only_human=only_human, + regression_output=True) + peptide_to_allele_to_affinity = transpose_nested_dictionary( + allele_to_peptide_to_affinity) + n_binding_values = sum( + len(allele_dict) + for allele_dict in + allele_to_peptide_to_affinity.values() + ) + print("Loaded %d binding values for %d alleles" % ( + n_binding_values, + len(allele_to_peptide_to_affinity))) + X, peptide_list, allele_list = \ + dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity) + return prune_data( + X, + peptide_list, + allele_list, + min_observations_per_peptide=min_observations_per_peptide, + min_observations_per_allele=min_observations_per_allele) + + +def index_counts(indices): + max_index = indices.max() + counts = np.zeros(max_index + 1, dtype=int) + for index in indices: + counts[index] += 1 + return counts + + +def stratified_cross_validation(X, observed_mask, n_folds=10): + n_observed = observed_mask.sum() + + (observed_peptide_index, observed_allele_index) = np.where(observed_mask) + observed_indices = np.ravel_multi_index( + (observed_peptide_index, observed_allele_index), + dims=observed_mask.shape) + + assert len(observed_indices) == n_observed + + observed_allele_counts = observed_mask.sum(axis=0) + print("# observed per allele: %s" % (observed_allele_counts,)) + assert (index_counts(observed_allele_index) == observed_allele_counts).all() + + kfold = StratifiedKFold( + observed_allele_index, + n_folds=n_folds, + shuffle=True) + + for (_, indirect_test_indices) in kfold: + test_linear_indices = observed_indices[indirect_test_indices] + test_coords = np.unravel_index( + test_linear_indices, + dims=observed_mask.shape) + + test_allele_counts = index_counts(test_coords[1]) + allele_fractions = test_allele_counts / observed_allele_counts.astype(float) + print("Fraction of each allele in this CV fold: %s" % (allele_fractions,)) + + X_test_vector = X[test_coords] + + X_fold = X.copy() + X_fold[test_coords] = np.nan + + empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0 + ok_row_mask = ~empty_row_mask + ok_row_indices = np.where(ok_row_mask)[0] + + empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0 + ok_col_mask = ~empty_col_mask + ok_col_indices = np.where(ok_col_mask)[0] + + ok_mesh = np.ix_(ok_row_indices, ok_col_indices) + + print("Dropping %d empty rows, %d empty columns" % ( + empty_row_mask.sum(), + empty_col_mask.sum())) + yield (X_fold, ok_mesh, test_coords, X_test_vector) diff --git a/experiments/model-selection.py b/experiments/model-selection.py index c46d735d1255bce7d2c98fdc113e71473dd26632..df52f7e30918d7ca3f88e2fb64cf60ecf1419946 100755 --- a/experiments/model-selection.py +++ b/experiments/model-selection.py @@ -143,6 +143,8 @@ parser.add_argument( help="Comma separated list of optimization methods") + + if __name__ == "__main__": args = parser.parse_args() configs = generate_all_model_configs( diff --git a/experiments/synthesize-augmented-dataset.py b/experiments/synthesize-augmented-dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..10cf705f868415901a53c66ecc4bc6a42f4f86a4 --- /dev/null +++ b/experiments/synthesize-augmented-dataset.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +import argparse + +import pandas as pd + +from dataset_paths import PETERS2009_CSV_PATH +from allele_similarities import compute_allele_similarities +from synthetic_data import ( + create_reverse_lookup_from_simple_dicts, + synthesize_affinities_for_all_alleles, + load_sims_dict +) + +from mhcflurry.data import load_allele_dicts + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--output-csv", + help="Path for CSV containing synthesized dataset of affinities", + required=True) + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--only-human", + default=False, + action="store_true") + +parser.add_argument( + "--allele-similarity-csv", + required=False) + +parser.add_argument( + "--smoothing-coef", + default=0.001, + type=float, + help="Smoothing value used for peptides with low weight across alleles") + +parser.add_argument( + "--similarity-exponent", + default=2.0, + type=float, + help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent") + + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + + allele_to_peptide_to_affinity = load_allele_dicts( + args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human, + regression_output=True) + + print("Loaded binding data for %d alleles" % ( + len(allele_to_peptide_to_affinity),)) + + reverse_lookup = create_reverse_lookup_from_simple_dicts( + allele_to_peptide_to_affinity) + print("Created reverse lookup dictionary for %d peptides" % len(reverse_lookup)) + + if args.allele_similarity_csv: + sims_dict = load_sims_dict( + args.allele_similarity_csv, + allele_pair_keys=False) + allele_names = list(sims_dict.keys()) + print("Loaded similarities between %d alleles from %s" % ( + len(allele_names), + args.allele_similarity_csv)) + else: + sims_dict, _, _ = \ + compute_allele_similarities( + allele_to_peptide_to_affinity, + min_weight=0.1) + + synthetic_data = synthesize_affinities_for_all_alleles( + peptide_to_affinities=reverse_lookup, + pairwise_allele_similarities=sims_dict, + smoothing=args.smoothing_coef, + exponent=args.similarity_exponent, + allele_pair_keys=False) + + synthetic_data_dict = OrderedDict([ + ("mhc", []), + ("sequence", []), + ("ic50", []), + ]) + + for allele, allele_entries in sorted(synthetic_data.items()): + print("%s (n=%d)" % (allele, len(allele_entries))) + for (peptide, regression_value) in allele_entries.items(): + synthetic_data_dict["mhc"].append(allele) + synthetic_data_dict["sequence"].append(peptide) + ic50 = args.max_ic50 ** (1.0 - regression_value) + synthetic_data_dict["ic50"].append(ic50) + + synthetic_data_df = pd.DataFrame(synthetic_data_dict) + + print("Created dataset with %d synthesized pMHC affinities" % ( + len(synthetic_data_df),)) + + synthetic_data_df.to_csv(args.output_csv, index=False) diff --git a/experiments/synthetic_data.py b/experiments/synthetic_data.py new file mode 100644 index 0000000000000000000000000000000000000000..130b8e2ff73fd0642871aedc9a6c31e156c0ceea --- /dev/null +++ b/experiments/synthetic_data.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helper functions for creating synthetic pMHC affinities from allele similarities +and true binding data +""" + +from collections import defaultdict + +import pandas as pd + +from common import curry_dictionary + + +def load_sims_dict(csv_path, allele_pair_keys=True): + """ + Read in allele-to-allele similarities as DataFrame + """ + sims_df = pd.read_csv(csv_path) + # extract dictionary mapping pairs of alleles to similarity + sims_dict = { + (row["allele_A"], row["allele_B"]): row["similarity"] + for (_, row) + in sims_df.iterrows() + } + if not allele_pair_keys: + return curry_dictionary(sims_dict) + else: + return sims_dict + + +def synthesize_affinities_for_single_allele( + similarities, + peptide_to_affinities, + smoothing=0.005, + exponent=2.0, + exclude_alleles=[]): + """ + Parameters + ---------- + similarities : dict + Dictionary mapping allele names to their similarity to the + allele of interest. + + peptide_to_affinities : dict + Dictionary mapping peptide sequence to list of triplets containing + the following fields: + - allele + - log-scaled affinity normalized to 0.0 = no binding, 1.0 = strong + - sample weight + + smoothing : float + + exponent : float + + exclude_alleles : collection of allele names + Don't use these alleles for synthesizing new affinities + + Returns dictionary mapping peptide sequences to affinities. + """ + assert isinstance(similarities, dict), \ + "Wrong type for similarities: %s" % type(similarities) + results = {} + for peptide, affinities in peptide_to_affinities.items(): + total = 0.0 + denom = 0.0 + for entry in affinities: + + if len(entry) == 2: + (allele, y) = entry + sample_weight = 1.0 + else: + assert len(entry) == 3 + (allele, y, sample_weight) = entry + if allele in exclude_alleles: + continue + sim = similarities.get(allele, 0) + if sim == 0: + continue + combined_weight = sim ** exponent * sample_weight + total += combined_weight * y + denom += combined_weight + if denom > 0.0: + results[peptide] = total / (smoothing + denom) + return results + + +def create_reverse_lookup_from_allele_data_objects(allele_datasets): + """ + Given a dictionary mapping each allele name to an AlleleData object, + create reverse-lookup dictionary mapping each peptide to a list of triplets: + [(allele, regression_output, weight), ...] + """ + peptide_affinities_dict = defaultdict(list) + for allele, dataset in allele_datasets.items(): + for peptide, y, weight in zip( + dataset.peptides, dataset.Y, dataset.weights): + entry = (allele, y, weight) + peptide_affinities_dict[peptide].append(entry) + return peptide_affinities_dict + + +def create_reverse_lookup_from_simple_dicts(affinities): + """ + Create a lookup table from peptides to lists of (allele, affinity, weight) + """ + reverse_lookup = defaultdict(list) + for allele, affinity_dict in affinities.items(): + for (peptide, affinity) in affinity_dict.items(): + reverse_lookup[peptide].append((allele, affinity, 1.0)) + return reverse_lookup + + +def synthesize_affinities_for_all_alleles( + peptide_to_affinities, + pairwise_allele_similarities, + allele_pair_keys=True, + smoothing=0.005, + exponent=2.0): + """ + peptide_to_affinities : dict + Maps each peptide to list of either + (allele, affinity) or (allele, affinity, weight) + + pairwise_allele_similarities : dict + Dictionary from allele -> allele -> value between 0..1 + + smoothing : float + + exponent : float + """ + if allele_pair_keys: + pairwise_allele_similarities = curry_dictionary( + pairwise_allele_similarities) + + all_predictions = {} + + allele_names = set(pairwise_allele_similarities.keys()) + for allele in allele_names: + all_predictions[allele] = synthesize_affinities_for_single_allele( + similarities=pairwise_allele_similarities[allele], + peptide_to_affinities=peptide_to_affinities, + smoothing=smoothing, + exponent=exponent) + return all_predictions diff --git a/experiments/train-models-with-synthetic-pretraining.py b/experiments/train-models-with-synthetic-pretraining.py new file mode 100644 index 0000000000000000000000000000000000000000..9d0d050e7b906b47488e32c12aea29a12125ac0c --- /dev/null +++ b/experiments/train-models-with-synthetic-pretraining.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import numpy as np + +import mhcflurry +from mhcflurry.data import ( + load_allele_datasets, + encode_peptide_to_affinity_dict, +) + + +from arg_parsing import parse_int_list, parse_float_list +from dataset_paths import PETERS2009_CSV_PATH +from common import load_csv_binding_data_as_dict +from training_helpers import ( + kfold_cross_validation_of_model_params_with_synthetic_data, + average_prediction_scores_list +) + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--synthetic-data-csv", + required=True, + help="CSV with {mhc, sequence, ic50} columns of synthetic data") + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--embedding-dim-sizes", + default=[4, 8, 16, 32, 64], + type=parse_int_list) + +parser.add_argument( + "--hidden-layer-sizes", + default=[4, 8, 16, 32, 64, 128, 256], + type=parse_int_list) + +parser.add_argument( + "--dropouts", + default=[0.0, 0.25, 0.5], + type=parse_float_list) + +parser.add_argument( + "--activation-functions", + default=["tanh", "relu"], + type=lambda s: [si.strip() for si in s.split(",")]) + +parser.add_argument( + "--training-epochs", + type=int, + default=250) + +parser.add_argument("--log-file") + + +def get_extra_data(allele, train_datasets, expanded_predictions): + original_dataset = train_datasets[allele] + original_peptides = set(original_dataset.peptides) + expanded_allele_affinities = expanded_predictions[allele] + extra_affinities = { + k: v + for (k, v) in expanded_allele_affinities.items() + if k not in original_peptides + } + extra_peptides = list(extra_affinities.keys()) + extra_values = list(extra_affinities.values()) + extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9) + extra_Y = np.array(extra_values) + return extra_X, extra_Y + + +def rescale_ic50(ic50, max_ic50): + log_ic50 = np.log(ic50) / np.log(args.max_ic50) + return max(0.0, min(1.0, 1.0 - log_ic50)) + + +def load_synthetic_data(csv_path, max_ic50): + synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict( + csv_path) + return { + allele: { + peptide: rescale_ic50(ic50, max_ic50=max_ic50) + for (peptide, ic50) + in allele_dict.items() + } + for (allele, allele_dict) + in synthetic_allele_to_peptide_to_ic50_dict.items() + } + + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + + print("Loading binding data from %s" % args.binding_data_csv) + allele_datasets = load_allele_datasets( + args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=False) + print("Loading synthetic data from %s" % args.synthetic_data_csv) + + synthetic_affinities = load_synthetic_data( + csv_path=args.synthetic_data_csv, + max_ic50=args.max_ic50) + + combined_allele_set = set(allele_datasets.keys()).union( + synthetic_affinities.keys()) + + combined_allele_list = list(sorted(combined_allele_set)) + + if args.log_file: + logfile = open(args.log_file, "w") + logfile.write("allele,n_samples,n_unique,n_synth,") + else: + logfile = None + logfile_needs_header = True + + for allele in combined_allele_list: + synthetic_allele_dict = synthetic_affinities[allele] + (_, _, Counts_synth, X_synth, _, Y_synth) = \ + encode_peptide_to_affinity_dict(synthetic_allele_dict) + synthetic_sample_weights = 1.0 / Counts_synth + scores = {} + source_peptides = allele_datasets[allele].original_peptides + X_original = allele_datasets[allele].X_index + Y_original = allele_datasets[allele].Y + n_samples = len(X_original) + n_unique_samples = len(set(source_peptides)) + n_synth_samples = len(synthetic_sample_weights) + for dropout in args.dropouts: + for embedding_dim_size in args.embedding_dim_sizes: + for hidden_layer_size in args.hidden_layer_sizes: + for activation in args.activation_functions: + params = ( + ("dropout_probability", dropout), + ("embedding_dim_size", embedding_dim_size), + ("hidden_layer_size", hidden_layer_size), + ("activation", activation), + ) + + print( + "Evaluating allele %s (n=%d, unique=%d): %s" % ( + allele, + n_samples, + n_unique_samples, + params)) + fold_scores = \ + kfold_cross_validation_of_model_params_with_synthetic_data( + X_original=X_original, + Y_original=Y_original, + source_peptides_original=source_peptides, + X_synth=X_synth, + Y_synth=Y_synth, + original_sample_weights=allele_datasets[allele].weights, + synthetic_sample_weights=synthetic_sample_weights, + n_training_epochs=args.training_epochs, + max_ic50=args.max_ic50, + **dict(params)) + average_scores = average_prediction_scores_list(fold_scores) + if logfile: + if logfile_needs_header: + for param_name, _ in params: + logfile.write("%s," % param_name) + for score_name in average_scores._fields: + logfile.write("%s," % score_name) + logfile.write("param_id\n") + logfile_needs_header = False + logfile.write("%s,%d,%d,%d," % ( + allele, + n_samples, + n_unique_samples, + n_synth_samples)) + for _, param_value in params: + logfile.write("%s," % param_value) + for score_name in average_scores._fields: + score_value = average_scores.__dict__[score_name] + logfile.write("%0.4f," % score_value) + logfile.write("%d\n" % len(scores)) + logfile.flush() + + scores[params] = average_scores + print("%s => %s" % (params, average_scores)) + if logfile: + logfile.close() diff --git a/experiments/training_helpers.py b/experiments/training_helpers.py index 0b9396247a7d4988084ba442c04e8da56cafff11..17773cdd9c7428ff5dc1370c7698c49b37b6bfc1 100644 --- a/experiments/training_helpers.py +++ b/experiments/training_helpers.py @@ -12,13 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections import OrderedDict +from collections import OrderedDict, namedtuple +import logging +from itertools import groupby import numpy as np -from sklearn.metrics import roc_auc_score +from sklearn import metrics +from sklearn.cross_validation import LabelKFold +from scipy import stats +import mhcflurry from mhcflurry.common import normalize_allele_name -from mhcflurry.data_helpers import indices_to_hotshot_encoding +from mhcflurry.data import indices_to_hotshot_encoding + + +PredictionScores = namedtuple("PredictionScores", "tau auc f1 accuracy mae") def encode_allele_dataset( @@ -81,27 +89,54 @@ def encode_allele_datasets( return (X_dict, Y_log_ic50_dict, ic50_dict) -def f1_score(true_label, label_pred): - tp = (true_label & label_pred).sum() - fp = ((~true_label) & label_pred).sum() - fn = (true_label & (~label_pred)).sum() - recall = (tp / float(tp + fn)) if (tp + fn) > 0 else 0.0 - precision = (tp / float(tp + fp)) if (tp + fp) > 0 else 0.0 - if (precision + recall) > 0: - return (2 * precision * recall) / (precision + recall) - else: - return 0.0 - - -def score_predictions(predicted_log_ic50, true_label, max_ic50): - """Computes accuracy, AUC, and F1 score of predictions""" - auc = roc_auc_score(true_label, predicted_log_ic50) - ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50) - label_pred = (ic50_pred <= 500) - same_mask = true_label == label_pred +def score_predictions( + predicted_log_ic50, + true_log_ic50, + max_ic50): + """Computes Kendall-tau, AUC, F1 score, and accuracy of predictions + + Parameters + ---------- + predicted_log_ic50 : np.array + + true_log_ic50 : np.array + + max_ic50 : float + + Returns PredictionScores object with fields (tau, auc, f1, accuracy) + """ + mae = np.abs(predicted_log_ic50 - true_log_ic50).mean() + tau, _ = stats.kendalltau(predicted_log_ic50, true_log_ic50) + if np.isnan(tau): + logging.warn("Kendall tau was NaN!") + tau = 0.0 + + true_ic50s = max_ic50 ** (1.0 - np.array(true_log_ic50)) + predicted_ic50s = max_ic50 ** (1.0 - np.array(predicted_log_ic50)) + + true_binding_label = true_ic50s <= 500 + if true_binding_label.all() or not true_binding_label.any(): + logging.warn( + ("Can't compute AUC, F1, accuracy without both" + " negative and positive ground truth labels")) + return PredictionScores( + tau=tau, + auc=0.5, + f1=0.0, + accuracy=0.0, + mae=mae) + + auc = metrics.roc_auc_score(true_binding_label, predicted_log_ic50) + predicted_binding_label = predicted_ic50s <= 500 + f1_score = metrics.f1_score(true_binding_label, predicted_binding_label) + same_mask = true_binding_label == predicted_binding_label accuracy = np.mean(same_mask) - f1 = f1_score(true_label, label_pred) - return accuracy, auc, f1 + return PredictionScores( + tau=tau, + auc=auc, + f1=f1_score, + accuracy=accuracy, + mae=mae) def train_model_and_return_scores( @@ -109,7 +144,7 @@ def train_model_and_return_scores( X_train, log_ic50_train, X_test, - binder_label_test, + log_ic50_test, n_training_epochs, minibatch_size, max_ic50): @@ -120,8 +155,339 @@ def train_model_and_return_scores( verbose=0, batch_size=minibatch_size) pred = model.predict(X_test).flatten() - accuracy, auc, f1_score = score_predictions( + return score_predictions( predicted_log_ic50=pred, - true_label=binder_label_test, + true_log_ic50=log_ic50_test, max_ic50=max_ic50) - return (accuracy, auc, f1_score) + + +def train_model_with_synthetic_data( + model, + n_training_epochs, + X_original, + Y_original, + X_synth, + Y_synth, + original_sample_weights=None, + synthetic_sample_weights=None): + n_actual_samples, n_actual_dims = X_original.shape + n_synth_samples, n_synth_dims = X_synth.shape + if original_sample_weights is None: + original_sample_weights = np.ones_like(Y_original) + + if synthetic_sample_weights is None: + synthetic_sample_weights = np.ones_like(Y_synth) + + total_synth_weights = synthetic_sample_weights.sum() + total_original_weights = original_sample_weights.sum() + print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % ( + np.mean(Y_original), + np.mean(Y_synth), + np.mean(original_sample_weights), + np.mean(synthetic_sample_weights))) + + combined_weights = np.concatenate([ + original_sample_weights, + synthetic_sample_weights + ]) + + assert n_actual_dims == n_synth_dims, \ + "Mismatch between # of actual dims %d and synthetic dims %d" % ( + n_actual_dims, n_synth_dims) + + X_combined = np.vstack([X_original, X_synth]) + n_combined_samples = n_actual_samples + n_synth_samples + + assert X_combined.shape[0] == n_combined_samples, \ + "Expected %d samples but got data array with shape %s" % ( + n_actual_samples + n_synth_samples, X_combined.shape) + + assert len(combined_weights) == n_combined_samples, \ + "Expected combined_weights to have length %d but got shape = %s" % ( + n_combined_samples, + combined_weights.shape) + + Y_combined = np.concatenate([Y_original, Y_synth]) + assert Y_combined.min() >= 0, \ + "Y should not contain negative numbers! Y.min() = %f" % ( + Y_combined.min(),) + assert Y_combined.max() <= 1, \ + "Y should have max value 1.0, got Y.max() = %f" % ( + Y_combined.max(),) + + for epoch in range(n_training_epochs): + # weights for synthetic points can be shrunk as: + # ~ 1 / (1+epoch)**2 + # or + # e ** -epoch + decay_factor = np.exp(-epoch) + # if the contribution of synthetic samples is less than a + # thousandth of the actual data, then stop using it + synth_contribution = total_synth_weights * decay_factor + synth_fraction_contribution = synth_contribution / ( + synth_contribution + total_original_weights) + # only use synthetic data if it contributes at least 1/1000th of + # sample weight + use_synth_data = synth_fraction_contribution > 0.001 + if use_synth_data: + combined_weights[n_actual_samples:] = ( + synthetic_sample_weights * decay_factor) + model.fit( + X_combined, + Y_combined, + sample_weight=combined_weights, + nb_epoch=1, + verbose=0) + else: + model.fit( + X_original, + Y_original, + sample_weight=original_sample_weights, + nb_epoch=1, + verbose=0) + + Y_pred = model.predict(X_original) + training_mse = ((Y_original - Y_pred) ** 2).mean() + + synth_data_percent = ( + ((1.0 - synth_fraction_contribution) * 100) + if use_synth_data + else 100 + ) + real_data_percent = ( + (synth_fraction_contribution * 100) + if use_synth_data else 0 + ) + print( + ("-- Epoch %d/%d real data weight %0.2f%%," + " synth data weight %0.2f%%," + " Training MSE %0.4f") % ( + epoch + 1, + n_training_epochs, + real_data_percent, + synth_data_percent, + training_mse)) + + +def peptide_group_indices(original_peptide_sequences): + """ + Given a list of peptide sequences, some of which are identical, + generate a sequence of index lists for the occurrence of distinct + entries. + """ + indices_and_sequences = enumerate(original_peptide_sequences) + for _, subiter in groupby(indices_and_sequences, lambda pair: pair[1]): + yield [idx for (idx, _) in subiter] + + +def collapse_peptide_group_affinities( + predictions, + true_values, + original_peptide_sequences, + combine_multiple_predictions_fn=np.median): + """ + Given predictions and true log-transformed IC50 values for 9mers which may + have been either elongated or shortened from peptide sequences of other + lengths, collapse the log-transformed IC50 values to a single value + per original peptide sequence. + """ + assert len(predictions) == len(true_values), \ + ("Expected predictions (%d elements) to have same length" + " as true values (%d elements)") % (len(predictions), len(true_values)) + assert len(predictions) == len(original_peptide_sequences), \ + ("Expected predictions (%d elements) to have same length" + " as peptide sequences (%d elements)") % ( + len(predictions), len(original_peptide_sequences)) + + collapsed_predictions = [] + collapsed_true_values = [] + for peptide_indices in peptide_group_indices(original_peptide_sequences): + if len(peptide_indices) == 1: + idx = peptide_indices[0] + collapsed_predictions.append(predictions[idx]) + collapsed_true_values.append(true_values[idx]) + else: + collapsed_predictions.append( + combine_multiple_predictions_fn(predictions[peptide_indices])) + collapsed_true_values.append( + combine_multiple_predictions_fn( + true_values[peptide_indices])) + return collapsed_predictions, collapsed_true_values + + +def kfold_cross_validation_of_model_fn_with_synthetic_data( + make_model_fn, + n_training_epochs, + max_ic50, + X_train, + Y_train, + source_peptides_train, + X_synth, + Y_synth, + original_sample_weights, + synthetic_sample_weights, + n_cross_validation_folds, + combine_multiple_predictions_fn=np.median, + random_seed=0): + """ + Given a function which generates fresh copies of a model, use k-fold + cross validation (stratified by the list of source peptide sequences) to + evaluate the predictive performance of this model type. + + Returns a list of pairs, the first element is a trained model and the second + element is a PredictionScores object with the following fields: + ("tau", "f1", "auc", "accuracy") + """ + assert len(X_train) == len(Y_train) + assert len(source_peptides_train) == len(X_train) + # randomly shuffle the training data first + indices = np.arange(len(X_train)) + np.random.seed(random_seed) + np.random.shuffle(indices) + X_train, Y_train, weights_train = \ + X_train[indices], Y_train[indices], original_sample_weights[indices] + source_peptides_train = [source_peptides_train[i] for i in indices] + + # we need to do cross-validation carefully since some of our training + # samples may be extracted from the same longer peptide. + # To avoid training vs. test contamination we do k-fold cross validation + # stratified by the "source" peptides of each sample. + cv_iterator = enumerate( + LabelKFold( + labels=source_peptides_train, + n_folds=n_cross_validation_folds)) + results = [] + for fold_number, (fold_train_index, fold_test_index) in cv_iterator: + model = make_model_fn() + X_train_fold, X_test_fold = \ + X_train[fold_train_index], X_train[fold_test_index] + weights_train_fold = weights_train[fold_train_index] + Y_train_fold, Y_test_fold = \ + Y_train[fold_train_index], Y_train[fold_test_index] + peptides_train_fold = [ + source_peptides_train[i] + for i in fold_train_index + ] + peptides_test_fold = [ + source_peptides_train[i] + for i in fold_test_index + ] + + train_model_with_synthetic_data( + model=model, + n_training_epochs=n_training_epochs, + X_original=X_train_fold, + Y_original=Y_train_fold, + X_synth=X_synth, + Y_synth=Y_synth, + original_sample_weights=weights_train_fold, + synthetic_sample_weights=synthetic_sample_weights) + Y_pred = model.predict(X_test_fold).flatten() + # since some 'epitopes' are actually substrings of longer peptides + # we need to coalesce those predictions by calling the function + # `combine_multiple_predictions_fn` on the set of predictions + # associated with one longer peptide + collapsed_predictions, collapsed_true_values = \ + collapse_peptide_group_affinities( + predictions=Y_pred, + true_values=Y_test_fold, + original_peptide_sequences=peptides_test_fold, + combine_multiple_predictions_fn=combine_multiple_predictions_fn) + scores = score_predictions( + predicted_log_ic50=collapsed_predictions, + true_log_ic50=collapsed_true_values, + max_ic50=max_ic50) + print("::: CV fold %d/%d (n_samples=%d, n_unique=%d): %s\n\n" % ( + fold_number + 1, + n_cross_validation_folds, + len(peptides_train_fold), + len(set(peptides_train_fold)), + scores)) + results.append((model, scores)) + return results + + +def average_prediction_scores_list(model_scores_list): + """ + Given a list of (model, PredictionScores) pairs, + returns a single PredictionScores object whose fields are + the average across the list. + """ + n = len(model_scores_list) + return PredictionScores( + tau=sum(x.tau for (_, x) in model_scores_list) / n, + auc=sum(x.auc for (_, x) in model_scores_list) / n, + f1=sum(x.f1 for (_, x) in model_scores_list) / n, + accuracy=sum(x.accuracy for (_, x) in model_scores_list) / n) + + +def kfold_cross_validation_of_model_params_with_synthetic_data( + X_original, + Y_original, + source_peptides_original, + X_synth=None, + Y_synth=None, + original_sample_weights=None, + synthetic_sample_weights=None, + n_training_epochs=150, + n_cross_validation_folds=5, + embedding_dim_size=16, + hidden_layer_size=50, + dropout_probability=0.0, + activation="tanh", + max_ic50=50000.0): + """ + Returns: + - PredictionScores object with average Kendall-tau, AUC, F1, + and accuracy across all cross-validation folds. + - list of all (model, PredictionScores) objects from cross-validation + folds + """ + n_unique_samples = len(set(source_peptides_original)) + + if n_cross_validation_folds > n_unique_samples: + n_cross_validation_folds = n_unique_samples + + if original_sample_weights is None: + original_sample_weights = np.ones(len(X_original), dtype=float) + + # if synthetic data is missing then make an empty array so all the + # downstream code still works + if X_synth is None: + X_synth = np.array([[]]) + + # if only the target values for synthetic data are missing then + # treat all synthetic entries as negative examples + if Y_synth is None: + Y_synth = np.zeros(len(X_synth), dtype=float) + + if synthetic_sample_weights is None: + synthetic_sample_weights = np.ones(len(X_synth), dtype=float) + + def make_model(): + return mhcflurry.feedforward.make_embedding_network( + peptide_length=9, + embedding_input_dim=20, + embedding_output_dim=embedding_dim_size, + layer_sizes=[hidden_layer_size], + activation=activation, + init="lecun_uniform", + loss="mse", + output_activation="sigmoid", + dropout_probability=dropout_probability, + optimizer=None, + learning_rate=0.001) + + model_scores_list = kfold_cross_validation_of_model_fn_with_synthetic_data( + make_model_fn=make_model, + n_training_epochs=n_training_epochs, + max_ic50=max_ic50, + X_train=X_original, + Y_train=Y_original, + source_peptides_train=source_peptides_original, + X_synth=X_synth, + Y_synth=Y_synth, + original_sample_weights=original_sample_weights, + synthetic_sample_weights=synthetic_sample_weights, + n_cross_validation_folds=n_cross_validation_folds) + return model_scores_list diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index c1a908084f56c3d68d61ac181d2bc2d0ac642b35..4ab1d343a2cb6462e2e50506c00e727a731fd75f 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -13,15 +13,19 @@ # limitations under the License. from . import paths -from . import data_helpers +from . import data from . import feedforward from . import common -from .mhc1_binding_predictor import Mhc1BindingPredictor +from . import peptide_encoding +from . import amino_acid +from .class1_binding_predictor import Class1BindingPredictor __all__ = [ "paths", - "data_helpers", + "data", "feedforward", + "peptide_encoding", + "amino_acid", "common", - "Mhc1BindingPredictor" -] \ No newline at end of file + "Class1BindingPredictor" +] diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index c7607d4d712ff1929056e99836ef2aa06696a44e..f40225612f6373c6609b3aed60fa23de14c2c90b 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -12,7 +12,81 @@ # See the License for the specific language governing permissions and # limitations under the License. -amino_acids = { +import numpy as np + + +class Alphabet(object): + """ + Used to track the order of amino acids used for peptide encodings + """ + + def __init__(self, **kwargs): + self.letters_to_names = {} + for (k, v) in kwargs.items(): + self.add(k, v) + + def add(self, letter, name): + assert letter not in self.letters_to_names + assert len(letter) == 1 + self.letters_to_names[letter] = name + + def letters(self): + return list(sorted(self.letters_to_names.keys())) + + def names(self): + return [self.letters_to_names[k] for k in self.letters()] + + def index_dict(self): + return {c: i for (i, c) in enumerate(self.letters())} + + def copy(self): + return Alphabet(**self.letters_to_names) + + def __getitem__(self, k): + return self.letters_to_names[k] + + def __setitem__(self, k, v): + self.add(k, v) + + def index_encoding_list(self, peptides): + index_dict = self.index_dict() + return [ + [index_dict[amino_acid] for amino_acid in peptide] + for peptide in peptides + ] + + def index_encoding(self, peptides, peptide_length): + """ + Encode a set of equal length peptides as a matrix of their + amino acid indices. + """ + X = np.zeros((len(peptides), peptide_length), dtype=int) + index_dict = self.index_dict() + for i, peptide in enumerate(peptides): + for j, amino_acid in enumerate(peptide): + X[i, j] = index_dict[amino_acid] + return X + + def hotshot_encoding( + self, + peptides, + peptide_length): + """ + Encode a set of equal length peptides as a binary matrix, + where each letter is transformed into a length 20 vector with a single + element that is 1 (and the others are 0). + """ + shape = (len(peptides), peptide_length, 20) + index_dict = self.index_dict() + X = np.zeros(shape, dtype=bool) + for i, peptide in enumerate(peptides): + for j, amino_acid in enumerate(peptide): + k = index_dict[amino_acid] + X[i, j, k] = 1 + return X + + +common_amino_acids = Alphabet(**{ "A": "Alanine", "R": "Arginine", "N": "Asparagine", @@ -33,8 +107,9 @@ amino_acids = { "W": "Tryptophan", "Y": "Tyrosine", "V": "Valine", -} +}) +common_amino_acid_letters = common_amino_acids.letters() -amino_acid_letters = list(sorted(amino_acids.keys())) -amino_acid_names = [amino_acids[k] for k in amino_acid_letters] -amino_acid_letter_indices = {c: i for (i, c) in enumerate(amino_acid_letters)} +amino_acids_with_unknown = common_amino_acids.copy() +amino_acids_with_unknown.add("X", "Unknown") +amino_acids_with_unknown_letters = amino_acids_with_unknown.letters() diff --git a/mhcflurry/class1.py b/mhcflurry/class1.py deleted file mode 100644 index aa099d624d9c0a77ae461e4bab94a68063209f5c..0000000000000000000000000000000000000000 --- a/mhcflurry/class1.py +++ /dev/null @@ -1,24 +0,0 @@ -import pandas as pd -import os - -from .paths import CLASS1_MODEL_DIRECTORY -from .mhc1_binding_predictor import Mhc1BindingPredictor - -def predict(alleles, peptides): - allele_dataframes = [] - for allele in alleles: - model = Mhc1BindingPredictor(allele=allele) - df = model.predict_peptides(peptides) - allele_dataframes.append(df) - return pd.concat(allele_dataframes) - -def supported_alleles(): - alleles = [] - for filename in os.listdir(CLASS1_MODEL_DIRECTORY): - allele = filename.replace(".hdf", "") - if len(allele) < 5: - # skipping serotype names like A2 or B7 - continue - allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:]) - alleles.append(allele) - return alleles diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..7ad212f9cd4a42f1e62fb0aa05650340df204776 --- /dev/null +++ b/mhcflurry/class1_binding_predictor.py @@ -0,0 +1,424 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Allele specific MHC Class I binding affinity predictor +""" +from __future__ import ( + print_function, + division, + absolute_import, +) +import logging +from os import listdir, remove +from os.path import exists, join + +import json + +import numpy as np +from keras.models import model_from_config + + +from .common import normalize_allele_name +from .paths import CLASS1_MODEL_DIRECTORY +from .feedforward import make_embedding_network +from .predictor_base import PredictorBase + +from .class1_allele_specific_hyperparameters import MAX_IC50 + +_allele_predictor_cache = {} + +class Class1BindingPredictor(PredictorBase): + def __init__( + self, + model, + name=None, + max_ic50=MAX_IC50, + allow_unknown_amino_acids=False, + verbose=False): + PredictorBase.__init__( + self, + name=name, + max_ic50=max_ic50, + allow_unknown_amino_acids=allow_unknown_amino_acids, + verbose=verbose) + self.name = name + self.model = model + + @classmethod + def from_disk( + cls, + model_json_path, + weights_hdf_path=None, + name=None, + max_ic50=MAX_IC50): + """ + Load model from stored JSON representation of network and + (optionally) load weights from HDF5 file. + """ + if not exists(model_json_path): + raise ValueError("Model file %s (name = %s) not found" % ( + model_json_path, name,)) + + with open(model_json_path, "r") as f: + config_dict = json.load(f) + + model = model_from_config(config_dict) + + if weights_hdf_path: + if not exists(weights_hdf_path): + raise ValueError( + "Missing model weights file %s (name = %s)" % ( + weights_hdf_path, name)) + + model.load_weights(weights_hdf_path) + + return cls.__init__( + model=model, + max_ic50=max_ic50, + name=name) + + @classmethod + def from_hyperparameters( + cls, + name=None, + max_ic50=MAX_IC50, + peptide_length=9, + embedding_input_dim=20, + embedding_output_dim=20, + layer_sizes=[50], + activation="tanh", + init="lecun_uniform", + loss="mse", + output_activation="sigmoid", + dropout_probability=0, + learning_rate=0.001, + **kwargs): + """ + Create untrained predictor with the given hyperparameters. + """ + model = make_embedding_network( + peptide_length=peptide_length, + embedding_input_dim=embedding_input_dim, + embedding_output_dim=embedding_output_dim, + layer_sizes=layer_sizes, + activation=activation, + init=init, + loss=loss, + output_activation=output_activation, + dropout_probability=dropout_probability, + learning_rate=learning_rate) + return cls( + name=name, + max_ic50=max_ic50, + model=model, + **kwargs) + + def _combine_training_data( + self, + X, + Y, + sample_weights, + X_pretrain, + Y_pretrain, + pretrain_sample_weights, + verbose=False): + """ + Make sure the shapes of given training and pre-training data + conform with each other. Then concatenate the pre-training and the + training data. + + Returns (X_combined, Y_combined, weights_combined, n_pretrain_samples) + """ + X = np.asarray(X) + Y = np.asarray(Y) + + if verbose: + print("X.shape = %s" % (X.shape,)) + + n_samples, n_dims = X.shape + + if len(Y) != n_samples: + raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % ( + n_samples, len(Y))) + + if Y.min() < 0: + raise ValueError("Minimum value of Y can't be negative, got %f" % ( + Y.min())) + if Y.max() > 1: + raise ValueError("Maximum value of Y can't be greater than 1, got %f" % ( + Y.max())) + + if sample_weights is None: + sample_weights = np.ones_like(Y) + else: + sample_weights = np.asarray(sample_weights) + + if len(sample_weights) != n_samples: + raise ValueError( + "Length of sample_weights (%d) doesn't match number of samples (%d)" % ( + len(sample_weights), + n_samples)) + + if X_pretrain is None or Y_pretrain is None: + X_pretrain = np.empty((0, n_dims), dtype=X.dtype) + Y_pretrain = np.empty((0,), dtype=Y.dtype) + else: + X_pretrain = np.asarray(X_pretrain) + Y_pretrain = np.asarray(Y_pretrain) + + if verbose: + print("X_pretrain.shape = %s" % (X_pretrain.shape,)) + n_pretrain_samples, n_pretrain_dims = X_pretrain.shape + if n_pretrain_dims != n_dims: + raise ValueError( + "# of dims for pretraining data (%d) doesn't match X.shape[1] = %d" % ( + n_pretrain_dims, n_dims)) + + if len(Y_pretrain) != n_pretrain_samples: + raise ValueError( + "length of Y_pretrain (%d) != length of X_pretrain (%d)" % ( + len(Y_pretrain), + len(X_pretrain))) + + if len(Y_pretrain) > 0 and Y_pretrain.min() < 0: + raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % ( + Y.min())) + if len(Y_pretrain) > 0 and Y_pretrain.max() > 1: + raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % ( + Y.max())) + + if pretrain_sample_weights is None: + pretrain_sample_weights = np.ones_like(Y_pretrain) + else: + pretrain_sample_weights = np.asarray(pretrain_sample_weights) + if verbose: + print("sample weights mean = %f, pretrain weights mean = %f" % ( + sample_weights.mean(), + pretrain_sample_weights.mean())) + X_combined = np.vstack([X_pretrain, X]) + Y_combined = np.concatenate([Y_pretrain, Y]) + combined_weights = np.concatenate([ + pretrain_sample_weights, + sample_weights, + ]) + return X_combined, Y_combined, combined_weights, n_pretrain_samples + + def fit( + self, + X, + Y, + sample_weights=None, + X_pretrain=None, + Y_pretrain=None, + pretrain_sample_weights=None, + n_training_epochs=200, + verbose=False, + batch_size=128): + """ + Train predictive model from index encoding of fixed length 9mer peptides. + + Parameters + ---------- + X : array + Training data with shape (n_samples, n_dims) + + Y : array + Training labels with shape (n_samples,) + + sample_weights : array + Weight of each training sample with shape (n_samples,) + + X_pretrain : array + Extra samples used for soft pretraining of the predictor, + should have same number of dimensions as X. During training the weights + of these samples will decay after each epoch. + + Y_pretrain : array + Labels for extra samples, shape + + pretrain_sample_weights : array + Initial weights for the rows of X_pretrain. If not specified then + initialized to ones. + + n_training_epochs : int + + verbose : bool + + batch_size : int + """ + X_combined, Y_combined, combined_weights, n_pretrain = \ + self._combine_training_data( + X, Y, sample_weights, X_pretrain, Y_pretrain, pretrain_sample_weights, + verbose=verbose) + + total_pretrain_sample_weight = combined_weights[:n_pretrain].sum() + total_train_sample_weight = combined_weights[n_pretrain:].sum() + total_combined_sample_weight = ( + total_pretrain_sample_weight + total_train_sample_weight) + + if self.verbose: + print("-- Total pretrain weight = %f (%f%%), sample weight = %f (%f%%)" % ( + total_pretrain_sample_weight, + 100 * total_pretrain_sample_weight / total_combined_sample_weight, + total_train_sample_weight, + 100 * total_train_sample_weight / total_combined_sample_weight)) + + for epoch in range(n_training_epochs): + # weights for synthetic points can be shrunk as: + # ~ 1 / (1+epoch)**2 + # or + # e ** -epoch + decay_factor = np.exp(-epoch) + # if the contribution of synthetic samples is less than a + # thousandth of the actual data, then stop using it + pretrain_contribution = total_pretrain_sample_weight * decay_factor + pretrain_fraction_contribution = ( + pretrain_contribution / total_combined_sample_weight) + + use_pretrain_data = pretrain_fraction_contribution > 0.001 + + # only use synthetic data if it contributes at least 1/1000th of + # sample weight + if verbose: + + real_data_percent = ( + ((1.0 - pretrain_fraction_contribution) * 100) + if use_pretrain_data + else 100 + ) + pretrain_data_percent = ( + (pretrain_fraction_contribution * 100) + if use_pretrain_data else 0 + ) + print( + ("-- Epoch %d/%d decay=%f, real data weight=%0.2f%%," + " synth data weight=%0.2f%% (use_pretrain=%s)") % ( + epoch + 1, + n_training_epochs, + decay_factor, + real_data_percent, + pretrain_data_percent, + use_pretrain_data)) + + if use_pretrain_data: + combined_weights[:n_pretrain] *= decay_factor + self.model.fit( + X_combined, + Y_combined, + sample_weight=combined_weights, + nb_epoch=1, + verbose=0, + batch_size=batch_size) + else: + self.model.fit( + X_combined[n_pretrain:], + Y_combined[n_pretrain:], + sample_weight=combined_weights[n_pretrain:], + nb_epoch=1, + verbose=0, + batch_size=batch_size) + + def to_disk(self, model_json_path, weights_hdf_path, overwrite=False): + if exists(model_json_path) and overwrite: + logging.info( + "Removing existing model JSON file '%s'" % ( + model_json_path,)) + remove(model_json_path) + + if exists(model_json_path): + logging.warn( + "Model JSON file '%s' already exists" % (model_json_path,)) + else: + logging.info( + "Saving model file %s (name=%s)" % (model_json_path, self.name)) + with open(model_json_path, "w") as f: + f.write(self.model.to_json()) + + if exists(weights_hdf_path) and overwrite: + logging.info( + "Removing existing model weights HDF5 file '%s'" % ( + weights_hdf_path,)) + remove(weights_hdf_path) + + if exists(weights_hdf_path): + logging.warn( + "Model weights HDF5 file '%s' already exists" % ( + weights_hdf_path,)) + else: + logging.info( + "Saving model weights HDF5 file %s (name=%s)" % ( + weights_hdf_path, self.name)) + self.model.save_weights(weights_hdf_path) + + @classmethod + def from_allele_name( + cls, + allele_name, + model_directory=CLASS1_MODEL_DIRECTORY, + max_ic50=MAX_IC50): + if not exists(model_directory) or len(listdir(model_directory)) == 0: + raise ValueError( + "No MHC prediction models found in %s" % (model_directory,)) + + allele_name = normalize_allele_name(allele_name) + key = (allele_name, model_directory, max_ic50) + if key in _allele_predictor_cache: + return _allele_predictor_cache[key] + + if not exists(model_directory) or len(listdir(model_directory)) == 0: + raise ValueError( + "No MHC prediction models found in %s" % (model_directory,)) + + model_json_filename = allele_name + ".json" + model_json_path = join(model_directory, model_json_filename) + + weights_hdf_filename = allele_name + ".hdf" + weights_hdf_path = join(model_directory, weights_hdf_filename) + + predictor = cls.from_disk( + model_json_path=model_json_path, + weights_hdf_path=weights_hdf_path, + name=allele_name, + max_ic50=max_ic50) + + _allele_predictor_cache[key] = predictor + return predictor + + @classmethod + def supported_alleles(cls, model_directory=CLASS1_MODEL_DIRECTORY): + alleles_with_weights = set([]) + alleles_with_models = set([]) + for filename in listdir(model_directory): + if filename.endswith(".hdf"): + alleles_with_weights.add(filename.replace(".hdf", "")) + elif filename.endswith(".json"): + alleles_with_models.add(filename.replace(".json", "")) + alleles = alleles_with_models.intersection(alleles_with_weights) + return list(sorted(alleles)) + + def __repr__(self): + return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % ( + self.name, + self.model, + self.max_ic50) + + def __str__(self): + return repr(self) + + def predict_encoded(self, X): + max_expected_index = 20 if self.allow_unknown_amino_acids else 19 + assert X.max() <= max_expected_index, \ + "Got index %d in peptide encoding" % (X.max(),) + return self.model.predict(X, verbose=False).flatten() diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 9333786437dfd8a9b422d2277f3cd199dea1a3ce..6ab2f9fd64aa394bd0c2a95adbdcdbeb0d8b08b4 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -17,7 +17,6 @@ from __future__ import ( division, absolute_import, ) -from .amino_acid import amino_acid_letters def parse_int_list(s): @@ -27,35 +26,37 @@ def parse_int_list(s): def split_uppercase_sequences(s): return [part.strip().upper() for part in s.split(",")] +MHC_PREFIXES = [ + "HLA", + "H-2", + "Mamu", + "Patr", + "Gogo", + "ELA", +] -def normalize_allele_name(allele_name): + +def normalize_allele_name(allele_name, default_prefix="HLA"): """ - Only works for mouse, human, and rhesus monkey alleles. + Only works for a small number of species. TODO: use the same logic as mhctools for MHC name parsing. Possibly even worth its own small repo called something like "mhcnames" """ allele_name = allele_name.upper() - if allele_name.startswith("MAMU"): - prefix = "Mamu-" - elif allele_name.startswith("H-2") or allele_name.startswith("H2"): - prefix = "H-2-" - else: - prefix = "" # old school HLA-C serotypes look like "Cw" allele_name = allele_name.replace("CW", "C") - patterns = [ - "HLA-", - "H-2", - "H2", - "MAMU", - "-", - "*", - ":" - ] - for pattern in patterns: + + prefix = default_prefix + for candidate in MHC_PREFIXES: + if (allele_name.startswith(candidate.upper()) or + allele_name.startswith(candidate.replace("-", "").upper())): + prefix = candidate + allele_name = allele_name[len(prefix):] + break + for pattern in MHC_PREFIXES + ["-", "*", ":"]: allele_name = allele_name.replace(pattern, "") - return "%s%s" % (prefix, allele_name) + return "%s%s" % (prefix + "-" if prefix else "", allele_name) def split_allele_names(s): @@ -64,37 +65,3 @@ def split_allele_names(s): for part in s.split(",") ] - - -def expand_9mer_peptides(peptides, length): - """ - Expand non-9mer peptides using methods from - Accurate approximation method for prediction of class I MHC - affinities for peptides of length 8, 10 and 11 using prediction - tools trained on 9mers. - by Lundegaard et. al. - http://bioinformatics.oxfordjournals.org/content/24/11/1397 - """ - assert len(peptides) > 0 - if length < 8: - raise ValueError("Invalid peptide length: %d (%s)" % ( - length, peptides[0])) - elif length == 9: - return peptides - elif length == 8: - # extend each peptide by inserting every possible amino acid - # between base-1 positions 4-8 - return [ - peptide[:i] + extra_amino_acid + peptide[i:] - for peptide in peptides - for i in range(3, 8) - for extra_amino_acid in amino_acid_letters - ] - else: - # drop interior residues between base-1 positions 4 to last - n_skip = length - 9 - return [ - peptide[:i] + peptide[i + n_skip:] - for peptide in peptides - for i in range(3, 9) - ] diff --git a/mhcflurry/data.py b/mhcflurry/data.py new file mode 100644 index 0000000000000000000000000000000000000000..895edecdfe9db51affda0233f1defc59549eaafe --- /dev/null +++ b/mhcflurry/data.py @@ -0,0 +1,398 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import ( + print_function, + division, + absolute_import, +) +from collections import namedtuple, defaultdict + +import pandas as pd +import numpy as np + +from .common import normalize_allele_name +from .amino_acid import common_amino_acids +from .peptide_encoding import ( + indices_to_hotshot_encoding, + fixed_length_from_many_peptides +) + +index_encoding = common_amino_acids.index_encoding + +AlleleData = namedtuple( + "AlleleData", + [ + "X_index", # index-based featue encoding of fixed length peptides + "X_binary", # binary encoding of fixed length peptides + "Y", # regression encoding of IC50 (log scaled between 0..1) + "peptides", # list of fixed length peptide string + "ic50", # IC50 value associated with each entry + "original_peptides", # original peptides may be of different lengths + "original_lengths", # len(original_peptide) + "substring_counts", # how many substrings were extracted from + # each original peptide string + "weights", # 1.0 / count + ]) + + +def _infer_csv_separator(filename): + """ + Determine if file is separated by comma, tab, or whitespace. + Default to whitespace if the others are not detected. + + Returns (sep, delim_whitespace) + """ + for candidate in [",", "\t"]: + with open(filename, "r") as f: + for line in f: + if line.startswith("#"): + continue + if candidate in line: + return candidate, False + return None, True + + +def load_dataframe( + filename, + peptide_length=None, + max_ic50=50000.0, + sep=None, + species_column_name="species", + allele_column_name="mhc", + peptide_column_name=None, + peptide_length_column_name="peptide_length", + ic50_column_name="meas", + only_human=True): + """ + Load a dataframe of peptide-MHC affinity measurements + + filename : str + TSV filename with columns: + - 'species' + - 'mhc' + - 'peptide_length' + - 'sequence' + - 'meas' + + peptide_length : int, optional + Which length peptides to use (default=load all lengths) + + max_ic50 : float + Treat IC50 scores above this value as all equally bad + (transform them to 0.0 in the regression output) + + sep : str, optional + Separator in CSV file, default is to let Pandas infer + + peptide_column_name : str, optional + Default behavior is to try {"sequence", "peptide", "peptide_sequence"} + + only_human : bool + Only load entries from human MHC alleles + + Returns DataFrame augmented with extra columns: + - "log_ic50" : log(ic50) / log(max_ic50) + - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1] + """ + if sep is None: + sep, delim_whitespace = _infer_csv_separator(filename) + else: + delim_whitespace = False + df = pd.read_csv( + filename, + sep=sep, + delim_whitespace=delim_whitespace, + engine="c") + # hack: get around variability of column naming by checking if + # the peptide_column_name is actually present and if not try "peptide" + if peptide_column_name is None: + columns = set(df.keys()) + for candidate in ["sequence", "peptide", "peptide_sequence"]: + if candidate in columns: + peptide_column_name = candidate + break + if peptide_column_name is None: + raise ValueError( + "Couldn't find peptide column name, candidates: %s" % ( + columns)) + if only_human: + human_mask = df[species_column_name] == "human" + df = df[human_mask] + if peptide_length is not None: + length_mask = df[peptide_length_column_name] == peptide_length + df = df[length_mask] + + df[allele_column_name] = df[allele_column_name].map(normalize_allele_name) + ic50 = np.array(df[ic50_column_name]) + log_ic50 = np.log(ic50) / np.log(max_ic50) + df["log_ic50"] = log_ic50 + regression_output = 1.0 - log_ic50 + # clamp to values between 0, 1 + regression_output = np.maximum(regression_output, 0.0) + regression_output = np.minimum(regression_output, 1.0) + df["regression_output"] = regression_output + return df, peptide_column_name + + +def load_allele_dicts( + filename, + max_ic50=50000.0, + regression_output=False, + sep=None, + species_column_name="species", + allele_column_name="mhc", + peptide_column_name=None, + peptide_length_column_name="peptide_length", + ic50_column_name="meas", + only_human=True, + min_allele_size=1): + """ + Parsing CSV of binding data into dictionary of dictionaries. + The outer key is an allele name, the inner key is a peptide sequence, + and the inner value is an IC50 or log-transformed value between [0,1] + """ + binding_df, peptide_column_name = load_dataframe( + filename=filename, + max_ic50=max_ic50, + sep=sep, + species_column_name=species_column_name, + allele_column_name=allele_column_name, + peptide_column_name=peptide_column_name, + peptide_length_column_name=peptide_length_column_name, + ic50_column_name=ic50_column_name, + only_human=only_human) + # map peptides to either the raw IC50 or rescaled log IC50 depending + # on the `regression_output` parameter + output_column_name = ( + "regression_output" + if regression_output + else ic50_column_name + ) + return { + allele_name: { + peptide: output + for (peptide, output) + in zip(group[peptide_column_name], group[output_column_name]) + } + for (allele_name, group) + in binding_df.groupby(allele_column_name) + if len(group) >= min_allele_size + } + + +def encode_peptide_to_affinity_dict( + peptide_to_affinity_dict, + peptide_length=9, + flatten_binary_encoding=True): + """ + Given a dictionary mapping from peptide sequences to affinity values, + returns tuple with the following fields: + - kmer_peptides: fixed length peptide strings + - original_peptides: variable length peptide strings + - counts: how many fixed length peptides were made from this original + - X_index: index encoding of fixed length peptides + - X_binary: binary encoding of fixed length peptides + - Y: affinity values associated with original peptides + """ + raw_peptides = list(sorted(peptide_to_affinity_dict.keys())) + kmer_peptides, original_peptides, counts = \ + fixed_length_from_many_peptides( + peptides=raw_peptides, + desired_length=peptide_length, + start_offset_shorten=0, + end_offset_shorten=0, + start_offset_extend=0, + end_offset_extend=0) + n_samples = len(kmer_peptides) + assert n_samples == len(original_peptides), \ + "Mismatch between # of samples (%d) and # of peptides (%d)" % ( + n_samples, len(original_peptides)) + assert n_samples == len(counts), \ + "Mismatch between # of samples (%d) and # of counts (%d)" % ( + n_samples, len(counts)) + + X_index = index_encoding(kmer_peptides, peptide_length) + X_binary = indices_to_hotshot_encoding(X_index, n_indices=20) + + assert X_binary.shape[0] == X_index.shape[0], \ + ("Mismatch between number of samples for index encoding (%d)" + " vs. binary encoding (%d)") % ( + X_binary.shape[0], + X_index.shape[0]) + + if flatten_binary_encoding: + # collapse 3D input into 2D matrix + n_binary_features = peptide_length * 20 + X_binary = X_binary.reshape((n_samples, n_binary_features)) + + # easier to work with counts when they're an array instead of list + counts = np.array(counts) + + Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides]) + assert n_samples == len(Y), \ + "Mismatch between # peptides %d and # regression outputs %d" % ( + n_samples, len(Y)) + return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) + + +def load_allele_datasets( + filename, + peptide_length=9, + use_multiple_peptide_lengths=True, + max_ic50=50000.0, + flatten_binary_encoding=True, + sep=None, + species_column_name="species", + allele_column_name="mhc", + peptide_column_name=None, + peptide_length_column_name="peptide_length", + ic50_column_name="meas", + only_human=True): + """ + Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides + and log-transforms the IC50 measurement. Returns dictionary mapping allele + names to AlleleData objects (containing fields X, Y, ic50) + + Parameters + ---------- + filename : str + TSV filename with columns: + - 'species' + - 'mhc' + - 'peptide_length' + - 'sequence' + - 'meas' + + peptide_length : int + Which length peptides to use (default=9) + + use_multiple_peptide_lengths : bool + If a peptide is shorter than `peptide_length`, expand it into many + peptides of the appropriate length by inserting all combinations of + amino acids. Similarly, if a peptide is longer than `peptide_length`, + shorten it by deleting stretches of contiguous amino acids at all + peptide positions. + + max_ic50 : float + Treat IC50 scores above this value as all equally bad + (transform them to 0.0 in the rescaled output) + + flatten_binary_encoding : bool + If False, returns a (n_samples, peptide_length, 20) matrix, otherwise + returns the 2D flattened version of the same data. + + sep : str, optional + Separator in CSV file, default is to let Pandas infer + + peptide_column_name : str, optional + Default behavior is to try {"sequence", "peptide", "peptide_sequence"} + + only_human : bool + Only load entries from human MHC alleles + """ + df, peptide_column_name = load_dataframe( + filename=filename, + max_ic50=max_ic50, + sep=sep, + peptide_length=peptide_length, + species_column_name=species_column_name, + allele_column_name=allele_column_name, + peptide_column_name=peptide_column_name, + peptide_length_column_name=peptide_length_column_name, + ic50_column_name=ic50_column_name, + only_human=only_human) + + allele_groups = {} + for allele, group in df.groupby(allele_column_name): + assert allele not in allele_groups, \ + "Duplicate datasets for %s" % allele + + raw_peptides = group[peptide_column_name] + + # filter lengths in case user wants to drop peptides that are longer + # or shorter than the desired fixed length + if not use_multiple_peptide_lengths: + drop_mask = raw_peptides.str.len() != peptide_length + group = group[~drop_mask] + raw_peptides = raw_peptides[~drop_mask] + + # convert from a Pandas column to a list, since that's easier to + # interact with later + raw_peptides = list(raw_peptides) + + # create dictionaries of outputs from which we can look up values + # after peptides have been expanded + ic50_dict = { + peptide: ic50 + for (peptide, ic50) + in zip(raw_peptides, group[ic50_column_name]) + } + + Y_dict = { + peptide: y + for (peptide, y) + in zip(raw_peptides, group["regression_output"]) + } + + (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \ + encode_peptide_to_affinity_dict( + Y_dict, + peptide_length=peptide_length, + flatten_binary_encoding=flatten_binary_encoding) + + ic50 = np.array([ic50_dict[p] for p in original_peptides]) + + assert len(kmer_peptides) == len(ic50), \ + "Mismatch between # of peptides %d and # IC50 outputs %d" % ( + len(kmer_peptides), len(ic50)) + + allele_groups[allele] = AlleleData( + X_index=X_index, + X_binary=X_binary, + Y=Y, + ic50=ic50, + peptides=kmer_peptides, + original_peptides=original_peptides, + original_lengths=[len(peptide) for peptide in original_peptides], + substring_counts=counts, + weights=1.0 / counts) + return allele_groups + + +def collapse_multiple_peptide_entries(allele_datasets): + """ + If an (allele, peptide) pair occurs multiple times then reduce it + to a single entry by taking the weighted average of affinity values. + + Returns a dictionary of alleles, each of which maps to a dictionary of + peptides that map to affinity values. + """ + allele_to_peptide_to_affinity = {} + for (allele, dataset) in allele_datasets.items(): + multiple_affinities = defaultdict(list) + for (peptide, normalized_affinity, weight) in zip( + dataset.peptides, dataset.Y, dataset.weights): + multiple_affinities[peptide].append((normalized_affinity, weight)) + weighted_averages = {} + for peptide, affinity_weight_tuples in multiple_affinities.items(): + denom = 0.0 + sum_weighted_affinities = 0.0 + for affinity, weight in affinity_weight_tuples: + sum_weighted_affinities += affinity * weight + denom += weight + if denom > 0: + weighted_averages[peptide] = sum_weighted_affinities / denom + allele_to_peptide_to_affinity[allele] = weighted_averages + return allele_to_peptide_to_affinity diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data_helpers.py deleted file mode 100644 index 9fd8834dba39c38e946b8f164d60e0482eb1c09c..0000000000000000000000000000000000000000 --- a/mhcflurry/data_helpers.py +++ /dev/null @@ -1,256 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import ( - print_function, - division, - absolute_import, -) -from collections import namedtuple - -import pandas as pd -import numpy as np - -from .common import normalize_allele_name -from .amino_acid import amino_acid_letter_indices - -AlleleData = namedtuple("AlleleData", "X Y peptides ic50") - - -def hotshot_encoding(peptides, peptide_length): - """ - Encode a set of equal length peptides as a binary matrix, - where each letter is transformed into a length 20 vector with a single - element that is 1 (and the others are 0). - """ - shape = (len(peptides), peptide_length, 20) - X = np.zeros(shape, dtype=bool) - for i, peptide in enumerate(peptides): - for j, amino_acid in enumerate(peptide): - k = amino_acid_letter_indices[amino_acid] - X[i, j, k] = 1 - return X - - -def index_encoding(peptides, peptide_length): - """ - Encode a set of equal length peptides as a vector of their - amino acid indices. - """ - X = np.zeros((len(peptides), peptide_length), dtype=int) - for i, peptide in enumerate(peptides): - for j, amino_acid in enumerate(peptide): - X[i, j] = amino_acid_letter_indices[amino_acid] - return X - - -def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): - """ - Given an (n_samples, peptide_length) integer matrix - convert it to a binary encoding of shape: - (n_samples, peptide_length * n_indices) - """ - (n_samples, peptide_length) = X.shape - if not n_indices: - n_indices = X.max() - first_index_value + 1 - - X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool) - for i, row in enumerate(X): - for j, xij in enumerate(row): - X_binary[i, n_indices * j + xij - first_index_value] = 1 - return X_binary.astype(float) - - -def _infer_csv_separator(filename): - """ - Determine if file is separated by comma, tab, or whitespace. - Default to whitespace if the others are not detected. - - Returns (sep, delim_whitespace) - """ - for candidate in [",", "\t"]: - with open(filename, "r") as f: - for line in f: - if line.startswith("#"): - continue - if candidate in line: - return candidate, False - return None, True - - -def load_dataframe( - filename, - peptide_length=None, - max_ic50=50000.0, - sep=None, - species_column_name="species", - allele_column_name="mhc", - peptide_column_name=None, - peptide_length_column_name="peptide_length", - ic50_column_name="meas", - only_human=True): - """ - Load a dataframe of peptide-MHC affinity measurements - - filename : str - TSV filename with columns: - - 'species' - - 'mhc' - - 'peptide_length' - - 'sequence' - - 'meas' - - peptide_length : int, optional - Which length peptides to use (default=load all lengths) - - max_ic50 : float - Treat IC50 scores above this value as all equally bad - (transform them to 0.0 in the regression output) - - sep : str, optional - Separator in CSV file, default is to let Pandas infer - - peptide_column_name : str, optional - Default behavior is to try {"sequence", "peptide", "peptide_sequence"} - - only_human : bool - Only load entries from human MHC alleles - - Returns DataFrame augmented with extra columns: - - "log_ic50" : log(ic50) / log(max_ic50) - - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1] - """ - if sep is None: - sep, delim_whitespace = _infer_csv_separator(filename) - else: - delim_whitespace = False - df = pd.read_csv( - filename, - sep=sep, - delim_whitespace=delim_whitespace, - engine="c") - # hack: get around variability of column naming by checking if - # the peptide_column_name is actually present and if not try "peptide" - if peptide_column_name is None: - columns = set(df.keys()) - for candidate in ["sequence", "peptide", "peptide_sequence"]: - if candidate in columns: - peptide_column_name = candidate - break - if peptide_column_name is None: - raise ValueError( - "Couldn't find peptide column name, candidates: %s" % ( - columns)) - if only_human: - human_mask = df[species_column_name] == "human" - df = df[human_mask] - if peptide_length is not None: - length_mask = df[peptide_length_column_name] == peptide_length - df = df[length_mask] - - df[allele_column_name] = df[allele_column_name].map(normalize_allele_name) - ic50 = np.array(df[ic50_column_name]) - log_ic50 = np.log(ic50) / np.log(max_ic50) - df["log_ic50"] = log_ic50 - regression_output = 1.0 - log_ic50 - # clamp to values between 0, 1 - regression_output = np.maximum(regression_output, 0.0) - regression_output = np.minimum(regression_output, 1.0) - df["regression_output"] = regression_output - return df - - -def load_allele_datasets( - filename, - peptide_length=9, - max_ic50=5000.0, - binary_encoding=True, - flatten_binary_encoding=True, - sep=None, - species_column_name="species", - allele_column_name="mhc", - peptide_column_name=None, - peptide_length_column_name="peptide_length", - ic50_column_name="meas", - only_human=True): - """ - Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides - and log-transforms the IC50 measurement. Returns dictionary mapping allele - names to AlleleData objects (containing fields X, Y, ic50) - - Parameters - ---------- - filename : str - TSV filename with columns: - - 'species' - - 'mhc' - - 'peptide_length' - - 'sequence' - - 'meas' - - peptide_length : int - Which length peptides to use (default=9) - - max_ic50 : float - Treat IC50 scores above this value as all equally bad - (transform them to 0.0 in the rescaled output) - - binary_encoding : bool - Encode amino acids of each peptide as indices or binary vectors - - flatten_features : bool - If False, returns a (n_samples, peptide_length, 20) matrix, otherwise - returns the 2D flattened version of the same data. - - sep : str, optional - Separator in CSV file, default is to let Pandas infer - - peptide_column_name : str, optional - Default behavior is to try {"sequence", "peptide", "peptide_sequence"} - - only_human : bool - Only load entries from human MHC alleles - """ - df = load_dataframe( - filename=filename, - max_ic50=max_ic50, - sep=sep, - peptide_length=peptide_length, - species_column_name=species_column_name, - allele_column_name=allele_column_name, - peptide_column_name=peptide_column_name, - peptide_length_column_name=peptide_length_column_name, - ic50_column_name=ic50_column_name, - only_human=only_human) - - allele_groups = {} - for allele, group in df.groupby(allele_column_name): - ic50 = np.array(group[ic50_column_name]) - Y = np.array(group["regression_output"]) - peptides = list(group[peptide_column_name]) - if binary_encoding: - X = hotshot_encoding(peptides, peptide_length=peptide_length) - if flatten_binary_encoding: - # collapse 3D input into 2D matrix - X = X.reshape((X.shape[0], peptide_length * 20)) - else: - X = index_encoding(peptides, peptide_length=peptide_length) - assert allele not in allele_groups, \ - "Duplicate datasets for %s" % allele - allele_groups[allele] = AlleleData( - X=X, - Y=Y, - ic50=ic50, - peptides=peptides) - return allele_groups diff --git a/mhcflurry/ensemble.py b/mhcflurry/ensemble.py new file mode 100644 index 0000000000000000000000000000000000000000..a0373607d68a7680e2905c3b23d053a6fe0a89b6 --- /dev/null +++ b/mhcflurry/ensemble.py @@ -0,0 +1,27 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +class Ensemble(object): + def __init__(self, models, name=None): + self.name = name + self.models = models + + @classmethod + def from_directory(cls, directory_path): + files = os.listdir(directory_path) + + + diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py index 7f09c73def7612508d33f44981663b7f8aa785f9..f2c7876bc520d6f6513b4922335293fba07aabfe 100644 --- a/mhcflurry/feedforward.py +++ b/mhcflurry/feedforward.py @@ -128,9 +128,10 @@ def make_hotshot_network( output_activation="sigmoid", dropout_probability=0.0, optimizer=None, - learning_rate=0.001): + learning_rate=0.001, + n_amino_acids=20): return make_network( - input_size=peptide_length * 20, + input_size=peptide_length * n_amino_acids, layer_sizes=layer_sizes, activation=activation, init=init, diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py deleted file mode 100644 index ffde0cbeeb04c00b58735b33fd8dc3b18f9bb17b..0000000000000000000000000000000000000000 --- a/mhcflurry/mhc1_binding_predictor.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Allele specific MHC Class I binding affinity predictor -""" -from __future__ import ( - print_function, - division, - absolute_import, -) -from os import listdir -from os.path import exists, join -from itertools import groupby -import json - -import numpy as np -import pandas as pd -from keras.models import model_from_config - -from .class1_allele_specific_hyperparameters import MAX_IC50 -from .data_helpers import index_encoding, normalize_allele_name -from .paths import CLASS1_MODEL_DIRECTORY -from .common import expand_9mer_peptides - -_allele_model_cache = {} - - -class Mhc1BindingPredictor(object): - def __init__( - self, - allele, - model_directory=CLASS1_MODEL_DIRECTORY, - max_ic50=MAX_IC50): - self.max_ic50 = max_ic50 - if not exists(model_directory) or len(listdir(model_directory)) == 0: - raise ValueError( - "No MHC prediction models found in %s" % (model_directory,)) - original_allele_name = allele - allele = self.allele = normalize_allele_name(allele) - if self.allele not in _allele_model_cache: - json_filename = self.allele + ".json" - json_path = join(model_directory, json_filename) - if not exists(json_path): - raise ValueError("Unsupported allele: %s" % ( - original_allele_name,)) - - hdf_filename = self.allele + ".hdf" - hdf_path = join(model_directory, hdf_filename) - - if not exists(hdf_path): - raise ValueError("Missing model weights for allele %s" % ( - original_allele_name,)) - - with open(json_path, "r") as f: - config_dict = json.load(f) - self.model = model_from_config(config_dict) - self.model.load_weights(hdf_path) - _allele_model_cache[self.allele] = self.model - self.model.compile(loss="mse", optimizer="rmsprop") - else: - self.model = _allele_model_cache[self.allele] - - def __repr__(self): - return "Mhc1BindingPredictor(allele=%s, model_directory=%s)" % ( - self.allele, - self.model_directory) - - def __str__(self): - return repr(self) - - def _log_to_ic50(self, log_value): - """ - Convert neural network output to IC50 values between 0.0 and - self.max_ic50 (typically 5000, 20000 or w0) - """ - return self.max_ic50 ** (1.0 - log_value) - - def _predict_9mer_peptides(self, peptides): - """ - Predict binding affinity for 9mer peptides - """ - if any(len(peptide) != 9 for peptide in peptides): - raise ValueError("Can only predict 9mer peptides") - X = index_encoding(peptides, peptide_length=9) - return self.model.predict(X, verbose=False).flatten() - - def _predict_9mer_peptides_ic50(self, peptides): - log_y = self._predict_9mer_peptides(peptides) - return self._log_to_ic50(log_y) - - def predict_peptides(self, peptides): - column_names = [ - "Allele", - "Peptide", - "Prediction", - ] - results = {} - for column_name in column_names: - results[column_name] = [] - - for length, group_peptides in groupby(peptides, lambda x: len(x)): - group_peptides = list(group_peptides) - expanded_peptides = expand_9mer_peptides(group_peptides, length) - n_group = len(group_peptides) - n_expanded = len(expanded_peptides) - expansion_factor = int(n_expanded / n_group) - raw_y = self._predict_9mer_peptides(expanded_peptides) - median_y = np.zeros(n_group) - # take the median of each group of log(IC50) values - for i in range(n_group): - start = i * expansion_factor - end = (i + 1) * expansion_factor - median_y[i] = np.median(raw_y[start:end]) - ic50 = self._log_to_ic50(median_y) - results["Allele"].extend([self.allele] * n_group) - results["Peptide"].extend(group_peptides) - results["Prediction"].extend(ic50) - return pd.DataFrame(results, columns=column_names) diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..d07a41b8407179e62e4d8aebd39784be4c62b6b6 --- /dev/null +++ b/mhcflurry/peptide_encoding.py @@ -0,0 +1,299 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging + +import numpy as np + +from .amino_acid import common_amino_acids, amino_acids_with_unknown + +common_amino_acid_letters = common_amino_acids.letters() + + +def all_kmers(k, alphabet=common_amino_acid_letters): + """ + Generates all k-mer peptide sequences + + Parameters + ---------- + k : int + + alphabet : str | list of characters + """ + alphabets = [alphabet] * k + return [ + "".join(combination) + for combination + in itertools.product(*alphabets) + ] + + +class CombinatorialExplosion(Exception): + pass + + +def extend_peptide( + peptide, + desired_length, + start_offset, + end_offset, + insert_amino_acid_letters=common_amino_acid_letters): + """Extend peptide by inserting every possible amino acid combination + if we're trying to e.g. turn an 8mer into 9mers. + + Parameters + ---------- + peptide : str + + desired_length : int + + start_offset : int + How many characters (from the position before the start of the string) + to skip before inserting characters. + + + end_offset : int + Last character position from the end where we insert new characters, + where 0 is the position after the last character. + + insert_alphabet : str | list of character + """ + n = len(peptide) + assert n < desired_length, \ + "%s (length = %d) is too long! Must be shorter than %d" % ( + peptide, n, desired_length) + n_missing = desired_length - n + if n_missing > 3: + raise CombinatorialExplosion( + "Cannot transform %s of length %d into a %d-mer peptide" % ( + peptide, n, desired_length)) + return [ + peptide[:i] + extra + peptide[i:] + for i in range(start_offset, n - end_offset + 1) + for extra in all_kmers( + n_missing, + alphabet=insert_amino_acid_letters) + ] + + +def shorten_peptide( + peptide, + desired_length, + start_offset, + end_offset, + insert_amino_acid_letters=common_amino_acid_letters): + """Shorten peptide if trying to e.g. turn 10mer into 9mers + + Parameters + ---------- + + peptide : str + + desired_length : int + + start_offset : int + + end_offset : int + + alphabet : str | list of characters + """ + n = len(peptide) + assert n > desired_length, \ + "%s (length = %d) is too short! Must be longer than %d" % ( + peptide, n, desired_length) + n_skip = n - desired_length + assert n_skip > 0, \ + "Expected length of peptide %s %d to be greater than %d" % ( + peptide, n, desired_length) + end_range = n - end_offset - n_skip + 1 + return [ + peptide[:i] + peptide[i + n_skip:] + for i in range(start_offset, end_range) + ] + + +def fixed_length_from_many_peptides( + peptides, + desired_length, + start_offset_extend=2, + end_offset_extend=1, + start_offset_shorten=2, + end_offset_shorten=0, + insert_amino_acid_letters=common_amino_acid_letters): + """ + Create a set of fixed-length k-mer peptides from a collection of varying + length peptides. + + + Shorter peptides are filled in using all possible amino acids at any + insertion site between start_offset_shorten and -end_offset_shorten + where start_offset_extend=0 represents insertions before the string + and end_offset_extend=0 represents insertions after the string's ending. + + Longer peptides are shortened by deleting contiguous residues, starting + from start_offset_shorten and ending with -end_offset_shorten. Unlike + peptide extensions, the offsets for shortening a peptide range between + the first and last positions (rather than between the positions *before* + the string starts and the position *after*). + + We can recreate the methods from: + Accurate approximation method for prediction of class I MHC + affinities for peptides of length 8, 10 and 11 using prediction + tools trained on 9mers. + by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) + with the following settings: + - desired_length = 9 + - start_offset_extend = 3 + - end_offset_extend = 2 + - start_offset_shorten = 3 + - end_offset_shorten = 1 + + Returns three lists: + - a list of fixed length peptides (all of length `desired_length`) + - a list of the original peptides from which subsequences were + contracted or lengthened + - a list of integers indicating the number of fixed length peptides + generated from each variable length peptide. + + Example: + kmers, original, counts = fixed_length_from_many_peptides( + peptides=["ABC", "A"] + desired_length=2, + start_offset_extend=0, + end_offset_extend=0, + start_offset_shorten=0, + end_offset_shorten=0, + insert_amino_acid_letters="ABC") + kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"] + original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"] + counts == [3, 3, 3, 6, 6, 6, 6, 6, 6] + + Parameters + ---------- + peptide : list of str + + desired_length : int + + start_offset_extend : int + + end_offset_extend : int + + start_offset_shorten : int + + end_offset_shorten : int + + insert_amino_acid_letters : str | list of characters + """ + all_fixed_length_peptides = [] + original_peptide_sequences = [] + counts = [] + for peptide in peptides: + n = len(peptide) + if n == desired_length: + fixed_length_peptides = [peptide] + elif n < desired_length: + try: + fixed_length_peptides = extend_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_extend, + end_offset=end_offset_extend, + insert_amino_acid_letters=insert_amino_acid_letters) + except CombinatorialExplosion: + logging.warn( + "Peptide %s is too short to be extended to length %d" % ( + peptide, desired_length)) + continue + else: + fixed_length_peptides = shorten_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_shorten, + end_offset=end_offset_shorten, + insert_amino_acid_letters=insert_amino_acid_letters) + + n_fixed_length = len(fixed_length_peptides) + all_fixed_length_peptides.extend(fixed_length_peptides) + original_peptide_sequences.extend([peptide] * n_fixed_length) + counts.extend([n_fixed_length] * n_fixed_length) + return all_fixed_length_peptides, original_peptide_sequences, counts + + +def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): + """ + Given an (n_samples, peptide_length) integer matrix + convert it to a binary encoding of shape: + (n_samples, peptide_length * n_indices) + """ + (n_samples, peptide_length) = X.shape + if not n_indices: + n_indices = X.max() - first_index_value + 1 + X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool) + for i, row in enumerate(X): + for j, xij in enumerate(row): + X_binary[i, n_indices * j + xij - first_index_value] = 1 + return X_binary.astype(float) + + +def fixed_length_index_encoding( + peptides, + desired_length, + start_offset_shorten=0, + end_offset_shorten=0, + start_offset_extend=0, + end_offset_extend=0, + allow_unknown_amino_acids=False): + """ + Take peptides of varying lengths, chop them into substrings of fixed + length and apply index encoding to these substrings. + + If a string is longer than the desired length, then it's reduced to + the desired length by deleting characters at all possible positions. When + positions at the start or end of a string should be exempt from deletion + then the number of exempt characters can be controlled via + `start_offset_shorten` and `end_offset_shorten`. + + If a string is shorter than the desired length then it is filled + with all possible characters of the alphabet at all positions. The + parameters `start_offset_extend` and `end_offset_extend` control whether + certain positions are excluded from insertion. The positions are + in a "inter-residue" coordinate system, where `start_offset_extend` = 0 + refers to the position *before* the start of a peptide and, similarly, + `end_offset_extend` = 0 refers to the position *after* the peptide. + + Returns feature matrix X, a list of original peptides for each feature + vector, and a list of integer counts indicating how many rows share a + particular original peptide. When two rows are expanded out of a single + original peptide, they will both have a count of 2. These counts can + be useful for down-weighting the importance of multiple feature vectors + which originate from the same sample. + """ + if allow_unknown_amino_acids: + insert_letters = ["X"] + index_encoding = amino_acids_with_unknown.index_encoding + else: + insert_letters = common_amino_acid_letters + index_encoding = common_amino_acids.index_encoding + + fixed_length, original, counts = fixed_length_from_many_peptides( + peptides=peptides, + desired_length=desired_length, + start_offset_shorten=start_offset_shorten, + end_offset_shorten=end_offset_shorten, + start_offset_extend=start_offset_extend, + end_offset_extend=end_offset_extend, + insert_amino_acid_letters=insert_letters) + X = index_encoding(fixed_length, desired_length) + return X, original, counts diff --git a/mhcflurry/predict.py b/mhcflurry/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..1aef5dcafb05b51b0126cfdf728fef435984c040 --- /dev/null +++ b/mhcflurry/predict.py @@ -0,0 +1,29 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import OrderedDict + +import pandas as pd + +from .paths import CLASS1_MODEL_DIRECTORY +from .mhc1_binding_predictor import Mhc1BindingPredictor + +def predict(alleles, peptides): + allele_dataframes = OrderedDict([]) + for allele in alleles: + model = Mhc1BindingPredictor(allele=allele) + result_dictionary = model.predict_peptides(peptides) + allele_dataframes.append(df) + return pd.concat(allele_dataframes) diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py new file mode 100644 index 0000000000000000000000000000000000000000..9939f55361a56af08458f4586a98bf463d0f33cc --- /dev/null +++ b/mhcflurry/predictor_base.py @@ -0,0 +1,131 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +import numpy as np + +from .peptide_encoding import fixed_length_index_encoding +from .amino_acid import ( + amino_acids_with_unknown, + common_amino_acids +) + + +class PredictorBase(object): + """ + Base class for all mhcflurry predictors (including the Ensemble class) + """ + + def __init__( + self, + name, + max_ic50, + allow_unknown_amino_acids, + verbose): + self.name = name + self.max_ic50 = max_ic50 + self.allow_unknown_amino_acids = allow_unknown_amino_acids + self.verbose = verbose + + def log_to_ic50(self, log_value): + """ + Convert neural network output to IC50 values between 0.0 and + self.max_ic50 (typically 5000, 20000 or 50000) + """ + return self.max_ic50 ** (1.0 - log_value) + + def encode_9mer_peptides(self, peptides): + if self.allow_unknown_amino_acids: + return amino_acids_with_unknown.index_encoding(peptides, 9) + else: + return common_amino_acids.index_encoding(peptides, 9) + + def encode_peptides(self, peptides): + """ + Parameters + ---------- + peptides : str list + Peptide strings of any length + + Encode peptides of any length into fixed length vectors. + Returns 2d array of encoded peptides and 1d array indicating the + original peptide index for each row. + """ + indices = [] + encoded_matrices = [] + for i, peptide in enumerate(peptides): + matrix, _, _ = fixed_length_index_encoding( + peptides=[peptide], + desired_length=9, + allow_unknown_amino_acids=self.allow_unknown_amino_acids) + encoded_matrices.append(matrix) + indices.extend([i] * len(matrix)) + combined_matrix = np.concatenate(encoded_matrices) + index_array = np.array(indices) + expected_shape = (len(index_array), 9) + assert combined_matrix.shape == expected_shape, \ + "Expected shape %s but got %s" % (expected_shape, combined_matrix.shape) + return combined_matrix, index_array + + def predict_9mer_peptides(self, peptides): + """ + Predict binding affinity for 9mer peptides + """ + if any(len(peptide) != 9 for peptide in peptides): + raise ValueError("Can only predict 9mer peptides") + X, _ = self.encode_peptides(peptides) + return self.predict_encoded(X) + + def predict_9mer_peptides_ic50(self, peptides): + return self.log_to_ic50(self.predict_9mer_peptides(peptides)) + + def predict_peptides_ic50(self, peptides): + """ + Predict IC50 affinities for peptides of any length + """ + return self.log_to_ic50( + self.predict_peptides(peptides)) + + def predict_encoded(self, X): + raise ValueError("Not yet implemented for %s!" % ( + self.__class__.__name__,)) + + def predict_peptides( + self, + peptides, + combine_fn=np.mean): + """ + Given a list of peptides of any length, returns an array of predicted + normalized affinity values. Unlike IC50, a higher value here + means a stronger affinity. Peptides of lengths other than 9 are + transformed into a set of 9mers either by deleting or inserting + amino acid characters. The prediction for a single peptide will be + the average of expanded 9mers. + """ + input_matrix, original_peptide_indices = self.encode_peptides(peptides) + # non-9mer peptides get multiple predictions, which are then combined + # with the combine_fn argument + multiple_predictions_dict = defaultdict(list) + fixed_length_predictions = self.predict_encoded(input_matrix) + for i, yi in enumerate(fixed_length_predictions): + original_peptide_index = original_peptide_indices[i] + original_peptide = peptides[original_peptide_index] + multiple_predictions_dict[original_peptide].append(yi) + + combined_predictions_dict = { + p: combine_fn(ys) + for (p, ys) in multiple_predictions_dict.items() + } + return np.array([combined_predictions_dict[p] for p in peptides]) diff --git a/mhcflurry/score_set.py b/mhcflurry/score_set.py new file mode 100644 index 0000000000000000000000000000000000000000..5f833a3c8715eb0e444b70ebd8ad414365da1f3e --- /dev/null +++ b/mhcflurry/score_set.py @@ -0,0 +1,104 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import ( + print_function, + division, + absolute_import, +) +from collections import OrderedDict + +import numpy as np + + +class ScoreSet(object): + """ + Useful for keeping a collection of score dictionaries + which map name->score type->list of values. + """ + def __init__(self, verbose=True, index="name"): + self.groups = {} + self.verbose = verbose + if isinstance(index, (list, tuple)): + index = ",".join("%s" % item for item in index) + self.index = index + + def add_many(self, group, **kwargs): + for (k, v) in sorted(kwargs.items()): + self.add(group, k, v) + + def add(self, group, score_type, value): + if isinstance(group, (list, tuple)): + group = ",".join("%s" % item for item in group) + if group not in self.groups: + self.groups[group] = {} + if score_type not in self.groups[group]: + self.groups[group][score_type] = [] + self.groups[group][score_type].append(value) + if self.verbose: + print("--> %s:%s %0.4f" % (group, score_type, value)) + + def score_types(self): + result = set([]) + for (g, d) in sorted(self.groups.items()): + for score_type in sorted(d.keys()): + result.add(score_type) + return list(sorted(result)) + + def _reduce_scores(self, reduce_fn): + score_types = self.score_types() + return { + group: + OrderedDict([ + (score_type, reduce_fn(score_dict[score_type])) + for score_type + in score_types + ]) + for (group, score_dict) + in self.groups.items() + } + + def averages(self): + return self._reduce_scores(np.mean) + + def stds(self): + return self._reduce_scores(np.std) + + def to_csv(self, filename): + with open(filename, "w") as f: + header_list = [self.index] + score_types = self.score_types() + for score_type in score_types: + header_list.append(score_type) + header_list.append(score_type + "_std") + + header_line = ",".join(header_list) + "\n" + if self.verbose: + print(header_line) + f.write(header_line) + + score_averages = self.averages() + score_stds = self.stds() + + for name in sorted(score_averages.keys()): + line_elements = [name] + for score_type in score_types: + line_elements.append( + "%0.4f" % score_averages[name][score_type]) + line_elements.append( + "%0.4f" % score_stds[name][score_type]) + line = ",".join(line_elements) + "\n" + if self.verbose: + print(line) + f.write(line) diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py index 486e8fe3ea90503cafe4c5c0fd9864ce53096196..7d7e5fc2fe826a7fd30795e0f9d2fbf96f4eebc4 100755 --- a/scripts/create-combined-class1-dataset.py +++ b/scripts/create-combined-class1-dataset.py @@ -29,33 +29,39 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) parser = argparse.ArgumentParser() -parser.add_argument("--ic50-fraction-tolerance", +parser.add_argument( + "--ic50-fraction-tolerance", default=0.01, type=float, help=( "How much can the IEDB and NetMHCpan IC50 differ and still be" " considered compatible (as a fraction of the NetMHCpan value)")) -parser.add_argument("--min-assay-overlap-size", +parser.add_argument( + "--min-assay-overlap-size", type=int, default=1, help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data") -parser.add_argument("--min-assay-fraction-same", +parser.add_argument( + "--min-assay-fraction-same", type=float, help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data", default=0.9) -parser.add_argument("--iedb-pickle-path", +parser.add_argument( + "--iedb-pickle-path", default=IEDB_PICKLE_PATH, help="Path to .pickle file containing dictionary of IEDB assay datasets") -parser.add_argument("--netmhcpan-csv-path", +parser.add_argument( + "--netmhcpan-csv-path", default=PETERS_CSV_PATH, help="Path to CSV with NetMHCpan dataset from 2013 Peters paper") -parser.add_argument("--output-csv-path", +parser.add_argument( + "--output-csv-path", default=CLASS1_DATA_CSV_PATH, help="Path to CSV of combined assay results") @@ -137,7 +143,7 @@ if __name__ == "__main__": for (allele, count) in new_allele_counts.most_common(): print("%s: %d" % (allele, count)) print("Combined DataFrame size: %d (+%d)" % ( - len(combined_df), - len(combined_df) - len(nielsen_data))) + len(combined_df), + len(combined_df) - len(nielsen_data))) print("Writing %s..." % args.output_csv_path) combined_df.to_csv(args.output_csv_path, index=False) diff --git a/scripts/mhcflurry-class1-web-server.py b/scripts/mhcflurry-class1-web-server.py index 4fe36f34f444786e24bf8170decc851c0ec83cdc..128fe452c1f115e350ab7805afcb1e795f56a786 100755 --- a/scripts/mhcflurry-class1-web-server.py +++ b/scripts/mhcflurry-class1-web-server.py @@ -29,6 +29,7 @@ from mhcflurry.common import ( ) from mhcflurry.class1 import predict, supported_alleles + parser = argparse.ArgumentParser() parser.add_argument("--host", default="0.0.0.0") @@ -52,6 +53,7 @@ def get_binding_value(): return "ERROR: %s" % e.args[0] return result_df.to_csv(sep="\t", index=False, float_format="%0.4f") + @get('/alleles') def get_supported_alleles(): peptide_lengths = "8,9,10,11,12" @@ -61,6 +63,7 @@ def get_supported_alleles(): ] return "\n".join(strings) + if __name__ == "__main__": args = parser.parse_args() - run(host=args.host, port=args.port, debug=args.debug) + run(host=args.host, port=args.port, debug=args.debug, server="cherrypy") diff --git a/setup.py b/setup.py index f502b92b81f7b25d2b2d3e4fb4353ed39ee3f772..334c4e4818fd0a02682c499c71f89fdad3edb199 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,8 @@ if __name__ == '__main__': 'appdirs', 'theano', 'keras', + # using for multi-threaded web server + 'cherrypy' ], long_description=readme, packages=['mhcflurry'], diff --git a/test/test_class1_binding_predictor.py b/test/test_class1_binding_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..c824ce1a8bcf39df50d0611405d4d9c92e251f58 --- /dev/null +++ b/test/test_class1_binding_predictor.py @@ -0,0 +1,115 @@ +import numpy as np + +from mhcflurry import Class1BindingPredictor + + +class Dummy9merIndexEncodingModel(object): + """ + Dummy molde used for testing the pMHC binding predictor. + """ + def predict(self, X, verbose=False): + assert isinstance(X, np.ndarray) + assert len(X.shape) == 2 + n_rows, n_cols = X.shape + n_cols == 9, "Expected 9mer index input input, got %d columns" % ( + n_cols,) + return np.zeros(n_rows, dtype=float) + + +def test_always_zero_9mer_inputs(): + predictor = Class1BindingPredictor( + model=Dummy9merIndexEncodingModel(), + allow_unknown_amino_acids=True) + test_9mer_peptides = [ + "SIISIISII", + "AAAAAAAAA", + ] + + n_expected = len(test_9mer_peptides) + y = predictor.predict_peptides(test_9mer_peptides) + assert len(y) == n_expected + assert np.all(y == 0) + + # call the predict method for 9mers directly + y = predictor.predict_9mer_peptides(test_9mer_peptides) + assert len(y) == n_expected + assert np.all(y == 0) + + ic50 = predictor.predict_peptides_ic50(test_9mer_peptides) + assert len(y) == n_expected + assert np.all(ic50 == predictor.max_ic50), ic50 + + +def test_always_zero_8mer_inputs(): + predictor = Class1BindingPredictor( + model=Dummy9merIndexEncodingModel(), + allow_unknown_amino_acids=True) + test_8mer_peptides = [ + "SIISIISI", + "AAAAAAAA", + ] + + n_expected = len(test_8mer_peptides) + y = predictor.predict_peptides(test_8mer_peptides) + assert len(y) == n_expected + assert np.all(y == 0) + + ic50 = predictor.predict_peptides_ic50(test_8mer_peptides) + assert len(y) == n_expected + assert np.all(ic50 == predictor.max_ic50), ic50 + + +def test_always_zero_10mer_inputs(): + predictor = Class1BindingPredictor( + model=Dummy9merIndexEncodingModel(), + allow_unknown_amino_acids=True) + test_10mer_peptides = [ + "SIISIISIYY", + "AAAAAAAAYY", + ] + + n_expected = len(test_10mer_peptides) + y = predictor.predict_peptides(test_10mer_peptides) + assert len(y) == n_expected + assert np.all(y == 0) + + ic50 = predictor.predict_peptides_ic50(test_10mer_peptides) + assert len(y) == n_expected + assert np.all(ic50 == predictor.max_ic50), ic50 + + +def test_encode_peptides_9mer(): + predictor = Class1BindingPredictor( + model=Dummy9merIndexEncodingModel(), + allow_unknown_amino_acids=True) + X = predictor.encode_9mer_peptides(["AAASSSYYY"]) + assert X.shape[0] == 1, X.shape + assert X.shape[1] == 9, X.shape + + X, indices = predictor.encode_peptides(["AAASSSYYY"]) + assert len(indices) == 1 + assert indices[0] == 0 + assert X.shape[0] == 1, X.shape + assert X.shape[1] == 9, X.shape + + +def test_encode_peptides_8mer(): + predictor = Class1BindingPredictor( + model=Dummy9merIndexEncodingModel(), + allow_unknown_amino_acids=True) + X, indices = predictor.encode_peptides(["AAASSSYY"]) + assert len(indices) == 9 + assert (indices == 0).all() + assert X.shape[0] == 9, (X.shape, X) + assert X.shape[1] == 9, (X.shape, X) + + +def test_encode_peptides_10mer(): + predictor = Class1BindingPredictor( + model=Dummy9merIndexEncodingModel(), + allow_unknown_amino_acids=True) + X, indices = predictor.encode_peptides(["AAASSSYYFF"]) + assert len(indices) == 10 + assert (indices == 0).all() + assert X.shape[0] == 10, (X.shape, X) + assert X.shape[1] == 9, (X.shape, X) diff --git a/test/test_fixed_length_peptides.py b/test/test_fixed_length_peptides.py new file mode 100644 index 0000000000000000000000000000000000000000..733ef8a8f49737762df93479457c9b38399a5c7d --- /dev/null +++ b/test/test_fixed_length_peptides.py @@ -0,0 +1,109 @@ +from mhcflurry.peptide_encoding import ( + all_kmers, + extend_peptide, + shorten_peptide, + fixed_length_from_many_peptides, +) +from nose.tools import eq_ + + +def test_all_kmers(): + kmers = all_kmers(2, alphabet=["A", "B"]) + assert len(kmers) == 4, kmers + eq_(set(kmers), {"AA", "AB", "BA", "BB"}) + + +def test_all_kmers_string_alphabet(): + kmers = all_kmers(2, alphabet="AB") + assert len(kmers) == 4, kmers + eq_(set(kmers), {"AA", "AB", "BA", "BB"}) + + +def test_extend_peptide_all_positions(): + # insert 0 or 1 at every position + results = extend_peptide( + "111", + desired_length=4, + start_offset=0, + end_offset=0, + insert_amino_acid_letters="01") + + expected = [ + "0111", + "1111", + "1011", + "1111", + "1101", + "1111", + "1110", + "1111", + ] + eq_(results, expected) + + +def test_shorten_peptide_all_positions(): + # insert 0 or 1 at every position + results = shorten_peptide( + "012", + desired_length=2, + start_offset=0, + end_offset=0, + insert_amino_acid_letters="012") + + expected = [ + "12", + "02", + "01" + ] + eq_(results, expected) + + +def test_shorten_peptide_all_positions_except_first(): + # insert 0 or 1 at every position + results = shorten_peptide( + "012", + desired_length=2, + start_offset=1, + end_offset=0, + insert_amino_acid_letters="012") + + expected = [ + "02", + "01", + ] + eq_(results, expected) + + +def test_shorten_peptide_all_positions_except_last(): + # insert 0 or 1 at every position + results = shorten_peptide( + "012", + desired_length=2, + start_offset=0, + end_offset=1, + insert_amino_acid_letters="012") + + expected = [ + "12", + "02", + ] + eq_(results, expected) + + +def test_fixed_length_from_many_peptides(): + kmers, original, counts = fixed_length_from_many_peptides( + peptides=["ABC", "A"], + desired_length=2, + start_offset_extend=0, + end_offset_extend=0, + start_offset_shorten=0, + end_offset_shorten=0, + insert_amino_acid_letters="ABC") + print(kmers) + print(original) + print(counts) + eq_(len(kmers), len(original)) + eq_(len(kmers), len(counts)) + eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]) + eq_(original, ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]) + eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6])