From 8c83236e33d17455b1d2a7f5d7479c9d2bb5b33b Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 20 Nov 2015 18:43:31 -0500 Subject: [PATCH] moving more flexible fixed length peptide helpers into main repo from experiments --- experiments/allele-similarities.py | 244 ++++++++++++++++++++ experiments/compute-allele-similarities.py | 216 ----------------- experiments/synthesize-augmented-dataset.py | 173 ++++++++++++++ mhcflurry/__init__.py | 4 +- mhcflurry/common.py | 35 --- mhcflurry/data_helpers.py | 39 ++++ mhcflurry/fixed_length_peptides.py | 182 +++++++++++++++ mhcflurry/mhc1_binding_predictor.py | 7 +- mhcflurry/peptide_encoding.py | 80 +++++++ 9 files changed, 725 insertions(+), 255 deletions(-) create mode 100644 experiments/allele-similarities.py delete mode 100644 experiments/compute-allele-similarities.py create mode 100644 experiments/synthesize-augmented-dataset.py create mode 100644 mhcflurry/fixed_length_peptides.py create mode 100644 mhcflurry/peptide_encoding.py diff --git a/experiments/allele-similarities.py b/experiments/allele-similarities.py new file mode 100644 index 00000000..b75de847 --- /dev/null +++ b/experiments/allele-similarities.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fancyimpute import ConvexSolver +import numpy as np +import mhcflurry +import seaborn + +from dataset_paths import PETERS2009_CSV_PATH + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--min-overlap-weight", + default=1.0, + help="Minimum overlap weight between pair of alleles", + type=float) + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--only-human", + default=False, + action="store_true") + +parser.add_argument( + "--raw-similarities-output-path", + help="CSV file which contains incomplete allele similarities") + +parser.add_argument( + "--complete-similarities-output-path", + help="CSV file which contains allele similarities after matrix completion") + +parser.add_argument( + "--raw-heatmap-output-path", + help="PNG file to save heatmap of incomplete similarities matrix") + +parser.add_argument( + "--complete-heatmap-output-path", + help="PNG file to save heatmap of complete similarities matrix") + + +def compute_similarities( + allele_groups, + min_weight=1.0, + allele_name_length=None): + sims = {} + overlaps = {} + weights = {} + for a, da in allele_groups.items(): + if allele_name_length and len(a) != allele_name_length: + continue + peptide_set_a = set(da.keys()) + for b, db in allele_groups.items(): + if allele_name_length and len(b) != allele_name_length: + continue + peptide_set_b = set(db.keys()) + intersection = peptide_set_a.intersection(peptide_set_b) + overlaps[(a, b)] = len(intersection) + total = 0.0 + weight = 0.0 + for peptide in intersection: + ya = da[peptide] + yb = db[peptide] + minval = min(ya, yb) + maxval = max(ya, yb) + total += minval + weight += maxval + weights[(a, b)] = weight + if weight > min_weight: + sims[(a, b)] = total / weight + return sims, overlaps, weights + + +def save_heatmap(matrix, allele_names, filename): + seaborn.set_context("paper") + + with seaborn.plotting_context(font_scale=1): + figure = seaborn.plt.figure(figsize=(20, 18)) + ax = figure.add_axes() + seaborn.heatmap( + data=matrix, + xticklabels=allele_names, + yticklabels=allele_names, + linewidths=0, + annot=False, + ax=ax, + fmt=".2g") + figure.savefig(filename) + + +def save_csv(filename, sims, overlap_counts, overlap_weights): + with open(filename, "w") as f: + f.write("allele_A,allele_B,similarity,count,weight\n") + for (a, b), s in sorted(sims.items()): + count = overlap_counts.get((a, b), 0) + weight = overlap_weights.get((a, b), 0.0) + f.write("%s,%s,%0.4f,%d,%0.4f\n" % (a, b, s, count, weight)) + + +def print_dataset_sizes(allele_groups): + print("---\nDataset Sizes\n---") + for (allele_name, g) in sorted(allele_groups.items()): + print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % ( + allele_name, + len(g), + sum(len(k) == 8 for k in g.keys()), + sum(len(k) == 9 for k in g.keys()), + sum(len(k) == 10 for k in g.keys()), + sum(len(k) == 11 for k in g.keys()), + )) + print("---") + + +def build_incomplete_similarity_matrix(allele_groups, sims): + allele_list = list(sorted(allele_groups.keys())) + allele_order = { + allele_name: i + for (i, allele_name) in enumerate(sorted(allele_groups.keys())) + } + n_alleles = len(allele_list) + sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float) + + for a, ai in allele_order.items(): + for b, bi in allele_order.items(): + # if allele pair is missing from similarities dict then + # fill its slot with NaN, indicating missing data + sims_incomplete_matrix[ai, bi] = sims.get((a, b), np.nan) + return allele_list, allele_order, sims_incomplete_matrix + + +def matrix_to_dictionary(sims, allele_list): + sims_dict = {} + for i in range(sims.shape[0]): + a = allele_list[i] + for j in range(sims.shape[1]): + b = allele_list[j] + sims_dict[a, b] = sims[i, j] + return sims_dict + + +def fill_in_similarities( + raw_sims_dict, + allele_datasets, + raw_sims_heatmap_path=None, + complete_sims_heatmap_path=None): + allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix( + allele_datasets, + sims=raw_sims_dict) + + if raw_sims_heatmap_path: + save_heatmap( + sims_matrix, + allele_list, + raw_sims_heatmap_path) + + print("Completing %s similarities matrix with %d missing entries" % ( + sims_matrix.shape, + np.isnan(sims_matrix).sum())) + + solver = ConvexSolver( + require_symmetric_solution=True, + min_value=0.0, + max_value=1.0, + error_tolerance=0.0001) + + sims_complete_matrix = solver.complete(sims_matrix) + + if complete_sims_heatmap_path: + save_heatmap( + sims_complete_matrix, + allele_list, + complete_sims_heatmap_path) + + complete_sims_dict = matrix_to_dictionary( + sims=sims_complete_matrix, + allele_list=allele_list) + return complete_sims_dict + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + df = mhcflurry.data_helpers.load_dataframe( + args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human) + allele_groups = { + allele_name: { + row["sequence"]: row["regression_output"] + for (_, row) in group.iterrows() + } + for (allele_name, group) in df.groupby("mhc") + } + print_dataset_sizes(allele_groups) + + raw_sims_dict, overlap_counts, overlap_weights = compute_similarities( + allele_groups, + min_weight=args.min_overlap_weight) + + if args.raw_similarities_output_path: + save_csv( + args.raw_similarities_output_path, + raw_sims_dict, + overlap_counts, + overlap_weights) + + complete_sims_dict = fill_in_similarities( + raw_sims_dict=raw_sims_dict, + allele_datasets=allele_groups, + raw_sims_heatmap_path=args.raw_heatmap_output_path, + complete_sims_heatmap_path=args.complete_heatmap_output_path) + + print("-- Added %d/%d allele similarities" % ( + len(complete_sims_dict) - len(raw_sims_dict), + len(complete_sims_dict))) + + if args.complete_similarities_output_path: + save_csv( + args.complete_similarities_output_path, + complete_sims_dict, + overlap_counts, + overlap_weights) diff --git a/experiments/compute-allele-similarities.py b/experiments/compute-allele-similarities.py deleted file mode 100644 index 290e4a7a..00000000 --- a/experiments/compute-allele-similarities.py +++ /dev/null @@ -1,216 +0,0 @@ -import argparse - -import fancyimpute -import numpy as np -import mhcflurry - -from dataset_paths import PETERS2009_CSV_PATH - -parser = argparse.ArgumentParser() - -parser.add_argument( - "--binding-data-csv", - default=PETERS2009_CSV_PATH) - -parser.add_argument( - "--fill-missing-values", - action="store_true", - default=False) - - -parser.add_argument( - "--min-overlap-weight", - default=2.0, - help="Minimum overlap weight between pair of alleles") - -parser.add_argument( - "--max-ic50", - default=50000.0) - -parser.add_argument( - "--output-csv", - required=True) - -def binary_encode(X, n_indices=20): - n_cols = X.shape[1] - X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float) - for i in range(len(X)): - for col_idx in range(n_cols): - X_encode[i, col_idx*n_indices + X[i, col_idx]] = True - return X_encode - -def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5): - if allele_name_length: - datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length} - if prefix: - datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)} - if len(datasets_dict) == 0: - raise ValueError("No alleles matched criteria") - all_peptides = set([]) - allele_list = [] - for k,d in sorted(datasets_dict.items()): - print(k, len(d.peptides)) - allele_list.append(k) - for p in d.peptides: - all_peptides.add(p) - peptide_list = list(sorted(all_peptides)) - allele_order = {a:i for (i,a) in enumerate(allele_list)} - peptide_order = {p:i for (i,p) in enumerate(peptide_list)} - n_alleles = len(allele_order) - n_peptides = len(peptide_order) - incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan - for k,d in datasets_dict.items(): - if k not in allele_order: - continue - i = allele_order[k] - for p,y in zip(d.peptides, d.Y): - j = peptide_order[p] - incomplete_affinity_matrix[i,j] = y - print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides)) - return incomplete_affinity_matrix, allele_order, peptide_order - -def get_extra_data(allele, train_datasets, expanded_predictions): - original_dataset = train_datasets[allele] - original_peptides = set(original_dataset.peptides) - expanded_allele_affinities = expanded_predictions[allele] - extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides} - extra_peptides = list(extra_affinities.keys()) - extra_values = list(extra_affinities.values()) - extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9) - extra_Y = np.array(extra_values) - return extra_X, extra_Y - -def WHO_KNOWS_WHAT(): - - - smoothing = 0.005 - exponent = 2 - all_predictions = defaultdict(dict) - for allele, allele_idx in allele_order.items(): - for peptide in peptide_order.keys(): - predictions = reverse_lookups[peptide] - total_value = 0.0 - total_denom = 0.0 - for (other_allele, y) in predictions: - key = (allele,other_allele) - if key not in completed_sims_dict: - continue - sim = completed_sims_dict[key] - weight = sim ** exponent - total_value += weight * y - total_denom += weight - if total_denom > 0.0: - all_predictions[allele][peptide] = total_value / (smoothing + total_denom) - - -def data_augmentation(X, Y, extra_X, extra_Y, - fraction=0.5, - niters=10, - extra_sample_weight=0.1, - nb_epoch=50, - nn=True, - hidden_layer_size=5): - n = len(Y) - aucs = [] - f1s = [] - n_originals = [] - for _ in range(niters): - mask = np.random.rand(n) <= fraction - X_train = X[mask] - X_test = X[~mask] - Y_train = Y[mask] - Y_test = Y[~mask] - test_ic50 = 20000 ** (1-Y_test) - test_label = test_ic50 <= 500 - if test_label.all() or not test_label.any(): - continue - n_original = mask.sum() - print("Keeping %d original training samples" % n_original) - X_train_combined = np.vstack([X_train, extra_X]) - Y_train_combined = np.concatenate([Y_train, extra_Y]) - print("Combined shape: %s" % (X_train_combined.shape,)) - assert len(X_train_combined) == len(Y_train_combined) - # initialize weights to count synthesized and actual data equally - # but weight on synthesized points will decay across training epochs - weight = np.ones(len(Y_train_combined)) - if nn: - model = mhcflurry.feedforward.make_embedding_network( - layer_sizes=[hidden_layer_size], - embedding_output_dim=10, - activation="tanh") - for i in range(nb_epoch): - # decay weight as training progresses - weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2 - model.fit( - X_train_combined, - Y_train_combined, - sample_weight=weight, - shuffle=True, - nb_epoch=1, - verbose=0) - pred = model.predict(X_test) - else: - model = sklearn.linear_model.Ridge(alpha=5) - X_train_combined_binary = binary_encode(X_train_combined) - X_test_binary = binary_encode(X_test) - model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight) - pred = model.predict(X_test_binary) - pred_ic50 = 20000 ** (1-pred) - pred_label = pred_ic50 <= 500 - mse = sklearn.metrics.mean_squared_error(Y_test, pred) - auc = sklearn.metrics.roc_auc_score(test_label, pred) - - if pred_label.all() or not pred_label.any(): - f1 = 0 - else: - f1 = sklearn.metrics.f1_score(test_label, pred_label) - print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1)) - n_originals.append(n_original) - aucs.append(auc) - f1s.append(f1) - return aucs, f1s, n_originals - - -if __name__ == "__main__": - args = parser.parse_args() - - datasets = mhcflurry.data_helpers.load_data( - args.binding_data_csv, - binary_encoding=True, - max_ic50=args.max_ic50) - - sims = {} - overlaps = {} - weights = {} - for allele_name_a, da in train_datasets.items(): - if len(allele_name_a) != 5: - continue - y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)} - seta = set(da.peptides) - for allele_name_b, db in train_datasets.items(): - if len(allele_name_b) != 5: - continue - y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)} - setb = set(db.peptides) - intersection = seta.intersection(setb) - overlaps[a,b] = len(intersection) - total = 0.0 - weight = 0.0 - for peptide in intersection: - ya = y_dict_a[peptide] - yb = y_dict_b[peptide] - minval = min(ya, yb) - maxval = max(ya, yb) - total += minval - weight += maxval - weights[a,b] = weight - if weight > min_weight: - sims[allele_name_a, allele_name_b] = total / weight - else: - sims[allele_name_a, allele_name_b] = np.nan - - if args.fill_missing_values: - sims_array = np.zeros((n_alleles, n_alleles), dtype=float) - for i, a in enumerate(allele_names): - for j, b in enumerate(allele_names): - sims_array[i,j] = sims[(a,b)] diff --git a/experiments/synthesize-augmented-dataset.py b/experiments/synthesize-augmented-dataset.py new file mode 100644 index 00000000..12c4e9be --- /dev/null +++ b/experiments/synthesize-augmented-dataset.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +import argparse + +import numpy as np +import mhcflurry +import sklearn.metrics +import pandas as pd + +from dataset_paths import PETERS2009_CSV_PATH + +EMBEDDING_DIM_SIZES = [5, 10, 20, 40] +HIDDEN_LAYER_SIZES = [4, 8, 16, 32, 64, 128, 256] + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--binding-data-csv", + default=PETERS2009_CSV_PATH) + +parser.add_argument( + "--max-ic50", + default=50000.0, + type=float) + +parser.add_argument( + "--only-human", + default=False, + action="store_true") + +parser.add_argument( + "--allele-similarity-csv", + required=True) + +def get_extra_data(allele, train_datasets, expanded_predictions): + original_dataset = train_datasets[allele] + original_peptides = set(original_dataset.peptides) + expanded_allele_affinities = expanded_predictions[allele] + extra_affinities = { + k: v + for (k, v) in expanded_allele_affinities.items() + if k not in original_peptides + } + extra_peptides = list(extra_affinities.keys()) + extra_values = list(extra_affinities.values()) + extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9) + extra_Y = np.array(extra_values) + return extra_X, extra_Y + + +def augment_affinity_predictions( + sims_dict, + peptide_to_affinities, + smoothing=0.005, + exponent=2.0): + all_predictions = defaultdict(dict) + for allele in {allele for (allele, _) in sims_dict.keys()}: + for peptide, affinities in peptide_to_affinities.items(): + total = 0.0 + denom = 0.0 + for (other_allele, y) in affinities: + key = (allele, other_allele) + sim = sims_dict.get(key, 0) + if sim == 0: + continue + weight = sim ** exponent + total += weight * y + denom += weight + if denom > 0.0: + all_predictions[allele][peptide] = total / (smoothing + denom) + return all_predictions + + +def data_augmentation( + X, + Y, + extra_X, + extra_Y, + fraction=0.5, + niters=10, + extra_sample_weight=0.1, + nb_epoch=50, + nn=True, + hidden_layer_size=5, + max_ic50=50000.0): + n = len(Y) + aucs = [] + f1s = [] + n_originals = [] + for _ in range(niters): + mask = np.random.rand(n) <= fraction + X_train = X[mask] + X_test = X[~mask] + Y_train = Y[mask] + Y_test = Y[~mask] + test_ic50 = max_ic50 ** (1 - Y_test) + test_label = test_ic50 <= 500 + if test_label.all() or not test_label.any(): + continue + n_original = mask.sum() + print("Keeping %d original training samples" % n_original) + X_train_combined = np.vstack([X_train, extra_X]) + Y_train_combined = np.concatenate([Y_train, extra_Y]) + print("Combined shape: %s" % (X_train_combined.shape,)) + assert len(X_train_combined) == len(Y_train_combined) + # initialize weights to count synthesized and actual data equally + # but weight on synthesized points will decay across training epochs + weight = np.ones(len(Y_train_combined)) + model = mhcflurry.feedforward.make_embedding_network( + layer_sizes=[hidden_layer_size], + embedding_output_dim=10, + activation="tanh") + for i in range(nb_epoch): + # decay weight as training progresses + weight[n_original:] = ( + extra_sample_weight + if extra_sample_weight is not None + else 1.0 / (i + 1) ** 2 + ) + model.fit( + X_train_combined, + Y_train_combined, + sample_weight=weight, + shuffle=True, + nb_epoch=1, + verbose=0) + pred = model.predict(X_test) + + pred_ic50 = max_ic50 ** (1 - pred) + pred_label = pred_ic50 <= 500 + mse = sklearn.metrics.mean_squared_error(Y_test, pred) + auc = sklearn.metrics.roc_auc_score(test_label, pred) + + if pred_label.all() or not pred_label.any(): + f1 = 0 + else: + f1 = sklearn.metrics.f1_score(test_label, pred_label) + print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1)) + n_originals.append(n_original) + aucs.append(auc) + f1s.append(f1) + return aucs, f1s, n_originals + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + sims_df = pd.read_csv(args.allele_similarity_csv) + df = mhcflurry.data_helpers.load_dataframe( + args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human) + allele_groups = { + allele_name: { + row["sequence"]: row["regression_output"] + for (_, row) in group.iterrows() + } + for (allele_name, group) in df.groupby("mhc") + } diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index c1a90808..5f7b272d 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -16,12 +16,14 @@ from . import paths from . import data_helpers from . import feedforward from . import common +from . import fixed_length_peptides from .mhc1_binding_predictor import Mhc1BindingPredictor __all__ = [ "paths", "data_helpers", "feedforward", + "fixed_length_peptides", "common", "Mhc1BindingPredictor" -] \ No newline at end of file +] diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 93337864..8623aa97 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -17,7 +17,6 @@ from __future__ import ( division, absolute_import, ) -from .amino_acid import amino_acid_letters def parse_int_list(s): @@ -64,37 +63,3 @@ def split_allele_names(s): for part in s.split(",") ] - - -def expand_9mer_peptides(peptides, length): - """ - Expand non-9mer peptides using methods from - Accurate approximation method for prediction of class I MHC - affinities for peptides of length 8, 10 and 11 using prediction - tools trained on 9mers. - by Lundegaard et. al. - http://bioinformatics.oxfordjournals.org/content/24/11/1397 - """ - assert len(peptides) > 0 - if length < 8: - raise ValueError("Invalid peptide length: %d (%s)" % ( - length, peptides[0])) - elif length == 9: - return peptides - elif length == 8: - # extend each peptide by inserting every possible amino acid - # between base-1 positions 4-8 - return [ - peptide[:i] + extra_amino_acid + peptide[i:] - for peptide in peptides - for i in range(3, 8) - for extra_amino_acid in amino_acid_letters - ] - else: - # drop interior residues between base-1 positions 4 to last - n_skip = length - 9 - return [ - peptide[:i] + peptide[i + n_skip:] - for peptide in peptides - for i in range(3, 9) - ] diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data_helpers.py index 9fd8834d..be40e8c6 100644 --- a/mhcflurry/data_helpers.py +++ b/mhcflurry/data_helpers.py @@ -171,6 +171,45 @@ def load_dataframe( return df +def load_allele_dicts( + filename, + max_ic50=50000.0, + regression_output=False, + sep=None, + species_column_name="species", + allele_column_name="mhc", + peptide_column_name=None, + peptide_length_column_name="peptide_length", + ic50_column_name="meas", + only_human=True): + """ + Parsing CSV of binding data into dictionary of dictionaries. + The outer key is an allele name, the inner key is a peptide sequence, + and the inner value is an IC50 or log-transformed value between [0,1] + """ + binding_df = load_dataframe( + filename=filename, + max_ic50=max_ic50, + sep=sep, + species_column_name=species_column_name, + allele_column_name=allele_column_name, + peptide_column_name=peptide_column_name, + peptide_length_column_name=peptide_length_column_name, + ic50_column_name=ic50_column_name, + only_human=only_human) + return { + allele_name: { + row[peptide_column_name]: ( + row["regression_output"] + if regression_output + else row[ic50_column_name] + ) + for (_, row) in group.iterrows() + } + for (allele_name, group) in binding_df.groupby(allele_column_name) + } + + def load_allele_datasets( filename, peptide_length=9, diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py new file mode 100644 index 00000000..d5ef0a69 --- /dev/null +++ b/mhcflurry/fixed_length_peptides.py @@ -0,0 +1,182 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +from .amino_acid import amino_acid_letters + + +def all_kmers(k, alphabet=amino_acid_letters): + """ + Generates all k-mer peptide sequences + """ + return [ + "".join(combination) + for combination + in itertools.product(list(alphabet) * k) + ] + + +def extend_peptide( + peptide, + desired_length, + start_offset, + end_offset, + alphabet=amino_acid_letters): + """Extend peptide by inserting every possible amino acid combination + if we're trying to e.g. turn an 8mer into 9mers + """ + n = len(peptide) + assert n < desired_length + n_missing = desired_length - n + if n_missing > 3: + raise ValueError( + "Cannot transform %s of length %d into a %d-mer peptide" % ( + peptide, n, desired_length)) + return [ + peptide[:i] + extra + peptide[i:] + for i in range(start_offset, n - end_offset) + for extra in all_kmers(n_missing, alphabet=alphabet) + ] + + +def shorten_peptide( + peptide, + desired_length, + start_offset, + end_offset, + alphabet=amino_acid_letters): + """Shorten peptide if trying to e.g. turn 10mer into 9mers""" + n = len(peptide) + assert n > desired_length + n_skip = n - desired_length + assert n_skip > 0, \ + "Expected length of peptide %s %d to be greater than %d" % ( + peptide, n, desired_length) + end_range = n - end_offset - n_skip + 1 + return [ + peptide[:i] + peptide[i + n_skip:] + for i in range(start_offset, end_range) + ] + + +def fixed_length_from_single_peptide( + peptide, + desired_length, + start_offset_extend=2, + end_offset_extend=1, + start_offset_shorten=2, + end_offset_shorten=0, + alphabet=amino_acid_letters): + """ + Create a set of fixed-length k-mer peptides from a single peptide of any + length. Shorter peptides are filled in using all possible amino acids at any + insertion site between (start_offset, -end_offset). + + We can recreate the methods from: + Accurate approximation method for prediction of class I MHC + affinities for peptides of length 8, 10 and 11 using prediction + tools trained on 9mers. + by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) + with the following settings: + - desired_length = 9 + - start_offset_extend = 2 + - end_offset_extend = 1 + - start_offset_shorten = 2 + - end_offset_shorten = 0 + """ + n = len(peptide) + if n == desired_length: + return [peptide] + elif n < desired_length: + return extend_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_extend, + end_offset=end_offset_extend, + alphabet=alphabet) + else: + return shorten_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_shorten, + end_offset=end_offset_shorten, + alphabet=alphabet) + + +def fixed_length_from_many_peptides( + peptides, + desired_length, + start_offset_extend=2, + end_offset_extend=1, + start_offset_shorten=2, + end_offset_shorten=0, + alphabet=amino_acid_letters): + """ + Create a set of fixed-length k-mer peptides from a collection of varying + length peptides. Shorter peptides are filled in using all possible amino + acids at any insertion site between + [start_offset_extend, length - end_offset_extend). + Longer peptides are made smaller by deleting contiguous residues between + [start_offset_shorten, length - end_offset_shorten) + + We can recreate the methods from: + Accurate approximation method for prediction of class I MHC + affinities for peptides of length 8, 10 and 11 using prediction + tools trained on 9mers. + by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) + with the following settings: + - desired_length = 9 + - start_offset_extend = 2 + - end_offset_extend = 1 + - start_offset_shorten = 2 + - end_offset_shorten = 0 + + Returns three lists: + - a list of fixed length peptides (all of length `desired_length`) + - a list of the original peptides from which subsequences were + contracted or lengthened + - a list of integers indicating the number of fixed length peptides + generated from each variable length peptide. + + Example: + kmers, original, counts = fixed_length_from_many_peptides( + peptides=["ABC", "A"] + desired_length=2, + start_offset_extend=0, + end_offset_extend=0, + start_offset_shorten=0, + end_offset_shorten=0, + alphabet="ABC") + kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"] + original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"] + counts == [3, 3, 3, 6, 6, 6, 6, 6, 6] + """ + fixed_length_peptides = [] + original_peptide_sequences = [] + number_expanded = [] + for peptide in peptides: + fixed_length_peptides = fixed_length_from_single_peptide( + peptide, + desired_length=desired_length, + start_offset_extend=start_offset_extend, + end_offset_extend=end_offset_extend, + start_offset_shorten=start_offset_shorten, + end_offset_shorten=end_offset_shorten, + alphabet=alphabet) + n_fixed_length = len(fixed_length_peptides) + fixed_length_peptides.extend(fixed_length_peptides) + original_peptide_sequences.extend([peptide] * n_fixed_length) + number_expanded.extend([n_fixed_length] * n_fixed_length) + return fixed_length_peptides, original_peptide_sequences, number_expanded diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py index ffde0cbe..b037b2d6 100644 --- a/mhcflurry/mhc1_binding_predictor.py +++ b/mhcflurry/mhc1_binding_predictor.py @@ -32,7 +32,7 @@ from keras.models import model_from_config from .class1_allele_specific_hyperparameters import MAX_IC50 from .data_helpers import index_encoding, normalize_allele_name from .paths import CLASS1_MODEL_DIRECTORY -from .common import expand_9mer_peptides +from .fixed_length_peptides import fixed_length_from_many_peptides _allele_model_cache = {} @@ -111,8 +111,9 @@ class Mhc1BindingPredictor(object): results[column_name] = [] for length, group_peptides in groupby(peptides, lambda x: len(x)): - group_peptides = list(group_peptides) - expanded_peptides = expand_9mer_peptides(group_peptides, length) + expanded_peptides, _, _ = fixed_length_from_many_peptides( + peptides=list(group_peptides), + desired_length=length) n_group = len(group_peptides) n_expanded = len(expanded_peptides) expansion_factor = int(n_expanded / n_group) diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py new file mode 100644 index 00000000..db20c8ea --- /dev/null +++ b/mhcflurry/peptide_encoding.py @@ -0,0 +1,80 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from .amino_acid import amino_acid_letter_indices + + +def hotshot_encoding(peptides, peptide_length): + """ + Encode a set of equal length peptides as a binary matrix, + where each letter is transformed into a length 20 vector with a single + element that is 1 (and the others are 0). + """ + shape = (len(peptides), peptide_length, 20) + X = np.zeros(shape, dtype=bool) + for i, peptide in enumerate(peptides): + for j, amino_acid in enumerate(peptide): + k = amino_acid_letter_indices[amino_acid] + X[i, j, k] = 1 + return X + + +def index_encoding(peptides, peptide_length): + """ + Encode a set of equal length peptides as a vector of their + amino acid indices. + """ + X = np.zeros((len(peptides), peptide_length), dtype=int) + for i, peptide in enumerate(peptides): + for j, amino_acid in enumerate(peptide): + X[i, j] = amino_acid_letter_indices[amino_acid] + return X + + +def index_encoding_of_substrings( + peptides, + substring_length, + delete_exclude_start=0, + delete_exclude_end=0): + """ + Take peptides of varying lengths, chop them into substrings of fixed + length and apply index encoding to these substrings. + + If a string is longer than the substring length, then it's reduced to + the desired length by deleting characters at all possible positions. + If positions at the start or end of a string should be exempt from deletion + then the number of exempt characters can be controlled via + `delete_exclude_start` and `delete_exclude_end`. + + Returns feature matrix X and a vector of substring counts. + """ + pass + + +def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): + """ + Given an (n_samples, peptide_length) integer matrix + convert it to a binary encoding of shape: + (n_samples, peptide_length * n_indices) + """ + (n_samples, peptide_length) = X.shape + if not n_indices: + n_indices = X.max() - first_index_value + 1 + + X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool) + for i, row in enumerate(X): + for j, xij in enumerate(row): + X_binary[i, n_indices * j + xij - first_index_value] = 1 + return X_binary.astype(float) -- GitLab