Skip to content
Snippets Groups Projects
Commit 8c83236e authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

moving more flexible fixed length peptide helpers into main repo from experiments

parent 5a856ce8
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from fancyimpute import ConvexSolver
import numpy as np
import mhcflurry
import seaborn
from dataset_paths import PETERS2009_CSV_PATH
parser = argparse.ArgumentParser()
parser.add_argument(
"--binding-data-csv",
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--min-overlap-weight",
default=1.0,
help="Minimum overlap weight between pair of alleles",
type=float)
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--raw-similarities-output-path",
help="CSV file which contains incomplete allele similarities")
parser.add_argument(
"--complete-similarities-output-path",
help="CSV file which contains allele similarities after matrix completion")
parser.add_argument(
"--raw-heatmap-output-path",
help="PNG file to save heatmap of incomplete similarities matrix")
parser.add_argument(
"--complete-heatmap-output-path",
help="PNG file to save heatmap of complete similarities matrix")
def compute_similarities(
allele_groups,
min_weight=1.0,
allele_name_length=None):
sims = {}
overlaps = {}
weights = {}
for a, da in allele_groups.items():
if allele_name_length and len(a) != allele_name_length:
continue
peptide_set_a = set(da.keys())
for b, db in allele_groups.items():
if allele_name_length and len(b) != allele_name_length:
continue
peptide_set_b = set(db.keys())
intersection = peptide_set_a.intersection(peptide_set_b)
overlaps[(a, b)] = len(intersection)
total = 0.0
weight = 0.0
for peptide in intersection:
ya = da[peptide]
yb = db[peptide]
minval = min(ya, yb)
maxval = max(ya, yb)
total += minval
weight += maxval
weights[(a, b)] = weight
if weight > min_weight:
sims[(a, b)] = total / weight
return sims, overlaps, weights
def save_heatmap(matrix, allele_names, filename):
seaborn.set_context("paper")
with seaborn.plotting_context(font_scale=1):
figure = seaborn.plt.figure(figsize=(20, 18))
ax = figure.add_axes()
seaborn.heatmap(
data=matrix,
xticklabels=allele_names,
yticklabels=allele_names,
linewidths=0,
annot=False,
ax=ax,
fmt=".2g")
figure.savefig(filename)
def save_csv(filename, sims, overlap_counts, overlap_weights):
with open(filename, "w") as f:
f.write("allele_A,allele_B,similarity,count,weight\n")
for (a, b), s in sorted(sims.items()):
count = overlap_counts.get((a, b), 0)
weight = overlap_weights.get((a, b), 0.0)
f.write("%s,%s,%0.4f,%d,%0.4f\n" % (a, b, s, count, weight))
def print_dataset_sizes(allele_groups):
print("---\nDataset Sizes\n---")
for (allele_name, g) in sorted(allele_groups.items()):
print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % (
allele_name,
len(g),
sum(len(k) == 8 for k in g.keys()),
sum(len(k) == 9 for k in g.keys()),
sum(len(k) == 10 for k in g.keys()),
sum(len(k) == 11 for k in g.keys()),
))
print("---")
def build_incomplete_similarity_matrix(allele_groups, sims):
allele_list = list(sorted(allele_groups.keys()))
allele_order = {
allele_name: i
for (i, allele_name) in enumerate(sorted(allele_groups.keys()))
}
n_alleles = len(allele_list)
sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float)
for a, ai in allele_order.items():
for b, bi in allele_order.items():
# if allele pair is missing from similarities dict then
# fill its slot with NaN, indicating missing data
sims_incomplete_matrix[ai, bi] = sims.get((a, b), np.nan)
return allele_list, allele_order, sims_incomplete_matrix
def matrix_to_dictionary(sims, allele_list):
sims_dict = {}
for i in range(sims.shape[0]):
a = allele_list[i]
for j in range(sims.shape[1]):
b = allele_list[j]
sims_dict[a, b] = sims[i, j]
return sims_dict
def fill_in_similarities(
raw_sims_dict,
allele_datasets,
raw_sims_heatmap_path=None,
complete_sims_heatmap_path=None):
allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix(
allele_datasets,
sims=raw_sims_dict)
if raw_sims_heatmap_path:
save_heatmap(
sims_matrix,
allele_list,
raw_sims_heatmap_path)
print("Completing %s similarities matrix with %d missing entries" % (
sims_matrix.shape,
np.isnan(sims_matrix).sum()))
solver = ConvexSolver(
require_symmetric_solution=True,
min_value=0.0,
max_value=1.0,
error_tolerance=0.0001)
sims_complete_matrix = solver.complete(sims_matrix)
if complete_sims_heatmap_path:
save_heatmap(
sims_complete_matrix,
allele_list,
complete_sims_heatmap_path)
complete_sims_dict = matrix_to_dictionary(
sims=sims_complete_matrix,
allele_list=allele_list)
return complete_sims_dict
if __name__ == "__main__":
args = parser.parse_args()
print(args)
df = mhcflurry.data_helpers.load_dataframe(
args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=args.only_human)
allele_groups = {
allele_name: {
row["sequence"]: row["regression_output"]
for (_, row) in group.iterrows()
}
for (allele_name, group) in df.groupby("mhc")
}
print_dataset_sizes(allele_groups)
raw_sims_dict, overlap_counts, overlap_weights = compute_similarities(
allele_groups,
min_weight=args.min_overlap_weight)
if args.raw_similarities_output_path:
save_csv(
args.raw_similarities_output_path,
raw_sims_dict,
overlap_counts,
overlap_weights)
complete_sims_dict = fill_in_similarities(
raw_sims_dict=raw_sims_dict,
allele_datasets=allele_groups,
raw_sims_heatmap_path=args.raw_heatmap_output_path,
complete_sims_heatmap_path=args.complete_heatmap_output_path)
print("-- Added %d/%d allele similarities" % (
len(complete_sims_dict) - len(raw_sims_dict),
len(complete_sims_dict)))
if args.complete_similarities_output_path:
save_csv(
args.complete_similarities_output_path,
complete_sims_dict,
overlap_counts,
overlap_weights)
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import argparse
import fancyimpute
import numpy as np
import mhcflurry
import sklearn.metrics
import pandas as pd
from dataset_paths import PETERS2009_CSV_PATH
EMBEDDING_DIM_SIZES = [5, 10, 20, 40]
HIDDEN_LAYER_SIZES = [4, 8, 16, 32, 64, 128, 256]
parser = argparse.ArgumentParser()
parser.add_argument(
......@@ -13,103 +34,70 @@ parser.add_argument(
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--fill-missing-values",
action="store_true",
default=False)
parser.add_argument(
"--min-overlap-weight",
default=2.0,
help="Minimum overlap weight between pair of alleles")
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--max-ic50",
default=50000.0)
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--output-csv",
"--allele-similarity-csv",
required=True)
def binary_encode(X, n_indices=20):
n_cols = X.shape[1]
X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float)
for i in range(len(X)):
for col_idx in range(n_cols):
X_encode[i, col_idx*n_indices + X[i, col_idx]] = True
return X_encode
def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5):
if allele_name_length:
datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length}
if prefix:
datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)}
if len(datasets_dict) == 0:
raise ValueError("No alleles matched criteria")
all_peptides = set([])
allele_list = []
for k,d in sorted(datasets_dict.items()):
print(k, len(d.peptides))
allele_list.append(k)
for p in d.peptides:
all_peptides.add(p)
peptide_list = list(sorted(all_peptides))
allele_order = {a:i for (i,a) in enumerate(allele_list)}
peptide_order = {p:i for (i,p) in enumerate(peptide_list)}
n_alleles = len(allele_order)
n_peptides = len(peptide_order)
incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan
for k,d in datasets_dict.items():
if k not in allele_order:
continue
i = allele_order[k]
for p,y in zip(d.peptides, d.Y):
j = peptide_order[p]
incomplete_affinity_matrix[i,j] = y
print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides))
return incomplete_affinity_matrix, allele_order, peptide_order
def get_extra_data(allele, train_datasets, expanded_predictions):
original_dataset = train_datasets[allele]
original_peptides = set(original_dataset.peptides)
expanded_allele_affinities = expanded_predictions[allele]
extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides}
extra_affinities = {
k: v
for (k, v) in expanded_allele_affinities.items()
if k not in original_peptides
}
extra_peptides = list(extra_affinities.keys())
extra_values = list(extra_affinities.values())
extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
extra_Y = np.array(extra_values)
return extra_X, extra_Y
def WHO_KNOWS_WHAT():
smoothing = 0.005
exponent = 2
def augment_affinity_predictions(
sims_dict,
peptide_to_affinities,
smoothing=0.005,
exponent=2.0):
all_predictions = defaultdict(dict)
for allele, allele_idx in allele_order.items():
for peptide in peptide_order.keys():
predictions = reverse_lookups[peptide]
total_value = 0.0
total_denom = 0.0
for (other_allele, y) in predictions:
key = (allele,other_allele)
if key not in completed_sims_dict:
for allele in {allele for (allele, _) in sims_dict.keys()}:
for peptide, affinities in peptide_to_affinities.items():
total = 0.0
denom = 0.0
for (other_allele, y) in affinities:
key = (allele, other_allele)
sim = sims_dict.get(key, 0)
if sim == 0:
continue
sim = completed_sims_dict[key]
weight = sim ** exponent
total_value += weight * y
total_denom += weight
if total_denom > 0.0:
all_predictions[allele][peptide] = total_value / (smoothing + total_denom)
def data_augmentation(X, Y, extra_X, extra_Y,
fraction=0.5,
niters=10,
extra_sample_weight=0.1,
nb_epoch=50,
nn=True,
hidden_layer_size=5):
total += weight * y
denom += weight
if denom > 0.0:
all_predictions[allele][peptide] = total / (smoothing + denom)
return all_predictions
def data_augmentation(
X,
Y,
extra_X,
extra_Y,
fraction=0.5,
niters=10,
extra_sample_weight=0.1,
nb_epoch=50,
nn=True,
hidden_layer_size=5,
max_ic50=50000.0):
n = len(Y)
aucs = []
f1s = []
......@@ -120,7 +108,7 @@ def data_augmentation(X, Y, extra_X, extra_Y,
X_test = X[~mask]
Y_train = Y[mask]
Y_test = Y[~mask]
test_ic50 = 20000 ** (1-Y_test)
test_ic50 = max_ic50 ** (1 - Y_test)
test_label = test_ic50 <= 500
if test_label.all() or not test_label.any():
continue
......@@ -133,30 +121,28 @@ def data_augmentation(X, Y, extra_X, extra_Y,
# initialize weights to count synthesized and actual data equally
# but weight on synthesized points will decay across training epochs
weight = np.ones(len(Y_train_combined))
if nn:
model = mhcflurry.feedforward.make_embedding_network(
layer_sizes=[hidden_layer_size],
embedding_output_dim=10,
activation="tanh")
for i in range(nb_epoch):
# decay weight as training progresses
weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2
model.fit(
X_train_combined,
Y_train_combined,
sample_weight=weight,
shuffle=True,
nb_epoch=1,
verbose=0)
pred = model.predict(X_test)
else:
model = sklearn.linear_model.Ridge(alpha=5)
X_train_combined_binary = binary_encode(X_train_combined)
X_test_binary = binary_encode(X_test)
model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight)
pred = model.predict(X_test_binary)
pred_ic50 = 20000 ** (1-pred)
pred_label = pred_ic50 <= 500
model = mhcflurry.feedforward.make_embedding_network(
layer_sizes=[hidden_layer_size],
embedding_output_dim=10,
activation="tanh")
for i in range(nb_epoch):
# decay weight as training progresses
weight[n_original:] = (
extra_sample_weight
if extra_sample_weight is not None
else 1.0 / (i + 1) ** 2
)
model.fit(
X_train_combined,
Y_train_combined,
sample_weight=weight,
shuffle=True,
nb_epoch=1,
verbose=0)
pred = model.predict(X_test)
pred_ic50 = max_ic50 ** (1 - pred)
pred_label = pred_ic50 <= 500
mse = sklearn.metrics.mean_squared_error(Y_test, pred)
auc = sklearn.metrics.roc_auc_score(test_label, pred)
......@@ -170,47 +156,18 @@ def data_augmentation(X, Y, extra_X, extra_Y,
f1s.append(f1)
return aucs, f1s, n_originals
if __name__ == "__main__":
args = parser.parse_args()
datasets = mhcflurry.data_helpers.load_data(
print(args)
sims_df = pd.read_csv(args.allele_similarity_csv)
df = mhcflurry.data_helpers.load_dataframe(
args.binding_data_csv,
binary_encoding=True,
max_ic50=args.max_ic50)
sims = {}
overlaps = {}
weights = {}
for allele_name_a, da in train_datasets.items():
if len(allele_name_a) != 5:
continue
y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)}
seta = set(da.peptides)
for allele_name_b, db in train_datasets.items():
if len(allele_name_b) != 5:
continue
y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)}
setb = set(db.peptides)
intersection = seta.intersection(setb)
overlaps[a,b] = len(intersection)
total = 0.0
weight = 0.0
for peptide in intersection:
ya = y_dict_a[peptide]
yb = y_dict_b[peptide]
minval = min(ya, yb)
maxval = max(ya, yb)
total += minval
weight += maxval
weights[a,b] = weight
if weight > min_weight:
sims[allele_name_a, allele_name_b] = total / weight
else:
sims[allele_name_a, allele_name_b] = np.nan
if args.fill_missing_values:
sims_array = np.zeros((n_alleles, n_alleles), dtype=float)
for i, a in enumerate(allele_names):
for j, b in enumerate(allele_names):
sims_array[i,j] = sims[(a,b)]
max_ic50=args.max_ic50,
only_human=args.only_human)
allele_groups = {
allele_name: {
row["sequence"]: row["regression_output"]
for (_, row) in group.iterrows()
}
for (allele_name, group) in df.groupby("mhc")
}
......@@ -16,12 +16,14 @@ from . import paths
from . import data_helpers
from . import feedforward
from . import common
from . import fixed_length_peptides
from .mhc1_binding_predictor import Mhc1BindingPredictor
__all__ = [
"paths",
"data_helpers",
"feedforward",
"fixed_length_peptides",
"common",
"Mhc1BindingPredictor"
]
\ No newline at end of file
]
......@@ -17,7 +17,6 @@ from __future__ import (
division,
absolute_import,
)
from .amino_acid import amino_acid_letters
def parse_int_list(s):
......@@ -64,37 +63,3 @@ def split_allele_names(s):
for part
in s.split(",")
]
def expand_9mer_peptides(peptides, length):
"""
Expand non-9mer peptides using methods from
Accurate approximation method for prediction of class I MHC
affinities for peptides of length 8, 10 and 11 using prediction
tools trained on 9mers.
by Lundegaard et. al.
http://bioinformatics.oxfordjournals.org/content/24/11/1397
"""
assert len(peptides) > 0
if length < 8:
raise ValueError("Invalid peptide length: %d (%s)" % (
length, peptides[0]))
elif length == 9:
return peptides
elif length == 8:
# extend each peptide by inserting every possible amino acid
# between base-1 positions 4-8
return [
peptide[:i] + extra_amino_acid + peptide[i:]
for peptide in peptides
for i in range(3, 8)
for extra_amino_acid in amino_acid_letters
]
else:
# drop interior residues between base-1 positions 4 to last
n_skip = length - 9
return [
peptide[:i] + peptide[i + n_skip:]
for peptide in peptides
for i in range(3, 9)
]
......@@ -171,6 +171,45 @@ def load_dataframe(
return df
def load_allele_dicts(
filename,
max_ic50=50000.0,
regression_output=False,
sep=None,
species_column_name="species",
allele_column_name="mhc",
peptide_column_name=None,
peptide_length_column_name="peptide_length",
ic50_column_name="meas",
only_human=True):
"""
Parsing CSV of binding data into dictionary of dictionaries.
The outer key is an allele name, the inner key is a peptide sequence,
and the inner value is an IC50 or log-transformed value between [0,1]
"""
binding_df = load_dataframe(
filename=filename,
max_ic50=max_ic50,
sep=sep,
species_column_name=species_column_name,
allele_column_name=allele_column_name,
peptide_column_name=peptide_column_name,
peptide_length_column_name=peptide_length_column_name,
ic50_column_name=ic50_column_name,
only_human=only_human)
return {
allele_name: {
row[peptide_column_name]: (
row["regression_output"]
if regression_output
else row[ic50_column_name]
)
for (_, row) in group.iterrows()
}
for (allele_name, group) in binding_df.groupby(allele_column_name)
}
def load_allele_datasets(
filename,
peptide_length=9,
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
from .amino_acid import amino_acid_letters
def all_kmers(k, alphabet=amino_acid_letters):
"""
Generates all k-mer peptide sequences
"""
return [
"".join(combination)
for combination
in itertools.product(list(alphabet) * k)
]
def extend_peptide(
peptide,
desired_length,
start_offset,
end_offset,
alphabet=amino_acid_letters):
"""Extend peptide by inserting every possible amino acid combination
if we're trying to e.g. turn an 8mer into 9mers
"""
n = len(peptide)
assert n < desired_length
n_missing = desired_length - n
if n_missing > 3:
raise ValueError(
"Cannot transform %s of length %d into a %d-mer peptide" % (
peptide, n, desired_length))
return [
peptide[:i] + extra + peptide[i:]
for i in range(start_offset, n - end_offset)
for extra in all_kmers(n_missing, alphabet=alphabet)
]
def shorten_peptide(
peptide,
desired_length,
start_offset,
end_offset,
alphabet=amino_acid_letters):
"""Shorten peptide if trying to e.g. turn 10mer into 9mers"""
n = len(peptide)
assert n > desired_length
n_skip = n - desired_length
assert n_skip > 0, \
"Expected length of peptide %s %d to be greater than %d" % (
peptide, n, desired_length)
end_range = n - end_offset - n_skip + 1
return [
peptide[:i] + peptide[i + n_skip:]
for i in range(start_offset, end_range)
]
def fixed_length_from_single_peptide(
peptide,
desired_length,
start_offset_extend=2,
end_offset_extend=1,
start_offset_shorten=2,
end_offset_shorten=0,
alphabet=amino_acid_letters):
"""
Create a set of fixed-length k-mer peptides from a single peptide of any
length. Shorter peptides are filled in using all possible amino acids at any
insertion site between (start_offset, -end_offset).
We can recreate the methods from:
Accurate approximation method for prediction of class I MHC
affinities for peptides of length 8, 10 and 11 using prediction
tools trained on 9mers.
by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
with the following settings:
- desired_length = 9
- start_offset_extend = 2
- end_offset_extend = 1
- start_offset_shorten = 2
- end_offset_shorten = 0
"""
n = len(peptide)
if n == desired_length:
return [peptide]
elif n < desired_length:
return extend_peptide(
peptide=peptide,
desired_length=desired_length,
start_offset=start_offset_extend,
end_offset=end_offset_extend,
alphabet=alphabet)
else:
return shorten_peptide(
peptide=peptide,
desired_length=desired_length,
start_offset=start_offset_shorten,
end_offset=end_offset_shorten,
alphabet=alphabet)
def fixed_length_from_many_peptides(
peptides,
desired_length,
start_offset_extend=2,
end_offset_extend=1,
start_offset_shorten=2,
end_offset_shorten=0,
alphabet=amino_acid_letters):
"""
Create a set of fixed-length k-mer peptides from a collection of varying
length peptides. Shorter peptides are filled in using all possible amino
acids at any insertion site between
[start_offset_extend, length - end_offset_extend).
Longer peptides are made smaller by deleting contiguous residues between
[start_offset_shorten, length - end_offset_shorten)
We can recreate the methods from:
Accurate approximation method for prediction of class I MHC
affinities for peptides of length 8, 10 and 11 using prediction
tools trained on 9mers.
by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
with the following settings:
- desired_length = 9
- start_offset_extend = 2
- end_offset_extend = 1
- start_offset_shorten = 2
- end_offset_shorten = 0
Returns three lists:
- a list of fixed length peptides (all of length `desired_length`)
- a list of the original peptides from which subsequences were
contracted or lengthened
- a list of integers indicating the number of fixed length peptides
generated from each variable length peptide.
Example:
kmers, original, counts = fixed_length_from_many_peptides(
peptides=["ABC", "A"]
desired_length=2,
start_offset_extend=0,
end_offset_extend=0,
start_offset_shorten=0,
end_offset_shorten=0,
alphabet="ABC")
kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]
original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]
counts == [3, 3, 3, 6, 6, 6, 6, 6, 6]
"""
fixed_length_peptides = []
original_peptide_sequences = []
number_expanded = []
for peptide in peptides:
fixed_length_peptides = fixed_length_from_single_peptide(
peptide,
desired_length=desired_length,
start_offset_extend=start_offset_extend,
end_offset_extend=end_offset_extend,
start_offset_shorten=start_offset_shorten,
end_offset_shorten=end_offset_shorten,
alphabet=alphabet)
n_fixed_length = len(fixed_length_peptides)
fixed_length_peptides.extend(fixed_length_peptides)
original_peptide_sequences.extend([peptide] * n_fixed_length)
number_expanded.extend([n_fixed_length] * n_fixed_length)
return fixed_length_peptides, original_peptide_sequences, number_expanded
......@@ -32,7 +32,7 @@ from keras.models import model_from_config
from .class1_allele_specific_hyperparameters import MAX_IC50
from .data_helpers import index_encoding, normalize_allele_name
from .paths import CLASS1_MODEL_DIRECTORY
from .common import expand_9mer_peptides
from .fixed_length_peptides import fixed_length_from_many_peptides
_allele_model_cache = {}
......@@ -111,8 +111,9 @@ class Mhc1BindingPredictor(object):
results[column_name] = []
for length, group_peptides in groupby(peptides, lambda x: len(x)):
group_peptides = list(group_peptides)
expanded_peptides = expand_9mer_peptides(group_peptides, length)
expanded_peptides, _, _ = fixed_length_from_many_peptides(
peptides=list(group_peptides),
desired_length=length)
n_group = len(group_peptides)
n_expanded = len(expanded_peptides)
expansion_factor = int(n_expanded / n_group)
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .amino_acid import amino_acid_letter_indices
def hotshot_encoding(peptides, peptide_length):
"""
Encode a set of equal length peptides as a binary matrix,
where each letter is transformed into a length 20 vector with a single
element that is 1 (and the others are 0).
"""
shape = (len(peptides), peptide_length, 20)
X = np.zeros(shape, dtype=bool)
for i, peptide in enumerate(peptides):
for j, amino_acid in enumerate(peptide):
k = amino_acid_letter_indices[amino_acid]
X[i, j, k] = 1
return X
def index_encoding(peptides, peptide_length):
"""
Encode a set of equal length peptides as a vector of their
amino acid indices.
"""
X = np.zeros((len(peptides), peptide_length), dtype=int)
for i, peptide in enumerate(peptides):
for j, amino_acid in enumerate(peptide):
X[i, j] = amino_acid_letter_indices[amino_acid]
return X
def index_encoding_of_substrings(
peptides,
substring_length,
delete_exclude_start=0,
delete_exclude_end=0):
"""
Take peptides of varying lengths, chop them into substrings of fixed
length and apply index encoding to these substrings.
If a string is longer than the substring length, then it's reduced to
the desired length by deleting characters at all possible positions.
If positions at the start or end of a string should be exempt from deletion
then the number of exempt characters can be controlled via
`delete_exclude_start` and `delete_exclude_end`.
Returns feature matrix X and a vector of substring counts.
"""
pass
def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
"""
Given an (n_samples, peptide_length) integer matrix
convert it to a binary encoding of shape:
(n_samples, peptide_length * n_indices)
"""
(n_samples, peptide_length) = X.shape
if not n_indices:
n_indices = X.max() - first_index_value + 1
X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
for i, row in enumerate(X):
for j, xij in enumerate(row):
X_binary[i, n_indices * j + xij - first_index_value] = 1
return X_binary.astype(float)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment