Skip to content
Snippets Groups Projects
Commit f1d17bf0 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

better random negative peptides

parent ee1e1014
No related merge requests found
...@@ -18,6 +18,7 @@ from .regression_target import to_ic50, from_ic50 ...@@ -18,6 +18,7 @@ from .regression_target import to_ic50, from_ic50
from .common import random_peptides, amino_acid_distribution from .common import random_peptides, amino_acid_distribution
from .custom_loss import get_loss from .custom_loss import get_loss
from .data_dependent_weights_initialization import lsuv_init from .data_dependent_weights_initialization import lsuv_init
from .random_negative_peptides import RandomNegativePeptides
DEFAULT_PREDICT_BATCH_SIZE = 4096 DEFAULT_PREDICT_BATCH_SIZE = 4096
...@@ -91,16 +92,10 @@ class Class1NeuralNetwork(object): ...@@ -91,16 +92,10 @@ class Class1NeuralNetwork(object):
early_stopping=True, early_stopping=True,
minibatch_size=128, minibatch_size=128,
data_dependent_initialization_method=None, data_dependent_initialization_method=None,
random_negative_rate=0.0,
random_negative_constant=25,
random_negative_affinity_min=20000.0, random_negative_affinity_min=20000.0,
random_negative_affinity_max=50000.0, random_negative_affinity_max=50000.0,
random_negative_match_distribution=True, random_negative_output_indices=None).extend(
random_negative_distribution_smoothing=0.0, RandomNegativePeptides.hyperparameter_defaults)
random_negative_output_indices=None,
random_negative_method="by_length",
random_negative_binder_threshold=None,
random_negative_lengths=[8,9,10,11,12,13,14,15])
""" """
Hyperparameters for neural network training. Hyperparameters for neural network training.
""" """
...@@ -677,202 +672,6 @@ class Class1NeuralNetwork(object): ...@@ -677,202 +672,6 @@ class Class1NeuralNetwork(object):
fit_info["num_points"] = mutable_generator_state["yielded_values"] fit_info["num_points"] = mutable_generator_state["yielded_values"]
self.fit_info.append(dict(fit_info)) self.fit_info.append(dict(fit_info))
def random_negatives_generator(
self,
encodable_peptides,
affinities,
allele_encoding,
inequalities):
random_negative_lengths = self.hyperparameters['random_negative_lengths']
df = pandas.DataFrame({
"peptide": encodable_peptides.sequences,
"affinity": affinities,
})
if allele_encoding is not None:
df["allele"] = allele_encoding.alleles
df["length"] = df.peptide.str.len()
if inequalities is None:
df["inequality"] = "="
else:
df["inequality"] = inequalities
df_nonbinders = None
if self.hyperparameters['random_negative_binder_threshold']:
df_nonbinders = df.loc[
(df.inequality != "<") &
(df.affinity > self.hyperparameters[
'random_negative_binder_threshold'
])
]
df = df.loc[
(df.inequality != ">") &
(df.affinity <= self.hyperparameters[
'random_negative_binder_threshold'
])
]
aa_distribution = None
if self.hyperparameters['random_negative_match_distribution']:
aa_distribution = amino_acid_distribution(
encodable_peptides.sequences,
smoothing=self.hyperparameters[
'random_negative_distribution_smoothing'
])
logging.info(
"Using amino acid distribution for random negative:\n%s" % (
str(aa_distribution.to_dict())))
random_negative_alleles = None
if self.hyperparameters["random_negative_method"] == "by_length":
# Different numbers of random negatives per length. Alleles are
# sampled proportionally to the number of times they are used in
# the training data.
length_to_num_random_negative = {}
random_negative_lengths = self.hyperparameters[
'random_negative_lengths'
]
length_counts = df.length.value_counts().to_dict()
for length in random_negative_lengths:
length_to_num_random_negative[length] = int(
length_counts.get(length, 0) *
self.hyperparameters['random_negative_rate'] +
self.hyperparameters['random_negative_constant'])
length_to_num_random_negative = pandas.Series(
length_to_num_random_negative)
total_random_negatives = length_to_num_random_negative.sum()
logging.info("Random negative counts per length:\n%s" % (
str(length_to_num_random_negative.to_dict())))
if allele_encoding is not None:
random_negative_alleles = df.allele.sample(
n=total_random_negatives, replace=True).values
def sample_peptides():
peptides = []
for (length, count) in length_to_num_random_negative.items():
peptides.extend(
random_peptides(
count,
length=length,
distribution=aa_distribution))
random.shuffle(peptides) # important
return EncodableSequences.create(peptides)
elif self.hyperparameters["random_negative_method"] == "by_allele":
# For each allele, a particular number of random negatives are used
# for all lengths. Across alleles, the number of random negatives
# varies; within an allele, the number of random negatives for each
# length is a constant
allele_to_num_per_length = {}
total_random_peptides_per_length = 0
for (allele, sub_df) in df.groupby("allele"):
num_for_allele = len(sub_df) * (
self.hyperparameters['random_negative_rate']
) + self.hyperparameters['random_negative_constant']
num_per_length = int(math.ceil(
num_for_allele / len(random_negative_lengths)))
total_random_peptides_per_length += num_per_length
allele_to_num_per_length[allele] = num_per_length
if allele_encoding is not None:
random_negative_alleles = []
for _ in random_negative_lengths:
for (allele, num) in allele_to_num_per_length.items():
random_negative_alleles.extend([allele] * num)
numpy.testing.assert_equal(
len(random_negative_alleles),
total_random_peptides_per_length * len(random_negative_lengths))
logging.info(
"Random negative counts for each length by allele:\n%s" % (
str(allele_to_num_per_length)))
def sample_peptides():
peptides = []
for length in random_negative_lengths:
peptides.extend(
random_peptides(
total_random_peptides_per_length,
length=length,
distribution=aa_distribution))
# important NOT to shuffle peptides, since they correspond with
# specific alleles.
return EncodableSequences.create(peptides)
elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders":
assert df_nonbinders is not None
allele_and_length_to_num_random_negative = {}
# First pass
allele_to_num_per_length_first_pass = {}
for (allele, sub_df) in df.groupby("allele"):
num_for_allele = len(sub_df) * (
self.hyperparameters['random_negative_rate']
) + self.hyperparameters['random_negative_constant']
num_per_length = int(math.ceil(
num_for_allele / len(random_negative_lengths)))
allele_to_num_per_length_first_pass[allele] = num_per_length
# Second pass
for (allele, sub_df) in df_nonbinders.groupby("allele"):
nonbinders_by_length = sub_df.peptide.str.len().value_counts()
nonbinders_by_length = nonbinders_by_length.reindex(
random_negative_lengths
).fillna(0)
total_nonbinders_by_length = (
nonbinders_by_length +
allele_to_num_per_length_first_pass[allele])
max_total_nonbinders_by_length = total_nonbinders_by_length.max()
for (length, total) in total_nonbinders_by_length.items():
allele_and_length_to_num_random_negative[
(allele, length)
] = math.ceil(
allele_to_num_per_length_first_pass[allele] +
max_total_nonbinders_by_length -
total)
plan_df = []
for (
(allele, length), num_random_negative) in (
allele_and_length_to_num_random_negative.items()):
plan_df.append((allele, length, num_random_negative))
plan_df = pandas.DataFrame(
plan_df, columns=["allele", "length", "num"])
logging.info("Random negative plan:\n%s", plan_df)
if allele_encoding is not None:
random_negative_alleles = []
for _, row in plan_df.iterrows():
random_negative_alleles.extend([row.allele] * int(row.num))
def sample_peptides():
peptides = []
for _, row in plan_df.iterrows():
peptides.extend(
random_peptides(
row.num,
length=row.length,
distribution=aa_distribution))
# important NOT to shuffle peptides, since they correspond with
# specific alleles.
return EncodableSequences.create(peptides)
else:
raise NotImplementedError(
self.hyperparameters["random_negative_method"])
random_negative_allele_encoding = None
if random_negative_alleles is not None:
random_negative_allele_encoding = AlleleEncoding(
random_negative_alleles, borrow_from=allele_encoding)
yield random_negative_allele_encoding
while True:
yield sample_peptides()
def fit( def fit(
self, self,
peptides, peptides,
...@@ -937,14 +736,21 @@ class Class1NeuralNetwork(object): ...@@ -937,14 +736,21 @@ class Class1NeuralNetwork(object):
peptide_encoding = self.peptides_to_network_input(encodable_peptides) peptide_encoding = self.peptides_to_network_input(encodable_peptides)
fit_info = collections.defaultdict(list) fit_info = collections.defaultdict(list)
random_negatives_generator = self.random_negatives_generator( random_negatives_planner = RandomNegativePeptides(
encodable_peptides=encodable_peptides, **RandomNegativePeptides.hyperparameter_defaults.subselect(
self.hyperparameters))
random_negatives_planner.plan(
peptides=encodable_peptides.sequences,
affinities=affinities, affinities=affinities,
allele_encoding=allele_encoding, alleles=allele_encoding.alleles if allele_encoding else None,
inequalities=inequalities) inequalities=inequalities)
random_negatives_allele_encoding = next(random_negatives_generator)
num_random_negatives = len( random_negatives_allele_encoding = None
next(random_negatives_generator).sequences) if allele_encoding is not None:
random_negatives_allele_encoding = AlleleEncoding(
random_negatives_planner.get_alleles(),
borrow_from=allele_encoding)
num_random_negatives = random_negatives_planner.get_total_count()
y_values = from_ic50(numpy.array(affinities, copy=False)) y_values = from_ic50(numpy.array(affinities, copy=False))
assert numpy.isnan(y_values).sum() == 0, y_values assert numpy.isnan(y_values).sum() == 0, y_values
...@@ -1107,7 +913,8 @@ class Class1NeuralNetwork(object): ...@@ -1107,7 +913,8 @@ class Class1NeuralNetwork(object):
last_progress_print = None last_progress_print = None
x_dict_with_random_negatives = {} x_dict_with_random_negatives = {}
for i in range(self.hyperparameters['max_epochs']): for i in range(self.hyperparameters['max_epochs']):
random_negative_peptides = next(random_negatives_generator) random_negative_peptides = EncodableSequences.create(
random_negatives_planner.get_peptides())
random_negative_peptides_encoding = ( random_negative_peptides_encoding = (
self.peptides_to_network_input(random_negative_peptides)) self.peptides_to_network_input(random_negative_peptides))
......
import logging
import math
import numpy
import pandas
from .hyperparameters import HyperparameterDefaults
from .common import amino_acid_distribution, random_peptides
class RandomNegativePeptides(object):
hyperparameter_defaults = HyperparameterDefaults(
random_negative_rate=0.0,
random_negative_constant=25,
random_negative_match_distribution=True,
random_negative_distribution_smoothing=0.0,
random_negative_method="by_length",
random_negative_binder_threshold=None,
random_negative_lengths=[8,9,10,11,12,13,14,15])
def __init__(self, **hyperparameters):
self.hyperparameters = self.hyperparameter_defaults.with_defaults(
hyperparameters)
self.plan_df = None
self.aa_distribution = None
def plan(self, peptides, affinities, alleles=None, inequalities=None):
peptides = pandas.Series(peptides, copy=False)
peptide_lengths = peptides.str.len()
if self.hyperparameters['random_negative_match_distribution']:
self.aa_distribution = amino_acid_distribution(
peptides.values,
smoothing=self.hyperparameters[
'random_negative_distribution_smoothing'
])
logging.info(
"Using amino acid distribution for random negative:\n%s" % (
str(self.aa_distribution.to_dict())))
df_all = pandas.DataFrame({
'length': peptide_lengths,
'affinity': affinities,
})
df_all["allele"] = "" if alleles is None else alleles
df_all["inequality"] = "=" if inequalities is None else inequalities
df_binders = None
df_nonbinders = None
if self.hyperparameters['random_negative_binder_threshold']:
df_nonbinders = df_all.loc[
(df_all.inequality != "<") &
(df_all.affinity > self.hyperparameters[
'random_negative_binder_threshold'
])
]
df_binders = df_all.loc[
(df_all.inequality != ">") &
(df_all.affinity <= self.hyperparameters[
'random_negative_binder_threshold'
])
]
method = self.hyperparameters['random_negative_method']
function = {
'by_length': self.plan_by_length,
'by_allele': self.plan_by_allele,
'by_allele_equalize_nonbinders':
self.plan_by_allele_equalize_nonbinders,
}[method]
function(df_all, df_binders, df_nonbinders)
assert self.plan_df is not None
logging.info("Random negative plan [%s]:\n%s", method, self.plan_df)
return self.plan_df
def plan_by_length(self, df_all, df_binders=None, df_nonbinders=None):
"""
Different numbers of random negatives per length. Alleles are sampled
proportionally to the number of times they are used in the training
data.
Used for allele-specific predictors. Does not work well for pan-allele.
"""
assert list(df_all.allele.unique()) == [""], (
"by_length only recommended for allele specific prediction")
df = df_all if df_binders is None else df_binders
lengths = self.hyperparameters['random_negative_lengths']
length_to_num_random_negative = {}
length_counts = df.length.value_counts().to_dict()
for length in lengths:
length_to_num_random_negative[length] = int(
length_counts.get(length, 0) *
self.hyperparameters['random_negative_rate'] +
self.hyperparameters['random_negative_constant'])
plan_df = pandas.DataFrame(index=sorted(df.allele.unique()))
for length in lengths:
plan_df[length] = length_to_num_random_negative[length]
self.plan_df = plan_df.astype(int)
def plan_by_allele(self, df_all, df_binders=None, df_nonbinders=None):
"""
For each allele, a particular number of random negatives are used
for all lengths. Across alleles, the number of random negatives
varies; within an allele, the number of random negatives for each
length is a constant
"""
allele_to_num_per_length = {}
total_random_peptides_per_length = 0
df = df_all if df_binders is None else df_binders
lengths = self.hyperparameters['random_negative_lengths']
all_alleles = df_all.allele.unique()
for allele in all_alleles:
sub_df = df.loc[df.allele == allele]
num_for_allele = len(sub_df) * (
self.hyperparameters['random_negative_rate']
) + self.hyperparameters['random_negative_constant']
num_per_length = int(math.ceil(
num_for_allele / len(lengths)))
total_random_peptides_per_length += num_per_length
allele_to_num_per_length[allele] = num_per_length
plan_df = pandas.DataFrame(index=sorted(df.allele.unique()))
for length in lengths:
plan_df[length] = plan_df.index.map(allele_to_num_per_length)
self.plan_df = plan_df.astype(int)
def plan_by_allele_equalize_nonbinders(
self, df_all, df_binders, df_nonbinders):
"""
"""
assert df_binders is not None
assert df_nonbinders is not None
lengths = self.hyperparameters['random_negative_lengths']
self.plan_by_allele(df_all, df_binders, df_nonbinders)
first_pass_plan = self.plan_df
self.plan_df = None
new_plan = first_pass_plan.copy()
new_plan[:] = numpy.nan
for (allele, first_pass_per_length) in first_pass_plan.iterrows():
real_nonbinders_by_length = df_nonbinders.loc[
df_nonbinders.allele == allele
].length.value_counts().reindex(lengths).fillna(0)
total_nonbinders_by_length = (
real_nonbinders_by_length + first_pass_per_length)
new_plan.loc[allele] = first_pass_per_length + (
total_nonbinders_by_length.max() - total_nonbinders_by_length)
self.plan_df = new_plan.astype(int)
def get_alleles(self):
assert self.plan_df is not None, "Call plan() first"
alleles = []
for allele, row in self.plan_df.iterrows():
alleles.extend([allele] * int(row.sum()))
assert len(alleles) == self.get_total_count()
return alleles
def get_peptides(self):
assert self.plan_df is not None, "Call plan() first"
peptides = []
for allele, row in self.plan_df.iterrows():
for (length, num) in row.items():
peptides.extend(
random_peptides(
num,
length=length,
distribution=self.aa_distribution))
assert len(peptides) == self.get_total_count()
return peptides
def get_total_count(self):
return self.plan_df.sum().sum()
\ No newline at end of file
import logging
logging.getLogger('matplotlib').disabled = True
logging.getLogger('tensorflow').disabled = True
from mhcflurry import amino_acid from mhcflurry import amino_acid
from nose.tools import eq_ from nose.tools import eq_
from numpy.testing import assert_equal from numpy.testing import assert_equal
...@@ -9,20 +13,16 @@ from mhcflurry import Class1NeuralNetwork ...@@ -9,20 +13,16 @@ from mhcflurry import Class1NeuralNetwork
from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.allele_encoding import AlleleEncoding from mhcflurry.allele_encoding import AlleleEncoding
from mhcflurry.common import random_peptides from mhcflurry.common import random_peptides
from mhcflurry.random_negative_peptides import RandomNegativePeptides
def test_random_negative_peptides_by_allele(): def test_random_negative_peptides_by_allele():
network = Class1NeuralNetwork( planner = RandomNegativePeptides(
random_negative_method="by_allele", random_negative_method="by_allele",
random_negative_binder_threshold=500, random_negative_binder_threshold=500,
random_negative_rate=1.0, random_negative_rate=1.0,
random_negative_constant=2) random_negative_constant=2)
allele_to_sequence = {
"HLA-A*02:01": "AAAAA",
"HLA-B*44:02": "CCCCC",
"HLA-C*07:02": "EEEEE",
}
data_rows = [ data_rows = [
("HLA-A*02:01", "SIINFEKL", 400, "="), ("HLA-A*02:01", "SIINFEKL", 400, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="), ("HLA-A*02:01", "SIINFEKLL", 300, "="),
...@@ -42,25 +42,18 @@ def test_random_negative_peptides_by_allele(): ...@@ -42,25 +42,18 @@ def test_random_negative_peptides_by_allele():
columns=["allele", "peptide", "affinity", "inequality"]) columns=["allele", "peptide", "affinity", "inequality"])
data["length"] = data.peptide.str.len() data["length"] = data.peptide.str.len()
peptides = EncodableSequences.create(data.peptide.values) planner.plan(
allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence) peptides=data.peptide.values,
affinities=data.affinity.values,
results = network.random_negatives_generator( alleles=data.allele.values,
peptides, inequalities=data.inequality.values)
data.affinity.values, result_df = pandas.DataFrame({
allele_encoding, "allele": planner.get_alleles(),
data.inequality.values "peptide": planner.get_peptides(),
)
result_allele_encoding = next(results)
first_peptide_sample = next(results)
second_peptide_sample = next(results)
result_df1 = pandas.DataFrame({
"allele": result_allele_encoding.alleles,
"peptide": first_peptide_sample.sequences,
}) })
result_df1["length"] = result_df1.peptide.str.len()
random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack() result_df["length"] = result_df.peptide.str.len()
random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack()
real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0) real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
real_binders = data.loc[ real_binders = data.loc[
data.affinity <= 500 data.affinity <= 500
...@@ -77,23 +70,20 @@ def test_random_negative_peptides_by_allele(): ...@@ -77,23 +70,20 @@ def test_random_negative_peptides_by_allele():
def test_random_negative_peptides_by_allele(): def test_random_negative_peptides_by_allele():
network = Class1NeuralNetwork( planner = RandomNegativePeptides(
random_negative_method="by_allele_equalize_nonbinders", random_negative_method="by_allele_equalize_nonbinders",
random_negative_binder_threshold=500, random_negative_binder_threshold=500,
random_negative_rate=1.0, random_negative_rate=1.0,
random_negative_constant=2) random_negative_constant=2)
allele_to_sequence = {
"HLA-A*02:01": "AAAAA",
"HLA-B*44:02": "CCCCC",
"HLA-C*07:02": "EEEEE",
}
data_rows = [ data_rows = [
("HLA-A*02:01", "SIINFEKL", 400, "="), ("HLA-A*02:01", "SIINFEKL", 400, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="), ("HLA-A*02:01", "SIINFEKLL", 300, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="), ("HLA-A*02:01", "SIINFEKLL", 300, "="),
("HLA-A*02:01", "SIINFEKLQ", 1000, "="), ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"), ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
("HLA-C*01:02", "SIINFEKLQ", 100, "="), # only binders
("HLA-C*07:02", "SIINFEKLL", 1000, "=") # only non-binders
] ]
for peptide in random_peptides(1000, length=9): for peptide in random_peptides(1000, length=9):
data_rows.append(("HLA-B*44:02", peptide, 100, "=")) data_rows.append(("HLA-B*44:02", peptide, 100, "="))
...@@ -107,25 +97,17 @@ def test_random_negative_peptides_by_allele(): ...@@ -107,25 +97,17 @@ def test_random_negative_peptides_by_allele():
columns=["allele", "peptide", "affinity", "inequality"]) columns=["allele", "peptide", "affinity", "inequality"])
data["length"] = data.peptide.str.len() data["length"] = data.peptide.str.len()
peptides = EncodableSequences.create(data.peptide.values) planner.plan(
allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence) peptides=data.peptide.values,
affinities=data.affinity.values,
results = network.random_negatives_generator( alleles=data.allele.values,
peptides, inequalities=data.inequality.values)
data.affinity.values, result_df = pandas.DataFrame({
allele_encoding, "allele": planner.get_alleles(),
data.inequality.values "peptide": planner.get_peptides(),
)
result_allele_encoding = next(results)
first_peptide_sample = next(results)
second_peptide_sample = next(results)
result_df1 = pandas.DataFrame({
"allele": result_allele_encoding.alleles,
"peptide": first_peptide_sample.sequences,
}) })
result_df1["length"] = result_df1.peptide.str.len() result_df["length"] = result_df.peptide.str.len()
random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack() random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack()
real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0) real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
real_binders = data.loc[ real_binders = data.loc[
data.affinity <= 500 data.affinity <= 500
...@@ -136,7 +118,11 @@ def test_random_negative_peptides_by_allele(): ...@@ -136,7 +118,11 @@ def test_random_negative_peptides_by_allele():
for length in random_negatives.columns: for length in random_negatives.columns:
if length not in real_nonbinders.columns: if length not in real_nonbinders.columns:
real_nonbinders[length] = 0 real_nonbinders[length] = 0
total_nonbinders = random_negatives + real_nonbinders total_nonbinders = (
random_negatives.reindex(real_data.index).fillna(0) +
real_nonbinders.reindex(real_data.index).fillna(0))
assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all(), total_nonbinders
assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all(), total_nonbinders
assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all() assert not total_nonbinders.isnull().any().any()
assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment