Skip to content
Snippets Groups Projects
Commit a0b0a8a9 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

add random negative peptide method by_allele_equalize_nonbinders

parent 4192c780
No related branches found
No related tags found
No related merge requests found
...@@ -41,7 +41,7 @@ base_hyperparameters = { ...@@ -41,7 +41,7 @@ base_hyperparameters = {
'random_negative_distribution_smoothing': 0.0, 'random_negative_distribution_smoothing': 0.0,
'random_negative_match_distribution': True, 'random_negative_match_distribution': True,
'random_negative_rate': 1.0, 'random_negative_rate': 1.0,
'random_negative_method': 'by_allele', 'random_negative_method': 'by_allele_equalize_nonbinders',
'train_data': { 'train_data': {
'pretrain': True, 'pretrain': True,
'pretrain_peptides_per_epoch': 64, 'pretrain_peptides_per_epoch': 64,
......
...@@ -6,6 +6,7 @@ import itertools ...@@ -6,6 +6,7 @@ import itertools
import os import os
import logging import logging
import random import random
import math
import numpy import numpy
import pandas import pandas
...@@ -696,7 +697,14 @@ class Class1NeuralNetwork(object): ...@@ -696,7 +697,14 @@ class Class1NeuralNetwork(object):
df["inequality"] = "=" df["inequality"] = "="
else: else:
df["inequality"] = inequalities df["inequality"] = inequalities
df_nonbinders = None
if self.hyperparameters['random_negative_binder_threshold']: if self.hyperparameters['random_negative_binder_threshold']:
df_nonbinders = df.loc[
(df.inequality != "<") &
(df.affinity > self.hyperparameters[
'random_negative_binder_threshold'
])
]
df = df.loc[ df = df.loc[
(df.inequality != ">") & (df.inequality != ">") &
(df.affinity <= self.hyperparameters[ (df.affinity <= self.hyperparameters[
...@@ -762,8 +770,8 @@ class Class1NeuralNetwork(object): ...@@ -762,8 +770,8 @@ class Class1NeuralNetwork(object):
num_for_allele = len(sub_df) * ( num_for_allele = len(sub_df) * (
self.hyperparameters['random_negative_rate'] self.hyperparameters['random_negative_rate']
) + self.hyperparameters['random_negative_constant'] ) + self.hyperparameters['random_negative_constant']
num_per_length = int( num_per_length = int(math.ceil(
num_for_allele / len(random_negative_lengths)) num_for_allele / len(random_negative_lengths)))
total_random_peptides_per_length += num_per_length total_random_peptides_per_length += num_per_length
allele_to_num_per_length[allele] = num_per_length allele_to_num_per_length[allele] = num_per_length
...@@ -792,6 +800,66 @@ class Class1NeuralNetwork(object): ...@@ -792,6 +800,66 @@ class Class1NeuralNetwork(object):
# important NOT to shuffle peptides, since they correspond with # important NOT to shuffle peptides, since they correspond with
# specific alleles. # specific alleles.
return EncodableSequences.create(peptides) return EncodableSequences.create(peptides)
elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders":
assert df_nonbinders is not None
allele_and_length_to_num_random_negative = {}
# First pass
allele_to_num_per_length_first_pass = {}
for (allele, sub_df) in df.groupby("allele"):
num_for_allele = len(sub_df) * (
self.hyperparameters['random_negative_rate']
) + self.hyperparameters['random_negative_constant']
num_per_length = int(math.ceil(
num_for_allele / len(random_negative_lengths)))
allele_to_num_per_length_first_pass[allele] = num_per_length
# Second pass
for (allele, sub_df) in df_nonbinders.groupby("allele"):
nonbinders_by_length = sub_df.peptide.str.len().value_counts()
nonbinders_by_length = nonbinders_by_length.reindex(
random_negative_lengths
).fillna(0)
total_nonbinders_by_length = (
nonbinders_by_length +
allele_to_num_per_length_first_pass[allele])
max_total_nonbinders_by_length = total_nonbinders_by_length.max()
for (length, total) in total_nonbinders_by_length.items():
allele_and_length_to_num_random_negative[
(allele, length)
] = math.ceil(
allele_to_num_per_length_first_pass[allele] +
max_total_nonbinders_by_length -
total)
plan_df = []
for (
(allele, length), num_random_negative) in (
allele_and_length_to_num_random_negative.items()):
plan_df.append((allele, length, num_random_negative))
plan_df = pandas.DataFrame(
plan_df, columns=["allele", "length", "num"])
logging.info("Random negative plan:\n%s", plan_df)
if allele_encoding is not None:
random_negative_alleles = []
for _, row in plan_df.iterrows():
random_negative_alleles.extend([row.allele] * int(row.num))
def sample_peptides():
peptides = []
for _, row in plan_df.iterrows():
peptides.extend(
random_peptides(
row.num,
length=row.length,
distribution=aa_distribution))
# important NOT to shuffle peptides, since they correspond with
# specific alleles.
return EncodableSequences.create(peptides)
else: else:
raise NotImplementedError( raise NotImplementedError(
self.hyperparameters["random_negative_method"]) self.hyperparameters["random_negative_method"])
......
from mhcflurry import amino_acid
from nose.tools import eq_
from numpy.testing import assert_equal
import numpy
import pandas
import math
from mhcflurry import Class1NeuralNetwork
from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.allele_encoding import AlleleEncoding
from mhcflurry.common import random_peptides
def test_random_negative_peptides_by_allele():
network = Class1NeuralNetwork(
random_negative_method="by_allele",
random_negative_binder_threshold=500,
random_negative_rate=1.0,
random_negative_constant=2)
allele_to_sequence = {
"HLA-A*02:01": "AAAAA",
"HLA-B*44:02": "CCCCC",
"HLA-C*07:02": "EEEEE",
}
data_rows = [
("HLA-A*02:01", "SIINFEKL", 400, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="),
("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
]
for peptide in random_peptides(1000, length=9):
data_rows.append(("HLA-B*44:02", peptide, 100, "="))
for peptide in random_peptides(1000, length=9):
data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
for peptide in random_peptides(5, length=10):
data_rows.append(("HLA-B*44:02", peptide, 100, "="))
data = pandas.DataFrame(
data_rows,
columns=["allele", "peptide", "affinity", "inequality"])
data["length"] = data.peptide.str.len()
peptides = EncodableSequences.create(data.peptide.values)
allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
results = network.random_negatives_generator(
peptides,
data.affinity.values,
allele_encoding,
data.inequality.values
)
result_allele_encoding = next(results)
first_peptide_sample = next(results)
second_peptide_sample = next(results)
result_df1 = pandas.DataFrame({
"allele": result_allele_encoding.alleles,
"peptide": first_peptide_sample.sequences,
})
result_df1["length"] = result_df1.peptide.str.len()
random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
real_binders = data.loc[
data.affinity <= 500
].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
real_nonbinders = data.loc[
data.affinity > 500
].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
total_nonbinders = random_negatives + real_nonbinders
assert (random_negatives.loc["HLA-A*02:01"] == 1.0).all()
assert (random_negatives.loc["HLA-B*44:02"] == math.ceil(1007 / 8)).all(), (
random_negatives.loc["HLA-B*44:02"], math.ceil(1007 / 8))
def test_random_negative_peptides_by_allele():
network = Class1NeuralNetwork(
random_negative_method="by_allele_equalize_nonbinders",
random_negative_binder_threshold=500,
random_negative_rate=1.0,
random_negative_constant=2)
allele_to_sequence = {
"HLA-A*02:01": "AAAAA",
"HLA-B*44:02": "CCCCC",
"HLA-C*07:02": "EEEEE",
}
data_rows = [
("HLA-A*02:01", "SIINFEKL", 400, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="),
("HLA-A*02:01", "SIINFEKLL", 300, "="),
("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
]
for peptide in random_peptides(1000, length=9):
data_rows.append(("HLA-B*44:02", peptide, 100, "="))
for peptide in random_peptides(1000, length=9):
data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
for peptide in random_peptides(5, length=10):
data_rows.append(("HLA-B*44:02", peptide, 100, "="))
data = pandas.DataFrame(
data_rows,
columns=["allele", "peptide", "affinity", "inequality"])
data["length"] = data.peptide.str.len()
peptides = EncodableSequences.create(data.peptide.values)
allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
results = network.random_negatives_generator(
peptides,
data.affinity.values,
allele_encoding,
data.inequality.values
)
result_allele_encoding = next(results)
first_peptide_sample = next(results)
second_peptide_sample = next(results)
result_df1 = pandas.DataFrame({
"allele": result_allele_encoding.alleles,
"peptide": first_peptide_sample.sequences,
})
result_df1["length"] = result_df1.peptide.str.len()
random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
real_binders = data.loc[
data.affinity <= 500
].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
real_nonbinders = data.loc[
data.affinity > 500
].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
for length in random_negatives.columns:
if length not in real_nonbinders.columns:
real_nonbinders[length] = 0
total_nonbinders = random_negatives + real_nonbinders
assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all()
assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment