Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
a3328a63
Commit
a3328a63
authored
5 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
Better random negative peptides generation
parent
2bd4c3af
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
mhcflurry/class1_neural_network.py
+157
-60
157 additions, 60 deletions
mhcflurry/class1_neural_network.py
with
157 additions
and
60 deletions
mhcflurry/class1_neural_network.py
+
157
−
60
View file @
a3328a63
...
@@ -14,6 +14,7 @@ import pandas
...
@@ -14,6 +14,7 @@ import pandas
from
.hyperparameters
import
HyperparameterDefaults
from
.hyperparameters
import
HyperparameterDefaults
from
.encodable_sequences
import
EncodableSequences
,
EncodingError
from
.encodable_sequences
import
EncodableSequences
,
EncodingError
from
.allele_encoding
import
AlleleEncoding
from
.regression_target
import
to_ic50
,
from_ic50
from
.regression_target
import
to_ic50
,
from_ic50
from
.common
import
random_peptides
,
amino_acid_distribution
from
.common
import
random_peptides
,
amino_acid_distribution
from
.custom_loss
import
get_loss
from
.custom_loss
import
get_loss
...
@@ -97,7 +98,10 @@ class Class1NeuralNetwork(object):
...
@@ -97,7 +98,10 @@ class Class1NeuralNetwork(object):
random_negative_affinity_max
=
50000.0
,
random_negative_affinity_max
=
50000.0
,
random_negative_match_distribution
=
True
,
random_negative_match_distribution
=
True
,
random_negative_distribution_smoothing
=
0.0
,
random_negative_distribution_smoothing
=
0.0
,
random_negative_output_indices
=
None
)
random_negative_output_indices
=
None
,
random_negative_method
=
"
by_length
"
,
random_negative_binder_threshold
=
None
,
random_negative_lengths
=
[
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
])
"""
"""
Hyperparameters for neural network training.
Hyperparameters for neural network training.
"""
"""
...
@@ -674,6 +678,132 @@ class Class1NeuralNetwork(object):
...
@@ -674,6 +678,132 @@ class Class1NeuralNetwork(object):
fit_info
[
"
num_points
"
]
=
mutable_generator_state
[
"
yielded_values
"
]
fit_info
[
"
num_points
"
]
=
mutable_generator_state
[
"
yielded_values
"
]
self
.
fit_info
.
append
(
dict
(
fit_info
))
self
.
fit_info
.
append
(
dict
(
fit_info
))
def
random_negatives_generator
(
self
,
encodable_peptides
,
affinities
,
allele_encoding
,
inequalities
):
random_negative_lengths
=
self
.
hyperparameters
[
'
random_negative_lengths
'
]
df
=
pandas
.
DataFrame
({
"
peptide
"
:
encodable_peptides
.
sequences
,
"
affinity
"
:
affinities
,
})
if
allele_encoding
is
not
None
:
df
[
"
allele
"
]
=
allele_encoding
.
alleles
df
[
"
length
"
]
=
df
.
peptide
.
str
.
len
()
if
inequalities
is
None
:
df
[
"
inequality
"
]
=
"
=
"
else
:
df
[
"
inequality
"
]
=
inequalities
if
self
.
hyperparameters
[
'
random_negative_binder_threshold
'
]:
df
=
df
.
loc
[
(
df
.
inequality
!=
"
>
"
)
&
(
df
.
affinity
<
self
.
hyperparameters
[
'
random_negative_binder_threshold
'
])
]
aa_distribution
=
None
if
self
.
hyperparameters
[
'
random_negative_match_distribution
'
]:
aa_distribution
=
amino_acid_distribution
(
encodable_peptides
.
sequences
,
smoothing
=
self
.
hyperparameters
[
'
random_negative_distribution_smoothing
'
])
logging
.
info
(
"
Using amino acid distribution for random negative:
\n
%s
"
%
(
str
(
aa_distribution
.
to_dict
())))
random_negative_alleles
=
None
if
self
.
hyperparameters
[
"
random_negative_method
"
]
==
"
by_length
"
:
# Different numbers of random negatives per length. Alleles are
# sampled proportionally to the number of times they are used in
# the training data.
length_to_num_random_negative
=
{}
random_negative_lengths
=
self
.
hyperparameters
[
'
random_negative_lengths
'
]
length_counts
=
df
.
length
.
value_counts
().
to_dict
()
for
length
in
random_negative_lengths
:
length_to_num_random_negative
[
length
]
=
int
(
length_counts
.
get
(
length
,
0
)
*
self
.
hyperparameters
[
'
random_negative_rate
'
]
+
self
.
hyperparameters
[
'
random_negative_constant
'
])
length_to_num_random_negative
=
pandas
.
Series
(
length_to_num_random_negative
)
total_random_negatives
=
length_to_num_random_negative
.
sum
()
logging
.
info
(
"
Random negative counts per length:
\n
%s
"
%
(
str
(
length_to_num_random_negative
.
to_dict
())))
if
allele_encoding
is
not
None
:
random_negative_alleles
=
df
.
allele
.
sample
(
n
=
total_random_negatives
,
replace
=
True
).
values
def
sample_peptides
():
peptides
=
[]
for
(
length
,
count
)
in
length_to_num_random_negative
.
items
():
peptides
.
extend
(
random_peptides
(
count
,
length
=
length
,
distribution
=
aa_distribution
))
random
.
shuffle
(
peptides
)
# important
return
EncodableSequences
.
create
(
peptides
)
elif
self
.
hyperparameters
[
"
random_negative_method
"
]
==
"
by_allele
"
:
# For each allele, a particular number of random negatives are used
# for all lengths. Across alleles, the number of random negatives
# varies; within an allele, the number of random negatives for each
# length is a constant
allele_to_num_per_length
=
{}
total_random_peptides_per_length
=
0
for
(
allele
,
sub_df
)
in
df
.
groupby
(
"
allele
"
):
num_for_allele
=
len
(
sub_df
)
*
(
self
.
hyperparameters
[
'
random_negative_rate
'
]
)
+
self
.
hyperparameters
[
'
random_negative_constant
'
]
num_per_length
=
int
(
num_for_allele
/
len
(
random_negative_lengths
))
total_random_peptides_per_length
+=
num_per_length
allele_to_num_per_length
[
allele
]
=
num_per_length
for
_
in
random_negative_lengths
:
for
(
allele
,
num
)
in
allele_to_num_per_length
.
items
():
random_negative_alleles
.
append
([
allele
]
*
num
)
numpy
.
testing
.
assert_equal
(
len
(
random_negative_alleles
),
total_random_peptides_per_length
*
len
(
random_negative_lengths
))
logging
.
info
(
"
Random negative counts for each length by allele:
\n
%s
"
%
(
str
(
allele_to_num_per_length
)))
def
sample_peptides
():
peptides
=
[]
for
length
in
random_negative_lengths
:
peptides
.
extend
(
random_peptides
(
total_random_peptides_per_length
,
length
=
length
,
distribution
=
aa_distribution
))
# important NOT to shuffle peptides.
return
EncodableSequences
.
create
(
peptides
)
else
:
raise
NotImplementedError
(
self
.
hyperparameters
[
"
random_negative_method
"
])
random_negative_allele_encoding
=
None
if
random_negative_alleles
:
random_negative_allele_encoding
=
AlleleEncoding
(
random_negative_alleles
,
borrow_from
=
allele_encoding
)
yield
random_negative_allele_encoding
while
True
:
yield
sample_peptides
()
def
fit
(
def
fit
(
self
,
self
,
peptides
,
peptides
,
...
@@ -738,29 +868,14 @@ class Class1NeuralNetwork(object):
...
@@ -738,29 +868,14 @@ class Class1NeuralNetwork(object):
peptide_encoding
=
self
.
peptides_to_network_input
(
encodable_peptides
)
peptide_encoding
=
self
.
peptides_to_network_input
(
encodable_peptides
)
fit_info
=
collections
.
defaultdict
(
list
)
fit_info
=
collections
.
defaultdict
(
list
)
length_counts
=
(
random_negatives_generator
=
self
.
random_negatives_generator
(
pandas
.
Series
(
encodable_peptides
.
sequences
)
encodable_peptides
=
encodable_peptides
,
.
str
.
len
().
value_counts
().
to_dict
())
affinities
=
affinities
,
allele_encoding
=
allele_encoding
,
num_random_negative
=
{}
inequalities
=
inequalities
)
for
length
in
range
(
8
,
16
):
random_negatives_allele_encoding
=
next
(
random_negatives_generator
)
num_random_negative
[
length
]
=
int
(
num_random_negatives
=
len
(
length_counts
.
get
(
length
,
0
)
*
next
(
random_negatives_generator
).
sequences
)
self
.
hyperparameters
[
'
random_negative_rate
'
]
+
self
.
hyperparameters
[
'
random_negative_constant
'
])
num_random_negative
=
pandas
.
Series
(
num_random_negative
)
logging
.
info
(
"
Random negative counts per length:
\n
%s
"
%
(
str
(
num_random_negative
.
to_dict
())))
aa_distribution
=
None
if
self
.
hyperparameters
[
'
random_negative_match_distribution
'
]:
aa_distribution
=
amino_acid_distribution
(
encodable_peptides
.
sequences
,
smoothing
=
self
.
hyperparameters
[
'
random_negative_distribution_smoothing
'
])
logging
.
info
(
"
Using amino acid distribution for random negative:
\n
%s
"
%
(
str
(
aa_distribution
.
to_dict
())))
y_values
=
from_ic50
(
numpy
.
array
(
affinities
,
copy
=
False
))
y_values
=
from_ic50
(
numpy
.
array
(
affinities
,
copy
=
False
))
assert
numpy
.
isnan
(
y_values
).
sum
()
==
0
,
y_values
assert
numpy
.
isnan
(
y_values
).
sum
()
==
0
,
y_values
...
@@ -853,15 +968,14 @@ class Class1NeuralNetwork(object):
...
@@ -853,15 +968,14 @@ class Class1NeuralNetwork(object):
y_dict_with_random_negatives
=
{
y_dict_with_random_negatives
=
{
"
output
"
:
numpy
.
concatenate
([
"
output
"
:
numpy
.
concatenate
([
numpy
.
tile
(
numpy
.
tile
(
random_negative_target
,
int
(
num_random_negative
.
sum
())
),
random_negative_target
,
num_random_negative
s
),
y_values
,
y_values
,
]),
]),
}
}
# Note: we are using "<" here not ">" because the inequalities are
# Note: we are using "<" here not ">" because the inequalities are
# now in target-space (0-1) not affinity-space.
# now in target-space (0-1) not affinity-space.
adjusted_inequalities_with_random_negatives
=
(
adjusted_inequalities_with_random_negatives
=
(
[
"
<
"
]
*
int
(
num_random_negative
.
sum
())
+
[
"
<
"
]
*
num_random_negatives
+
list
(
adjusted_inequalities
))
list
(
adjusted_inequalities
))
else
:
else
:
# Randomly sample random negative affinities
# Randomly sample random negative affinities
y_dict_with_random_negatives
=
{
y_dict_with_random_negatives
=
{
...
@@ -872,7 +986,7 @@ class Class1NeuralNetwork(object):
...
@@ -872,7 +986,7 @@ class Class1NeuralNetwork(object):
'
random_negative_affinity_min
'
],
'
random_negative_affinity_min
'
],
self
.
hyperparameters
[
self
.
hyperparameters
[
'
random_negative_affinity_max
'
],
'
random_negative_affinity_max
'
],
int
(
num_random_negative
.
sum
())
)),
num_random_negative
s
)),
y_values
,
y_values
,
]),
]),
}
}
...
@@ -881,7 +995,7 @@ class Class1NeuralNetwork(object):
...
@@ -881,7 +995,7 @@ class Class1NeuralNetwork(object):
y_dict_with_random_negatives
)
y_dict_with_random_negatives
)
if
sample_weights
is
not
None
:
if
sample_weights
is
not
None
:
sample_weights_with_random_negatives
=
numpy
.
concatenate
([
sample_weights_with_random_negatives
=
numpy
.
concatenate
([
numpy
.
ones
(
int
(
num_random_negative
.
sum
())
),
numpy
.
ones
(
num_random_negative
s
),
sample_weights
])
sample_weights
])
else
:
else
:
sample_weights_with_random_negatives
=
None
sample_weights_with_random_negatives
=
None
...
@@ -893,7 +1007,7 @@ class Class1NeuralNetwork(object):
...
@@ -893,7 +1007,7 @@ class Class1NeuralNetwork(object):
else
list
(
range
(
0
,
self
.
hyperparameters
[
'
num_outputs
'
])))
else
list
(
range
(
0
,
self
.
hyperparameters
[
'
num_outputs
'
])))
output_indices_with_random_negatives
=
numpy
.
concatenate
([
output_indices_with_random_negatives
=
numpy
.
concatenate
([
pandas
.
Series
(
random_negative_output_indices
,
dtype
=
int
).
sample
(
pandas
.
Series
(
random_negative_output_indices
,
dtype
=
int
).
sample
(
n
=
int
(
num_random_negative
.
sum
())
,
replace
=
True
).
values
,
n
=
num_random_negative
s
,
replace
=
True
).
values
,
output_indices
output_indices
])
])
else
:
else
:
...
@@ -924,32 +1038,24 @@ class Class1NeuralNetwork(object):
...
@@ -924,32 +1038,24 @@ class Class1NeuralNetwork(object):
last_progress_print
=
None
last_progress_print
=
None
x_dict_with_random_negatives
=
{}
x_dict_with_random_negatives
=
{}
for
i
in
range
(
self
.
hyperparameters
[
'
max_epochs
'
]):
for
i
in
range
(
self
.
hyperparameters
[
'
max_epochs
'
]):
random_negative_peptides_list
=
[]
random_negative_peptides
=
next
(
random_negatives_generator
)
for
(
length
,
count
)
in
num_random_negative
.
iteritems
():
random_negative_peptides_list
.
extend
(
random_peptides
(
count
,
length
=
length
,
distribution
=
aa_distribution
))
random
.
shuffle
(
random_negative_peptides_list
)
random_negative_peptides
=
EncodableSequences
.
create
(
random_negative_peptides_list
)
random_negative_peptides_encoding
=
(
random_negative_peptides_encoding
=
(
self
.
peptides_to_network_input
(
random_negative_peptides
))
self
.
peptides_to_network_input
(
random_negative_peptides
))
if
not
x_dict_with_random_negatives
:
if
not
x_dict_with_random_negatives
:
if
len
(
random_negative_peptides
)
>
0
:
if
len
(
random_negative_peptides
)
>
0
:
x_dict_with_random_negatives
[
"
peptide
"
]
=
numpy
.
concatenate
([
x_dict_with_random_negatives
[
"
peptide
"
]
=
numpy
.
concatenate
([
random_negative_peptides_encoding
,
random_negative_peptides_encoding
,
peptide_encoding
,
x_dict_without_random_negatives
[
'
peptide
'
]
,
])
])
if
'
allele
'
in
x_dict_without_random_negatives
:
if
'
allele
'
in
x_dict_without_random_negatives
:
x_dict_with_random_negatives
[
'
allele
'
]
=
numpy
.
concatenate
([
x_dict_with_random_negatives
[
x_dict_without_random_negatives
[
'
allele
'
][
'
allele
'
numpy
.
random
.
choice
(
]
=
numpy
.
concatenate
([
x_dict_without_random_negatives
[
self
.
allele_encoding_to_network_input
(
'
allele
'
].
shape
[
0
],
random_negatives_allele_encoding
)[
0
],
size
=
len
(
random_negative_peptides_list
))],
x_dict_without_random_negatives
[
'
allele
'
]
x_dict_without_random_negatives
[
'
allele
'
]
])
])
else
:
else
:
...
@@ -959,18 +1065,9 @@ class Class1NeuralNetwork(object):
...
@@ -959,18 +1065,9 @@ class Class1NeuralNetwork(object):
# Update x_dict_with_random_negatives in place.
# Update x_dict_with_random_negatives in place.
# This is more memory efficient than recreating it as above.
# This is more memory efficient than recreating it as above.
if
len
(
random_negative_peptides
)
>
0
:
if
len
(
random_negative_peptides
)
>
0
:
x_dict_with_random_negatives
[
"
peptide
"
][:
len
(
random_negative_peptides
)]
=
(
x_dict_with_random_negatives
[
random_negative_peptides_encoding
"
peptide
"
)
][:
num_random_negatives
]
=
random_negative_peptides_encoding
if
'
allele
'
in
x_dict_with_random_negatives
:
x_dict_with_random_negatives
[
'
allele
'
][:
len
(
random_negative_peptides
)]
=
(
x_dict_with_random_negatives
[
'
allele
'
][
len
(
random_negative_peptides
)
+
numpy
.
random
.
choice
(
x_dict_with_random_negatives
[
'
allele
'
].
shape
[
0
]
-
len
(
random_negative_peptides
),
size
=
len
(
random_negative_peptides
))
]
)
if
needs_initialization
:
if
needs_initialization
:
self
.
data_dependent_weights_initialization
(
self
.
data_dependent_weights_initialization
(
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment