Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
63482a3f
Commit
63482a3f
authored
5 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
fix
parent
db82ae8d
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
mhcflurry/batch_generator.py
+64
-13
64 additions, 13 deletions
mhcflurry/batch_generator.py
mhcflurry/class1_ligandome_predictor.py
+40
-180
40 additions, 180 deletions
mhcflurry/class1_ligandome_predictor.py
test/test_batch_generator.py
+1
-2
1 addition, 2 deletions
test/test_batch_generator.py
with
105 additions
and
195 deletions
mhcflurry/
multiallelic_mass_spec_
batch_generator.py
→
mhcflurry/batch_generator.py
+
64
−
13
View file @
63482a3f
...
...
@@ -8,7 +8,16 @@ from .hyperparameters import HyperparameterDefaults
class
BatchPlan
(
object
):
def
__init__
(
self
,
equivalence_classes
,
batch_compositions
):
def
__init__
(
self
,
equivalence_classes
,
batch_compositions
,
equivalence_class_labels
=
None
):
"""
Parameters
----------
equivalence_classes
batch_compositions
equivalence_class_labels : list of string, optional
Used only for summary().
"""
# batch_compositions is (num batches_generator, batch size)
self
.
equivalence_classes
=
equivalence_classes
# indices into points
...
...
@@ -23,6 +32,9 @@ class BatchPlan(object):
indices_into_equivalence_classes
.
append
(
numpy
.
array
(
indices
,
dtype
=
int
))
self
.
indices_into_equivalence_classes
=
indices_into_equivalence_classes
self
.
equivalence_class_labels
=
(
numpy
.
array
(
equivalence_class_labels
)
if
equivalence_class_labels
is
not
None
else
None
)
def
batch_indices_generator
(
self
,
epochs
=
1
):
batch_nums
=
numpy
.
arange
(
len
(
self
.
batch_compositions
))
...
...
@@ -54,21 +66,35 @@ class BatchPlan(object):
def
summary
(
self
,
indent
=
0
):
lines
=
[]
lines
.
append
(
"
Equivalence class sizes:
"
)
lines
.
append
(
pandas
.
Series
(
[
len
(
c
)
for
c
in
self
.
equivalence_classes
]))
lines
.
append
(
"
Batch compositions:
"
)
lines
.
append
(
self
.
batch_compositions
)
equivalence_class_labels
=
self
.
equivalence_class_labels
if
equivalence_class_labels
is
None
:
equivalence_class_labels
=
(
"
class-
"
+
numpy
.
arange
(
self
.
equivalence_classes
).
astype
(
"
str
"
))
i
=
0
while
i
<
len
(
self
.
batch_compositions
):
composition
=
self
.
batch_compositions
[
i
]
label_counts
=
pandas
.
Series
(
equivalence_class_labels
[
composition
]).
value_counts
()
lines
.
append
(
(
"
Batch %5d:
"
%
i
)
+
"
,
"
.
join
(
"
{key}[{value}]
"
.
format
(
key
=
key
,
value
=
value
)
for
(
key
,
value
)
in
label_counts
.
iteritems
()))
if
i
==
5
:
lines
.
append
(
"
...
"
)
i
=
len
(
self
.
batch_compositions
)
-
4
i
+=
1
indent_spaces
=
"
"
*
indent
return
"
\n
"
.
join
([
indent_spaces
+
str
(
line
)
for
line
in
lines
])
@property
def
num_batches
(
self
):
return
self
.
batch_compositions
.
shape
[
0
]
return
len
(
self
.
batch_compositions
)
@property
def
batch_size
(
self
):
return
self
.
batch_compositions
.
shape
[
1
]
return
max
(
len
(
b
)
for
b
in
self
.
batch_compositions
)
class
MultiallelicMassSpecBatchGenerator
(
object
):
...
...
@@ -100,6 +126,15 @@ class MultiallelicMassSpecBatchGenerator(object):
df
[
"
first_allele
"
]
=
df
.
alleles
.
str
.
get
(
0
)
df
[
"
unused
"
]
=
True
df
[
"
idx
"
]
=
df
.
index
equivalence_class_to_label
=
dict
(
(
idx
,
(
"
{first_allele} {binder}
"
if
row
.
is_affinity
else
"
{experiment_name} {binder}
"
).
format
(
binder
=
"
binder
"
if
row
.
is_binder
else
"
nonbinder
"
,
**
row
.
to_dict
()))
for
(
idx
,
row
)
in
df
.
drop_duplicates
(
"
equivalence_class
"
).
set_index
(
"
equivalence_class
"
).
iterrows
())
df
=
df
.
sample
(
frac
=
1.0
)
#df["key"] = df.is_binder ^ (numpy.arange(len(df)) % 2).astype(bool)
#df = df.sort_values("key")
...
...
@@ -171,14 +206,19 @@ class MultiallelicMassSpecBatchGenerator(object):
]
return
BatchPlan
(
equivalence_classes
=
equivalence_classes
,
batch_compositions
=
batch_compositions
)
batch_compositions
=
batch_compositions
,
equivalence_class_labels
=
[
equivalence_class_to_label
[
i
]
for
i
in
range
(
len
(
class_to_indices
))
])
def
plan
(
self
,
affinities_mask
,
experiment_names
,
alleles_matrix
,
is_binder
):
is_binder
,
potential_validation_mask
=
None
):
affinities_mask
=
numpy
.
array
(
affinities_mask
,
copy
=
False
,
dtype
=
bool
)
experiment_names
=
numpy
.
array
(
experiment_names
,
copy
=
False
)
alleles_matrix
=
numpy
.
array
(
alleles_matrix
,
copy
=
False
)
...
...
@@ -190,10 +230,13 @@ class MultiallelicMassSpecBatchGenerator(object):
numpy
.
testing
.
assert_equal
(
len
(
is_binder
),
n
)
numpy
.
testing
.
assert_equal
(
affinities_mask
,
pandas
.
isnull
(
experiment_names
))
if
potential_validation_mask
is
not
None
:
numpy
.
testing
.
assert_equal
(
len
(
potential_validation_mask
),
n
)
validation_items
=
numpy
.
random
.
choice
(
n
,
int
(
self
.
hyperparameters
[
'
batch_generator_validation_split
'
]
*
n
))
n
if
potential_validation_mask
is
None
else
numpy
.
where
(
potential_validation_mask
)[
0
],
int
(
self
.
hyperparameters
[
'
batch_generator_validation_split
'
]
*
n
))
validation_mask
=
numpy
.
zeros
(
n
,
dtype
=
bool
)
validation_mask
[
validation_items
]
=
True
...
...
@@ -216,7 +259,7 @@ class MultiallelicMassSpecBatchGenerator(object):
def
summary
(
self
):
return
(
"
Train:
"
+
self
.
train_batch_plan
.
summary
(
indent
=
1
)
+
"
Train:
\n
"
+
self
.
train_batch_plan
.
summary
(
indent
=
1
)
+
"
\n
***
\n
Test:
"
+
self
.
test_batch_plan
.
summary
(
indent
=
1
))
def
get_train_and_test_generators
(
self
,
x_dict
,
y_list
,
epochs
=
1
):
...
...
@@ -225,3 +268,11 @@ class MultiallelicMassSpecBatchGenerator(object):
test_generator
=
self
.
test_batch_plan
.
batches_generator
(
x_dict
,
y_list
,
epochs
=
epochs
)
return
(
train_generator
,
test_generator
)
@property
def
num_train_batches
(
self
):
return
self
.
train_batch_plan
.
num_batches
@property
def
num_test_batches
(
self
):
return
self
.
test_batch_plan
.
num_batches
This diff is collapsed.
Click to expand it.
mhcflurry/class1_ligandome_predictor.py
+
40
−
180
View file @
63482a3f
...
...
@@ -16,6 +16,7 @@ from .regression_target import from_ic50, to_ic50
from
.random_negative_peptides
import
RandomNegativePeptides
from
.allele_encoding
import
MultipleAlleleEncoding
,
AlleleEncoding
from
.auxiliary_input
import
AuxiliaryInputEncoder
from
.batch_generator
import
MultiallelicMassSpecBatchGenerator
from
.custom_loss
import
(
MSEWithInequalities
,
MultiallelicMassSpecLoss
,
...
...
@@ -39,11 +40,10 @@ class Class1LigandomePredictor(object):
fit_hyperparameter_defaults
=
HyperparameterDefaults
(
max_epochs
=
500
,
validation_split
=
0.1
,
early_stopping
=
True
,
minibatch_size
=
128
,
random_negative_affinity_min
=
20000.0
,).
extend
(
RandomNegativePeptides
.
hyperparameter_defaults
RandomNegativePeptides
.
hyperparameter_defaults
).
extend
(
MultiallelicMassSpecBatchGenerator
.
hyperparameter_defaults
)
"""
Hyperparameters for neural network training.
...
...
@@ -366,12 +366,6 @@ class Class1LigandomePredictor(object):
peptide_input
=
self
.
peptides_to_network_input
(
encodable_peptides
)
validation_items
=
numpy
.
random
.
choice
(
len
(
labels
),
int
(
self
.
hyperparameters
[
'
validation_split
'
]
*
len
(
labels
)))
validation_mask
=
numpy
.
zeros
(
len
(
labels
),
dtype
=
bool
)
validation_mask
[
validation_items
]
=
True
# Optional optimization
(
allele_encoding_input
,
allele_representations
)
=
(
self
.
allele_encoding_to_network_input
(
allele_encoding
))
...
...
@@ -403,10 +397,6 @@ class Class1LigandomePredictor(object):
allele_encoding
.
max_alleles_per_experiment
),
borrow_from
=
allele_encoding
.
allele_encoding
)
num_random_negatives
=
random_negatives_planner
.
get_total_count
()
validation_mask_with_random_negatives
=
numpy
.
concatenate
([
numpy
.
tile
(
False
,
num_random_negatives
),
validation_mask
])
# Reverse inequalities because from_ic50() flips the direction
# (i.e. lower affinity results in higher y values).
...
...
@@ -466,6 +456,37 @@ class Class1LigandomePredictor(object):
if
verbose
:
self
.
network
.
summary
()
batch_generator
=
MultiallelicMassSpecBatchGenerator
(
MultiallelicMassSpecBatchGenerator
.
hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
start
=
time
.
time
()
batch_generator
.
plan
(
affinities_mask
=
numpy
.
concatenate
([
numpy
.
tile
(
True
,
num_random_negatives
),
affinities_mask
]),
experiment_names
=
numpy
.
concatenate
([
numpy
.
tile
(
None
,
num_random_negatives
),
allele_encoding
.
experiment_names
]),
alleles_matrix
=
numpy
.
concatenate
([
random_negatives_allele_encoding
.
alleles
,
allele_encoding
.
alleles
,
]),
is_binder
=
numpy
.
concatenate
([
numpy
.
tile
(
False
,
num_random_negatives
),
numpy
.
where
(
affinities_mask
,
labels
,
to_ic50
(
labels
))
<
1000.0
]),
potential_validation_mask
=
numpy
.
concatenate
([
numpy
.
tile
(
False
,
num_random_negatives
),
numpy
.
tile
(
True
,
len
(
labels
))
]),
)
if
verbose
:
print
(
"
Generated batch generation plan in %0.2f sec.
"
%
(
time
.
time
()
-
start
))
print
(
batch_generator
.
summary
())
min_val_loss_iteration
=
None
min_val_loss
=
None
last_progress_print
=
0
...
...
@@ -519,27 +540,22 @@ class Class1LigandomePredictor(object):
"
peptide
"
][:
num_random_negatives
]
=
random_negative_peptides_encoding
(
train_generator
,
t
rain_batches
,
test_generator
,
test_batches
)
=
(
self
.
train_and_test_generators
(
(
train_generator
,
t
est_generator
)
=
(
batch_generator
.
get_
train_and_test_generators
(
x_dict
=
x_dict_with_random_negatives
,
y_list
=
[
encoded_y1
,
encoded_y2
,
encoded_y2
],
batch_size
=
self
.
hyperparameters
[
'
minibatch_size
'
],
validation_mask
=
validation_mask_with_random_negatives
,
experiment_names
=
numpy
.
concatenate
([
numpy
.
tile
(
None
,
num_random_negatives
),
allele_encoding
.
experiment_names
])))
epochs
=
1
))
self
.
assert_allele_representations_hash
(
allele_representations_hash
)
fit_history
=
self
.
network
.
fit_generator
(
train_generator
,
steps_per_epoch
=
train_batches
,
steps_per_epoch
=
batch_generator
.
num_
train_batches
,
epochs
=
i
+
1
,
initial_epoch
=
i
,
verbose
=
verbose
,
use_multiprocessing
=
False
,
workers
=
0
,
validation_data
=
test_generator
,
validation_steps
=
test_batches
)
validation_steps
=
batch_generator
.
num_
test_batches
)
"""
fit_history = self.network.fit(
...
...
@@ -575,7 +591,7 @@ class Class1LigandomePredictor(object):
min_val_loss_iteration
)).
strip
())
last_progress_print
=
time
.
time
()
if
self
.
hyperparameters
[
'
validation_split
'
]
:
if
batch_generator
.
num_test_batches
:
#import ipdb ; ipdb.set_trace()
val_loss
=
fit_info
[
'
val_loss
'
][
-
1
]
if
min_val_loss
is
None
or
(
...
...
@@ -609,162 +625,6 @@ class Class1LigandomePredictor(object):
fit_info
[
"
num_points
"
]
=
len
(
labels
)
self
.
fit_info
.
append
(
dict
(
fit_info
))
@classmethod
def
train_and_test_generators
(
cls
,
x_dict
,
y_list
,
batch_size
,
validation_mask
,
experiment_names
):
points
=
len
(
y_list
[
0
])
train_x_dict
=
{}
test_x_dict
=
{}
for
(
key
,
value
)
in
x_dict
.
items
():
train_x_dict
[
key
]
=
value
[
~
validation_mask
]
test_x_dict
[
key
]
=
value
[
validation_mask
]
train_y_list
=
[]
test_y_list
=
[]
for
value
in
y_list
:
train_y_list
.
append
(
value
[
~
validation_mask
])
test_y_list
.
append
(
value
[
validation_mask
])
train_generator
=
cls
.
batch_generator
(
x_dict
=
train_x_dict
,
y_list
=
train_y_list
,
batch_size
=
batch_size
,
experiment_names
=
experiment_names
[
~
validation_mask
])
test_generator
=
cls
.
batch_generator
(
x_dict
=
test_x_dict
,
y_list
=
test_y_list
,
batch_size
=
batch_size
,
experiment_names
=
experiment_names
[
validation_mask
])
train_batches
=
next
(
train_generator
)
test_batches
=
next
(
test_generator
)
return
(
train_generator
,
train_batches
,
test_generator
,
test_batches
)
@staticmethod
def
batch_generator
(
x_dict
,
y_list
,
batch_size
,
experiment_names
,
affinity_fraction_for_mass_spec_batches
=
0.5
):
# Each batch should have a mix of:
# - random negative peptides
# - affinity measurements (binder + non-binder)
# - multiallelic mass spec
start
=
time
.
time
()
df
=
pandas
.
DataFrame
({
"
experiment
"
:
experiment_names
})
df
[
"
unused
"
]
=
True
df
[
"
mass_spec_label
"
]
=
y_list
[
1
]
assert
set
(
df
.
loc
[
~
df
.
experiment
.
isnull
()].
mass_spec_label
.
unique
())
==
{
0.0
,
1.0
},
df
.
loc
[
~
df
.
experiment
.
isnull
()].
mass_spec_label
.
unique
()
hit_rate
=
df
.
loc
[
~
df
.
experiment
.
isnull
()].
mass_spec_label
.
mean
()
affinities_per_batch
=
int
(
affinity_fraction_for_mass_spec_batches
*
batch_size
)
mass_spec_per_batch
=
batch_size
-
affinities_per_batch
hits_per_mass_spec_batch
=
int
(
hit_rate
*
mass_spec_per_batch
)
decoys_per_mass_spec_batch
=
(
mass_spec_per_batch
-
hits_per_mass_spec_batch
)
print
(
"
affinity count
"
,
affinities_per_batch
)
print
(
"
mass_spec count
"
,
mass_spec_per_batch
,
hits_per_mass_spec_batch
,
decoys_per_mass_spec_batch
)
# Mixed mass spec / affinity batches_generator
experiments
=
df
.
experiment
.
unique
()
batch_indices
=
[]
batch_descriptions
=
[]
for
experiment
in
experiments
:
if
experiment
is
None
:
continue
while
True
:
experiment_df
=
df
.
loc
[
df
.
unused
&
(
df
.
experiment
==
experiment
)]
if
len
(
experiment_df
)
==
0
:
break
affinities_df
=
df
.
loc
[
df
.
unused
&
df
.
experiment
.
isnull
()]
affinities_for_this_batch
=
min
(
affinities_per_batch
,
len
(
affinities_df
))
mass_spec_for_this_batch
=
(
batch_size
-
affinities_for_this_batch
)
if
len
(
experiment_df
)
<
mass_spec_for_this_batch
:
mass_spec_for_this_batch
=
len
(
experiment_df
)
affinities_for_this_batch
=
(
batch_size
-
mass_spec_for_this_batch
)
if
affinities_for_this_batch
<
len
(
affinities_df
):
# For mass spec, we only do whole batches_generator, since it's
# unclear how our pairwise loss would interact with
# a smaller batch.
break
mass_spec_labels
=
y_list
[
1
][
experiment_df
.
index
.
values
]
assert
((
mass_spec_labels
==
0
)
|
(
mass_spec_labels
==
1
)).
all
(),
mass_spec_labels
to_use_list
=
[]
# sample hits
to_use
=
experiment_df
.
sample
(
n
=
hits_per_mass_spec_batch
,
weights
=
experiment_df
.
mass_spec_label
+
1e-10
,
replace
=
False
)
to_use_list
.
append
(
to_use
.
index
.
values
)
# sample decoys
to_use
=
experiment_df
.
loc
[
~
experiment_df
.
index
.
isin
(
to_use
.
index
)
].
sample
(
n
=
decoys_per_mass_spec_batch
,
weights
=
(
1
-
experiment_df
.
mass_spec_label
)
+
1e-10
,
replace
=
False
)
to_use_list
.
append
(
to_use
.
index
.
values
)
# sample affinities
to_use
=
affinities_df
.
sample
(
n
=
affinities_for_this_batch
,
replace
=
False
)
to_use_list
.
append
(
to_use
.
index
.
values
)
to_use_indices
=
numpy
.
concatenate
(
to_use_list
)
df
.
loc
[
to_use_indices
,
"
unused
"
]
=
False
batch_indices
.
append
(
to_use_indices
)
batch_descriptions
.
append
(
"
multiallelic-mass-spec
"
)
# Affinities-only batches_generator
affinities_df
=
df
.
loc
[
df
.
unused
&
df
.
experiment
.
isnull
()]
while
len
(
affinities_df
)
>
0
:
if
len
(
affinities_df
)
<=
batch_size
:
to_use
=
affinities_df
else
:
to_use
=
affinities_df
.
sample
(
n
=
batch_size
,
replace
=
False
)
df
.
loc
[
to_use
.
index
,
"
unused
"
]
=
False
batch_indices
.
append
(
to_use
.
index
)
affinities_df
=
df
.
loc
[
df
.
unused
&
df
.
experiment
.
isnull
()]
batch_descriptions
.
append
(
"
affinities-only
"
)
numpy
.
random
.
shuffle
(
batch_indices
)
print
(
"
Planning %d batches_generator took
"
%
len
(
batch_indices
),
time
.
time
()
-
start
,
"
sec
"
)
print
(
"
remaining unused:
"
)
print
(
df
.
loc
[
df
.
unused
].
experiment
.
fillna
(
"
[affinity]
"
).
value_counts
())
print
(
"
batch descriptions
"
)
print
(
pandas
.
Series
(
batch_descriptions
).
value_counts
())
#import ipdb ; ipdb.set_trace()
yield
len
(
batch_indices
)
for
indices
in
batch_indices
:
x_dict_batch
=
{}
for
(
key
,
value
)
in
x_dict
.
items
():
x_dict_batch
[
key
]
=
value
[
indices
]
y_list_batch
=
[]
for
value
in
y_list
:
y_list_batch
.
append
(
value
[
indices
])
yield
(
x_dict_batch
,
y_list_batch
)
#import ipdb ; ipdb.set_trace()
#yield None
def
predict
(
self
,
peptides
,
...
...
This diff is collapsed.
Click to expand it.
test/test_
multiallelic_mass_spec_
batch_generator.py
→
test/test_batch_generator.py
+
1
−
2
View file @
63482a3f
import
pandas
import
numpy
from
mhcflurry.
multiallelic_mass_spec_
batch_generator
import
(
from
mhcflurry.batch_generator
import
(
MultiallelicMassSpecBatchGenerator
)
from
numpy.testing
import
assert_equal
...
...
@@ -56,7 +56,6 @@ def test_basic():
for
((
kind
,
batch_num
),
batch_df
)
in
df
.
groupby
([
"
kind
"
,
"
batch
"
]):
if
not
batch_df
.
affinities_mask
.
all
():
print
(
batch_df
)
# Test each batch has at most one multiallelic ms experiment.
assert_equal
(
batch_df
.
loc
[
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment