Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
dfe14dd9
Commit
dfe14dd9
authored
5 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
Delete BatchGenerator class
parent
c64db777
Loading
Loading
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
mhcflurry/batch_generator.py
+0
-336
0 additions, 336 deletions
mhcflurry/batch_generator.py
test/test_batch_generator.py
+0
-120
0 additions, 120 deletions
test/test_batch_generator.py
with
0 additions
and
456 deletions
mhcflurry/batch_generator.py
deleted
100644 → 0
+
0
−
336
View file @
c64db777
import
collections
import
numpy
import
pandas
from
.hyperparameters
import
HyperparameterDefaults
class
BatchPlan
(
object
):
def
__init__
(
self
,
equivalence_classes
,
batch_compositions
,
equivalence_class_labels
=
None
):
"""
Parameters
----------
equivalence_classes
batch_compositions
equivalence_class_labels : list of string, optional
Used only for summary().
"""
# batch_compositions is (num batches_generator, batch size)
self
.
equivalence_classes
=
equivalence_classes
# indices into points
self
.
batch_compositions
=
batch_compositions
# indices into equivalence_classes
indices_into_equivalence_classes
=
[]
next_index
=
collections
.
defaultdict
(
int
)
for
batch_composition
in
batch_compositions
:
indices
=
[]
for
equivalence_class
in
batch_composition
:
indices
.
append
(
next_index
[
equivalence_class
])
next_index
[
equivalence_class
]
+=
1
indices_into_equivalence_classes
.
append
(
numpy
.
array
(
indices
,
dtype
=
int
))
self
.
indices_into_equivalence_classes
=
indices_into_equivalence_classes
self
.
equivalence_class_labels
=
(
numpy
.
array
(
equivalence_class_labels
)
if
equivalence_class_labels
is
not
None
else
None
)
def
batch_indices_generator
(
self
,
epochs
=
1
):
batch_nums
=
numpy
.
arange
(
len
(
self
.
batch_compositions
))
for
epoch
in
range
(
epochs
):
# Shuffle equivalence classes
for
arr
in
self
.
equivalence_classes
:
numpy
.
random
.
shuffle
(
arr
)
numpy
.
random
.
shuffle
(
batch_nums
)
for
batch_num
in
batch_nums
:
class_indices
=
self
.
batch_compositions
[
batch_num
]
indices_into_classes
=
self
.
indices_into_equivalence_classes
[
batch_num
]
batch_indices
=
[
self
.
equivalence_classes
[
i
][
j
]
for
(
i
,
j
)
in
zip
(
class_indices
,
indices_into_classes
)
]
yield
batch_indices
def
batches_generator
(
self
,
x_dict
,
y_list
,
epochs
=
1
):
for
indices
in
self
.
batch_indices_generator
(
epochs
=
epochs
):
batch_x_dict
=
{}
for
(
item
,
value
)
in
x_dict
.
items
():
assert
not
numpy
.
isnan
(
value
[
indices
]).
any
(),
(
item
,
value
)
batch_x_dict
[
item
]
=
value
[
indices
]
batch_y_list
=
[]
for
value
in
y_list
:
assert
not
numpy
.
isnan
(
value
[
indices
]).
any
(),
(
len
(
batch_y_list
),
value
)
batch_y_list
.
append
(
value
[
indices
])
yield
(
batch_x_dict
,
batch_y_list
)
def
summary
(
self
,
indent
=
0
):
lines
=
[]
equivalence_class_labels
=
self
.
equivalence_class_labels
if
equivalence_class_labels
is
None
:
equivalence_class_labels
=
numpy
.
array
([
"
class-%d
"
%
i
for
i
in
range
(
len
(
self
.
equivalence_classes
))
])
i
=
0
while
i
<
len
(
self
.
batch_compositions
):
composition
=
self
.
batch_compositions
[
i
]
label_counts
=
pandas
.
Series
(
equivalence_class_labels
[
composition
]).
value_counts
()
lines
.
append
(
(
"
Batch %5d:
"
%
i
)
+
"
,
"
.
join
(
"
{key}[{value}]
"
.
format
(
key
=
key
,
value
=
value
)
for
(
key
,
value
)
in
label_counts
.
iteritems
()))
if
i
==
5
and
len
(
self
.
batch_compositions
)
>
i
+
3
:
lines
.
append
(
"
...
"
)
i
=
len
(
self
.
batch_compositions
)
-
i
+
1
i
+=
1
indent_spaces
=
"
"
*
indent
return
"
\n
"
.
join
([
indent_spaces
+
str
(
line
)
for
line
in
lines
])
@property
def
num_batches
(
self
):
return
len
(
self
.
batch_compositions
)
@property
def
batch_size
(
self
):
return
max
(
len
(
b
)
for
b
in
self
.
batch_compositions
)
class
BatchGenerator
(
object
):
implementations
=
{}
hyperparameter_defaults
=
HyperparameterDefaults
(
batch_generator
=
"
simple
"
,
batch_generator_validation_split
=
0.1
,
batch_generator_batch_size
=
128
)
@staticmethod
def
register_implementation
(
name
,
klass
):
BatchGenerator
.
implementations
[
name
]
=
klass
BatchGenerator
.
hyperparameter_defaults
=
(
BatchGenerator
.
hyperparameter_defaults
.
extend
(
klass
.
hyperparameter_defaults
))
@staticmethod
def
create
(
hyperparameters
):
name
=
hyperparameters
[
'
batch_generator
'
]
return
BatchGenerator
.
implementations
[
name
](
hyperparameters
)
def
__init__
(
self
,
hyperparameters
):
self
.
hyperparameters
=
BatchGenerator
.
hyperparameter_defaults
.
with_defaults
(
hyperparameters
)
self
.
train_batch_plan
=
None
self
.
test_batch_plan
=
None
def
plan
(
self
,
*
args
,
**
kwargs
):
raise
NotImplementedError
()
def
summary
(
self
):
return
(
"
Train:
\n
"
+
self
.
train_batch_plan
.
summary
(
indent
=
1
)
+
"
\n
***
\n
Test:
"
+
self
.
test_batch_plan
.
summary
(
indent
=
1
))
def
get_train_and_test_generators
(
self
,
x_dict
,
y_list
,
epochs
=
1
):
train_generator
=
self
.
train_batch_plan
.
batches_generator
(
x_dict
,
y_list
,
epochs
=
epochs
)
test_generator
=
self
.
test_batch_plan
.
batches_generator
(
x_dict
,
y_list
,
epochs
=
epochs
)
return
(
train_generator
,
test_generator
)
@property
def
num_train_batches
(
self
):
return
self
.
train_batch_plan
.
num_batches
@property
def
num_test_batches
(
self
):
return
self
.
test_batch_plan
.
num_batches
class
SimpleBatchGenerator
(
BatchGenerator
):
hyperparameter_defaults
=
HyperparameterDefaults
()
def
__init__
(
self
,
hyperparameters
):
BatchGenerator
.
__init__
(
self
,
hyperparameters
)
def
plan
(
self
,
num
,
validation_weights
=
None
,
**
kwargs
):
if
validation_weights
is
not
None
:
validation_weights
=
numpy
.
array
(
validation_weights
,
copy
=
True
,
dtype
=
float
)
numpy
.
testing
.
assert_equal
(
len
(
validation_weights
),
num
)
validation_weights
/=
validation_weights
.
sum
()
validation_items
=
numpy
.
random
.
choice
(
num
,
int
((
self
.
hyperparameters
[
'
batch_generator_validation_split
'
])
*
num
),
replace
=
False
,
p
=
validation_weights
)
validation_items_set
=
set
(
validation_items
)
numpy
.
testing
.
assert_equal
(
len
(
validation_items
),
len
(
validation_items_set
))
training_items
=
numpy
.
array
([
x
for
x
in
range
(
num
)
if
x
not
in
validation_items_set
],
dtype
=
int
)
numpy
.
testing
.
assert_equal
(
len
(
validation_items
)
+
len
(
training_items
),
num
)
def
simple_compositions
(
num
,
num_per_batch
=
self
.
hyperparameters
[
'
batch_generator_batch_size
'
]):
full_batch
=
numpy
.
zeros
(
num_per_batch
,
dtype
=
int
)
result
=
[
full_batch
]
*
int
(
numpy
.
floor
(
num
/
num_per_batch
))
if
num
%
num_per_batch
!=
0
:
result
.
append
(
numpy
.
zeros
(
num
%
num_per_batch
,
dtype
=
int
))
numpy
.
testing
.
assert_equal
(
sum
(
len
(
x
)
for
x
in
result
),
num
)
return
result
self
.
train_batch_plan
=
BatchPlan
(
equivalence_classes
=
[
training_items
],
batch_compositions
=
simple_compositions
(
len
(
training_items
)))
self
.
test_batch_plan
=
BatchPlan
(
equivalence_classes
=
[
validation_items
],
batch_compositions
=
simple_compositions
(
len
(
validation_items
)))
BatchGenerator
.
register_implementation
(
"
simple
"
,
SimpleBatchGenerator
)
class
MultiallelicMassSpecBatchGenerator
(
BatchGenerator
):
hyperparameter_defaults
=
HyperparameterDefaults
(
batch_generator_affinity_fraction
=
0.5
)
"""
Hyperperameters for batch generation for the presentation predictor.
"""
def
__init__
(
self
,
hyperparameters
):
BatchGenerator
.
__init__
(
self
,
hyperparameters
)
self
.
equivalence_classes
=
None
self
.
batch_indices
=
None
@staticmethod
def
plan_from_dataframe
(
df
,
hyperparameters
):
affinity_fraction
=
hyperparameters
[
"
batch_generator_affinity_fraction
"
]
batch_size
=
hyperparameters
[
"
batch_generator_batch_size
"
]
df
[
"
first_allele
"
]
=
df
.
alleles
.
str
.
get
(
0
)
df
[
"
equivalence_key
"
]
=
numpy
.
where
(
df
.
is_affinity
,
df
.
first_allele
,
df
.
experiment_name
,
)
+
"
"
+
df
.
is_binder
.
map
({
True
:
"
binder
"
,
False
:
"
nonbinder
"
})
(
df
[
"
equivalence_class
"
],
equivalence_class_labels
)
=
(
df
.
equivalence_key
.
factorize
())
df
[
"
idx
"
]
=
df
.
index
df
=
df
.
sample
(
frac
=
1.0
)
affinities_per_batch
=
int
(
affinity_fraction
*
batch_size
)
remaining_affinities_df
=
df
.
loc
[
df
.
is_affinity
].
copy
()
# First do mixed affinity / multiallelic ms batches_generator.
batch_compositions
=
[]
for
(
experiment
,
experiment_df
)
in
df
.
loc
[
~
df
.
is_affinity
].
groupby
(
"
experiment_name
"
):
(
experiment_alleles
,)
=
experiment_df
.
alleles
.
unique
()
remaining_affinities_df
[
"
matches_allele
"
]
=
(
remaining_affinities_df
.
first_allele
.
isin
(
experiment_alleles
))
# Whenever possible we try to use affinities with the same
# alleles as the mass spec experiment
remaining_affinities_df
=
remaining_affinities_df
.
sort_values
(
"
matches_allele
"
,
ascending
=
False
)
while
len
(
experiment_df
)
>
0
:
affinities_for_this_batch
=
min
(
affinities_per_batch
,
len
(
remaining_affinities_df
))
mass_spec_for_this_batch
=
(
batch_size
-
affinities_for_this_batch
)
if
len
(
experiment_df
)
<
mass_spec_for_this_batch
:
mass_spec_for_this_batch
=
len
(
experiment_df
)
affinities_for_this_batch
=
(
batch_size
-
mass_spec_for_this_batch
)
batch_composition
=
[]
# take mass spec
to_use
=
experiment_df
.
iloc
[:
mass_spec_for_this_batch
]
experiment_df
=
experiment_df
.
iloc
[
mass_spec_for_this_batch
:]
batch_composition
.
extend
(
to_use
.
equivalence_class
.
values
)
# take affinities
to_use
=
remaining_affinities_df
.
iloc
[
:
affinities_for_this_batch
]
remaining_affinities_df
=
remaining_affinities_df
.
iloc
[
affinities_for_this_batch
:
]
batch_composition
.
extend
(
to_use
.
equivalence_class
.
values
)
batch_compositions
.
append
(
batch_composition
)
# Affinities-only batches
while
len
(
remaining_affinities_df
)
>
0
:
to_use
=
remaining_affinities_df
.
iloc
[:
batch_size
]
remaining_affinities_df
=
remaining_affinities_df
.
iloc
[
batch_size
:]
batch_compositions
.
append
(
to_use
.
equivalence_class
.
values
)
class_to_indices
=
df
.
groupby
(
"
equivalence_class
"
).
idx
.
unique
()
equivalence_classes
=
[
class_to_indices
[
i
]
for
i
in
range
(
len
(
class_to_indices
))
]
return
BatchPlan
(
equivalence_classes
=
equivalence_classes
,
batch_compositions
=
batch_compositions
,
equivalence_class_labels
=
equivalence_class_labels
)
def
plan
(
self
,
affinities_mask
,
experiment_names
,
alleles_matrix
,
is_binder
,
validation_weights
=
None
,
num
=
None
):
affinities_mask
=
numpy
.
array
(
affinities_mask
,
copy
=
False
,
dtype
=
bool
)
experiment_names
=
numpy
.
array
(
experiment_names
,
copy
=
False
)
alleles_matrix
=
numpy
.
array
(
alleles_matrix
,
copy
=
False
)
is_binder
=
numpy
.
array
(
is_binder
,
copy
=
False
,
dtype
=
bool
)
n
=
len
(
experiment_names
)
if
num
is
not
None
:
numpy
.
testing
.
assert_equal
(
num
,
n
)
numpy
.
testing
.
assert_equal
(
len
(
affinities_mask
),
n
)
numpy
.
testing
.
assert_equal
(
len
(
alleles_matrix
),
n
)
numpy
.
testing
.
assert_equal
(
len
(
is_binder
),
n
)
if
validation_weights
is
not
None
:
validation_weights
=
numpy
.
array
(
validation_weights
,
copy
=
True
,
dtype
=
float
)
numpy
.
testing
.
assert_equal
(
len
(
validation_weights
),
n
)
validation_weights
/=
validation_weights
.
sum
()
validation_items
=
numpy
.
random
.
choice
(
n
,
int
((
self
.
hyperparameters
[
'
batch_generator_validation_split
'
])
*
n
),
replace
=
False
,
p
=
validation_weights
)
validation_mask
=
numpy
.
zeros
(
n
,
dtype
=
bool
)
validation_mask
[
validation_items
]
=
True
df
=
pandas
.
DataFrame
({
"
is_affinity
"
:
affinities_mask
,
"
experiment_name
"
:
experiment_names
,
"
is_binder
"
:
is_binder
,
"
is_validation
"
:
validation_mask
,
"
alleles
"
:
[
tuple
(
row
[
row
!=
None
])
for
row
in
alleles_matrix
],
})
df
.
loc
[
df
.
is_affinity
,
"
experiment_name
"
]
=
None
train_df
=
df
.
loc
[
~
df
.
is_validation
].
copy
()
test_df
=
df
.
loc
[
df
.
is_validation
].
copy
()
self
.
train_batch_plan
=
self
.
plan_from_dataframe
(
train_df
,
self
.
hyperparameters
)
self
.
test_batch_plan
=
self
.
plan_from_dataframe
(
test_df
,
self
.
hyperparameters
)
BatchGenerator
.
register_implementation
(
"
multiallelic_mass_spec
"
,
MultiallelicMassSpecBatchGenerator
)
This diff is collapsed.
Click to expand it.
test/test_batch_generator.py
deleted
100644 → 0
+
0
−
120
View file @
c64db777
import
logging
logging
.
getLogger
(
'
matplotlib
'
).
disabled
=
True
logging
.
getLogger
(
'
tensorflow
'
).
disabled
=
True
import
os
import
collections
import
time
import
cProfile
import
pstats
import
pandas
import
numpy
from
mhcflurry.downloads
import
get_path
from
mhcflurry.batch_generator
import
(
MultiallelicMassSpecBatchGenerator
)
from
mhcflurry.regression_target
import
to_ic50
from
mhcflurry
import
Class1AffinityPredictor
from
numpy.testing
import
assert_equal
from
nose.tools
import
assert_greater
,
assert_less
def
data_path
(
name
):
'''
Return the absolute path to a file in the test/data directory.
The name specified should be relative to test/data.
'''
return
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"
data
"
,
name
)
def
test_basic_repeat
():
for
_
in
range
(
100
):
test_basic
()
def
test_basic
():
batch_size
=
7
validation_split
=
0.2
planner
=
MultiallelicMassSpecBatchGenerator
(
hyperparameters
=
dict
(
batch_generator_validation_split
=
validation_split
,
batch_generator_batch_size
=
batch_size
,
batch_generator_affinity_fraction
=
0.5
))
exp1_alleles
=
[
"
HLA-A*03:01
"
,
"
HLA-B*07:02
"
,
"
HLA-C*02:01
"
]
exp2_alleles
=
[
"
HLA-A*02:01
"
,
"
HLA-B*27:01
"
,
"
HLA-C*02:01
"
]
df
=
pandas
.
DataFrame
(
dict
(
affinities_mask
=
([
True
]
*
14
)
+
([
False
]
*
6
),
experiment_names
=
([
None
]
*
14
)
+
([
"
exp1
"
]
*
2
)
+
([
"
exp2
"
]
*
4
),
alleles_matrix
=
[[
"
HLA-C*07:01
"
,
None
,
None
]]
*
10
+
[
[
"
HLA-A*02:01
"
,
None
,
None
],
[
"
HLA-A*02:01
"
,
None
,
None
],
[
"
HLA-A*03:01
"
,
None
,
None
],
[
"
HLA-A*03:01
"
,
None
,
None
],
exp1_alleles
,
exp1_alleles
,
exp2_alleles
,
exp2_alleles
,
exp2_alleles
,
exp2_alleles
,
],
is_binder
=
[
False
,
True
]
*
5
+
[
True
,
True
,
False
,
False
,
True
,
False
,
True
,
False
,
True
,
False
,
]))
df
=
pandas
.
concat
([
df
,
df
],
ignore_index
=
True
)
df
=
pandas
.
concat
([
df
,
df
],
ignore_index
=
True
)
planner
.
plan
(
**
df
.
to_dict
(
"
list
"
))
assert_equal
(
planner
.
num_train_batches
,
numpy
.
ceil
(
len
(
df
)
*
(
1
-
validation_split
)
/
batch_size
))
assert_equal
(
planner
.
num_test_batches
,
numpy
.
ceil
(
len
(
df
)
*
validation_split
/
batch_size
))
(
train_iter
,
test_iter
)
=
planner
.
get_train_and_test_generators
(
x_dict
=
{
"
idx
"
:
numpy
.
arange
(
len
(
df
)),
},
y_list
=
[])
for
(
kind
,
it
)
in
[(
"
train
"
,
train_iter
),
(
"
test
"
,
test_iter
)]:
for
(
i
,
(
x_item
,
y_item
))
in
enumerate
(
it
):
idx
=
x_item
[
"
idx
"
]
df
.
loc
[
idx
,
"
kind
"
]
=
kind
df
.
loc
[
idx
,
"
idx
"
]
=
idx
df
.
loc
[
idx
,
"
batch
"
]
=
i
df
[
"
idx
"
]
=
df
.
idx
.
astype
(
int
)
df
[
"
batch
"
]
=
df
.
batch
.
astype
(
int
)
assert_equal
(
df
.
kind
.
value_counts
()[
"
test
"
],
len
(
df
)
*
validation_split
)
assert_equal
(
df
.
kind
.
value_counts
()[
"
train
"
],
len
(
df
)
*
(
1
-
validation_split
))
experiment_allele_colocations
=
collections
.
defaultdict
(
int
)
for
((
kind
,
batch_num
),
batch_df
)
in
df
.
groupby
([
"
kind
"
,
"
batch
"
]):
if
not
batch_df
.
affinities_mask
.
all
():
# Test each batch has at most one multiallelic ms experiment.
names
=
batch_df
.
loc
[
~
batch_df
.
affinities_mask
].
experiment_names
.
unique
()
assert_equal
(
len
(
names
),
1
)
(
experiment
,)
=
names
if
batch_df
.
affinities_mask
.
any
():
# Test experiments are matched to the correct affinity alleles.
affinity_alleles
=
batch_df
.
loc
[
batch_df
.
affinities_mask
].
alleles_matrix
.
str
.
get
(
0
).
values
for
allele
in
affinity_alleles
:
experiment_allele_colocations
[(
experiment
,
allele
)]
+=
1
assert_greater
(
experiment_allele_colocations
[(
'
exp1
'
,
'
HLA-A*03:01
'
)],
experiment_allele_colocations
[(
'
exp1
'
,
'
HLA-A*02:01
'
)])
assert_less
(
experiment_allele_colocations
[(
'
exp2
'
,
'
HLA-A*03:01
'
)],
experiment_allele_colocations
[(
'
exp2
'
,
'
HLA-A*02:01
'
)])
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment