Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
80d8c161
Commit
80d8c161
authored
9 years ago
by
Alex Rubinsteyn
Browse files
Options
Downloads
Patches
Plain Diff
run each fold of data over every possible predictor, should be faster
parent
8308c397
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
experiments/matrix-completion-hyperparameter-search.py
+127
-112
127 additions, 112 deletions
experiments/matrix-completion-hyperparameter-search.py
with
127 additions
and
112 deletions
experiments/matrix-completion-hyperparameter-search.py
+
127
−
112
View file @
80d8c161
...
...
@@ -129,6 +129,11 @@ parser.add_argument(
action
=
"
store_true
"
,
help
=
"
When expanding 8mers into 9mers use
'
X
'
instead of all possible AAs
"
)
parser
.
add_argument
(
"
--pretrain-only-9mers
"
,
default
=
False
,
action
=
"
store_true
"
,
help
=
"
Exclude expanded samples from e.g. 8mers or 10mers
"
)
parser
.
add_argument
(
"
--alleles
"
,
...
...
@@ -136,6 +141,8 @@ parser.add_argument(
type
=
parse_string_list
,
help
=
"
Only evaluate these alleles (by default use all in the dataset)
"
)
parser
.
add_argument
(
"
--min-samples-per-cv-fold
"
,
type
=
int
,
default
=
5
)
def
print_length_distribution
(
peptides
,
values
,
name
):
print
(
"
Length distribution for %s (n=%d):
"
%
(
name
,
len
(
peptides
)))
...
...
@@ -251,7 +258,6 @@ if __name__ == "__main__":
activation
)
if
key
not
in
predictors
:
layer_sizes
=
[
hidden_layer_size1
]
if
hidden_layer_size2
:
layer_sizes
.
append
(
hidden_layer_size2
)
...
...
@@ -268,12 +274,21 @@ if __name__ == "__main__":
)
weights
=
predictor
.
model
.
get_weights
()
opt_state
=
predictor
.
model
.
optimizer
.
get_state
()
yield
(
key
,
predictor
,
weights
,
opt_state
)
# want at least 5 samples in each fold of CV
# to make meaningful estimates of accuracy
min_samples_per_cv_fold
=
5
min_samples_per_allele
=
min_samples_per_cv_fold
*
args
.
n_folds
predictors
[
key
]
=
predictor
initial_weights
[
key
]
=
weights
initial_optimizer_states
[
key
]
=
opt_state
else
:
predictor
=
predictors
[
key
]
weights
=
initial_weights
[
key
]
opt_state
=
initial_optimizer_states
[
key
]
# reset the predictor to its initial condition
predictor
.
model
.
set_weights
([
w
.
copy
()
for
w
in
weights
])
predictor
.
model
.
optimizer
.
set_state
([
s
.
copy
()
for
s
in
opt_state
])
yield
(
key
,
predictor
)
min_samples_per_allele
=
args
.
min_samples_per_cv_fold
*
args
.
n_folds
for
allele_idx
,
allele
in
enumerate
(
allele_list
):
if
args
.
alleles
and
allele
not
in
args
.
alleles
:
# if user specifies an allele list then skip anything which isn't included
...
...
@@ -320,110 +335,111 @@ if __name__ == "__main__":
observed_peptides
=
[
peptide_list
[
i
]
for
i
in
observed_row_indices
]
print_length_distribution
(
observed_peptides
,
observed_values
,
name
=
allele
)
for
key
,
predictor
,
predictor_weights
,
optimizer_state
in
generate_predictors
():
# k-fold cross validation stratified to keep an even balance of low
# vs. high-binding peptides in each fold
for
fold_idx
,
(
train_indices
,
test_indices
)
in
enumerate
(
StratifiedKFold
(
y
=
below_500nM
,
n_folds
=
args
.
n_folds
,
shuffle
=
True
)):
predictor
.
model
.
set_weights
([
w
.
copy
()
for
w
in
predictor_weights
])
predictor
.
model
.
optimizer
.
set_state
([
s
.
copy
()
for
s
in
optimizer_state
])
train_peptides_fold
=
[
observed_peptides
[
i
]
for
i
in
train_indices
]
train_values_fold
=
[
observed_values
[
i
]
for
i
in
train_indices
]
train_dict_fold
=
{
k
:
v
for
(
k
,
v
)
in
zip
(
train_peptides_fold
,
train_values_fold
)}
if
args
.
verbose
:
print
(
"
Training peptides for CV fold %d/%d:
"
%
(
fold_idx
+
1
,
args
.
n_folds
))
for
p
in
train_peptides_fold
:
aff
=
train_dict_fold
[
p
]
print
(
"
-- %s: %f (%f nM)
"
%
(
p
,
aff
,
args
.
max_ic50
**
(
1
-
aff
)))
test_peptides
=
[
observed_peptides
[
i
]
for
i
in
test_indices
]
test_values
=
[
observed_values
[
i
]
for
i
in
test_indices
]
test_dict
=
{
k
:
v
for
(
k
,
v
)
in
zip
(
test_peptides
,
test_values
)}
# drop the test peptides from the full matrix and then
# run completion on it to get synthesized affinities
pMHC_affinities_fold_incomplete
=
pMHC_affinity_matrix
.
copy
()
test_indices_among_all_rows
=
observed_row_indices
[
test_indices
]
pMHC_affinities_fold_incomplete
[
test_indices_among_all_rows
,
allele_idx
]
=
np
.
nan
# drop peptides with fewer than 2 measurements and alleles
# with fewer than 10 peptides
pMHC_affinities_fold_pruned
,
pruned_peptides_fold
,
pruned_alleles_fold
=
\
prune_data
(
X
=
pMHC_affinities_fold_incomplete
,
peptide_list
=
peptide_list
,
allele_list
=
allele_list
,
min_observations_per_peptide
=
2
,
min_observations_per_allele
=
min_samples_per_cv_fold
)
pMHC_affinities_fold_pruned_imputed
=
imputer
.
complete
(
pMHC_affinities_fold_pruned
)
if
not
args
.
unknown_amino_acids
:
# if we're expanding 8mers to >100 9mers
# then the pretraining dataset becomes
# unmanageably large, so let's only use 9mers for pre-training
# In the future: we'll use an neural network that
# takes multiple input lengths
ninemer_mask
=
np
.
array
([
len
(
p
)
==
9
for
p
in
pruned_peptides_fold
])
ninemer_indices
=
np
.
where
(
ninemer_mask
)[
0
]
pMHC_affinities_fold_pruned_imputed
=
\
pMHC_affinities_fold_pruned_imputed
[
ninemer_indices
]
pruned_peptides_fold
=
[
pruned_peptides_fold
[
i
]
for
i
in
ninemer_indices
]
# since we may have dropped some of the columns in the completed
# pMHC matrix need to find which column corresponds to the
# same name we're currently predicting
if
allele
in
pruned_alleles_fold
:
pMHC_fold_allele_idx
=
pruned_alleles_fold
.
index
(
allele
)
pMHC_allele_values
=
pMHC_affinities_fold_pruned_imputed
[
:,
pMHC_fold_allele_idx
]
if
pMHC_allele_values
.
std
()
==
0
:
print
(
"
WARNING: unexpected zero-variance in pretraining affinity values
"
)
else
:
print
(
"
WARNING: Couldn
'
t find allele %s in pre-training matrix
"
%
allele
)
column
=
pMHC_affinities_fold_incomplete
[:,
allele_idx
]
column_mean
=
np
.
nanmean
(
column
)
print
(
"
-- Setting pre-training target value to nanmean = %f
"
%
column_mean
)
pMHC_allele_values
=
np
.
ones
(
len
(
pruned_peptides_fold
))
*
column_mean
assert
False
assert
len
(
pruned_peptides_fold
)
==
len
(
pMHC_allele_values
)
# dictionary mapping peptides to imputed affinity values
pretrain_dict
=
{
pi
:
yi
for
(
pi
,
yi
)
in
zip
(
pruned_peptides_fold
,
pMHC_allele_values
)
}
X_pretrain
,
pretrain_row_peptides
,
pretrain_counts
=
\
fixed_length_index_encoding
(
peptides
=
pruned_peptides_fold
,
desired_length
=
9
,
allow_unknown_amino_acids
=
args
.
unknown_amino_acids
)
pretrain_sample_weights
=
1.0
/
np
.
array
(
pretrain_counts
)
Y_pretrain
=
np
.
array
(
[
pretrain_dict
[
p
]
for
p
in
pretrain_row_peptides
])
X_train
,
training_row_peptides
,
training_counts
=
\
fixed_length_index_encoding
(
peptides
=
train_peptides_fold
,
desired_length
=
9
,
allow_unknown_amino_acids
=
args
.
unknown_amino_acids
)
training_sample_weights
=
1.0
/
np
.
array
(
training_counts
)
Y_train
=
np
.
array
([
train_dict_fold
[
p
]
for
p
in
training_row_peptides
])
# k-fold cross validation stratified to keep an even balance of low
# vs. high-binding peptides in each fold
for
fold_idx
,
(
train_indices
,
test_indices
)
in
enumerate
(
StratifiedKFold
(
y
=
below_500nM
,
n_folds
=
args
.
n_folds
,
shuffle
=
True
)):
train_peptides_fold
=
[
observed_peptides
[
i
]
for
i
in
train_indices
]
train_values_fold
=
[
observed_values
[
i
]
for
i
in
train_indices
]
train_dict_fold
=
{
k
:
v
for
(
k
,
v
)
in
zip
(
train_peptides_fold
,
train_values_fold
)}
"""
if args.verbose:
print(
"
Training peptides for CV fold %d/%d:
"
% (
fold_idx + 1,
args.n_folds))
for p in train_peptides_fold:
aff = train_dict_fold[p]
print(
"
-- %s: %f (%f nM)
"
% (
p,
aff,
args.max_ic50 ** (1 - aff)))
"""
test_peptides
=
[
observed_peptides
[
i
]
for
i
in
test_indices
]
test_values
=
[
observed_values
[
i
]
for
i
in
test_indices
]
test_dict
=
{
k
:
v
for
(
k
,
v
)
in
zip
(
test_peptides
,
test_values
)}
# drop the test peptides from the full matrix and then
# run completion on it to get synthesized affinities
pMHC_affinities_fold_incomplete
=
pMHC_affinity_matrix
.
copy
()
test_indices_among_all_rows
=
observed_row_indices
[
test_indices
]
pMHC_affinities_fold_incomplete
[
test_indices_among_all_rows
,
allele_idx
]
=
np
.
nan
# drop peptides with fewer than 2 measurements and alleles
# with fewer than 10 peptides
pMHC_affinities_fold_pruned
,
pruned_peptides_fold
,
pruned_alleles_fold
=
\
prune_data
(
X
=
pMHC_affinities_fold_incomplete
,
peptide_list
=
peptide_list
,
allele_list
=
allele_list
,
min_observations_per_peptide
=
2
,
min_observations_per_allele
=
args
.
min_samples_per_cv_fold
)
pMHC_affinities_fold_pruned_imputed
=
imputer
.
complete
(
pMHC_affinities_fold_pruned
)
if
args
.
pretrain_only_9mers
:
# if we're expanding 8mers to >100 9mers
# then the pretraining dataset becomes
# unmanageably large, so let's only use 9mers for pre-training
# In the future: we'll use an neural network that
# takes multiple input lengths
ninemer_mask
=
np
.
array
([
len
(
p
)
==
9
for
p
in
pruned_peptides_fold
])
ninemer_indices
=
np
.
where
(
ninemer_mask
)[
0
]
pMHC_affinities_fold_pruned_imputed
=
\
pMHC_affinities_fold_pruned_imputed
[
ninemer_indices
]
pruned_peptides_fold
=
[
pruned_peptides_fold
[
i
]
for
i
in
ninemer_indices
]
# since we may have dropped some of the columns in the completed
# pMHC matrix need to find which column corresponds to the
# same name we're currently predicting
if
allele
in
pruned_alleles_fold
:
pMHC_fold_allele_idx
=
pruned_alleles_fold
.
index
(
allele
)
pMHC_allele_values
=
pMHC_affinities_fold_pruned_imputed
[
:,
pMHC_fold_allele_idx
]
if
pMHC_allele_values
.
std
()
==
0
:
print
(
"
WARNING: unexpected zero-variance in pretraining affinity values
"
)
else
:
print
(
"
WARNING: Couldn
'
t find allele %s in pre-training matrix
"
%
allele
)
column
=
pMHC_affinities_fold_incomplete
[:,
allele_idx
]
column_mean
=
np
.
nanmean
(
column
)
print
(
"
-- Setting pre-training target value to nanmean = %f
"
%
column_mean
)
pMHC_allele_values
=
np
.
ones
(
len
(
pruned_peptides_fold
))
*
column_mean
assert
False
assert
len
(
pruned_peptides_fold
)
==
len
(
pMHC_allele_values
)
# dictionary mapping peptides to imputed affinity values
pretrain_dict
=
{
pi
:
yi
for
(
pi
,
yi
)
in
zip
(
pruned_peptides_fold
,
pMHC_allele_values
)
}
X_pretrain
,
pretrain_row_peptides
,
pretrain_counts
=
\
fixed_length_index_encoding
(
peptides
=
pruned_peptides_fold
,
desired_length
=
9
,
allow_unknown_amino_acids
=
args
.
unknown_amino_acids
)
pretrain_sample_weights
=
1.0
/
np
.
array
(
pretrain_counts
)
Y_pretrain
=
np
.
array
(
[
pretrain_dict
[
p
]
for
p
in
pretrain_row_peptides
])
X_train
,
training_row_peptides
,
training_counts
=
\
fixed_length_index_encoding
(
peptides
=
train_peptides_fold
,
desired_length
=
9
,
allow_unknown_amino_acids
=
args
.
unknown_amino_acids
)
training_sample_weights
=
1.0
/
np
.
array
(
training_counts
)
Y_train
=
np
.
array
([
train_dict_fold
[
p
]
for
p
in
training_row_peptides
])
for
key
,
predictor
in
generate_predictors
():
print
(
"
\n
-----
"
)
print
(
(
"
Training CV fold %d/%d for %s (# peptides = %d, # samples=%d)
"
...
...
@@ -435,7 +451,6 @@ if __name__ == "__main__":
len
(
X_train
),
key
))
print
(
"
-----
"
)
predictor
.
fit
(
X
=
X_train
,
Y
=
Y_train
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment