Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
e3e6f5eb
Commit
e3e6f5eb
authored
9 years ago
by
Alex Rubinsteyn
Browse files
Options
Downloads
Patches
Plain Diff
pulling together more bits of isolated training scripts into helpers file
parent
70af16e2
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
experiments/train-models-with-synthetic-pretraining.py
+41
-52
41 additions, 52 deletions
experiments/train-models-with-synthetic-pretraining.py
experiments/training_helpers.py
+103
-32
103 additions, 32 deletions
experiments/training_helpers.py
with
144 additions
and
84 deletions
experiments/train-models-with-synthetic-pretraining.py
+
41
−
52
View file @
e3e6f5eb
...
...
@@ -29,6 +29,7 @@ from mhcflurry.data import (
from
arg_parsing
import
parse_int_list
,
parse_float_list
from
dataset_paths
import
PETERS2009_CSV_PATH
from
common
import
load_csv_binding_data_as_dict
from
training_helpers
import
create_and_evaluate_model_with_synthetic_data
parser
=
argparse
.
ArgumentParser
()
...
...
@@ -159,6 +160,25 @@ def data_augmentation(
return
aucs
,
f1s
,
n_originals
def
rescale_ic50
(
ic50
,
max_ic50
):
log_ic50
=
np
.
log
(
ic50
)
/
np
.
log
(
args
.
max_ic50
)
return
max
(
0.0
,
min
(
1.0
,
1.0
-
log_ic50
))
def
load_synthetic_data
(
csv_path
,
max_ic50
):
synthetic_allele_to_peptide_to_ic50_dict
=
load_csv_binding_data_as_dict
(
csv_path
)
return
{
allele
:
{
peptide
:
rescale_ic50
(
ic50
,
max_ic50
=
max_ic50
)
for
(
peptide
,
ic50
)
in
allele_dict
.
items
()
}
for
(
allele
,
allele_dict
)
in
synthetic_allele_to_peptide_to_ic50_dict
.
items
()
}
if
__name__
==
"
__main__
"
:
args
=
parser
.
parse_args
()
print
(
args
)
...
...
@@ -169,70 +189,39 @@ if __name__ == "__main__":
max_ic50
=
args
.
max_ic50
,
only_human
=
False
)
print
(
"
Loading synthetic data from %s
"
%
args
.
synthetic_data_csv
)
synthetic_allele_to_peptide_to_ic50_dict
=
load_csv_binding_data_as_dict
(
args
.
synthetic_data_csv
)
synthetic_allele_to_peptide_to_y_dict
=
{
allele
:
{
peptide
:
max
(
0.0
,
min
(
1.0
,
1.0
-
np
.
log
(
ic50
)
/
np
.
log
(
args
.
max_ic50
)))
for
(
peptide
,
ic50
)
in
allele_dict
.
items
()
}
for
(
allele
,
allele_dict
)
in
synthetic_allele_to_peptide_to_ic50_dict
.
items
()
}
synthetic_affinities
=
load_synthetic_data
(
csv_path
=
args
.
synthetic_data_csv
,
max_ic50
=
args
.
max_ic50
)
combined_allele_set
=
set
(
allele_datasets
.
keys
()).
union
(
synthetic_allele_to_peptide_to_y_dict
.
keys
())
synthetic_affinities
.
keys
())
combined_allele_list
=
list
(
sorted
(
combined_allele_set
))
for
allele
in
combined_allele_list
:
actual_dataset
=
allele_datasets
[
allele
]
X_actual
=
actual_dataset
.
X_index
weights_actual
=
actual_dataset
.
weights
Y_actual
=
actual_dataset
.
Y
synthetic_dict
=
synthetic_allele_to_peptide_to_y_dict
[
allele
]
_
,
_
,
C_synth
,
X_synth
,
_
,
Y_synth
=
encode_peptide_to_affinity_dict
(
synthetic_dict
)
n_actual_samples
,
n_actual_dims
=
X_actual
.
shape
n_synth_samples
,
n_synth_dims
=
X_synth
.
shape
assert
n_actual_dims
==
n_synth_dims
,
\
"
Mismatch between # of actual dims %d and synthetic dims %d
"
%
(
n_actual_dims
,
n_synth_dims
)
print
(
"
-- Using %d actual samples and %d synthetic samples for %s
"
%
(
n_actual_samples
,
n_synth_samples
,
allele
))
X
=
np
.
vstack
([
X_actual
,
X_synth
])
print
(
"
-- X.shape = %s, dtype = %s
"
%
(
X
.
shape
,
X
.
dtype
))
n_samples
=
n_actual_samples
+
n_synth_samples
assert
X
.
shape
[
0
]
==
n_samples
,
\
"
Expected %d samples but got data array with shape %s
"
%
(
n_actual_samples
+
n_synth_samples
,
X
.
shape
)
Y
=
np
.
concatenate
([
Y_actual
,
Y_synth
])
print
(
"
-- Y.shape = %s, dtype = %s
"
%
(
Y
.
shape
,
Y
.
dtype
))
assert
Y
.
min
()
>=
0
,
\
"
Y should not contain negative numbers! Y.min() = %f
"
%
(
Y
.
min
(),)
assert
Y
.
max
()
<=
1
,
\
"
Y should have max value 1.0, got Y.max() = %f
"
%
(
Y
.
max
(),)
weights_synth
=
1.0
/
C_synth
weights
=
np
.
concatenate
([
weights_actual
,
weights_synth
])
assert
len
(
weights
)
==
n_samples
print
(
"
-- weights.shape = %s, dtype = %s
"
%
(
weights
.
shape
,
weights
.
dtype
))
synthetic_allele_dict
=
synthetic_affinities
[
allele
]
(
_
,
_
,
Counts_synth
,
X_synth
,
_
,
Y_synth
)
=
\
encode_peptide_to_affinity_dict
(
synthetic_allele_dict
)
synthetic_sample_weights
=
1.0
/
Counts_synth
scores
=
{}
for
dropout
in
args
.
dropouts
:
for
embedding_dim_size
in
args
.
embedding_dim_sizes
:
for
hidden_layer_size
in
args
.
hidden_layer_sizes
:
params
=
(
(
"
dropout
"
,
dropout
),
(
"
dropout
_probability
"
,
dropout
),
(
"
embedding_dim_size
"
,
embedding_dim_size
),
(
"
hidden_layer_size
"
,
hidden_layer_size
),
)
tau
,
auc
,
f1
=
evaluate_model
(
**
dict
(
params
))
tau
,
auc
,
f1
=
create_and_evaluate_model_with_synthetic_data
(
X_original
=
allele_datasets
[
allele
].
X_index
,
Y_original
=
allele_datasets
[
allele
].
Y
,
X_synth
=
X_synth
,
Y_synth
=
Y_synth
,
original_sample_weights
=
allele_datasets
[
allele
].
weights
,
synthetic_sample_weights
=
synthetic_sample_weights
,
n_training_epochs
=
150
,
max_ic50
=
args
.
max_ic50
,
**
dict
(
params
))
scores
[
params
]
=
(
tau
,
auc
,
f1
)
print
(
"
%s => tau=%f, AUC=%f, F1=%f
"
%
(
params
,
...
...
This diff is collapsed.
Click to expand it.
experiments/training_helpers.py
+
103
−
32
View file @
e3e6f5eb
...
...
@@ -128,29 +128,51 @@ def train_model_and_return_scores(
return
(
accuracy
,
auc
,
f1_score
)
def
create_and_evaluate_model_with_synthetic_data
(
X
,
Y
,
weights_synth
,
weights_actual
,
def
train_model_with_synthetic_data
(
model
,
n_training_epochs
,
dropout
,
embedding_dim_size
,
hidden_layer_size
):
model
=
mhcflurry
.
feedforward
.
make_embedding_network
(
peptide_length
=
9
,
embedding_input_dim
=
20
,
embedding_output_dim
=
4
,
layer_sizes
=
[
4
],
activation
=
"
tanh
"
,
init
=
"
lecun_uniform
"
,
loss
=
"
mse
"
,
output_activation
=
"
sigmoid
"
,
dropout_probability
=
0.0
,
optimizer
=
None
,
learning_rate
=
0.001
)
total_synth_weights
=
weights_synth
.
sum
()
total_actual_weights
=
weights_actual
.
sum
()
max_ic50
,
X_original
,
Y_original
,
X_synth
,
Y_synth
,
original_sample_weights
,
synthetic_sample_weights
):
total_synth_weights
=
synthetic_sample_weights
.
sum
()
total_original_weights
=
original_sample_weights
.
sum
()
print
(
"
Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f
"
%
(
np
.
mean
(
Y_original
),
np
.
mean
(
Y_synth
),
np
.
mean
(
original_sample_weights
),
np
.
mean
(
synthetic_sample_weights
)))
combined_weights
=
np
.
concatenate
([
original_sample_weights
,
synthetic_sample_weights
])
n_actual_samples
,
n_actual_dims
=
X_original
.
shape
n_synth_samples
,
n_synth_dims
=
X_synth
.
shape
assert
n_actual_dims
==
n_synth_dims
,
\
"
Mismatch between # of actual dims %d and synthetic dims %d
"
%
(
n_actual_dims
,
n_synth_dims
)
X_combined
=
np
.
vstack
([
X_original
,
X_synth
])
n_combined_samples
=
n_actual_samples
+
n_synth_samples
assert
X_combined
.
shape
[
0
]
==
n_combined_samples
,
\
"
Expected %d samples but got data array with shape %s
"
%
(
n_actual_samples
+
n_synth_samples
,
X_combined
.
shape
)
Y_combined
=
np
.
concatenate
([
Y_original
,
Y_synth
])
assert
Y_combined
.
min
()
>=
0
,
\
"
Y should not contain negative numbers! Y.min() = %f
"
%
(
Y_combined
.
min
(),)
assert
Y_combined
.
max
()
<=
1
,
\
"
Y should have max value 1.0, got Y.max() = %f
"
%
(
Y_combined
.
max
(),)
combined_weights
=
np
.
concatenate
([
original_sample_weights
,
synthetic_sample_weights
])
assert
len
(
combined_weights
)
==
n_combined_samples
for
epoch
in
range
(
n_training_epochs
):
# weights for synthetic points can be shrunk as:
# ~ 1 / (1+epoch)**2
...
...
@@ -160,20 +182,69 @@ def create_and_evaluate_model_with_synthetic_data(
# if the contribution of synthetic samples is less than a
# thousandth of the actual data, then stop using it
synth_contribution
=
total_synth_weights
*
decay_factor
if
synth_contribution
<
total_
actu
al_weights
/
1000
:
if
synth_contribution
<
total_
origin
al_weights
/
1000
:
print
(
"
Epoch %d, using only actual data
"
%
(
epoch
+
1
,))
model
.
fit
(
X_actual
,
Y_actual
,
sample_weight
=
weights_actual
,
nb_epoch
=
1
)
X_original
,
Y_original
,
sample_weight
=
original_sample_weights
,
nb_epoch
=
1
,
verbose
=
0
)
else
:
print
(
"
Epoch %d, synth decay factor = %f
"
%
(
epoch
+
1
,
decay_factor
))
weights
[
n_actual_samples
:]
=
weights_synth
*
decay_factor
model
.
fit
(
X
,
Y
,
sample_weight
=
weights
,
nb_epoch
=
1
)
Y_pred
=
model
.
predict
(
X_actual
)
print
(
"
Training MSE %0.4f
"
%
((
Y_actual
-
Y_pred
)
**
2
).
mean
())
combined_weights
[
n_actual_samples
:]
=
(
synthetic_sample_weights
*
decay_factor
)
model
.
fit
(
X_combined
,
Y_combined
,
sample_weight
=
combined_weights
,
nb_epoch
=
1
,
verbose
=
0
)
Y_pred
=
model
.
predict
(
X_original
)
training_mse
=
((
Y_original
-
Y_pred
)
**
2
).
mean
()
print
(
"
-- Epoch %d/%d Training MSE %0.4f
"
%
(
epoch
+
1
,
n_training_epochs
,
training_mse
))
def
create_and_evaluate_model_with_synthetic_data
(
X_original
,
Y_original
,
X_synth
,
Y_synth
,
original_sample_weights
=
None
,
synthetic_sample_weights
=
None
,
n_training_epochs
=
150
,
embedding_dim_size
=
16
,
hidden_layer_size
=
50
,
dropout_probability
=
0.0
,
max_ic50
=
50000.0
):
if
original_sample_weights
is
None
:
original_sample_weights
=
np
.
ones
(
len
(
X_original
),
dtype
=
float
)
if
synthetic_sample_weights
is
None
:
synthetic_sample_weights
=
np
.
ones
(
len
(
X_synth
),
dtype
=
float
)
model
=
mhcflurry
.
feedforward
.
make_embedding_network
(
peptide_length
=
9
,
embedding_input_dim
=
20
,
embedding_output_dim
=
embedding_dim_size
,
layer_sizes
=
[
hidden_layer_size
],
activation
=
"
tanh
"
,
init
=
"
lecun_uniform
"
,
loss
=
"
mse
"
,
output_activation
=
"
sigmoid
"
,
dropout_probability
=
dropout_probability
,
optimizer
=
None
,
learning_rate
=
0.001
)
train_model_with_synthetic_data
(
model
=
model
,
n_training_epochs
=
n_training_epochs
,
max_ic50
=
max_ic50
,
X_original
=
X_original
,
Y_original
=
Y_original
,
X_synth
=
X_synth
,
Y_synth
=
Y_synth
,
original_sample_weights
=
original_sample_weights
,
synthetic_sample_weights
=
synthetic_sample_weights
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment