Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
328d2c12
Commit
328d2c12
authored
8 years ago
by
Alex Rubinsteyn
Browse files
Options
Downloads
Patches
Plain Diff
lots of tweaks to dataset size sensitivity script
parent
d122c743
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
mhcflurry/package_metadata.py
+1
-1
1 addition, 1 deletion
mhcflurry/package_metadata.py
script/mhcflurry-dataset-size-sensitivity.py
+152
-54
152 additions, 54 deletions
script/mhcflurry-dataset-size-sensitivity.py
setup.py
+1
-0
1 addition, 0 deletions
setup.py
with
154 additions
and
55 deletions
mhcflurry/package_metadata.py
+
1
−
1
View file @
328d2c12
__version__
=
"
0.0.
4
"
__version__
=
"
0.0.
5
"
This diff is collapsed.
Click to expand it.
script/mhcflurry-dataset-size-sensitivity.py
+
152
−
54
View file @
328d2c12
...
...
@@ -23,9 +23,12 @@ from collections import OrderedDict
import
numpy
as
np
import
pandas
as
pd
import
scipy
import
sklearn
import
sklearn.metrics
import
seaborn
from
matplotlib
import
pyplot
from
mhcflurry.dataset
import
Dataset
from
mhcflurry.args
import
(
...
...
@@ -55,23 +58,34 @@ parser.add_argument(
parser
.
add_argument
(
"
--number-dataset-sizes
"
,
type
=
int
,
default
=
1
0
)
default
=
2
0
)
parser
.
add_argument
(
"
--min-training-samples
"
,
type
=
int
,
default
=
20
)
default
=
5
)
parser
.
add_argument
(
"
--max-training-samples
"
,
type
=
int
,
default
=
20
00
)
default
=
5
00
)
parser
.
add_argument
(
"
--sample-censored-affinities
"
,
default
=
False
,
action
=
"
store_true
"
)
parser
.
add_argument
(
"
--load-existing-data
"
,
action
=
"
store_true
"
,
default
=
False
,
help
=
"
Skip computationally intensive data generation by loading from existing CSV
"
)
parser
.
add_argument
(
"
--seaborn-lmplot
"
,
action
=
"
store_true
"
,
default
=
False
)
"""
parser.add_argument(
"
--remove-similar-peptides-from-test-data
"
,
...
...
@@ -86,6 +100,10 @@ add_imputation_argument_to_parser(parser, default="mice")
add_hyperparameter_arguments_to_parser
(
parser
)
add_training_arguments_to_parser
(
parser
)
def
stratify_by_binder_label
(
row
):
return
row
[
"
affinity
"
]
<=
500
def
subsample_performance
(
dataset
,
allele
,
...
...
@@ -111,20 +129,21 @@ def subsample_performance(
(
"
idx
"
,
[]),
(
"
auc
"
,
[]),
(
"
f1
"
,
[]),
(
"
tau
"
,
[]),
(
"
impute
"
,
[]),
])
sample_sizes
=
np
.
exp
(
np
.
linspace
(
np
.
log
(
min_training_samples
),
np
.
log
(
max_training_samples
),
num
=
n_subsample_sizes
)).
astype
(
int
)
+
1
log_min_samples
=
np
.
log
(
min_training_samples
)
log_max_samples
=
np
.
log
(
max_training_samples
)
log_sample_sizes
=
np
.
linspace
(
log_min_samples
,
log_max_samples
,
num
=
n_subsample_sizes
)
sample_sizes
=
np
.
exp
(
log_sample_sizes
).
astype
(
int
)
+
1
for
i
,
n_train
in
enumerate
(
sample_sizes
):
i
=
0
for
n_train
in
sample_sizes
:
for
_
in
range
(
n_repeats_per_size
):
i
+=
1
if
imputer
is
None
:
dataset_train
,
dataset_test
=
dataset_allele
.
random_split
(
n_train
,
stratify_fn
=
lambda
row
:
row
[
"
affinity
"
]
<=
500
)
n_train
,
stratify_fn
=
stratify_by_binder_label
)
dataset_imputed
=
None
else
:
dataset_train
,
dataset_imputed
,
dataset_test
=
\
...
...
@@ -132,12 +151,12 @@ def subsample_performance(
allele
=
allele
,
n_training_samples
=
n_train
,
imputation_method
=
imputer
,
min_observations_per_peptide
=
2
,
min_observations_per_peptide
=
3
,
min_observations_per_allele
=
1
,
stratify_fn
=
lambda
row
:
row
[
"
affinity
"
]
<=
500
)
stratify_fn
=
stratify_by_binder_label
)
print
(
"
=== #%d/%d: Training model for %s with sample_size = %d/%d
"
%
(
i
+
1
,
len
(
sample_sizes
),
len
(
sample_sizes
)
*
n_repeats_per_size
,
allele
,
n_train
,
n_total
))
...
...
@@ -165,14 +184,15 @@ def subsample_performance(
n_random_negative_samples
=
n_random_negative_samples
)
pred_ic50
=
model
.
predict
(
dataset_test
.
peptides
)
auc
=
sklearn
.
metrics
.
roc_auc_score
(
true_label
,
-
pred_ic50
)
tau
=
scipy
.
stats
.
kendalltau
(
true_ic50
,
pred_ic50
).
correlation
f1
=
sklearn
.
metrics
.
f1_score
(
true_label
,
pred_ic50
<=
500
)
print
(
"
%s (impute=none) n=%d, AUC=%0.4f, F1=%0.4f
"
%
(
allele
,
n_train
,
auc
,
f1
))
print
(
"
%s (impute=none) n=%d, AUC=%0.4f, F1=%0.4f
, tau=%0.4f
"
%
(
allele
,
n_train
,
auc
,
f1
,
tau
))
results
[
"
num_samples
"
].
append
(
n_train
)
results
[
"
idx
"
].
append
(
i
)
results
[
"
auc
"
].
append
(
auc
)
results
[
"
f1
"
].
append
(
f1
)
results
[
"
tau
"
].
append
(
tau
)
results
[
"
impute
"
].
append
(
False
)
if
dataset_imputed
is
None
:
...
...
@@ -188,13 +208,16 @@ def subsample_performance(
sample_censored_affinities
=
sample_censored_affinities
)
pred_ic50
=
model
.
predict
(
dataset_test
.
peptides
)
auc
=
sklearn
.
metrics
.
roc_auc_score
(
true_label
,
-
pred_ic50
)
tau
=
scipy
.
stats
.
kendalltau
(
true_ic50
,
pred_ic50
).
correlation
f1
=
sklearn
.
metrics
.
f1_score
(
true_label
,
pred_ic50
<=
500
)
print
(
"
%s (impute=%s) n=%d, AUC=%0.4f, F1=%0.4f
"
%
(
allele
,
imputer
,
n_train
,
auc
,
f1
))
print
(
"
%s (impute=%s) n=%d, AUC=%0.4f, F1=%0.4f
, tau=%0.4f
"
%
(
allele
,
imputer
,
n_train
,
auc
,
f1
,
tau
))
results
[
"
num_samples
"
].
append
(
n_train
)
results
[
"
idx
"
].
append
(
i
)
results
[
"
auc
"
].
append
(
auc
)
results
[
"
f1
"
].
append
(
f1
)
results
[
"
tau
"
].
append
(
tau
)
results
[
"
impute
"
].
append
(
True
)
return
pd
.
DataFrame
(
results
)
...
...
@@ -202,41 +225,116 @@ def subsample_performance(
if
__name__
==
"
__main__
"
:
args
=
parser
.
parse_args
()
dataset
=
Dataset
.
from_csv
(
args
.
training_csv
)
imputer
=
imputer_from_args
(
args
)
def
make_model
():
return
predictor_from_args
(
allele_name
=
args
.
allele
,
args
=
args
)
results_df
=
subsample_performance
(
dataset
=
dataset
,
allele
=
args
.
allele
,
imputer
=
imputer
,
model_fn
=
make_model
,
n_repeats_per_size
=
args
.
repeat
,
n_training_epochs
=
args
.
training_epochs
,
batch_size
=
args
.
batch_size
,
min_training_samples
=
args
.
min_training_samples
,
max_training_samples
=
args
.
max_training_samples
,
n_subsample_sizes
=
args
.
number_dataset_sizes
,
n_random_negative_samples
=
args
.
random_negative_samples
,
sample_censored_affinities
=
args
.
sample_censored_affinities
)
print
(
results_df
)
for
name
in
[
"
auc
"
,
"
f1
"
]:
seaborn
.
violinplot
(
data
=
results_df
,
x
=
"
num_samples
"
,
y
=
name
,
hue
=
"
impute
"
)
seaborn
.
plt
.
xlabel
(
"
# samples (subset of %s)
"
%
args
.
allele
)
seaborn
.
plt
.
ylabel
(
name
)
filename
=
"
%s-%s-vs-nsamples-hidden-%s-activation-%s-impute-%s.png
"
%
(
base_filename
=
\
"
%s-vs-nsamples-hidden-%s-activation-%s-impute-%s-epochs-%d-embedding-%d
"
%
(
args
.
allele
,
name
,
args
.
hidden_layer_size
,
args
.
activation
,
args
.
imputation_method
)
seaborn
.
plt
.
savefig
(
filename
)
args
.
imputation_method
,
args
.
training_epochs
,
args
.
embedding_size
)
csv_filename
=
base_filename
+
"
.csv
"
if
args
.
load_existing_data
:
results_df
=
pd
.
read_csv
(
csv_filename
)
else
:
dataset
=
Dataset
.
from_csv
(
args
.
training_csv
)
imputer
=
imputer_from_args
(
args
)
def
make_model
():
return
predictor_from_args
(
allele_name
=
args
.
allele
,
args
=
args
)
results_df
=
subsample_performance
(
dataset
=
dataset
,
allele
=
args
.
allele
,
imputer
=
imputer
,
model_fn
=
make_model
,
n_repeats_per_size
=
args
.
repeat
,
n_training_epochs
=
args
.
training_epochs
,
batch_size
=
args
.
batch_size
,
min_training_samples
=
args
.
min_training_samples
,
max_training_samples
=
args
.
max_training_samples
,
n_subsample_sizes
=
args
.
number_dataset_sizes
,
n_random_negative_samples
=
args
.
random_negative_samples
,
sample_censored_affinities
=
args
.
sample_censored_affinities
)
print
(
"
Writing CSV to %s
"
%
csv_filename
)
results_df
.
to_csv
(
csv_filename
)
print
(
results_df
)
metrics
=
[
"
auc
"
,
"
f1
"
,
"
tau
"
]
if
args
.
seaborn_lmplot
:
for
score_name
in
metrics
:
seaborn
.
lmplot
(
data
=
results_df
,
x
=
"
num_samples
"
,
y
=
score_name
,
hue
=
"
impute
"
,
legend
=
True
,
fit_reg
=
True
,
logx
=
True
,
truncate
=
True
,
x_jitter
=
0.5
,
y_jitter
=
0.01
)
seaborn
.
plt
.
xlim
(
max
(
-
1
,
results_df
[
"
num_samples
"
].
min
()
-
2
),
results_df
[
"
num_samples
"
].
max
()
+
50
,
)
seaborn
.
plt
.
ylim
(
0
,
1
)
seaborn
.
plt
.
xlabel
(
"
# samples (subset of %s)
"
%
args
.
allele
)
seaborn
.
plt
.
ylabel
(
score_name
)
image_filename
=
"
%s-%s.png
"
%
(
base_filename
,
score_name
)
print
(
"
Writing image to %s
"
%
image_filename
)
seaborn
.
plt
.
savefig
(
image_filename
)
else
:
titles
=
{
"
tau
"
:
"
Kendall
'
s $
\\
tau$
"
,
"
auc
"
:
"
AUC
"
,
"
f1
"
:
"
$F_1$ score
"
}
pyplot
.
figure
(
figsize
=
(
6
,
4
))
seaborn
.
set_style
(
"
whitegrid
"
)
for
(
j
,
score_name
)
in
enumerate
(
metrics
):
ax
=
pyplot
.
subplot2grid
((
1
,
3
),
(
0
,
j
))
groups
=
results_df
.
groupby
([
"
num_samples
"
,
"
impute
"
])
groups_score
=
groups
[
score_name
].
mean
().
to_frame
().
reset_index
()
groups_score
[
"
std_error
"
]
=
\
groups
[
score_name
].
std
().
to_frame
().
reset_index
()[
score_name
]
for
impute
in
[
True
,
False
]:
sub
=
groups_score
[
groups_score
.
impute
==
impute
]
color
=
seaborn
.
get_color_cycle
()[
0
]
if
impute
else
seaborn
.
get_color_cycle
()[
1
]
pyplot
.
errorbar
(
x
=
sub
.
num_samples
.
values
,
y
=
sub
[
score_name
].
values
,
yerr
=
sub
.
std_error
.
values
,
label
=
(
"
with
"
if
impute
else
"
without
"
)
+
"
imputation
"
,
color
=
color
)
pyplot
.
xlabel
(
"
Training set size
"
)
pyplot
.
xscale
(
"
log
"
)
pyplot
.
title
(
titles
[
score_name
])
if
score_name
==
"
auc
"
:
pyplot
.
ylim
(
ymin
=
0.5
,
ymax
=
1.0
)
if
score_name
==
"
f1
"
:
pyplot
.
ylim
(
ymin
=
0
,
ymax
=
1
)
if
score_name
==
"
tau
"
:
pyplot
.
ylim
(
ymin
=
0
,
ymax
=
0.6
)
pyplot
.
yticks
(
np
.
arange
(
0
,
0.61
,
0.15
))
if
j
==
0
:
pyplot
.
legend
(
loc
=
(
-
0.1
,
0.05
),
fancybox
=
True
,
frameon
=
True
,
fontsize
=
"
small
"
)
pyplot
.
tight_layout
()
# Put the legend out of the figure
# pyplot.legend(
# bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fancybox=True, frameon=True)
image_filename
=
base_filename
+
"
.png
"
print
(
"
Writing PNG to %s
"
%
image_filename
)
pyplot
.
savefig
(
image_filename
)
This diff is collapsed.
Click to expand it.
setup.py
+
1
−
0
View file @
328d2c12
...
...
@@ -80,5 +80,6 @@ if __name__ == '__main__':
"
script/mhcflurry-train-class1-allele-specific-models.py
"
,
"
script/mhcflurry-predict-class1.py
"
,
"
script/mhcflurry-class1-web-server.py
"
,
"
script/mhcflurry-dataset-size-sensitivity.py
"
,
],
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment