Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
20e36c0a
Commit
20e36c0a
authored
7 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
Evaluate accuracy of unselected predictors during model selection
parent
381b3667
Loading
Loading
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
downloads-generation/models_class1/GENERATE.sh
+5
-9
5 additions, 9 deletions
downloads-generation/models_class1/GENERATE.sh
mhcflurry/select_allele_specific_models_command.py
+124
-31
124 additions, 31 deletions
mhcflurry/select_allele_specific_models_command.py
with
129 additions
and
40 deletions
downloads-generation/models_class1/GENERATE.sh
+
5
−
9
View file @
20e36c0a
...
...
@@ -43,21 +43,17 @@ python ./write_validation_data.py \
wc
-l
test.csv
#mhcflurry-predict \
# test.csv \
# --prediction-column-prefix "mhcflurry_unselected_" \
# --models "$(mhcflurry-downloads path models_class1_unselected)/models" \
# --out test.csv
#wc -l test.csv
time
mhcflurry-class1-select-allele-specific-models
\
--data
test.csv
\
--models-dir
"
$(
mhcflurry-downloads path models_class1_unselected
)
/models"
\
--out-models-dir
models
\
--scoring
combined
-all
\
--scoring
combined
:mass-spec,mse,consensus
\
--consensus-num-peptides-per-length
10000
\
--combined-min-models
8
\
--num-jobs
$(
expr
$PROCESSORS
\*
2
)
--gpus
$GPUS
--max-workers-per-gpu
2
--max-tasks-per-worker
50
--unselected-accuracy-scorer
combined:mass-spec,mse
\
--unselected-accuracy-percentile-threshold
95
\
--mass-spec-min-measurements
500
\
--num-jobs
$(
expr
$PROCESSORS
\*
2
)
--gpus
$GPUS
--max-workers-per-gpu
2
--max-tasks-per-worker
1
time
mhcflurry-calibrate-percentile-ranks
\
--models-dir
models
\
...
...
This diff is collapsed.
Click to expand it.
mhcflurry/select_allele_specific_models_command.py
+
124
−
31
View file @
20e36c0a
...
...
@@ -8,10 +8,11 @@ import sys
import
time
import
traceback
import
random
from
pprint
import
pprint
import
numpy
import
pandas
from
scipy.stats
import
kendalltau
from
scipy.stats
import
kendalltau
,
percentileofscore
from
mhcnames
import
normalize_allele_name
import
tqdm
# progress bar
...
...
@@ -63,6 +64,19 @@ parser.add_argument(
metavar
=
"
FILE.csv
"
,
help
=
"
Write predictions for validation data using unselected predictor to
"
"
FILE.csv
"
)
parser
.
add_argument
(
"
--unselected-accuracy-scorer
"
,
metavar
=
"
SCORER
"
,
default
=
"
combined:mass-spec,mse
"
)
parser
.
add_argument
(
"
--unselected-accuracy-scorer-num-samples
"
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
"
--unselected-accuracy-percentile-threshold
"
,
type
=
float
,
metavar
=
"
X
"
,
default
=
95
)
parser
.
add_argument
(
"
--allele
"
,
default
=
None
,
...
...
@@ -81,17 +95,11 @@ parser.add_argument(
default
=
1000
,
metavar
=
"
N
"
,
help
=
"
Max number of models to select per allele when using combined selector
"
)
parser
.
add_argument
(
"
--combined-weights
"
,
type
=
int
,
nargs
=
3
,
default
=
[
1
,
1
,
1
],
help
=
"
Weights for combined predictor in order: mass-spec MSE consensus
"
)
parser
.
add_argument
(
"
--mass-spec-min-measurements
"
,
type
=
int
,
metavar
=
"
N
"
,
default
=
50
,
default
=
1
,
help
=
"
Min number of measurements required for an allele to use mass-spec model
"
"
selection
"
)
parser
.
add_argument
(
...
...
@@ -110,7 +118,7 @@ parser.add_argument(
"
--mse-min-measurements
"
,
type
=
int
,
metavar
=
"
N
"
,
default
=
50
,
default
=
1
,
help
=
"
Min number of measurements required for an allele to use MSE model
"
"
selection
"
)
parser
.
add_argument
(
...
...
@@ -128,7 +136,6 @@ parser.add_argument(
parser
.
add_argument
(
"
--scoring
"
,
nargs
=
"
+
"
,
choices
=
(
"
combined-all
"
,
"
mse
"
,
"
mass-spec
"
,
"
consensus
"
),
default
=
[
"
mse
"
,
"
consensus
"
],
help
=
"
Scoring procedures to use in order
"
)
parser
.
add_argument
(
...
...
@@ -235,36 +242,42 @@ def run(argv=sys.argv[1:]):
selector_to_model_selection_kwargs
=
{}
def
make_selector
(
scoring
):
if
scoring
in
selectors
:
return
selectors
[
scoring
]
start
=
time
.
time
()
if
scoring
==
"
combined
-all
"
:
if
scoring
.
startswith
(
"
combined
:
"
)
:
model_selection_kwargs
=
{
'
min_models
'
:
args
.
combined_min_models
,
'
max_models
'
:
args
.
combined_max_models
,
}
selector
=
CombinedModelSelector
([
make_selector
(
"
mass-spec
"
)[
0
],
make_selector
(
"
mse
"
)[
0
],
make_selector
(
"
consensus
"
)[
0
],
],
weights
=
args
.
combined_weights
)
component_selectors
=
[]
for
component_selector
in
scoring
.
split
(
"
:
"
,
1
)[
1
].
split
(
"
,
"
):
component_selectors
.
append
(
make_selector
(
component_selector
)[
0
])
selector
=
CombinedModelSelector
(
component_selectors
)
elif
scoring
==
"
mse
"
:
model_selection_kwargs
=
{
'
min_models
'
:
args
.
mse_min_models
,
'
max_models
'
:
args
.
mse_max_models
,
}
min_measurements
=
args
.
mse_min_measurements
selector
=
MSEModelSelector
(
df
=
df
.
loc
[
~
df
.
mass_spec
],
predictor
=
input_predictor
,
min_measurements
=
args
.
mse_
min_measurements
)
min_measurements
=
min_measurements
)
elif
scoring
==
"
mass-spec
"
:
mass_spec_df
=
df
.
loc
[
df
.
mass_spec
]
model_selection_kwargs
=
{
'
min_models
'
:
args
.
mass_spec_min_models
,
'
max_models
'
:
args
.
mass_spec_max_models
,
}
min_measurements
=
args
.
mass_spec_min_measurements
selector
=
MassSpecModelSelector
(
df
=
mass_spec_df
,
predictor
=
input_predictor
,
min_measurements
=
args
.
mass_spec_
min_measurements
)
min_measurements
=
min_measurements
)
elif
scoring
==
"
consensus
"
:
model_selection_kwargs
=
{
'
min_models
'
:
args
.
consensus_min_models
,
...
...
@@ -281,10 +294,15 @@ def run(argv=sys.argv[1:]):
for
scoring
in
args
.
scoring
:
(
selector
,
model_selection_kwargs
)
=
make_selector
(
scoring
)
selectors
[
scoring
]
=
selector
selector_to_model_selection_kwargs
[
scoring
]
=
model_selection_kwargs
unselected_accuracy_scorer
=
None
if
args
.
unselected_accuracy_scorer
:
unselected_accuracy_scorer
=
make_selector
(
args
.
unselected_accuracy_scorer
)[
0
]
print
(
"
Using unselected accuracy scorer: %s
"
%
unselected_accuracy_scorer
)
GLOBAL_DATA
[
"
unselected_accuracy_scorer
"
]
=
unselected_accuracy_scorer
print
(
"
Selectors for alleles:
"
)
allele_to_selector
=
{}
allele_to_model_selection_kwargs
=
{}
...
...
@@ -301,7 +319,9 @@ def run(argv=sys.argv[1:]):
allele_to_model_selection_kwargs
[
allele
]
=
(
selector_to_model_selection_kwargs
[
possible_selector
])
GLOBAL_DATA
[
"
args
"
]
=
args
GLOBAL_DATA
[
"
input_predictor
"
]
=
input_predictor
GLOBAL_DATA
[
"
unselected_accuracy_scorer
"
]
=
unselected_accuracy_scorer
GLOBAL_DATA
[
"
allele_to_selector
"
]
=
allele_to_selector
GLOBAL_DATA
[
"
allele_to_model_selection_kwargs
"
]
=
allele_to_model_selection_kwargs
...
...
@@ -329,16 +349,31 @@ def run(argv=sys.argv[1:]):
alleles
,
chunksize
=
1
)
unselected_summary
=
[]
model_selection_dfs
=
[]
for
result
in
tqdm
.
tqdm
(
results
,
total
=
len
(
alleles
)):
model_selection_dfs
.
append
(
result
.
metadata_dataframes
[
'
model_selection
'
])
result_predictor
.
merge_in_place
([
result
])
pprint
(
result
)
summary_dict
=
dict
(
result
)
summary_dict
[
"
retained
"
]
=
result
[
"
selected
"
]
is
not
None
del
summary_dict
[
"
selected
"
]
model_selection_df
=
pandas
.
concat
(
model_selection_dfs
,
ignore_index
=
True
)
model_selection_df
[
"
selector
"
]
=
model_selection_df
.
allele
.
map
(
allele_to_selector
)
result_predictor
.
metadata_dataframes
[
"
model_selection
"
]
=
model_selection_df
unselected_summary
.
append
(
summary_dict
)
if
result
[
'
selected
'
]
is
not
None
:
model_selection_dfs
.
append
(
result
[
'
selected
'
].
metadata_dataframes
[
'
model_selection
'
])
result_predictor
.
merge_in_place
([
result
[
'
selected
'
]])
if
model_selection_dfs
:
model_selection_df
=
pandas
.
concat
(
model_selection_dfs
,
ignore_index
=
True
)
model_selection_df
[
"
selector
"
]
=
model_selection_df
.
allele
.
map
(
allele_to_selector
)
result_predictor
.
metadata_dataframes
[
"
model_selection
"
]
=
(
model_selection_df
)
result_predictor
.
metadata_dataframes
[
"
unselected_summary
"
]
=
(
pandas
.
DataFrame
(
unselected_summary
))
print
(
"
Done model selecting for %d alleles.
"
%
len
(
alleles
))
result_predictor
.
save
(
args
.
out_models_dir
)
...
...
@@ -353,17 +388,75 @@ def run(argv=sys.argv[1:]):
print
(
"
Predictor written to: %s
"
%
args
.
out_models_dir
)
class
ScrambledPredictor
(
object
):
def
__init__
(
self
,
predictor
):
self
.
predictor
=
predictor
self
.
_predictions
=
{}
self
.
_allele
=
None
def
predict
(
self
,
peptides
,
allele
):
if
peptides
not
in
self
.
_predictions
:
self
.
_predictions
[
peptides
]
=
pandas
.
Series
(
self
.
predictor
.
predict
(
peptides
=
peptides
,
allele
=
allele
))
self
.
_allele
=
allele
assert
allele
==
self
.
_allele
return
self
.
_predictions
[
peptides
].
sample
(
frac
=
1.0
).
values
def
model_select
(
allele
):
global
GLOBAL_DATA
predictor
=
GLOBAL_DATA
[
"
input_predicto
r
"
]
unselected_accuracy_scorer
=
GLOBAL_DATA
[
"
unselected_accuracy_score
r
"
]
selector
=
GLOBAL_DATA
[
"
allele_to_selector
"
][
allele
]
model_selection_kwargs
=
GLOBAL_DATA
[
"
allele_to_model_selection_kwargs
"
][
allele
]
return
predictor
.
model_select
(
score_function
=
selector
.
score_function
(
allele
=
allele
),
alleles
=
[
allele
],
**
model_selection_kwargs
)
predictor
=
GLOBAL_DATA
[
"
input_predictor
"
]
args
=
GLOBAL_DATA
[
"
args
"
]
unselected_accuracy_scorer_samples
=
GLOBAL_DATA
[
"
args
"
].
unselected_accuracy_scorer_num_samples
result_dict
=
{
"
allele
"
:
allele
}
unselected_score
=
None
unselected_score_percentile
=
None
unselected_score_scrambled_mean
=
None
if
unselected_accuracy_scorer
:
unselected_score_function
=
(
unselected_accuracy_scorer
.
score_function
(
allele
))
unselected_score
=
unselected_score_function
(
predictor
)
scrambled_predictor
=
ScrambledPredictor
(
predictor
)
scrambled_scores
=
numpy
.
array
([
unselected_score_function
(
scrambled_predictor
)
for
_
in
range
(
unselected_accuracy_scorer_samples
)
])
unselected_score_scrambled_mean
=
scrambled_scores
.
mean
()
unselected_score_percentile
=
percentileofscore
(
scrambled_scores
,
unselected_score
)
print
(
"
Unselected score and percentile
"
,
allele
,
unselected_score
,
unselected_score_percentile
)
selected
=
None
threshold
=
args
.
unselected_accuracy_percentile_threshold
if
unselected_score_percentile
is
None
or
unselected_score_percentile
>=
threshold
:
selected
=
predictor
.
model_select
(
score_function
=
selector
.
score_function
(
allele
=
allele
),
alleles
=
[
allele
],
**
model_selection_kwargs
)
result_dict
[
"
unselected_score_plan
"
]
=
(
unselected_accuracy_scorer
.
plan_summary
(
allele
)
if
unselected_accuracy_scorer
else
None
)
result_dict
[
"
selector_score_plan
"
]
=
selector
.
plan_summary
(
allele
)
result_dict
[
"
unselected_accuracy_score_percentile
"
]
=
unselected_score_percentile
result_dict
[
"
unselected_score
"
]
=
unselected_score
result_dict
[
"
unselected_score_scrambled_mean
"
]
=
unselected_score_scrambled_mean
result_dict
[
"
selected
"
]
=
selected
return
result_dict
def
cache_encoding
(
predictor
,
peptides
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment