Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
4b337b22
Commit
4b337b22
authored
8 years ago
by
Alex Rubinsteyn
Browse files
Options
Downloads
Patches
Plain Diff
added cross_validation iterator to Dataset
parent
37e0372b
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
mhcflurry/dataset.py
+64
-33
64 additions, 33 deletions
mhcflurry/dataset.py
test/test_dataset.py
+19
-0
19 additions, 0 deletions
test/test_dataset.py
with
83 additions
and
33 deletions
mhcflurry/dataset.py
+
64
−
33
View file @
4b337b22
...
...
@@ -25,6 +25,7 @@ from six import string_types
import
pandas
as
pd
import
numpy
as
np
from
typechecks
import
require_iterable_of
from
sklearn.cross_validation
import
KFold
from
.common
import
geometric_mean
from
.dataset_helpers
import
(
...
...
@@ -603,6 +604,69 @@ class Dataset(object):
right
=
self
.
slice
(
all_indices
[
n
:])
return
left
,
right
def
cross_validation_iterator
(
self
,
test_allele
=
None
,
n_folds
=
3
,
shuffle
=
True
):
"""
Yields a sequence of training/test splits of this dataset.
If test_allele is None then split across all pMHC entries, otherwise
only split the measurements of the specified allele (other alleles
will then always be included in the training datasets).
"""
if
test_allele
is
None
:
candidate_test_indices
=
np
.
arange
(
len
(
self
))
elif
test_allele
not
in
self
.
unique_alleles
():
raise
ValueError
(
"
Allele
'
%s
'
not in Dataset
"
%
test_allele
)
else
:
candidate_test_indices
=
np
.
where
(
self
.
alleles
==
test_allele
)[
0
]
n_candidate_test_samples
=
len
(
candidate_test_indices
)
n_total
=
len
(
self
)
for
_
,
subindices
in
KFold
(
n
=
n_candidate_test_samples
,
n_folds
=
n_folds
,
shuffle
=
shuffle
):
test_indices
=
candidate_test_indices
[
subindices
]
train_mask
=
np
.
ones
(
n_total
,
dtype
=
bool
)
train_mask
[
test_indices
]
=
False
train_data
=
self
.
slice
(
train_mask
)
test_data
=
self
.
slice
(
test_indices
)
yield
train_data
,
test_data
def
split_allele_randomly_and_impute_training_set
(
self
,
allele
,
n_training_samples
=
None
,
**
kwargs
):
"""
Split an allele into training and test sets, and then impute values
for peptides missing from the training set using data from other alleles
in this Dataset.
(apologies for the wordy name, this turns out to be a common operation)
Parameters
----------
allele : str
Name of allele
n_training_samples : int, optional
Size of the training set to return for this allele.
**kwargs : dict
Extra keyword arguments passed to Dataset.impute_missing_values
Returns three Dataset objects:
- training set with original pMHC affinities for given allele
- larger imputed training set for given allele
- test set
"""
dataset_allele
=
self
.
get_allele
(
allele
)
dataset_allele_train
,
dataset_allele_test
=
dataset_allele
.
random_split
(
n
=
n_training_samples
)
full_dataset_without_test_samples
=
self
.
difference
(
dataset_allele_test
)
imputed_dataset
=
full_dataset_without_test_samples
.
impute_missing_values
(
**
kwargs
)
imputed_dataset_allele
=
imputed_dataset
.
get_allele
(
allele
)
return
dataset_allele_train
,
imputed_dataset_allele
,
dataset_allele_test
def
drop_allele_peptide_lists
(
self
,
alleles
,
peptides
):
"""
Drop all allele-peptide pairs in the given lists.
...
...
@@ -657,39 +721,6 @@ class Dataset(object):
alleles
=
other_dataset
.
alleles
,
peptides
=
other_dataset
.
peptides
)
def
split_allele_randomly_and_impute_training_set
(
self
,
allele
,
n_training_samples
=
None
,
**
kwargs
):
"""
Split an allele into training and test sets, and then impute values
for peptides missing from the training set using data from other alleles
in this Dataset.
(apologies for the wordy name, this turns out to be a common operation)
Parameters
----------
allele : str
Name of allele
n_training_samples : int, optional
Size of the training set to return for this allele.
**kwargs : dict
Extra keyword arguments passed to Dataset.impute_missing_values
Returns three Dataset objects:
- training set with original pMHC affinities for given allele
- larger imputed training set for given allele
- test set
"""
dataset_allele
=
self
.
get_allele
(
allele
)
dataset_allele_train
,
dataset_allele_test
=
dataset_allele
.
random_split
(
n
=
n_training_samples
)
full_dataset_without_test_samples
=
self
.
difference
(
dataset_allele_test
)
imputed_dataset
=
full_dataset_without_test_samples
.
impute_missing_values
(
**
kwargs
)
imputed_dataset_allele
=
imputed_dataset
.
get_allele
(
allele
)
return
dataset_allele_train
,
imputed_dataset_allele
,
dataset_allele_test
def
impute_missing_values
(
self
,
imputation_method
,
...
...
This diff is collapsed.
Click to expand it.
test/test_dataset.py
+
19
−
0
View file @
4b337b22
...
...
@@ -48,6 +48,25 @@ def test_dataset_difference():
}})
eq_
(
dataset_diff
,
expected_result
)
def
test_dataset_cross_validation
():
dataset
=
Dataset
.
from_nested_dictionary
({
"
H-2-Kb
"
:
{
"
SIINFEKL
"
:
10.0
,
"
FEKLSIIN
"
:
20000.0
,
"
SIFEKLIN
"
:
50000.0
,
},
"
HLA-A*02:01
"
:
{
"
ASASAS
"
:
1.0
,
"
CCC
"
:
0.0
}})
fold_count
=
0
for
train_dataset
,
test_dataset
in
dataset
.
cross_validation_iterator
(
test_allele
=
"
HLA-A*02:01
"
,
n_folds
=
2
):
assert
train_dataset
.
unique_alleles
()
==
{
"
H-2-Kb
"
,
"
HLA-A*02:01
"
}
assert
test_dataset
.
unique_alleles
()
==
{
"
HLA-A*02:01
"
}
assert
len
(
test_dataset
)
==
1
fold_count
+=
1
assert
fold_count
==
2
if
__name__
==
"
__main__
"
:
test_create_allele_data_from_single_allele_dict
()
test_dataset_random_split
()
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment