Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
4e838dde
Commit
4e838dde
authored
5 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
work
parent
3211efca
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
mhcflurry/class1_ligandome_predictor.py
+281
-7
281 additions, 7 deletions
mhcflurry/class1_ligandome_predictor.py
with
281 additions
and
7 deletions
mhcflurry/class1_ligandome_predictor.py
+
281
−
7
View file @
4e838dde
from
__future__
import
print_function
from
os.path
import
join
,
exists
,
abspath
from
os
import
mkdir
,
environ
from
socket
import
gethostname
from
getpass
import
getuser
import
time
import
collections
import
json
import
hashlib
import
logging
from
six
import
string_types
import
numpy
import
pandas
import
mhcnames
import
hashlib
from
.hyperparameters
import
HyperparameterDefaults
from
.version
import
__version__
from
.class1_neural_network
import
Class1NeuralNetwork
,
DEFAULT_PREDICT_BATCH_SIZE
from
.encodable_sequences
import
EncodableSequences
from
.regression_target
import
from_ic50
,
to_ic50
...
...
@@ -24,9 +34,39 @@ from .custom_loss import (
class
Class1LigandomePredictor
(
object
):
def
__init__
(
self
,
class1_ligandome_neural_networks
,
allele_to_sequence
):
def
__init__
(
self
,
class1_ligandome_neural_networks
,
allele_to_sequence
,
manifest_df
=
None
,
metadata_dataframes
=
None
):
self
.
networks
=
class1_ligandome_neural_networks
self
.
allele_to_sequence
=
allele_to_sequence
self
.
_manifest_df
=
manifest_df
self
.
metadata_dataframes
=
(
dict
(
metadata_dataframes
)
if
metadata_dataframes
else
{})
@property
def
manifest_df
(
self
):
"""
A pandas.DataFrame describing the models included in this predictor.
Returns
-------
pandas.DataFrame
"""
if
self
.
_manifest_df
is
None
:
rows
=
[]
for
(
i
,
model
)
in
enumerate
(
self
.
networks
):
rows
.
append
((
self
.
model_name
(
i
),
json
.
dumps
(
model
.
get_config
()),
model
))
self
.
_manifest_df
=
pandas
.
DataFrame
(
rows
,
columns
=
[
"
model_name
"
,
"
config_json
"
,
"
model
"
])
return
self
.
_manifest_df
@property
def
max_alleles
(
self
):
...
...
@@ -36,6 +76,38 @@ class Class1LigandomePredictor(object):
for
n
in
self
.
networks
)
return
max_alleles
@staticmethod
def
model_name
(
num
):
"""
Generate a model name
Returns
-------
string
"""
random_string
=
hashlib
.
sha1
(
str
(
time
.
time
()).
encode
()).
hexdigest
()[:
16
]
return
"
LIGANDOME-CLASSI-%d-%s
"
%
(
num
,
random_string
)
@staticmethod
def
weights_path
(
models_dir
,
model_name
):
"""
Generate the path to the weights file for a model
Parameters
----------
models_dir : string
model_name : string
Returns
-------
string
"""
return
join
(
models_dir
,
"
weights_%s.npz
"
%
model_name
)
def
predict
(
self
,
peptides
,
alleles
,
batch_size
=
DEFAULT_PREDICT_BATCH_SIZE
):
return
self
.
predict_to_dataframe
(
peptides
=
peptides
,
...
...
@@ -71,7 +143,9 @@ class Class1LigandomePredictor(object):
alleles
=
MultipleAlleleEncoding
(
experiment_names
=
numpy
.
tile
(
"
experiment
"
,
len
(
peptides
)),
experiment_to_allele_list
=
{
"
experiment
"
:
alleles
,
"
experiment
"
:
[
mhcnames
.
normalize_allele_name
(
a
)
for
a
in
alleles
],
},
allele_to_sequence
=
self
.
allele_to_sequence
,
max_alleles_per_experiment
=
self
.
max_alleles
)
...
...
@@ -111,11 +185,211 @@ class Class1LigandomePredictor(object):
score_array
[:,
:,
i
],
95.0
,
axis
=
0
)
result_df
[
"
allele%d affinity
"
%
(
i
+
1
)]
=
to_ic50
(
ensemble_affinity
[:,
i
])
result_df
[
"
allele%d affinity low
"
%
(
i
+
1
)]
=
numpy
.
percentile
(
affinity_array
[:,
:,
i
],
5.0
,
axis
=
0
)
result_df
[
"
allele%d affinity high
"
%
(
i
+
1
)]
=
numpy
.
percentile
(
affinity_array
[:,
:,
i
],
9
5.0
,
axis
=
0
)
result_df
[
"
allele%d affinity low
"
%
(
i
+
1
)]
=
to_ic50
(
numpy
.
percentile
(
affinity_array
[:,
:,
i
],
9
5.0
,
axis
=
0
)
)
result_df
[
"
allele%d affinity high
"
%
(
i
+
1
)]
=
to_ic50
(
numpy
.
percentile
(
affinity_array
[:,
:,
i
],
5.0
,
axis
=
0
)
)
return
result_df
@staticmethod
def
save_weights
(
weights_list
,
filename
):
"""
Save the model weights to the given filename using numpy
'
s
"
.npz
"
format.
Parameters
----------
weights_list : list of array
filename : string
Should end in
"
.npz
"
.
"""
numpy
.
savez
(
filename
,
**
dict
(((
"
array_%d
"
%
i
),
w
)
for
(
i
,
w
)
in
enumerate
(
weights_list
)))
def
check_consistency
(
self
):
"""
Verify that self.manifest_df is consistent with instance variables.
Currently only checks for agreement on the total number of models.
Throws AssertionError if inconsistent.
"""
assert
len
(
self
.
manifest_df
)
==
len
(
self
.
networks
),
(
"
Manifest seems out of sync with models: %d vs %d entries:
\n
%s
"
%
(
len
(
self
.
manifest_df
),
len
(
self
.
networks
),
str
(
self
.
manifest_df
)))
def
save
(
self
,
models_dir
,
model_names_to_write
=
None
,
write_metadata
=
True
):
"""
Serialize the predictor to a directory on disk. If the directory does
not exist it will be created.
The serialization format consists of a file called
"
manifest.csv
"
with
the configurations of each Class1LigandomeNeuralNetwork, along with
per-network files giving the model weights.
Parameters
----------
models_dir : string
Path to directory. It will be created if it doesn
'
t exist.
"""
self
.
check_consistency
()
if
model_names_to_write
is
None
:
# Write all models
model_names_to_write
=
self
.
manifest_df
.
model_name
.
values
if
not
exists
(
models_dir
):
mkdir
(
models_dir
)
sub_manifest_df
=
self
.
manifest_df
.
loc
[
self
.
manifest_df
.
model_name
.
isin
(
model_names_to_write
)
].
copy
()
# Network JSON configs may have changed since the models were added,
# for example due to changes to the allele representation layer.
# So we update the JSON configs here also.
updated_network_config_jsons
=
[]
for
(
_
,
row
)
in
sub_manifest_df
.
iterrows
():
updated_network_config_jsons
.
append
(
json
.
dumps
(
row
.
model
.
get_config
()))
weights_path
=
self
.
weights_path
(
models_dir
,
row
.
model_name
)
self
.
save_weights
(
row
.
model
.
get_weights
(),
weights_path
)
logging
.
info
(
"
Wrote: %s
"
,
weights_path
)
sub_manifest_df
[
"
config_json
"
]
=
updated_network_config_jsons
self
.
manifest_df
.
loc
[
sub_manifest_df
.
index
,
"
config_json
"
]
=
updated_network_config_jsons
write_manifest_df
=
self
.
manifest_df
[[
c
for
c
in
self
.
manifest_df
.
columns
if
c
!=
"
model
"
]]
manifest_path
=
join
(
models_dir
,
"
manifest.csv
"
)
write_manifest_df
.
to_csv
(
manifest_path
,
index
=
False
)
logging
.
info
(
"
Wrote: %s
"
,
manifest_path
)
if
write_metadata
:
# Write "info.txt"
info_path
=
join
(
models_dir
,
"
info.txt
"
)
rows
=
[
(
"
trained on
"
,
time
.
asctime
()),
(
"
package
"
,
"
mhcflurry %s
"
%
__version__
),
(
"
hostname
"
,
gethostname
()),
(
"
user
"
,
getuser
()),
]
pandas
.
DataFrame
(
rows
).
to_csv
(
info_path
,
sep
=
"
\t
"
,
header
=
False
,
index
=
False
)
if
self
.
metadata_dataframes
:
for
(
name
,
df
)
in
self
.
metadata_dataframes
.
items
():
metadata_df_path
=
join
(
models_dir
,
"
%s.csv.bz2
"
%
name
)
df
.
to_csv
(
metadata_df_path
,
index
=
False
,
compression
=
"
bz2
"
)
# Save allele sequences
if
self
.
allele_to_sequence
is
not
None
:
allele_to_sequence_df
=
pandas
.
DataFrame
(
list
(
self
.
allele_to_sequence
.
items
()),
columns
=
[
'
allele
'
,
'
sequence
'
]
)
allele_to_sequence_df
.
to_csv
(
join
(
models_dir
,
"
allele_sequences.csv
"
),
index
=
False
)
logging
.
info
(
"
Wrote: %s
"
,
join
(
models_dir
,
"
allele_sequences.csv
"
))
@staticmethod
def
load
(
models_dir
=
None
,
max_models
=
None
):
"""
Deserialize a predictor from a directory on disk.
Parameters
----------
models_dir : string
Path to directory. If unspecified the default downloaded models are
used.
max_models : int, optional
Maximum number of models to load
Returns
-------
`Class1AffinityPredictor` instance
"""
if
models_dir
is
None
:
models_dir
=
get_default_class1_models_dir
()
if
optimization_level
is
None
:
optimization_level
=
OPTIMIZATION_LEVEL
manifest_path
=
join
(
models_dir
,
"
manifest.csv
"
)
manifest_df
=
pandas
.
read_csv
(
manifest_path
,
nrows
=
max_models
)
allele_to_allele_specific_models
=
collections
.
defaultdict
(
list
)
class1_pan_allele_models
=
[]
all_models
=
[]
for
(
_
,
row
)
in
manifest_df
.
iterrows
():
weights_filename
=
Class1AffinityPredictor
.
weights_path
(
models_dir
,
row
.
model_name
)
config
=
json
.
loads
(
row
.
config_json
)
# We will lazy-load weights when the network is used.
model
=
Class1NeuralNetwork
.
from_config
(
config
,
weights_loader
=
partial
(
Class1AffinityPredictor
.
load_weights
,
abspath
(
weights_filename
)))
if
row
.
allele
==
"
pan-class1
"
:
class1_pan_allele_models
.
append
(
model
)
else
:
allele_to_allele_specific_models
[
row
.
allele
].
append
(
model
)
all_models
.
append
(
model
)
manifest_df
[
"
model
"
]
=
all_models
# Load allele sequences
allele_to_sequence
=
None
if
exists
(
join
(
models_dir
,
"
allele_sequences.csv
"
)):
allele_to_sequence
=
pandas
.
read_csv
(
join
(
models_dir
,
"
allele_sequences.csv
"
),
index_col
=
0
).
iloc
[:,
0
].
to_dict
()
allele_to_percent_rank_transform
=
{}
percent_ranks_path
=
join
(
models_dir
,
"
percent_ranks.csv
"
)
if
exists
(
percent_ranks_path
):
percent_ranks_df
=
pandas
.
read_csv
(
percent_ranks_path
,
index_col
=
0
)
for
allele
in
percent_ranks_df
.
columns
:
allele_to_percent_rank_transform
[
allele
]
=
(
PercentRankTransform
.
from_series
(
percent_ranks_df
[
allele
]))
logging
.
info
(
"
Loaded %d class1 pan allele predictors, %d allele sequences,
"
"
%d percent rank distributions, and %d allele specific models: %s
"
,
len
(
class1_pan_allele_models
),
len
(
allele_to_sequence
)
if
allele_to_sequence
else
0
,
len
(
allele_to_percent_rank_transform
),
sum
(
len
(
v
)
for
v
in
allele_to_allele_specific_models
.
values
()),
"
,
"
.
join
(
"
%s (%d)
"
%
(
allele
,
len
(
v
))
for
(
allele
,
v
)
in
sorted
(
allele_to_allele_specific_models
.
items
())))
result
=
Class1AffinityPredictor
(
allele_to_allele_specific_models
=
allele_to_allele_specific_models
,
class1_pan_allele_models
=
class1_pan_allele_models
,
allele_to_sequence
=
allele_to_sequence
,
manifest_df
=
manifest_df
,
allele_to_percent_rank_transform
=
allele_to_percent_rank_transform
,
)
if
optimization_level
>=
1
:
optimized
=
result
.
optimize
()
logging
.
info
(
"
Model optimization %s
"
,
"
succeeded
"
if
optimized
else
"
not supported for these models
"
)
return
result
# TODO: implement saving and loading
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment