Skip to content
Snippets Groups Projects
Commit 7f1f671d authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

pylint nits

parent 18d451d2
No related branches found
No related tags found
No related merge requests found
Showing
with 89 additions and 77 deletions
"""
Class I MHC ligand prediction package
"""
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_neural_network import Class1NeuralNetwork
from .version import __version__
......
......@@ -138,4 +138,3 @@ class AlleleEncoding(object):
result = vector_encoded[self.indices]
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
......@@ -212,7 +212,7 @@ def run(argv=sys.argv[1:]):
worker_pool.join()
print("Percent rank calibration time: %0.2f min." % (
percent_rank_calibration_time / 60.0))
percent_rank_calibration_time / 60.0))
print("Predictor written to: %s" % args.models_dir)
......
......@@ -9,12 +9,13 @@ from os import mkdir, environ
from socket import gethostname
from getpass import getuser
from functools import partial
from six import string_types
import mhcnames
import numpy
import pandas
from numpy.testing import assert_equal
from six import string_types
import pandas
import mhcnames
from .class1_neural_network import Class1NeuralNetwork
from .common import random_peptides, positional_frequency_matrix
......@@ -57,11 +58,11 @@ class Class1AffinityPredictor(object):
Parameters
----------
allele_to_allele_specific_models : dict of string -> list of `Class1NeuralNetwork`
Ensemble of single-allele models to use for each allele.
Ensemble of single-allele models to use for each allele.
class1_pan_allele_models : list of `Class1NeuralNetwork`
Ensemble of pan-allele models.
allele_to_sequence : dict of string -> string
MHC allele name to fixed-length amino acid sequence (sometimes
referred to as the pseudosequence). Required only if
......@@ -106,7 +107,7 @@ class Class1AffinityPredictor(object):
self._cache = {}
self.optimization_info = {}
assert isinstance( self.allele_to_allele_specific_models, dict)
assert isinstance(self.allele_to_allele_specific_models, dict)
assert isinstance(self.class1_pan_allele_models, list)
@property
......@@ -365,14 +366,14 @@ class Class1AffinityPredictor(object):
weights_path = self.weights_path(models_dir, row.model_name)
Class1AffinityPredictor.save_weights(
row.model.get_weights(), weights_path)
logging.info("Wrote: %s" % weights_path)
logging.info("Wrote: %s", weights_path)
write_manifest_df = self.manifest_df[[
c for c in self.manifest_df.columns if c != "model"
]]
manifest_path = join(models_dir, "manifest.csv")
write_manifest_df.to_csv(manifest_path, index=False)
logging.info("Wrote: %s" % manifest_path)
logging.info("Wrote: %s", manifest_path)
if write_metadata:
# Write "info.txt"
......@@ -399,7 +400,7 @@ class Class1AffinityPredictor(object):
)
allele_to_sequence_df.to_csv(
join(models_dir, "allele_sequences.csv"), index=False)
logging.info("Wrote: %s" % join(models_dir, "allele_sequences.csv"))
logging.info("Wrote: %s", join(models_dir, "allele_sequences.csv"))
if self.allele_to_percent_rank_transform:
percent_ranks_df = None
......@@ -414,7 +415,7 @@ class Class1AffinityPredictor(object):
percent_ranks_path,
index=True,
index_label="bin")
logging.info("Wrote: %s" % percent_ranks_path)
logging.info("Wrote: %s", percent_ranks_path)
@staticmethod
def load(models_dir=None, max_models=None):
......@@ -467,7 +468,7 @@ class Class1AffinityPredictor(object):
if exists(join(models_dir, "allele_sequences.csv")):
allele_to_sequence = pandas.read_csv(
join(models_dir, "allele_sequences.csv"),
index_col=0).iloc[:,0].to_dict()
index_col=0).iloc[:, 0].to_dict()
allele_to_percent_rank_transform = {}
percent_ranks_path = join(models_dir, "percent_ranks.csv")
......@@ -479,15 +480,15 @@ class Class1AffinityPredictor(object):
logging.info(
"Loaded %d class1 pan allele predictors, %d allele sequences, "
"%d percent rank distributions, and %d allele specific models: %s" % (
len(class1_pan_allele_models),
len(allele_to_sequence) if allele_to_sequence else 0,
len(allele_to_percent_rank_transform),
sum(len(v) for v in allele_to_allele_specific_models.values()),
", ".join(
"%s (%d)" % (allele, len(v))
for (allele, v)
in sorted(allele_to_allele_specific_models.items()))))
"%d percent rank distributions, and %d allele specific models: %s",
len(class1_pan_allele_models),
len(allele_to_sequence) if allele_to_sequence else 0,
len(allele_to_percent_rank_transform),
sum(len(v) for v in allele_to_allele_specific_models.values()),
", ".join(
"%s (%d)" % (allele, len(v))
for (allele, v)
in sorted(allele_to_allele_specific_models.items())))
result = Class1AffinityPredictor(
allele_to_allele_specific_models=allele_to_allele_specific_models,
......@@ -500,7 +501,7 @@ class Class1AffinityPredictor(object):
logging.info("Optimizing models")
optimized = result.optimize()
logging.info(
"Optimization " + ("succeeded" if optimized else "failed"))
"Optimization %s", ("succeeded" if optimized else "failed"))
return result
def optimize(self):
......@@ -527,7 +528,7 @@ class Class1AffinityPredictor(object):
merge_method="concatenate")
]
except NotImplementedError as e:
logging.warning("Optimization failed: %s" % str(e))
logging.warning("Optimization failed: %s", str(e))
return False
self._manifest_df = None
self.clear_cache()
......@@ -584,8 +585,8 @@ class Class1AffinityPredictor(object):
AlleleEncoding
"""
if (self._master_allele_encoding is None or
self._master_allele_encoding.allele_to_sequence !=
self.allele_to_sequence):
self._master_allele_encoding.allele_to_sequence !=
self.allele_to_sequence):
self._master_allele_encoding = AlleleEncoding(
allele_to_sequence=self.allele_to_sequence)
return self._master_allele_encoding
......@@ -793,7 +794,7 @@ class Class1AffinityPredictor(object):
encodable_peptides = EncodableSequences.create(peptides)
models = []
for i in range(n_models):
logging.info("Training model %d / %d" % (i + 1, n_models))
logging.info("Training model %d / %d", i + 1, n_models)
model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit(
encodable_peptides,
......@@ -879,10 +880,8 @@ class Class1AffinityPredictor(object):
msg = "Allele %s has no percentile rank information" % allele
if throw:
raise ValueError(msg)
else:
warnings.warn(msg)
# Return NaNs
return numpy.ones(len(affinities)) * numpy.nan
warnings.warn(msg)
return numpy.ones(len(affinities)) * numpy.nan # Return NaNs
if alleles is None:
raise ValueError("Specify allele or alleles")
......@@ -1294,10 +1293,10 @@ class Class1AffinityPredictor(object):
Returns
----------
None if motif_summary is False
dict of string -> pandas.DataFrame
Otherwise: dict of string -> pandas.DataFrame where keys are
"frequency_matrices" and "length_distributions".
If motif_summary is True, this will have keys "frequency_matrices" and
"length_distributions". Otherwise it will be empty.
"""
if bins is None:
......@@ -1323,7 +1322,7 @@ class Class1AffinityPredictor(object):
else:
frequency_matrices = None
length_distributions = None
for (i, allele) in enumerate(alleles):
for allele in alleles:
start = time.time()
predictions = self.predict(
encoded_peptides, allele=allele, model_kwargs=model_kwargs)
......@@ -1400,6 +1399,7 @@ class Class1AffinityPredictor(object):
'frequency_matrices': frequency_matrices,
'length_distributions': length_distributions,
}
return {}
def model_select(
self,
......@@ -1490,4 +1490,3 @@ class Class1AffinityPredictor(object):
"model_selection": df,
})
return new_predictor
......@@ -349,4 +349,3 @@ def worker_entry_point(argv=sys.argv[1:]):
if args.complete_dir:
os.mkdir(args.complete_dir)
print("Created: ", args.complete_dir)
......@@ -173,4 +173,4 @@ def positional_frequency_matrix(peptides):
counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts()
result = (counts / len(peptides)).fillna(0.0).T
result.index.name = 'position'
return result
\ No newline at end of file
return result
......@@ -251,4 +251,4 @@ def check_shape(name, arr, expected_shape):
# Register custom losses.
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
CUSTOM_LOSSES[cls.name] = cls()
\ No newline at end of file
CUSTOM_LOSSES[cls.name] = cls()
......@@ -9,9 +9,9 @@ from __future__ import (
)
import logging
import yaml
from os.path import join, exists, relpath
from pipes import quote
from os.path import join, exists
from os import environ
from pipes import quote
from collections import OrderedDict
from appdirs import user_data_dir
from pkg_resources import resource_string
......@@ -81,8 +81,7 @@ def get_default_class1_models_dir(test_exists=True):
if test_exists and not exists(result):
raise IOError("No such directory: %s" % result)
return result
else:
return get_path("models_class1", "models", test_exists=test_exists)
return get_path("models_class1", "models", test_exists=test_exists)
def get_current_release_downloads():
......@@ -160,13 +159,13 @@ def configure():
metadata["releases"][_CURRENT_RELEASE]["compatibility-version"])
current_compatability = metadata["current-compatibility-version"]
if current_release_compatability != current_compatability:
logging.warn(
logging.warning(
"The specified downloads are not compatible with this version "
"of the MHCflurry codebase. Downloads: release %s, "
"compatability version: %d. Code compatability version: %d" % (
_CURRENT_RELEASE,
current_release_compatability,
current_compatability))
"compatability version: %d. Code compatability version: %d",
_CURRENT_RELEASE,
current_release_compatability,
current_compatability)
data_dir = environ.get("MHCFLURRY_DATA_DIR")
if not data_dir:
......@@ -176,6 +175,7 @@ def configure():
data_dir = user_data_dir("mhcflurry", version="4")
_DOWNLOADS_DIR = join(data_dir, _CURRENT_RELEASE)
logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s" % _DOWNLOADS_DIR)
logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s", _DOWNLOADS_DIR)
configure()
......@@ -168,8 +168,7 @@ def fetch_subcommand(args):
"\nThe requested download '%s' has already been downloaded. "
"To re-download this data, first run: \n\t%s\nin a shell "
"and then re-run this command.\n" +
"*" * 40)
% (name, 'rm -rf ' + quote(get_path(name))))
"*" * 40) % (name, 'rm -rf ' + quote(get_path(name))))
if not info['downloaded'] and (name in args.download_name or default):
items_to_fetch.add(name)
......
"""
Class for encoding variable-length peptides to fixed-size numerical matrices
"""
from __future__ import (
print_function,
division,
......@@ -26,9 +29,12 @@ class EncodingError(ValueError):
class EncodableSequences(object):
"""
Sequences of amino acids.
Class for encoding variable-length peptides to fixed-size numerical matrices
This class caches various encodings of a list of sequences.
In practice this is used only for peptides. To encode MHC allele sequences,
see AlleleEncoding.
"""
unknown_character = "X"
......@@ -299,8 +305,10 @@ class EncodableSequences(object):
min_length = 5
# Result array is int32, filled with X (null amino acid) value.
result = numpy.full(fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length * 2), dtype="int32")
result = numpy.full(
fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length * 2),
dtype="int32")
df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_)
......@@ -319,9 +327,9 @@ class EncodableSequences(object):
# Array of shape (num peptides, length) giving fixed-length
# amino acid encoding each peptide of the current length.
fixed_length_sequences = numpy.stack(sub_df.peptide.map(
lambda s: numpy.array(
[amino_acid.AMINO_ACID_INDEX[char] for char in
s])).values)
lambda s: numpy.array([
amino_acid.AMINO_ACID_INDEX[char] for char in s
])).values)
# Set left edge
result[sub_df.index, :length] = fixed_length_sequences
......@@ -334,8 +342,10 @@ class EncodableSequences(object):
min_length = 5
# Result array is int32, filled with X (null amino acid) value.
result = numpy.full(fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length * 3), dtype="int32")
result = numpy.full(
fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length * 3),
dtype="int32")
df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_)
......@@ -354,9 +364,9 @@ class EncodableSequences(object):
# Array of shape (num peptides, length) giving fixed-length
# amino acid encoding each peptide of the current length.
fixed_length_sequences = numpy.stack(sub_df.peptide.map(
lambda s: numpy.array(
[amino_acid.AMINO_ACID_INDEX[char] for char in
s])).values)
lambda s: numpy.array([
amino_acid.AMINO_ACID_INDEX[char] for char in s
])).values)
# Set left edge
result[sub_df.index, :length] = fixed_length_sequences
......
......@@ -37,4 +37,4 @@ CENTRALITY_MEASURES = {
"mean": partial(numpy.nanmean, axis=1),
"median": partial(numpy.nanmedian, axis=1),
"robust_mean": robust_mean,
}
\ No newline at end of file
}
"""
Hyperparameter (neural network options) management
"""
from __future__ import (
print_function,
division,
......@@ -70,8 +73,7 @@ class HyperparameterDefaults(object):
if invalid_keys:
raise ValueError(
"No such model parameters: %s. Valid parameters are: %s"
% (" ".join(invalid_keys),
" ".join(self.defaults)))
% (" ".join(invalid_keys), " ".join(self.defaults)))
def models_grid(self, **kwargs):
'''
......
"""
Class for transforming arbitrary values into percent ranks given a distribution.
"""
import numpy
import pandas
......@@ -77,8 +80,3 @@ class PercentRankTransform(object):
result.cdf = series.values
result.bin_edges = series.index.values[1:-1]
return result
......@@ -219,7 +219,7 @@ def run(argv=sys.argv[1:]):
})
logging.info(
"Predicting for %d alleles and %d peptides = %d predictions" % (
len(args.alleles), len(args.peptides), len(df)))
len(args.alleles), len(args.peptides), len(df)))
predictions = predictor.predict_to_dataframe(
peptides=df[args.peptide_column].values,
......
"""
Measures of prediction accuracy
"""
from __future__ import (
print_function,
division,
......
......@@ -24,7 +24,7 @@ tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
from .class1_affinity_predictor import Class1AffinityPredictor
from .encodable_sequences import EncodableSequences
from .allele_encoding import AlleleEncoding
from .common import configure_logging, random_peptides
from .common import configure_logging
from .local_parallelism import (
worker_pool_with_gpu_assignments_from_args,
add_local_parallelism_args)
......
......@@ -10,7 +10,6 @@ import traceback
import random
from functools import partial
import numpy
import pandas
import yaml
from sklearn.metrics.pairwise import cosine_similarity
......@@ -337,7 +336,7 @@ def alleles_by_similarity(allele):
allele_similarity.columns.to_series().sample(frac=1.0))
return (
allele_similarity[allele] + (
allele_similarity.index == allele) # force that we return specified allele first
allele_similarity.index == allele) # force specified allele first
).sort_values(ascending=False).index.tolist()
......
......@@ -270,7 +270,8 @@ def run(argv=sys.argv[1:]):
return main(args)
except Exception as e:
print(e)
import ipdb ; ipdb.set_trace()
import ipdb
ipdb.set_trace()
raise
else:
return main(args)
......@@ -697,4 +698,3 @@ def train_model(
if __name__ == '__main__':
run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment