From 7f1f671df4e11bba7938934d749a9ab6b2d90e56 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 5 Sep 2019 18:15:56 -0400 Subject: [PATCH] pylint nits --- mhcflurry/__init__.py | 4 ++ mhcflurry/allele_encoding.py | 1 - .../calibrate_percentile_ranks_command.py | 2 +- mhcflurry/class1_affinity_predictor.py | 69 +++++++++---------- mhcflurry/cluster_parallelism.py | 1 - mhcflurry/common.py | 2 +- mhcflurry/custom_loss.py | 2 +- mhcflurry/downloads.py | 20 +++--- mhcflurry/downloads_command.py | 3 +- mhcflurry/encodable_sequences.py | 32 ++++++--- mhcflurry/ensemble_centrality.py | 2 +- mhcflurry/hyperparameters.py | 6 +- mhcflurry/percent_rank_transform.py | 8 +-- mhcflurry/predict_command.py | 2 +- mhcflurry/scoring.py | 3 + mhcflurry/select_pan_allele_models_command.py | 2 +- .../train_allele_specific_models_command.py | 3 +- mhcflurry/train_pan_allele_models_command.py | 4 +- 18 files changed, 89 insertions(+), 77 deletions(-) diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index 8538c4d3..5d7ceb0f 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -1,3 +1,7 @@ +""" +Class I MHC ligand prediction package +""" + from .class1_affinity_predictor import Class1AffinityPredictor from .class1_neural_network import Class1NeuralNetwork from .version import __version__ diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py index 2e89bfaf..06355361 100644 --- a/mhcflurry/allele_encoding.py +++ b/mhcflurry/allele_encoding.py @@ -138,4 +138,3 @@ class AlleleEncoding(object): result = vector_encoded[self.indices] self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] - diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py index 31aeccc1..9ced4161 100644 --- a/mhcflurry/calibrate_percentile_ranks_command.py +++ b/mhcflurry/calibrate_percentile_ranks_command.py @@ -212,7 +212,7 @@ def run(argv=sys.argv[1:]): worker_pool.join() print("Percent rank calibration time: %0.2f min." % ( - percent_rank_calibration_time / 60.0)) + percent_rank_calibration_time / 60.0)) print("Predictor written to: %s" % args.models_dir) diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index eca34889..10ae6365 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -9,12 +9,13 @@ from os import mkdir, environ from socket import gethostname from getpass import getuser from functools import partial +from six import string_types -import mhcnames import numpy -import pandas from numpy.testing import assert_equal -from six import string_types +import pandas + +import mhcnames from .class1_neural_network import Class1NeuralNetwork from .common import random_peptides, positional_frequency_matrix @@ -57,11 +58,11 @@ class Class1AffinityPredictor(object): Parameters ---------- allele_to_allele_specific_models : dict of string -> list of `Class1NeuralNetwork` - Ensemble of single-allele models to use for each allele. - + Ensemble of single-allele models to use for each allele. + class1_pan_allele_models : list of `Class1NeuralNetwork` Ensemble of pan-allele models. - + allele_to_sequence : dict of string -> string MHC allele name to fixed-length amino acid sequence (sometimes referred to as the pseudosequence). Required only if @@ -106,7 +107,7 @@ class Class1AffinityPredictor(object): self._cache = {} self.optimization_info = {} - assert isinstance( self.allele_to_allele_specific_models, dict) + assert isinstance(self.allele_to_allele_specific_models, dict) assert isinstance(self.class1_pan_allele_models, list) @property @@ -365,14 +366,14 @@ class Class1AffinityPredictor(object): weights_path = self.weights_path(models_dir, row.model_name) Class1AffinityPredictor.save_weights( row.model.get_weights(), weights_path) - logging.info("Wrote: %s" % weights_path) + logging.info("Wrote: %s", weights_path) write_manifest_df = self.manifest_df[[ c for c in self.manifest_df.columns if c != "model" ]] manifest_path = join(models_dir, "manifest.csv") write_manifest_df.to_csv(manifest_path, index=False) - logging.info("Wrote: %s" % manifest_path) + logging.info("Wrote: %s", manifest_path) if write_metadata: # Write "info.txt" @@ -399,7 +400,7 @@ class Class1AffinityPredictor(object): ) allele_to_sequence_df.to_csv( join(models_dir, "allele_sequences.csv"), index=False) - logging.info("Wrote: %s" % join(models_dir, "allele_sequences.csv")) + logging.info("Wrote: %s", join(models_dir, "allele_sequences.csv")) if self.allele_to_percent_rank_transform: percent_ranks_df = None @@ -414,7 +415,7 @@ class Class1AffinityPredictor(object): percent_ranks_path, index=True, index_label="bin") - logging.info("Wrote: %s" % percent_ranks_path) + logging.info("Wrote: %s", percent_ranks_path) @staticmethod def load(models_dir=None, max_models=None): @@ -467,7 +468,7 @@ class Class1AffinityPredictor(object): if exists(join(models_dir, "allele_sequences.csv")): allele_to_sequence = pandas.read_csv( join(models_dir, "allele_sequences.csv"), - index_col=0).iloc[:,0].to_dict() + index_col=0).iloc[:, 0].to_dict() allele_to_percent_rank_transform = {} percent_ranks_path = join(models_dir, "percent_ranks.csv") @@ -479,15 +480,15 @@ class Class1AffinityPredictor(object): logging.info( "Loaded %d class1 pan allele predictors, %d allele sequences, " - "%d percent rank distributions, and %d allele specific models: %s" % ( - len(class1_pan_allele_models), - len(allele_to_sequence) if allele_to_sequence else 0, - len(allele_to_percent_rank_transform), - sum(len(v) for v in allele_to_allele_specific_models.values()), - ", ".join( - "%s (%d)" % (allele, len(v)) - for (allele, v) - in sorted(allele_to_allele_specific_models.items())))) + "%d percent rank distributions, and %d allele specific models: %s", + len(class1_pan_allele_models), + len(allele_to_sequence) if allele_to_sequence else 0, + len(allele_to_percent_rank_transform), + sum(len(v) for v in allele_to_allele_specific_models.values()), + ", ".join( + "%s (%d)" % (allele, len(v)) + for (allele, v) + in sorted(allele_to_allele_specific_models.items()))) result = Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, @@ -500,7 +501,7 @@ class Class1AffinityPredictor(object): logging.info("Optimizing models") optimized = result.optimize() logging.info( - "Optimization " + ("succeeded" if optimized else "failed")) + "Optimization %s", ("succeeded" if optimized else "failed")) return result def optimize(self): @@ -527,7 +528,7 @@ class Class1AffinityPredictor(object): merge_method="concatenate") ] except NotImplementedError as e: - logging.warning("Optimization failed: %s" % str(e)) + logging.warning("Optimization failed: %s", str(e)) return False self._manifest_df = None self.clear_cache() @@ -584,8 +585,8 @@ class Class1AffinityPredictor(object): AlleleEncoding """ if (self._master_allele_encoding is None or - self._master_allele_encoding.allele_to_sequence != - self.allele_to_sequence): + self._master_allele_encoding.allele_to_sequence != + self.allele_to_sequence): self._master_allele_encoding = AlleleEncoding( allele_to_sequence=self.allele_to_sequence) return self._master_allele_encoding @@ -793,7 +794,7 @@ class Class1AffinityPredictor(object): encodable_peptides = EncodableSequences.create(peptides) models = [] for i in range(n_models): - logging.info("Training model %d / %d" % (i + 1, n_models)) + logging.info("Training model %d / %d", i + 1, n_models) model = Class1NeuralNetwork(**architecture_hyperparameters) model.fit( encodable_peptides, @@ -879,10 +880,8 @@ class Class1AffinityPredictor(object): msg = "Allele %s has no percentile rank information" % allele if throw: raise ValueError(msg) - else: - warnings.warn(msg) - # Return NaNs - return numpy.ones(len(affinities)) * numpy.nan + warnings.warn(msg) + return numpy.ones(len(affinities)) * numpy.nan # Return NaNs if alleles is None: raise ValueError("Specify allele or alleles") @@ -1294,10 +1293,10 @@ class Class1AffinityPredictor(object): Returns ---------- - None if motif_summary is False + dict of string -> pandas.DataFrame - Otherwise: dict of string -> pandas.DataFrame where keys are - "frequency_matrices" and "length_distributions". + If motif_summary is True, this will have keys "frequency_matrices" and + "length_distributions". Otherwise it will be empty. """ if bins is None: @@ -1323,7 +1322,7 @@ class Class1AffinityPredictor(object): else: frequency_matrices = None length_distributions = None - for (i, allele) in enumerate(alleles): + for allele in alleles: start = time.time() predictions = self.predict( encoded_peptides, allele=allele, model_kwargs=model_kwargs) @@ -1400,6 +1399,7 @@ class Class1AffinityPredictor(object): 'frequency_matrices': frequency_matrices, 'length_distributions': length_distributions, } + return {} def model_select( self, @@ -1490,4 +1490,3 @@ class Class1AffinityPredictor(object): "model_selection": df, }) return new_predictor - diff --git a/mhcflurry/cluster_parallelism.py b/mhcflurry/cluster_parallelism.py index 976f53a2..9ec07de9 100644 --- a/mhcflurry/cluster_parallelism.py +++ b/mhcflurry/cluster_parallelism.py @@ -349,4 +349,3 @@ def worker_entry_point(argv=sys.argv[1:]): if args.complete_dir: os.mkdir(args.complete_dir) print("Created: ", args.complete_dir) - diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 7c8e1628..8885637b 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -173,4 +173,4 @@ def positional_frequency_matrix(peptides): counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts() result = (counts / len(peptides)).fillna(0.0).T result.index.name = 'position' - return result \ No newline at end of file + return result diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py index 47d51cb4..8c523c0a 100644 --- a/mhcflurry/custom_loss.py +++ b/mhcflurry/custom_loss.py @@ -251,4 +251,4 @@ def check_shape(name, arr, expected_shape): # Register custom losses. for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]: - CUSTOM_LOSSES[cls.name] = cls() \ No newline at end of file + CUSTOM_LOSSES[cls.name] = cls() diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index ec776f9e..7f29ea52 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -9,9 +9,9 @@ from __future__ import ( ) import logging import yaml -from os.path import join, exists, relpath -from pipes import quote +from os.path import join, exists from os import environ +from pipes import quote from collections import OrderedDict from appdirs import user_data_dir from pkg_resources import resource_string @@ -81,8 +81,7 @@ def get_default_class1_models_dir(test_exists=True): if test_exists and not exists(result): raise IOError("No such directory: %s" % result) return result - else: - return get_path("models_class1", "models", test_exists=test_exists) + return get_path("models_class1", "models", test_exists=test_exists) def get_current_release_downloads(): @@ -160,13 +159,13 @@ def configure(): metadata["releases"][_CURRENT_RELEASE]["compatibility-version"]) current_compatability = metadata["current-compatibility-version"] if current_release_compatability != current_compatability: - logging.warn( + logging.warning( "The specified downloads are not compatible with this version " "of the MHCflurry codebase. Downloads: release %s, " - "compatability version: %d. Code compatability version: %d" % ( - _CURRENT_RELEASE, - current_release_compatability, - current_compatability)) + "compatability version: %d. Code compatability version: %d", + _CURRENT_RELEASE, + current_release_compatability, + current_compatability) data_dir = environ.get("MHCFLURRY_DATA_DIR") if not data_dir: @@ -176,6 +175,7 @@ def configure(): data_dir = user_data_dir("mhcflurry", version="4") _DOWNLOADS_DIR = join(data_dir, _CURRENT_RELEASE) - logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s" % _DOWNLOADS_DIR) + logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s", _DOWNLOADS_DIR) + configure() diff --git a/mhcflurry/downloads_command.py b/mhcflurry/downloads_command.py index 8bf1d213..249269ae 100644 --- a/mhcflurry/downloads_command.py +++ b/mhcflurry/downloads_command.py @@ -168,8 +168,7 @@ def fetch_subcommand(args): "\nThe requested download '%s' has already been downloaded. " "To re-download this data, first run: \n\t%s\nin a shell " "and then re-run this command.\n" + - "*" * 40) - % (name, 'rm -rf ' + quote(get_path(name)))) + "*" * 40) % (name, 'rm -rf ' + quote(get_path(name)))) if not info['downloaded'] and (name in args.download_name or default): items_to_fetch.add(name) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index f6322835..19696e23 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -1,3 +1,6 @@ +""" +Class for encoding variable-length peptides to fixed-size numerical matrices +""" from __future__ import ( print_function, division, @@ -26,9 +29,12 @@ class EncodingError(ValueError): class EncodableSequences(object): """ - Sequences of amino acids. + Class for encoding variable-length peptides to fixed-size numerical matrices This class caches various encodings of a list of sequences. + + In practice this is used only for peptides. To encode MHC allele sequences, + see AlleleEncoding. """ unknown_character = "X" @@ -299,8 +305,10 @@ class EncodableSequences(object): min_length = 5 # Result array is int32, filled with X (null amino acid) value. - result = numpy.full(fill_value=amino_acid.AMINO_ACID_INDEX['X'], - shape=(len(sequences), max_length * 2), dtype="int32") + result = numpy.full( + fill_value=amino_acid.AMINO_ACID_INDEX['X'], + shape=(len(sequences), max_length * 2), + dtype="int32") df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_) @@ -319,9 +327,9 @@ class EncodableSequences(object): # Array of shape (num peptides, length) giving fixed-length # amino acid encoding each peptide of the current length. fixed_length_sequences = numpy.stack(sub_df.peptide.map( - lambda s: numpy.array( - [amino_acid.AMINO_ACID_INDEX[char] for char in - s])).values) + lambda s: numpy.array([ + amino_acid.AMINO_ACID_INDEX[char] for char in s + ])).values) # Set left edge result[sub_df.index, :length] = fixed_length_sequences @@ -334,8 +342,10 @@ class EncodableSequences(object): min_length = 5 # Result array is int32, filled with X (null amino acid) value. - result = numpy.full(fill_value=amino_acid.AMINO_ACID_INDEX['X'], - shape=(len(sequences), max_length * 3), dtype="int32") + result = numpy.full( + fill_value=amino_acid.AMINO_ACID_INDEX['X'], + shape=(len(sequences), max_length * 3), + dtype="int32") df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_) @@ -354,9 +364,9 @@ class EncodableSequences(object): # Array of shape (num peptides, length) giving fixed-length # amino acid encoding each peptide of the current length. fixed_length_sequences = numpy.stack(sub_df.peptide.map( - lambda s: numpy.array( - [amino_acid.AMINO_ACID_INDEX[char] for char in - s])).values) + lambda s: numpy.array([ + amino_acid.AMINO_ACID_INDEX[char] for char in s + ])).values) # Set left edge result[sub_df.index, :length] = fixed_length_sequences diff --git a/mhcflurry/ensemble_centrality.py b/mhcflurry/ensemble_centrality.py index e370a39d..07251bf0 100644 --- a/mhcflurry/ensemble_centrality.py +++ b/mhcflurry/ensemble_centrality.py @@ -37,4 +37,4 @@ CENTRALITY_MEASURES = { "mean": partial(numpy.nanmean, axis=1), "median": partial(numpy.nanmedian, axis=1), "robust_mean": robust_mean, -} \ No newline at end of file +} diff --git a/mhcflurry/hyperparameters.py b/mhcflurry/hyperparameters.py index cc5950d5..1241fa46 100644 --- a/mhcflurry/hyperparameters.py +++ b/mhcflurry/hyperparameters.py @@ -1,3 +1,6 @@ +""" +Hyperparameter (neural network options) management +""" from __future__ import ( print_function, division, @@ -70,8 +73,7 @@ class HyperparameterDefaults(object): if invalid_keys: raise ValueError( "No such model parameters: %s. Valid parameters are: %s" - % (" ".join(invalid_keys), - " ".join(self.defaults))) + % (" ".join(invalid_keys), " ".join(self.defaults))) def models_grid(self, **kwargs): ''' diff --git a/mhcflurry/percent_rank_transform.py b/mhcflurry/percent_rank_transform.py index a9098bc2..a4597686 100644 --- a/mhcflurry/percent_rank_transform.py +++ b/mhcflurry/percent_rank_transform.py @@ -1,3 +1,6 @@ +""" +Class for transforming arbitrary values into percent ranks given a distribution. +""" import numpy import pandas @@ -77,8 +80,3 @@ class PercentRankTransform(object): result.cdf = series.values result.bin_edges = series.index.values[1:-1] return result - - - - - diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index dea1ddf4..17a687f5 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -219,7 +219,7 @@ def run(argv=sys.argv[1:]): }) logging.info( "Predicting for %d alleles and %d peptides = %d predictions" % ( - len(args.alleles), len(args.peptides), len(df))) + len(args.alleles), len(args.peptides), len(df))) predictions = predictor.predict_to_dataframe( peptides=df[args.peptide_column].values, diff --git a/mhcflurry/scoring.py b/mhcflurry/scoring.py index f6a256ba..d0d41d4e 100644 --- a/mhcflurry/scoring.py +++ b/mhcflurry/scoring.py @@ -1,3 +1,6 @@ +""" +Measures of prediction accuracy +""" from __future__ import ( print_function, division, diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index 0032f8fc..6e6f45fa 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -24,7 +24,7 @@ tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 from .class1_affinity_predictor import Class1AffinityPredictor from .encodable_sequences import EncodableSequences from .allele_encoding import AlleleEncoding -from .common import configure_logging, random_peptides +from .common import configure_logging from .local_parallelism import ( worker_pool_with_gpu_assignments_from_args, add_local_parallelism_args) diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index fe295b67..ec1b8c8a 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -10,7 +10,6 @@ import traceback import random from functools import partial -import numpy import pandas import yaml from sklearn.metrics.pairwise import cosine_similarity @@ -337,7 +336,7 @@ def alleles_by_similarity(allele): allele_similarity.columns.to_series().sample(frac=1.0)) return ( allele_similarity[allele] + ( - allele_similarity.index == allele) # force that we return specified allele first + allele_similarity.index == allele) # force specified allele first ).sort_values(ascending=False).index.tolist() diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index d925f991..a676bb38 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -270,7 +270,8 @@ def run(argv=sys.argv[1:]): return main(args) except Exception as e: print(e) - import ipdb ; ipdb.set_trace() + import ipdb + ipdb.set_trace() raise else: return main(args) @@ -697,4 +698,3 @@ def train_model( if __name__ == '__main__': run() - -- GitLab