From 44d3bbaf74d85936d7f015559de15e1b98d186cc Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 16 Mar 2020 12:16:11 -0400 Subject: [PATCH] Working on docs --- mhcflurry/class1_presentation_predictor.py | 127 ++++++++++++------ mhcflurry/predict_command.py | 6 +- .../train_presentation_models_command.py | 2 +- test/test_class1_presentation_predictor.py | 8 +- test/test_doctest.py | 20 +++ test/test_predict_scan_command.py | 3 +- 6 files changed, 113 insertions(+), 53 deletions(-) create mode 100644 test/test_doctest.py diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index 812ffae6..a1f67ee8 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -45,7 +45,8 @@ class Class1PresentationPredictor(object): These predictions are combined using a logistic regression model to give a "presentation score" prediction. - See load() and predict() methods for basic usage. + Most users will call the `load` static method to get an instance of this + class, then call the `predict_to_dataframe` method to generate predictions. """ model_inputs = ["affinity_score", "processing_score"] @@ -83,23 +84,62 @@ class Class1PresentationPredictor(object): self, peptides, alleles, - experiment_names=None, + sample_names=None, include_affinity_percentile=False, verbose=1, throw=True): """ Predict binding affinities. + Two modes are supported: each peptide can be evaluated for binding to + any of the alleles in any sample (this is what happens when sample_names + is None), or the i'th peptide can be evaluated for binding the alleles + of the sample given by the i'th entry in sample_names. + + For example, if we don't specify sample_names, then predictions + are taken for all combinations of samples and peptides: + + >>> predictor = Class1PresentationPredictor.load() + >>> predictor.predict_affinity( + ... peptides=["SIINFEKL", "PEPTIDE"], + ... alleles={ + ... "sample1": ["A0201", "A0301", "B0702"], + ... "sample2": ["A0101", "C0202"], + ... }, + ... verbose=0) + peptide peptide_num sample_name affinity best_allele + 0 SIINFEKL 0 sample1 12906.787792 A0201 + 1 PEPTIDE 1 sample1 36827.681130 B0702 + 2 SIINFEKL 0 sample2 3588.413748 C0202 + 3 PEPTIDE 1 sample2 34362.109211 C0202 + + In contrast, here we specify sample_names, so peptide is evaluated for + binding the alleles in the corresponding sample: + + >>> predictor.predict_affinity( + ... peptides=["SIINFEKL", "PEPTIDE"], + ... alleles={ + ... "sample1": ["A0201", "A0301", "B0702"], + ... "sample2": ["A0101", "C0202"], + ... }, + ... sample_names=["sample2", "sample1"], + ... verbose=0) + peptide peptide_num sample_name affinity best_allele + 0 SIINFEKL 0 sample2 3588.412141 C0202 + 1 PEPTIDE 1 sample1 36827.682779 B0702 + + Parameters ---------- peptides : list of string alleles : dict of string -> list of string - Keys are experiment names, values are the alleles (genotype) for + Keys are sample names, values are the alleles (genotype) for that sample - experiment_names : list of string [same length as peptides] + sample_names : list of string [same length as peptides] Sample names corresponding to each peptide. These are used to lookup the alleles for each peptide in the alleles dict. If not - specified, then all combinations of experiment names + specified, then predictions are generated for all sample genotypes + across all peptides. include_affinity_percentile : bool Whether to include affinity percentile ranks verbose : int @@ -116,7 +156,7 @@ class Class1PresentationPredictor(object): "peptide": numpy.array(peptides, copy=False), }) df["peptide_num"] = df.index - if experiment_names is None: + if sample_names is None: peptides = EncodableSequences.create(peptides) all_alleles = set() for lst in alleles.values(): @@ -138,37 +178,37 @@ class Class1PresentationPredictor(object): throw=throw) dfs = [] - for (experiment_name, experiment_alleles) in alleles.items(): + for (sample_name, sample_alleles) in alleles.items(): new_df = df.copy() - new_df["experiment_name"] = experiment_name + new_df["sample_name"] = sample_name new_df["affinity"] = predictions_df[ - experiment_alleles + sample_alleles ].min(1).values if len(df) == 0: new_df["best_allele"] = [] else: new_df["best_allele"] = predictions_df[ - experiment_alleles + sample_alleles ].idxmin(1).values dfs.append(new_df) result_df = pandas.concat(dfs, ignore_index=True) else: - df["experiment_name"] = numpy.array(experiment_names, copy=False) + df["sample_name"] = numpy.array(sample_names, copy=False) - iterator = df.groupby("experiment_name") + iterator = df.groupby("sample_name") if verbose > 0: print("Predicting affinities.") if tqdm is not None: iterator = tqdm.tqdm( - iterator, total=df.experiment_name.nunique()) + iterator, total=df.sample_name.nunique()) - for (experiment, sub_df) in iterator: + for (sample, sub_df) in iterator: predictions_df = pandas.DataFrame(index=sub_df.index) - experiment_peptides = EncodableSequences.create(sub_df.peptide.values) - for allele in alleles[experiment]: + sample_peptides = EncodableSequences.create(sub_df.peptide.values) + for allele in alleles[sample]: predictions_df[allele] = self.affinity_predictor.predict( - peptides=experiment_peptides, + peptides=sample_peptides, allele=allele, model_kwargs={'batch_size': PREDICT_BATCH_SIZE}, throw=throw) @@ -250,7 +290,7 @@ class Class1PresentationPredictor(object): self, targets, peptides, - experiment_names, + sample_names, alleles, n_flanks=None, c_flanks=None, @@ -263,9 +303,9 @@ class Class1PresentationPredictor(object): targets : list of int/float 1 indicates hit, 0 indicates decoy peptides : list of string [same length as targets] - experiment_names : list of string [same length as targets] + sample_names : list of string [same length as targets] alleles : dict of string -> list of string - Keys are experiment names, values are the alleles for that sample + Keys are sample names, values are the alleles for that sample n_flanks : list of string [same length as targets] c_flanks : list of string [same length as targets] verbose : int @@ -274,7 +314,7 @@ class Class1PresentationPredictor(object): df = self.predict_affinity( peptides=peptides, alleles=alleles, - experiment_names=experiment_names, + sample_names=sample_names, verbose=verbose) df["affinity_score"] = from_ic50(df.affinity) df["target"] = numpy.array(targets, copy=False) @@ -351,7 +391,7 @@ class Class1PresentationPredictor(object): self, peptides, alleles, - experiment_names=None, + sample_names=None, n_flanks=None, c_flanks=None, verbose=1): @@ -371,11 +411,11 @@ class Class1PresentationPredictor(object): If you are predicting for a single sample, pass a list of strings (up to 6) indicating the genotype. If you are predicting across multiple samples, pass a dict where the keys are (arbitrary) - experiment names and the values are the alleles to predict for that + sample names and the values are the alleles to predict for that sample. - experiment_names : list of string [same length as peptides] + sample_names : list of string [same length as peptides] If you are passing a dict for 'alleles', use this argument to - specify which peptides go with which experiments. + specify which peptides go with which sample. n_flanks : list of string [same length as peptides] Upstream sequences before the peptide. Sequences of any length can be given and a suffix of the size supported by the model will be @@ -395,16 +435,16 @@ class Class1PresentationPredictor(object): higher values indicating more favorable presentation likelihood. """ if isinstance(alleles, dict): - if experiment_names is None: + if sample_names is None: raise ValueError( - "experiment_names must be supplied when alleles is a dict. " + "sample_names must be supplied when alleles is a dict. " "Alternatively, call predict_to_dataframe to predict over " - "all experiments") + "all samples") return self.predict_to_dataframe( peptides=peptides, alleles=alleles, - experiment_names=experiment_names, + sample_names=sample_names, n_flanks=n_flanks, c_flanks=c_flanks, verbose=verbose).presentation_score.values @@ -413,7 +453,7 @@ class Class1PresentationPredictor(object): self, peptides, alleles, - experiment_names=None, + sample_names=None, n_flanks=None, c_flanks=None, include_affinity_percentile=False, @@ -437,13 +477,14 @@ class Class1PresentationPredictor(object): If you are predicting for a single sample, pass a list of strings (up to 6) indicating the genotype. If you are predicting across multiple samples, pass a dict where the keys are (arbitrary) - experiment names and the values are the alleles to predict for that + sample names and the values are the alleles to predict for that sample. - experiment_names : list of string [same length as peptides] - If you are passing a dict for 'alleles', you can use this argument to - specify which peptides go with which experiments. If it is None, + sample_names : list of string [same length as peptides] + If you are passing a dict for 'alleles', you can use this + argument to + specify which peptides go with which samples. If it is None, then predictions will be performed for each peptide across all - experiments. + samples. n_flanks : list of string [same length as peptides] Upstream sequences before the peptide. Sequences of any length can be given and a suffix of the size supported by the model will be @@ -474,9 +515,9 @@ class Class1PresentationPredictor(object): if not isinstance(alleles, dict): # Make alleles into a dict. - if experiment_names is not None: + if sample_names is not None: raise ValueError( - "alleles must be a dict when experiment_names is specified") + "alleles must be a dict when sample_names is specified") alleles = numpy.array(alleles, copy=False) if len(alleles) > MAX_ALLELES_PER_SAMPLE: @@ -490,7 +531,7 @@ class Class1PresentationPredictor(object): % MAX_ALLELES_PER_SAMPLE) alleles = { - "experiment1": alleles, + "sample1": alleles, } if (n_flanks is None) != (c_flanks is None): @@ -505,7 +546,7 @@ class Class1PresentationPredictor(object): df = self.predict_affinity( peptides=peptides, alleles=alleles, - experiment_names=experiment_names, # might be None + sample_names=sample_names, # might be None include_affinity_percentile=include_affinity_percentile, verbose=verbose, throw=throw) @@ -643,7 +684,7 @@ class Class1PresentationPredictor(object): if not isinstance(alleles, dict): raise ValueError("Invalid type for alleles: ", type(alleles)) - experiment_names = None if cross_product else [] + sample_names = None if cross_product else [] genotype_names = list(alleles) position_in_sequence = [] for (i, (name, sequence)) in enumerate(sequences.items()): @@ -665,7 +706,7 @@ class Class1PresentationPredictor(object): sequence_names.append(name) position_in_sequence.append(peptide_start) if not cross_product: - experiment_names.append(genotype_name) + sample_names.append(genotype_name) peptides.append(peptide) if use_flanks: n_flanks.append( @@ -678,7 +719,7 @@ class Class1PresentationPredictor(object): alleles=alleles, n_flanks=n_flanks, c_flanks=c_flanks, - experiment_names=experiment_names, + sample_names=sample_names, include_affinity_percentile=include_affinity_percentile, verbose=verbose, throw=throw) @@ -701,7 +742,7 @@ class Class1PresentationPredictor(object): if result == "best": result_df = result_df.drop_duplicates( - ["sequence_name", "experiment_name"], keep="first" + ["sequence_name", "sample_name"], keep="first" ).sort_values("sequence_name") elif result == "filtered": if comparison_is_score: diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index 7a84c4ad..ecb35872 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -260,7 +260,7 @@ def run(argv=sys.argv[1:]): predictions = predictor.predict_affinity( peptides=df[args.peptide_column].values, alleles=allele_string_to_alleles, - experiment_names=df[args.allele_column], + sample_names=df[args.allele_column], throw=not args.no_throw, include_affinity_percentile=not args.no_affinity_percentile) else: @@ -280,7 +280,7 @@ def run(argv=sys.argv[1:]): n_flanks=n_flanks, c_flanks=c_flanks, alleles=allele_string_to_alleles, - experiment_names=df[args.allele_column], + sample_names=df[args.allele_column], throw=not args.no_throw, include_affinity_percentile=not args.no_affinity_percentile) @@ -291,7 +291,7 @@ def run(argv=sys.argv[1:]): del predictions["best_allele"] for col in predictions.columns: - if col not in ("allele", "peptide", "experiment_name", "peptide_num"): + if col not in ("allele", "peptide", "sample_name", "peptide_num"): df[args.prediction_column_prefix + col] = predictions[col] if args.out: diff --git a/mhcflurry/train_presentation_models_command.py b/mhcflurry/train_presentation_models_command.py index b50795c3..cc6c4ca8 100644 --- a/mhcflurry/train_presentation_models_command.py +++ b/mhcflurry/train_presentation_models_command.py @@ -124,7 +124,7 @@ def main(args): targets=df[args.target_column].values, peptides=df.peptide.values, alleles=experiment_to_alleles, - experiment_names=df.experiment_id, + sample_names=df.experiment_id, n_flanks=df.n_flank.values, c_flanks=df.c_flank.values, verbose=args.verbosity) diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py index b0ab1612..35a9c6dc 100644 --- a/test/test_class1_presentation_predictor.py +++ b/test/test_class1_presentation_predictor.py @@ -15,9 +15,7 @@ from sklearn.metrics import roc_auc_score from mhcflurry import Class1AffinityPredictor, Class1ProcessingPredictor from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor from mhcflurry.downloads import get_path -from mhcflurry.common import random_peptides from mhcflurry.testing_utils import cleanup, startup -from mhcflurry.regression_target import to_ic50 import mhcflurry.class1_presentation_predictor mhcflurry.class1_presentation_predictor.PREDICT_CHUNK_SIZE = 15 @@ -80,7 +78,7 @@ def test_basic(): predictor.fit( targets=train_df.hit.values, peptides=train_df.peptide.values, - experiment_names=train_df.sample_id.values, + sample_names=train_df.sample_id.values, alleles=experiment_to_alleles, n_flanks=train_df.n_flank.values, c_flanks=train_df.c_flank.values, @@ -89,7 +87,7 @@ def test_basic(): def add_prediction_cols(test_df, predictor): test_df["prediction1"] = predictor.predict( peptides=test_df.peptide.values, - experiment_names=test_df.sample_id.values, + sample_names=test_df.sample_id.values, alleles=experiment_to_alleles, n_flanks=test_df.n_flank.values, c_flanks=test_df.c_flank.values, @@ -97,7 +95,7 @@ def test_basic(): test_df["prediction2"] = predictor.predict( peptides=test_df.peptide.values, - experiment_names=test_df.sample_id.values, + sample_names=test_df.sample_id.values, alleles=experiment_to_alleles, verbose=2) diff --git a/test/test_doctest.py b/test/test_doctest.py new file mode 100644 index 00000000..9b9205a6 --- /dev/null +++ b/test/test_doctest.py @@ -0,0 +1,20 @@ +""" +Run doctests. +""" + +import os +import doctest + +import mhcflurry +import mhcflurry.class1_presentation_predictor + +os.environ["CUDA_VISIBLE_DEVICES"] = "" + +from mhcflurry.testing_utils import cleanup, startup +teardown = cleanup +setup = startup + + +def test_doctests(): + doctest.testmod(mhcflurry) + doctest.testmod(mhcflurry.class1_presentation_predictor) diff --git a/test/test_predict_scan_command.py b/test/test_predict_scan_command.py index ff69b818..0aa87f34 100644 --- a/test/test_predict_scan_command.py +++ b/test/test_predict_scan_command.py @@ -17,6 +17,7 @@ setup = startup from . import data_path + def read_output_csv(filename): return pandas.read_csv( filename, @@ -125,6 +126,6 @@ def test_commandline_sequences(): assert_equal(result.sequence_name.nunique(), 2) assert_equal(result.best_allele.nunique(), 3) - assert_equal(result.experiment_name.nunique(), 2) + assert_equal(result.sample_name.nunique(), 2) assert_equal((result.peptide == "ASDFGHKL").sum(), 2) assert_equal((result.peptide != "ASDFGHKL").sum(), 10) \ No newline at end of file -- GitLab