From 527a608d34e6361b100271a3390cf867a0a94812 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 6 Mar 2020 19:28:10 -0500 Subject: [PATCH] Working on docs --- mhcflurry/class1_presentation_predictor.py | 92 +++++++++++++++++++--- test/test_class1_presentation_predictor.py | 4 +- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index f2d8094e..ef3f5e48 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -1,14 +1,12 @@ from __future__ import print_function -from os.path import join, exists, abspath -from os import mkdir, environ +from os.path import join, exists +from os import mkdir from socket import gethostname from getpass import getuser import time import collections -import json -import hashlib import logging from six import string_types @@ -17,7 +15,6 @@ import pandas import sklearn import sklearn.linear_model -import mhcnames try: import tqdm @@ -30,9 +27,7 @@ from .class1_processing_predictor import Class1ProcessingPredictor from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE from .encodable_sequences import EncodableSequences from .regression_target import from_ic50, to_ic50 -from .multiple_allele_encoding import MultipleAlleleEncoding from .downloads import get_default_class1_presentation_models_dir -from .common import load_weights MAX_ALLELES_PER_SAMPLE = 6 @@ -41,6 +36,12 @@ PREDICT_CHUNK_SIZE = 100000 # currently used only for cleavage prediction class Class1PresentationPredictor(object): + """ + A logistic regression model over predicted binding affinity (BA) and antigen + processing (AP) score. + + See load() and predict() methods for basic usage. + """ model_inputs = ["affinity_score", "processing_score"] def __init__( @@ -61,10 +62,16 @@ class Class1PresentationPredictor(object): @property def supported_alleles(self): + """ + List of alleles supported by the underlying Class1AffinityPredictor + """ return self.affinity_predictor.supported_alleles @property def supported_peptide_lengths(self): + """ + (min, max) of supported peptide lengths, inclusive. + """ return self.affinity_predictor.supported_peptide_lengths def predict_affinity( @@ -75,6 +82,21 @@ class Class1PresentationPredictor(object): include_affinity_percentile=False, verbose=1, throw=True): + """ + + Parameters + ---------- + peptides + experiment_names + alleles + include_affinity_percentile + verbose + throw + + Returns + ------- + + """ df = pandas.DataFrame({ "peptide": numpy.array(peptides, copy=False), "experiment_name": numpy.array(experiment_names, copy=False), @@ -242,14 +264,62 @@ class Class1PresentationPredictor(object): self, sequences, alleles, - result="best", # or "all" or "filtered" + result="best", comparison_quantity="presentation_score", - comparison_value=None, + filter_value=None, peptide_lengths=[8, 9, 10, 11], use_flanks=True, include_affinity_percentile=False, verbose=1, throw=True): + """ + Predict across protein sequences. + + Parameters + ---------- + sequences : str, list of string, or string -> string dict + Protein sequences. If a dict is given, the keys are arbitrary ( + e.g. protein names), and the values are the amino acid sequences. + alleles : str, list of string, list of list of string, or string -> string dict + MHC I alleles. Can be: (1) a string (a single allele), (2) a list of + strings (a single genotype), (3) a list of list of strings + (multiple genotypes, where the total number of genotypes must equal + the number of sequences), or (4) a dict (in which case the keys must + match the sequences dict keys). + result : string + One of: + - "best": return the strongest peptide for each sequence + - "all": return predictions for all peptides + - "filtered": return predictions stronger where comparison_quantity + is stronger than filter_value. + comparison_quantity : string + One of "presentation_score", "processing_score", or "affinity". + Quantity to use to rank (if result is "best") or filter (if result + is "filtered") results. + filter_value : float + Threshold value to use, only relevant when result is "filtered". + If comparison_quantity is "affinity", then all results less than + (i.e. tighter than) the specified nM affinity are retained. If it's + "presentation_score" or "processing_score" then results greater than + the indicated filter_value are retained. + peptide_lengths : list of int + Peptide lengths to predict for. + use_flanks : bool + Whether to include flanking sequences when running the AP predictor + (for better cleavage prediction). + include_affinity_percentile : bool + Whether to include affinity percentile ranks in output. + verbose : int + Set to 0 for quiet mode. + throw : boolean + Whether to throw exceptions (vs. log warnings) on invalid inputs. + + Returns + ------- + pandas.DataFrame with columns: + peptide, n_flank, c_flank, sequence_name, affinity, best_allele, + processing_score, presentation_score + """ processing_predictor = self.processing_predictor_with_flanks if not use_flanks or processing_predictor is None: @@ -339,11 +409,11 @@ class Class1PresentationPredictor(object): elif result == "filtered": if comparison_is_score: result_df = result_df.loc[ - result_df[comparison_quantity] >= comparison_value + result_df[comparison_quantity] >= filter_value ] else: result_df = result_df.loc[ - result_df[comparison_quantity] <= comparison_value + result_df[comparison_quantity] <= filter_value ] elif result == "all": pass diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py index f811ab6b..242f6087 100644 --- a/test/test_class1_presentation_predictor.py +++ b/test/test_class1_presentation_predictor.py @@ -158,7 +158,7 @@ def test_downloaded_predictor(): scan_results2 = PRESENTATION_PREDICTOR.predict_sequences( result="filtered", - comparison_value=500, + filter_value=500, comparison_quantity="affinity", sequences={ "seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE", @@ -180,7 +180,7 @@ def test_downloaded_predictor(): scan_results3 = PRESENTATION_PREDICTOR.predict_sequences( result="filtered", - comparison_value=0.9, + filter_value=0.9, comparison_quantity="presentation_score", sequences={ "seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE", -- GitLab