From 44d3bbaf74d85936d7f015559de15e1b98d186cc Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 16 Mar 2020 12:16:11 -0400
Subject: [PATCH] Working on docs

---
 mhcflurry/class1_presentation_predictor.py    | 127 ++++++++++++------
 mhcflurry/predict_command.py                  |   6 +-
 .../train_presentation_models_command.py      |   2 +-
 test/test_class1_presentation_predictor.py    |   8 +-
 test/test_doctest.py                          |  20 +++
 test/test_predict_scan_command.py             |   3 +-
 6 files changed, 113 insertions(+), 53 deletions(-)
 create mode 100644 test/test_doctest.py

diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py
index 812ffae6..a1f67ee8 100644
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -45,7 +45,8 @@ class Class1PresentationPredictor(object):
     These predictions are combined using a logistic regression model to give
     a "presentation score" prediction.
 
-    See load() and predict() methods for basic usage.
+    Most users will call the `load` static method to get an instance of this
+    class, then call the `predict_to_dataframe` method to generate predictions.
     """
     model_inputs = ["affinity_score", "processing_score"]
 
@@ -83,23 +84,62 @@ class Class1PresentationPredictor(object):
             self,
             peptides,
             alleles,
-            experiment_names=None,
+            sample_names=None,
             include_affinity_percentile=False,
             verbose=1,
             throw=True):
         """
         Predict binding affinities.
 
+        Two modes are supported: each peptide can be evaluated for binding to
+        any of the alleles in any sample (this is what happens when sample_names
+        is None), or the i'th peptide can be evaluated for binding the alleles
+        of the sample given by the i'th entry in sample_names.
+
+        For example, if we don't specify sample_names, then predictions
+        are taken for all combinations of samples and peptides:
+
+        >>> predictor = Class1PresentationPredictor.load()
+        >>> predictor.predict_affinity(
+        ...    peptides=["SIINFEKL", "PEPTIDE"],
+        ...    alleles={
+        ...        "sample1": ["A0201", "A0301", "B0702"],
+        ...        "sample2": ["A0101", "C0202"],
+        ...    },
+        ...    verbose=0)
+            peptide  peptide_num sample_name      affinity best_allele
+        0  SIINFEKL            0     sample1  12906.787792       A0201
+        1   PEPTIDE            1     sample1  36827.681130       B0702
+        2  SIINFEKL            0     sample2   3588.413748       C0202
+        3   PEPTIDE            1     sample2  34362.109211       C0202
+
+        In contrast, here we specify sample_names, so peptide is evaluated for
+        binding the alleles in the corresponding sample:
+
+        >>> predictor.predict_affinity(
+        ...    peptides=["SIINFEKL", "PEPTIDE"],
+        ...    alleles={
+        ...        "sample1": ["A0201", "A0301", "B0702"],
+        ...        "sample2": ["A0101", "C0202"],
+        ...    },
+        ...    sample_names=["sample2", "sample1"],
+        ...    verbose=0)
+            peptide  peptide_num sample_name      affinity best_allele
+        0  SIINFEKL            0     sample2   3588.412141       C0202
+        1   PEPTIDE            1     sample1  36827.682779       B0702
+
+
         Parameters
         ----------
         peptides : list of string
         alleles : dict of string -> list of string
-            Keys are experiment names, values are the alleles (genotype) for
+            Keys are sample names, values are the alleles (genotype) for
             that sample
-        experiment_names : list of string [same length as peptides]
+        sample_names : list of string [same length as peptides]
             Sample names corresponding to each peptide. These are used to
             lookup the alleles for each peptide in the alleles dict. If not
-            specified, then all combinations of experiment names
+            specified, then predictions are generated for all sample genotypes
+            across all peptides.
         include_affinity_percentile : bool
             Whether to include affinity percentile ranks
         verbose : int
@@ -116,7 +156,7 @@ class Class1PresentationPredictor(object):
             "peptide": numpy.array(peptides, copy=False),
         })
         df["peptide_num"] = df.index
-        if experiment_names is None:
+        if sample_names is None:
             peptides = EncodableSequences.create(peptides)
             all_alleles = set()
             for lst in alleles.values():
@@ -138,37 +178,37 @@ class Class1PresentationPredictor(object):
                     throw=throw)
 
             dfs = []
-            for (experiment_name, experiment_alleles) in alleles.items():
+            for (sample_name, sample_alleles) in alleles.items():
                 new_df = df.copy()
-                new_df["experiment_name"] = experiment_name
+                new_df["sample_name"] = sample_name
                 new_df["affinity"] = predictions_df[
-                    experiment_alleles
+                    sample_alleles
                 ].min(1).values
                 if len(df) == 0:
                     new_df["best_allele"] = []
                 else:
                     new_df["best_allele"] = predictions_df[
-                        experiment_alleles
+                        sample_alleles
                     ].idxmin(1).values
                 dfs.append(new_df)
 
             result_df = pandas.concat(dfs, ignore_index=True)
         else:
-            df["experiment_name"] = numpy.array(experiment_names, copy=False)
+            df["sample_name"] = numpy.array(sample_names, copy=False)
 
-            iterator = df.groupby("experiment_name")
+            iterator = df.groupby("sample_name")
             if verbose > 0:
                 print("Predicting affinities.")
                 if tqdm is not None:
                     iterator = tqdm.tqdm(
-                        iterator, total=df.experiment_name.nunique())
+                        iterator, total=df.sample_name.nunique())
 
-            for (experiment, sub_df) in iterator:
+            for (sample, sub_df) in iterator:
                 predictions_df = pandas.DataFrame(index=sub_df.index)
-                experiment_peptides = EncodableSequences.create(sub_df.peptide.values)
-                for allele in alleles[experiment]:
+                sample_peptides = EncodableSequences.create(sub_df.peptide.values)
+                for allele in alleles[sample]:
                     predictions_df[allele] = self.affinity_predictor.predict(
-                        peptides=experiment_peptides,
+                        peptides=sample_peptides,
                         allele=allele,
                         model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
                         throw=throw)
@@ -250,7 +290,7 @@ class Class1PresentationPredictor(object):
             self,
             targets,
             peptides,
-            experiment_names,
+            sample_names,
             alleles,
             n_flanks=None,
             c_flanks=None,
@@ -263,9 +303,9 @@ class Class1PresentationPredictor(object):
         targets : list of int/float
             1 indicates hit, 0 indicates decoy
         peptides : list of string [same length as targets]
-        experiment_names : list of string [same length as targets]
+        sample_names : list of string [same length as targets]
         alleles : dict of string -> list of string
-            Keys are experiment names, values are the alleles for that sample
+            Keys are sample names, values are the alleles for that sample
         n_flanks : list of string [same length as targets]
         c_flanks : list of string [same length as targets]
         verbose : int
@@ -274,7 +314,7 @@ class Class1PresentationPredictor(object):
         df = self.predict_affinity(
             peptides=peptides,
             alleles=alleles,
-            experiment_names=experiment_names,
+            sample_names=sample_names,
             verbose=verbose)
         df["affinity_score"] = from_ic50(df.affinity)
         df["target"] = numpy.array(targets, copy=False)
@@ -351,7 +391,7 @@ class Class1PresentationPredictor(object):
             self,
             peptides,
             alleles,
-            experiment_names=None,
+            sample_names=None,
             n_flanks=None,
             c_flanks=None,
             verbose=1):
@@ -371,11 +411,11 @@ class Class1PresentationPredictor(object):
             If you are predicting for a single sample, pass a list of strings
             (up to 6) indicating the genotype. If you are predicting across
             multiple samples, pass a dict where the keys are (arbitrary)
-            experiment names and the values are the alleles to predict for that
+            sample names and the values are the alleles to predict for that
             sample.
-        experiment_names : list of string [same length as peptides]
+        sample_names : list of string [same length as peptides]
             If you are passing a dict for 'alleles', use this argument to
-            specify which peptides go with which experiments.
+            specify which peptides go with which sample.
         n_flanks : list of string [same length as peptides]
             Upstream sequences before the peptide. Sequences of any length can
             be given and a suffix of the size supported by the model will be
@@ -395,16 +435,16 @@ class Class1PresentationPredictor(object):
         higher values indicating more favorable presentation likelihood.
         """
         if isinstance(alleles, dict):
-            if experiment_names is None:
+            if sample_names is None:
                 raise ValueError(
-                    "experiment_names must be supplied when alleles is a dict. "
+                    "sample_names must be supplied when alleles is a dict. "
                     "Alternatively, call predict_to_dataframe to predict over "
-                    "all experiments")
+                    "all samples")
 
         return self.predict_to_dataframe(
             peptides=peptides,
             alleles=alleles,
-            experiment_names=experiment_names,
+            sample_names=sample_names,
             n_flanks=n_flanks,
             c_flanks=c_flanks,
             verbose=verbose).presentation_score.values
@@ -413,7 +453,7 @@ class Class1PresentationPredictor(object):
             self,
             peptides,
             alleles,
-            experiment_names=None,
+            sample_names=None,
             n_flanks=None,
             c_flanks=None,
             include_affinity_percentile=False,
@@ -437,13 +477,14 @@ class Class1PresentationPredictor(object):
             If you are predicting for a single sample, pass a list of strings
             (up to 6) indicating the genotype. If you are predicting across
             multiple samples, pass a dict where the keys are (arbitrary)
-            experiment names and the values are the alleles to predict for that
+            sample names and the values are the alleles to predict for that
             sample.
-        experiment_names : list of string [same length as peptides]
-            If you are passing a dict for 'alleles', you can use this argument to
-            specify which peptides go with which experiments. If it is None,
+        sample_names : list of string [same length as peptides]
+            If you are passing a dict for 'alleles', you can use this
+            argument to
+            specify which peptides go with which samples. If it is None,
             then predictions will be performed for each peptide across all
-            experiments.
+            samples.
         n_flanks : list of string [same length as peptides]
             Upstream sequences before the peptide. Sequences of any length can
             be given and a suffix of the size supported by the model will be
@@ -474,9 +515,9 @@ class Class1PresentationPredictor(object):
 
         if not isinstance(alleles, dict):
             # Make alleles into a dict.
-            if experiment_names is not None:
+            if sample_names is not None:
                 raise ValueError(
-                    "alleles must be a dict when experiment_names is specified")
+                    "alleles must be a dict when sample_names is specified")
 
             alleles = numpy.array(alleles, copy=False)
             if len(alleles) > MAX_ALLELES_PER_SAMPLE:
@@ -490,7 +531,7 @@ class Class1PresentationPredictor(object):
                     % MAX_ALLELES_PER_SAMPLE)
 
             alleles = {
-                "experiment1": alleles,
+                "sample1": alleles,
             }
 
         if (n_flanks is None) != (c_flanks is None):
@@ -505,7 +546,7 @@ class Class1PresentationPredictor(object):
         df = self.predict_affinity(
             peptides=peptides,
             alleles=alleles,
-            experiment_names=experiment_names,  # might be None
+            sample_names=sample_names,  # might be None
             include_affinity_percentile=include_affinity_percentile,
             verbose=verbose,
             throw=throw)
@@ -643,7 +684,7 @@ class Class1PresentationPredictor(object):
         if not isinstance(alleles, dict):
             raise ValueError("Invalid type for alleles: ", type(alleles))
 
-        experiment_names = None if cross_product else []
+        sample_names = None if cross_product else []
         genotype_names = list(alleles)
         position_in_sequence = []
         for (i, (name, sequence)) in enumerate(sequences.items()):
@@ -665,7 +706,7 @@ class Class1PresentationPredictor(object):
                     sequence_names.append(name)
                     position_in_sequence.append(peptide_start)
                     if not cross_product:
-                        experiment_names.append(genotype_name)
+                        sample_names.append(genotype_name)
                     peptides.append(peptide)
                     if use_flanks:
                         n_flanks.append(
@@ -678,7 +719,7 @@ class Class1PresentationPredictor(object):
             alleles=alleles,
             n_flanks=n_flanks,
             c_flanks=c_flanks,
-            experiment_names=experiment_names,
+            sample_names=sample_names,
             include_affinity_percentile=include_affinity_percentile,
             verbose=verbose,
             throw=throw)
@@ -701,7 +742,7 @@ class Class1PresentationPredictor(object):
 
         if result == "best":
             result_df = result_df.drop_duplicates(
-                ["sequence_name", "experiment_name"], keep="first"
+                ["sequence_name", "sample_name"], keep="first"
             ).sort_values("sequence_name")
         elif result == "filtered":
             if comparison_is_score:
diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py
index 7a84c4ad..ecb35872 100644
--- a/mhcflurry/predict_command.py
+++ b/mhcflurry/predict_command.py
@@ -260,7 +260,7 @@ def run(argv=sys.argv[1:]):
         predictions = predictor.predict_affinity(
             peptides=df[args.peptide_column].values,
             alleles=allele_string_to_alleles,
-            experiment_names=df[args.allele_column],
+            sample_names=df[args.allele_column],
             throw=not args.no_throw,
             include_affinity_percentile=not args.no_affinity_percentile)
     else:
@@ -280,7 +280,7 @@ def run(argv=sys.argv[1:]):
             n_flanks=n_flanks,
             c_flanks=c_flanks,
             alleles=allele_string_to_alleles,
-            experiment_names=df[args.allele_column],
+            sample_names=df[args.allele_column],
             throw=not args.no_throw,
             include_affinity_percentile=not args.no_affinity_percentile)
 
@@ -291,7 +291,7 @@ def run(argv=sys.argv[1:]):
             del predictions["best_allele"]
 
     for col in predictions.columns:
-        if col not in ("allele", "peptide", "experiment_name", "peptide_num"):
+        if col not in ("allele", "peptide", "sample_name", "peptide_num"):
             df[args.prediction_column_prefix + col] = predictions[col]
 
     if args.out:
diff --git a/mhcflurry/train_presentation_models_command.py b/mhcflurry/train_presentation_models_command.py
index b50795c3..cc6c4ca8 100644
--- a/mhcflurry/train_presentation_models_command.py
+++ b/mhcflurry/train_presentation_models_command.py
@@ -124,7 +124,7 @@ def main(args):
         targets=df[args.target_column].values,
         peptides=df.peptide.values,
         alleles=experiment_to_alleles,
-        experiment_names=df.experiment_id,
+        sample_names=df.experiment_id,
         n_flanks=df.n_flank.values,
         c_flanks=df.c_flank.values,
         verbose=args.verbosity)
diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py
index b0ab1612..35a9c6dc 100644
--- a/test/test_class1_presentation_predictor.py
+++ b/test/test_class1_presentation_predictor.py
@@ -15,9 +15,7 @@ from sklearn.metrics import roc_auc_score
 from mhcflurry import Class1AffinityPredictor, Class1ProcessingPredictor
 from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor
 from mhcflurry.downloads import get_path
-from mhcflurry.common import random_peptides
 from mhcflurry.testing_utils import cleanup, startup
-from mhcflurry.regression_target import to_ic50
 import mhcflurry.class1_presentation_predictor
 mhcflurry.class1_presentation_predictor.PREDICT_CHUNK_SIZE = 15
 
@@ -80,7 +78,7 @@ def test_basic():
     predictor.fit(
         targets=train_df.hit.values,
         peptides=train_df.peptide.values,
-        experiment_names=train_df.sample_id.values,
+        sample_names=train_df.sample_id.values,
         alleles=experiment_to_alleles,
         n_flanks=train_df.n_flank.values,
         c_flanks=train_df.c_flank.values,
@@ -89,7 +87,7 @@ def test_basic():
     def add_prediction_cols(test_df, predictor):
         test_df["prediction1"] = predictor.predict(
             peptides=test_df.peptide.values,
-            experiment_names=test_df.sample_id.values,
+            sample_names=test_df.sample_id.values,
             alleles=experiment_to_alleles,
             n_flanks=test_df.n_flank.values,
             c_flanks=test_df.c_flank.values,
@@ -97,7 +95,7 @@ def test_basic():
 
         test_df["prediction2"] = predictor.predict(
             peptides=test_df.peptide.values,
-            experiment_names=test_df.sample_id.values,
+            sample_names=test_df.sample_id.values,
             alleles=experiment_to_alleles,
             verbose=2)
 
diff --git a/test/test_doctest.py b/test/test_doctest.py
new file mode 100644
index 00000000..9b9205a6
--- /dev/null
+++ b/test/test_doctest.py
@@ -0,0 +1,20 @@
+"""
+Run doctests.
+"""
+
+import os
+import doctest
+
+import mhcflurry
+import mhcflurry.class1_presentation_predictor
+
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+from mhcflurry.testing_utils import cleanup, startup
+teardown = cleanup
+setup = startup
+
+
+def test_doctests():
+    doctest.testmod(mhcflurry)
+    doctest.testmod(mhcflurry.class1_presentation_predictor)
diff --git a/test/test_predict_scan_command.py b/test/test_predict_scan_command.py
index ff69b818..0aa87f34 100644
--- a/test/test_predict_scan_command.py
+++ b/test/test_predict_scan_command.py
@@ -17,6 +17,7 @@ setup = startup
 
 from . import data_path
 
+
 def read_output_csv(filename):
     return pandas.read_csv(
         filename,
@@ -125,6 +126,6 @@ def test_commandline_sequences():
 
     assert_equal(result.sequence_name.nunique(), 2)
     assert_equal(result.best_allele.nunique(), 3)
-    assert_equal(result.experiment_name.nunique(), 2)
+    assert_equal(result.sample_name.nunique(), 2)
     assert_equal((result.peptide == "ASDFGHKL").sum(), 2)
     assert_equal((result.peptide != "ASDFGHKL").sum(), 10)
\ No newline at end of file
-- 
GitLab