From ced183f7ca06f649864ac5d248d85d74b83880d9 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 31 Jul 2019 10:58:11 -0400
Subject: [PATCH] fixes

---
 .../models_class1_pan/GENERATE.sh             |   5 +-
 .../calibrate_percentile_ranks_command.py     | 101 +++++++++++++++---
 mhcflurry/class1_affinity_predictor.py        |  74 ++++++++++++-
 mhcflurry/common.py                           |  14 +++
 mhcflurry/local_parallelism.py                |   2 +-
 5 files changed, 176 insertions(+), 20 deletions(-)

diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh
index c3aa7fad..2c2dd104 100755
--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -56,7 +56,10 @@ do
 
     time mhcflurry-calibrate-percentile-ranks \
         --models-dir models.${kind} \
-        --num-peptides-per-length 10000 \
+        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
+        --motif-summary \
+        --num-peptides-per-length 100000 \
+        --verbosity 1 \
         --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
 done
 
diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py
index 43cc9d86..5cd1334e 100644
--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -7,14 +7,20 @@ import signal
 import sys
 import time
 import traceback
+import collections
 from functools import partial
 
+import pandas
+import numpy
+
 from mhcnames import normalize_allele_name
 import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
 
 from .class1_affinity_predictor import Class1AffinityPredictor
-from .common import configure_logging
+from .encodable_sequences import EncodableSequences
+from .common import configure_logging, random_peptides, amino_acid_distribution
+from .amino_acid import BLOSUM62_MATRIX
 from .local_parallelism import (
     add_local_parallelism_args,
     worker_pool_with_gpu_assignments_from_args,
@@ -41,6 +47,11 @@ parser.add_argument(
     nargs="+",
     help="Alleles to calibrate percentile ranks for. If not specified all "
     "alleles are used")
+parser.add_argument(
+    "--match-amino-acid-distribution-data",
+    help="Sample random peptides from the amino acid distribution of the "
+    "peptides listed in the supplied CSV file, which must have a 'peptide' "
+    "column. If not specified a uniform distribution is used.")
 parser.add_argument(
     "--num-peptides-per-length",
     type=int,
@@ -48,6 +59,25 @@ parser.add_argument(
     default=int(1e5),
     help="Number of peptides per length to use to calibrate percent ranks. "
     "Default: %(default)s.")
+parser.add_argument(
+    "--motif-summary",
+    default=False,
+    action="store_true",
+    help="Calculate motifs and length preferences for each allele")
+parser.add_argument(
+    "--summary-top-peptide-fraction",
+    default=0.001,
+    type=float,
+    metavar="X",
+    help="The top X fraction of predictions (i.e. tightest binders) to use to "
+    "generate motifs and length preferences. Default: %(default)s")
+parser.add_argument(
+    "--length-range",
+    default=(8, 15),
+    type=int,
+    nargs=2,
+    help="Min and max peptide length to calibrate, inclusive. "
+    "Default: %(default)s")
 parser.add_argument(
     "--verbosity",
     type=int,
@@ -77,13 +107,26 @@ def run(argv=sys.argv[1:]):
     else:
         alleles = predictor.supported_alleles
 
+    distribution = None
+    if args.match_amino_acid_distribution_data:
+        distribution_peptides = pandas.read_csv(
+            args.match_amino_acid_distribution_data).peptide.unique()
+        distribution = amino_acid_distribution(distribution_peptides)
+        print("Using amino acid distribution:")
+        print(distribution)
+
     start = time.time()
 
-    print("Performing percent rank calibration for %d alleles: encoding peptides" % (
+    print("Percent rank calibration for %d alleles. Encoding peptides." % (
         len(alleles)))
-    encoded_peptides = predictor.calibrate_percentile_ranks(
-        alleles=[],  # don't actually do any calibration, just return peptides
-        num_peptides_per_length=args.num_peptides_per_length)
+
+    peptides = []
+    lengths = range(args.length_range[0], args.length_range[1] + 1)
+    for length in lengths:
+        peptides.extend(
+            random_peptides(
+                args.num_peptides_per_length, length, distribution=distribution))
+    encoded_peptides = EncodableSequences.create(peptides)
 
     # Now we encode the peptides for each neural network, so the encoding
     # becomes cached.
@@ -100,6 +143,12 @@ def run(argv=sys.argv[1:]):
 
     worker_pool = worker_pool_with_gpu_assignments_from_args(args)
 
+    constant_kwargs = {
+        'motif_summary': args.motif_summary,
+        'summary_top_peptide_fraction': args.summary_top_peptide_fraction,
+        'verbose': args.verbosity > 0,
+    }
+
     if worker_pool is None:
         # Serial run
         print("Running in serial.")
@@ -107,20 +156,34 @@ def run(argv=sys.argv[1:]):
             calibrate_percentile_ranks(
                 allele=allele,
                 predictor=predictor,
-                peptides=encoded_peptides)
+                peptides=encoded_peptides,
+                **constant_kwargs,
+            )
             for allele in alleles)
     else:
         # Parallel run
         results = worker_pool.imap_unordered(
             partial(
                 partial(call_wrapped, calibrate_percentile_ranks),
-                predictor=predictor),
+                predictor=predictor,
+                **constant_kwargs),
             alleles,
             chunksize=1)
 
-    for result in tqdm.tqdm(results, total=len(alleles)):
-        predictor.allele_to_percent_rank_transform.update(result)
-    print("Done calibrating %d additional alleles." % len(alleles))
+    summary_results_lists = collections.defaultdict(list)
+    for (transforms, summary_results) in tqdm.tqdm(results, total=len(alleles)):
+        predictor.allele_to_percent_rank_transform.update(transforms)
+        if summary_results is not None:
+            for (item, value) in summary_results.items():
+                summary_results_lists[item].append(value)
+    print("Done calibrating %d alleles." % len(alleles))
+    if summary_results_lists:
+        for (name, lst) in summary_results_lists.items():
+            df = pandas.concat(lst, ignore_index=True)
+            predictor.metadata_dataframes[name] = df
+            print("Including summary result: %s" % name)
+            print(df)
+
     predictor.save(args.models_dir, model_names_to_write=[])
 
     percent_rank_calibration_time = time.time() - start
@@ -134,19 +197,29 @@ def run(argv=sys.argv[1:]):
     print("Predictor written to: %s" % args.models_dir)
 
 
-def calibrate_percentile_ranks(allele, predictor, peptides=None):
+def calibrate_percentile_ranks(
+        allele,
+        predictor,
+        peptides=None,
+        motif_summary=False,
+        summary_top_peptide_fraction=0.001,
+        verbose=False):
     """
     Private helper function.
     """
     global GLOBAL_DATA
     if peptides is None:
         peptides = GLOBAL_DATA["calibration_peptides"]
-    predictor.calibrate_percentile_ranks(
+    summary_results = predictor.calibrate_percentile_ranks(
         peptides=peptides,
-        alleles=[allele])
-    return {
+        alleles=[allele],
+        motif_summary=motif_summary,
+        summary_top_peptide_fraction=summary_top_peptide_fraction,
+        verbose=verbose)
+    transforms = {
         allele: predictor.allele_to_percent_rank_transform[allele],
     }
+    return (transforms, summary_results)
 
 
 if __name__ == '__main__':
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 0c383739..1f2507e8 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -17,7 +17,7 @@ from numpy.testing import assert_equal
 from six import string_types
 
 from .class1_neural_network import Class1NeuralNetwork
-from .common import random_peptides
+from .common import random_peptides, positional_frequency_matrix
 from .downloads import get_default_class1_models_dir
 from .encodable_sequences import EncodableSequences
 from .percent_rank_transform import PercentRankTransform
@@ -95,7 +95,8 @@ class Class1AffinityPredictor(object):
         if not allele_to_percent_rank_transform:
             allele_to_percent_rank_transform = {}
         self.allele_to_percent_rank_transform = allele_to_percent_rank_transform
-        self.metadata_dataframes = metadata_dataframes
+        self.metadata_dataframes = (
+            dict(metadata_dataframes) if metadata_dataframes else {})
         self._cache = {}
 
         assert isinstance( self.allele_to_allele_specific_models, dict)
@@ -1153,7 +1154,10 @@ class Class1AffinityPredictor(object):
             peptides=None,
             num_peptides_per_length=int(1e5),
             alleles=None,
-            bins=None):
+            bins=None,
+            motif_summary=False,
+            summary_top_peptide_fraction=0.001,
+            verbose=False):
         """
         Compute the cumulative distribution of ic50 values for a set of alleles
         over a large universe of random peptides, to enable computing quantiles in
@@ -1196,13 +1200,75 @@ class Class1AffinityPredictor(object):
 
         encoded_peptides = EncodableSequences.create(peptides)
 
+        if motif_summary:
+            frequency_matrices = []
+            length_distributions = []
+        else:
+            frequency_matrices = None
+            length_distributions = None
         for (i, allele) in enumerate(alleles):
+            start = time.time()
             predictions = self.predict(encoded_peptides, allele=allele)
+            if verbose:
+                elapsed = time.time() - start
+                print(
+                    "Generated %d predictions for allele %s in %0.2f sec: "
+                    "%0.2f predictions / sec" % (
+                        len(encoded_peptides.sequences),
+                        allele,
+                        elapsed,
+                        len(encoded_peptides.sequences) / elapsed))
             transform = PercentRankTransform()
             transform.fit(predictions, bins=bins)
             self.allele_to_percent_rank_transform[allele] = transform
 
-        return encoded_peptides
+            if frequency_matrices is not None:
+                predictions_df = pandas.DataFrame({
+                    'peptide': encoded_peptides.sequences,
+                    'prediction': predictions
+                }).drop_duplicates('peptide').set_index("peptide")
+                predictions_df["length"] = predictions_df.index.str.len()
+                for (length, sub_df) in predictions_df.groupby("length"):
+                    selected = sub_df.prediction.nsmallest(
+                        max(
+                            int(len(sub_df) * summary_top_peptide_fraction),
+                            1)).index.values
+                    matrix = positional_frequency_matrix(selected).reset_index()
+                    original_columns = list(matrix.columns)
+                    matrix["length"] = length
+                    matrix["allele"] = allele
+                    matrix = matrix[["allele", "length"] + original_columns]
+                    frequency_matrices.append(matrix)
+
+                # Length distribution
+                length_distribution = predictions_df.prediction.nsmallest(
+                    max(
+                        int(len(predictions_df) * summary_top_peptide_fraction),
+                        1)).index.str.len().value_counts()
+                length_distribution.index.name = "length"
+                length_distribution /= length_distribution.sum()
+                length_distribution = length_distribution.to_frame()
+                length_distribution.columns = ["fraction"]
+                length_distribution = length_distribution.reset_index()
+                length_distribution["allele"] = allele
+                length_distribution = length_distribution[
+                    ["allele", "length", "fraction"]
+                ].sort_values("length")
+                length_distributions.append(length_distribution)
+
+        if frequency_matrices is not None:
+            frequency_matrices = pandas.concat(
+                frequency_matrices, ignore_index=True)
+
+        if length_distributions is not None:
+            length_distributions = pandas.concat(
+                length_distributions, ignore_index=True)
+
+        if motif_summary:
+            return {
+                'frequency_matrices': frequency_matrices,
+                'length_distributions': length_distributions,
+            }
 
     def filter_networks(self, predicate):
         """
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index b5120b52..68a51084 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -147,3 +147,17 @@ def random_peptides(num, length=9, distribution=None):
             p=distribution.values,
             size=(int(num), int(length)))
     ]
+
+
+def positional_frequency_matrix(peptides):
+    length = len(peptides[0])
+    assert all(len(peptide) == length for peptide in peptides)
+    counts = pandas.DataFrame(
+        index=[a for a in amino_acid.BLOSUM62_MATRIX.index if a != 'X'],
+        columns=numpy.arange(1, length + 1),
+    )
+    for i in range(length):
+        counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts()
+    result = (counts / len(peptides)).fillna(0.0).T
+    result.index.name = 'position'
+    return result
\ No newline at end of file
diff --git a/mhcflurry/local_parallelism.py b/mhcflurry/local_parallelism.py
index 5a3d311b..85806b71 100644
--- a/mhcflurry/local_parallelism.py
+++ b/mhcflurry/local_parallelism.py
@@ -18,7 +18,7 @@ def add_local_parallelism_args(parser):
 
     group.add_argument(
         "--num-jobs",
-        default=1,
+        default=0,
         type=int,
         metavar="N",
         help="Number of processes to parallelize training over. Experimental. "
-- 
GitLab