fixes

ced183f7 · Tim O'Donnell · b7763e97 · ced183f7 · ced183f7 · ced183f7
Commit ced183f7 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -56,7 +56,10 @@ do

    time mhcflurry-calibrate-percentile-ranks \
        --models-dir models.${kind} \
-        --num-peptides-per-length 10000 \
+        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
+        --motif-summary \
+        --num-peptides-per-length 100000 \
+        --verbosity 1 \
        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
 done


--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -7,14 +7,20 @@ import signal
 import sys
 import time
 import traceback
+import collections
 from functools import partial

+import pandas
+import numpy
+
 from mhcnames import normalize_allele_name
 import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481

 from .class1_affinity_predictor import Class1AffinityPredictor
-from .common import configure_logging
+from .encodable_sequences import EncodableSequences
+from .common import configure_logging, random_peptides, amino_acid_distribution
+from .amino_acid import BLOSUM62_MATRIX
 from .local_parallelism import (
    add_local_parallelism_args,
    worker_pool_with_gpu_assignments_from_args,
@@ -41,6 +47,11 @@ parser.add_argument(
    nargs="+",
    help="Alleles to calibrate percentile ranks for. If not specified all "
    "alleles are used")
+parser.add_argument(
+    "--match-amino-acid-distribution-data",
+    help="Sample random peptides from the amino acid distribution of the "
+    "peptides listed in the supplied CSV file, which must have a 'peptide' "
+    "column. If not specified a uniform distribution is used.")
 parser.add_argument(
    "--num-peptides-per-length",
    type=int,
@@ -48,6 +59,25 @@ parser.add_argument(
    default=int(1e5),
    help="Number of peptides per length to use to calibrate percent ranks. "
    "Default: %(default)s.")
+parser.add_argument(
+    "--motif-summary",
+    default=False,
+    action="store_true",
+    help="Calculate motifs and length preferences for each allele")
+parser.add_argument(
+    "--summary-top-peptide-fraction",
+    default=0.001,
+    type=float,
+    metavar="X",
+    help="The top X fraction of predictions (i.e. tightest binders) to use to "
+    "generate motifs and length preferences. Default: %(default)s")
+parser.add_argument(
+    "--length-range",
+    default=(8, 15),
+    type=int,
+    nargs=2,
+    help="Min and max peptide length to calibrate, inclusive. "
+    "Default: %(default)s")
 parser.add_argument(
    "--verbosity",
    type=int,
@@ -77,13 +107,26 @@ def run(argv=sys.argv[1:]):
    else:
        alleles = predictor.supported_alleles

+    distribution = None
+    if args.match_amino_acid_distribution_data:
+        distribution_peptides = pandas.read_csv(
+            args.match_amino_acid_distribution_data).peptide.unique()
+        distribution = amino_acid_distribution(distribution_peptides)
+        print("Using amino acid distribution:")
+        print(distribution)
+
    start = time.time()

-    print("Performing percent rank calibration for %d alleles: encoding peptides" % (
+    print("Percent rank calibration for %d alleles. Encoding peptides." % (
        len(alleles)))
-    encoded_peptides = predictor.calibrate_percentile_ranks(
-        alleles=[],  # don't actually do any calibration, just return peptides
-        num_peptides_per_length=args.num_peptides_per_length)
+
+    peptides = []
+    lengths = range(args.length_range[0], args.length_range[1] + 1)
+    for length in lengths:
+        peptides.extend(
+            random_peptides(
+                args.num_peptides_per_length, length, distribution=distribution))
+    encoded_peptides = EncodableSequences.create(peptides)

    # Now we encode the peptides for each neural network, so the encoding
    # becomes cached.
@@ -100,6 +143,12 @@ def run(argv=sys.argv[1:]):

    worker_pool = worker_pool_with_gpu_assignments_from_args(args)

+    constant_kwargs = {
+        'motif_summary': args.motif_summary,
+        'summary_top_peptide_fraction': args.summary_top_peptide_fraction,
+        'verbose': args.verbosity > 0,
+    }
+
    if worker_pool is None:
        # Serial run
        print("Running in serial.")
@@ -107,20 +156,34 @@ def run(argv=sys.argv[1:]):
            calibrate_percentile_ranks(
                allele=allele,
                predictor=predictor,
-                peptides=encoded_peptides)
+                peptides=encoded_peptides,
+                **constant_kwargs,
+            )
            for allele in alleles)
    else:
        # Parallel run
        results = worker_pool.imap_unordered(
            partial(
                partial(call_wrapped, calibrate_percentile_ranks),
-                predictor=predictor),
+                predictor=predictor,
+                **constant_kwargs),
            alleles,
            chunksize=1)

-    for result in tqdm.tqdm(results, total=len(alleles)):
-        predictor.allele_to_percent_rank_transform.update(result)
-    print("Done calibrating %d additional alleles." % len(alleles))
+    summary_results_lists = collections.defaultdict(list)
+    for (transforms, summary_results) in tqdm.tqdm(results, total=len(alleles)):
+        predictor.allele_to_percent_rank_transform.update(transforms)
+        if summary_results is not None:
+            for (item, value) in summary_results.items():
+                summary_results_lists[item].append(value)
+    print("Done calibrating %d alleles." % len(alleles))
+    if summary_results_lists:
+        for (name, lst) in summary_results_lists.items():
+            df = pandas.concat(lst, ignore_index=True)
+            predictor.metadata_dataframes[name] = df
+            print("Including summary result: %s" % name)
+            print(df)
+
    predictor.save(args.models_dir, model_names_to_write=[])

    percent_rank_calibration_time = time.time() - start
@@ -134,19 +197,29 @@ def run(argv=sys.argv[1:]):
    print("Predictor written to: %s" % args.models_dir)


-def calibrate_percentile_ranks(allele, predictor, peptides=None):
+def calibrate_percentile_ranks(
+        allele,
+        predictor,
+        peptides=None,
+        motif_summary=False,
+        summary_top_peptide_fraction=0.001,
+        verbose=False):
    """
    Private helper function.
    """
    global GLOBAL_DATA
    if peptides is None:
        peptides = GLOBAL_DATA["calibration_peptides"]
-    predictor.calibrate_percentile_ranks(
+    summary_results = predictor.calibrate_percentile_ranks(
        peptides=peptides,
-        alleles=[allele])
-    return {
+        alleles=[allele],
+        motif_summary=motif_summary,
+        summary_top_peptide_fraction=summary_top_peptide_fraction,
+        verbose=verbose)
+    transforms = {
        allele: predictor.allele_to_percent_rank_transform[allele],
    }
+    return (transforms, summary_results)


 if __name__ == '__main__':

--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -17,7 +17,7 @@ from numpy.testing import assert_equal
 from six import string_types

 from .class1_neural_network import Class1NeuralNetwork
-from .common import random_peptides
+from .common import random_peptides, positional_frequency_matrix
 from .downloads import get_default_class1_models_dir
 from .encodable_sequences import EncodableSequences
 from .percent_rank_transform import PercentRankTransform
@@ -95,7 +95,8 @@ class Class1AffinityPredictor(object):
        if not allele_to_percent_rank_transform:
            allele_to_percent_rank_transform = {}
        self.allele_to_percent_rank_transform = allele_to_percent_rank_transform
-        self.metadata_dataframes = metadata_dataframes
+        self.metadata_dataframes = (
+            dict(metadata_dataframes) if metadata_dataframes else {})
        self._cache = {}

        assert isinstance( self.allele_to_allele_specific_models, dict)
@@ -1153,7 +1154,10 @@ class Class1AffinityPredictor(object):
            peptides=None,
            num_peptides_per_length=int(1e5),
            alleles=None,
-            bins=None):
+            bins=None,
+            motif_summary=False,
+            summary_top_peptide_fraction=0.001,
+            verbose=False):
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
        over a large universe of random peptides, to enable computing quantiles in
@@ -1196,13 +1200,75 @@ class Class1AffinityPredictor(object):

        encoded_peptides = EncodableSequences.create(peptides)

+        if motif_summary:
+            frequency_matrices = []
+            length_distributions = []
+        else:
+            frequency_matrices = None
+            length_distributions = None
        for (i, allele) in enumerate(alleles):
+            start = time.time()
            predictions = self.predict(encoded_peptides, allele=allele)
+            if verbose:
+                elapsed = time.time() - start
+                print(
+                    "Generated %d predictions for allele %s in %0.2f sec: "
+                    "%0.2f predictions / sec" % (
+                        len(encoded_peptides.sequences),
+                        allele,
+                        elapsed,
+                        len(encoded_peptides.sequences) / elapsed))
            transform = PercentRankTransform()
            transform.fit(predictions, bins=bins)
            self.allele_to_percent_rank_transform[allele] = transform

-        return encoded_peptides
+            if frequency_matrices is not None:
+                predictions_df = pandas.DataFrame({
+                    'peptide': encoded_peptides.sequences,
+                    'prediction': predictions
+                }).drop_duplicates('peptide').set_index("peptide")
+                predictions_df["length"] = predictions_df.index.str.len()
+                for (length, sub_df) in predictions_df.groupby("length"):
+                    selected = sub_df.prediction.nsmallest(
+                        max(
+                            int(len(sub_df) * summary_top_peptide_fraction),
+                            1)).index.values
+                    matrix = positional_frequency_matrix(selected).reset_index()
+                    original_columns = list(matrix.columns)
+                    matrix["length"] = length
+                    matrix["allele"] = allele
+                    matrix = matrix[["allele", "length"] + original_columns]
+                    frequency_matrices.append(matrix)
+
+                # Length distribution
+                length_distribution = predictions_df.prediction.nsmallest(
+                    max(
+                        int(len(predictions_df) * summary_top_peptide_fraction),
+                        1)).index.str.len().value_counts()
+                length_distribution.index.name = "length"
+                length_distribution /= length_distribution.sum()
+                length_distribution = length_distribution.to_frame()
+                length_distribution.columns = ["fraction"]
+                length_distribution = length_distribution.reset_index()
+                length_distribution["allele"] = allele
+                length_distribution = length_distribution[
+                    ["allele", "length", "fraction"]
+                ].sort_values("length")
+                length_distributions.append(length_distribution)
+
+        if frequency_matrices is not None:
+            frequency_matrices = pandas.concat(
+                frequency_matrices, ignore_index=True)
+
+        if length_distributions is not None:
+            length_distributions = pandas.concat(
+                length_distributions, ignore_index=True)
+
+        if motif_summary:
+            return {
+                'frequency_matrices': frequency_matrices,
+                'length_distributions': length_distributions,
+            }

    def filter_networks(self, predicate):
        """

--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -147,3 +147,17 @@ def random_peptides(num, length=9, distribution=None):
            p=distribution.values,
            size=(int(num), int(length)))
    ]
+
+
+def positional_frequency_matrix(peptides):
+    length = len(peptides[0])
+    assert all(len(peptide) == length for peptide in peptides)
+    counts = pandas.DataFrame(
+        index=[a for a in amino_acid.BLOSUM62_MATRIX.index if a != 'X'],
+        columns=numpy.arange(1, length + 1),
+    )
+    for i in range(length):
+        counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts()
+    result = (counts / len(peptides)).fillna(0.0).T
+    result.index.name = 'position'
+    return result
\ No newline at end of file
--- a/mhcflurry/local_parallelism.py
+++ b/mhcflurry/local_parallelism.py
@@ -18,7 +18,7 @@ def add_local_parallelism_args(parser):

    group.add_argument(
        "--num-jobs",
-        default=1,
+        default=0,
        type=int,
        metavar="N",
        help="Number of processes to parallelize training over. Experimental. "