From 99e74eeb107664e7c784c72399f9fb305da35614 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 15 Nov 2017 18:47:30 -0500
Subject: [PATCH] Support percentile ranks

* First cut of supporting quantiles ("percent ranks") per allele. Closes #87.
* Some refactoring of amino_acid.py

This builds on #114
---
 mhcflurry/amino_acid.py                       | 107 ++++++++++++
 .../class1_affinity_predictor.py              | 153 ++++++++++++++++--
 .../class1_neural_network.py                  |   6 +-
 .../train_allele_specific_models_command.py   |  17 ++
 mhcflurry/encodable_sequences.py              | 110 +------------
 mhcflurry/percent_rank_transform.py           |  77 +++++++++
 ...odable_sequences.py => test_amino_acid.py} |   6 +-
 test/test_percent_rank_transform.py           |  24 +++
 ...st_train_allele_specific_models_command.py |  47 +++---
 9 files changed, 403 insertions(+), 144 deletions(-)
 create mode 100644 mhcflurry/percent_rank_transform.py
 rename test/{test_encodable_sequences.py => test_amino_acid.py} (84%)
 create mode 100644 test/test_percent_rank_transform.py

diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index e5e59cf1..eba9de5f 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -20,6 +20,11 @@ from __future__ import (
 import collections
 from copy import copy
 
+import pandas
+import numpy
+from six import StringIO
+
+
 COMMON_AMINO_ACIDS = collections.OrderedDict(sorted({
     "A": "Alanine",
     "R": "Arginine",
@@ -47,3 +52,105 @@ COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown"
 
 AMINO_ACID_INDEX = dict(
     (letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN))
+
+AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
+
+BLOSUM62_MATRIX = pandas.read_table(StringIO("""
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0  0
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3  0
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  0
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  0
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1  0
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  0
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3  0
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3  0
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1  0
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1  0
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1  0
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2  0
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0  0 
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3  0
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1  0
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4  0
+X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
+"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
+assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
+
+ENCODING_DFS = {
+    "BLOSUM62": BLOSUM62_MATRIX,
+    "one-hot": pandas.DataFrame([
+        [1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
+        for j in range(len(AMINO_ACIDS))
+    ], index=AMINO_ACIDS, columns=AMINO_ACIDS)
+}
+
+
+def available_vector_encodings():
+    """
+    Return list of supported amino acid vector encodings.
+
+    Returns
+    -------
+    list of string
+
+    """
+    return list(ENCODING_DFS)
+
+
+def vector_encoding_length(name):
+    """
+    Return the length of the given vector encoding.
+
+    Parameters
+    ----------
+    name : string
+
+    Returns
+    -------
+    int
+    """
+    return ENCODING_DFS[name].shape[1]
+
+
+def index_encoding(sequences, letter_to_index_dict):
+    """
+    Given a sequence of n strings all of length k, return a k * n array where
+    the (i, j)th element is letter_to_index_dict[sequence[i][j]].
+
+    Parameters
+    ----------
+    sequences : list of length n of strings of length k
+    letter_to_index_dict : dict : string -> int
+
+    Returns
+    -------
+    numpy.array of integers with shape (k, n)
+    """
+    df = pandas.DataFrame(iter(s) for s in sequences)
+    result = df.replace(letter_to_index_dict)
+    return result.values
+
+
+def fixed_vectors_encoding(sequences, letter_to_vector_function):
+    """
+    Given a sequence of n strings all of length k, return a n * k * m array where
+    the (i, j)th element is letter_to_vector_function(sequence[i][j]).
+
+    Parameters
+    ----------
+    sequences : list of length n of strings of length k
+    letter_to_vector_function : function : string -> vector of length m
+
+    Returns
+    -------
+    numpy.array of integers with shape (n, k, m)
+    """
+    arr = numpy.array([list(s) for s in sequences])
+    result = numpy.vectorize(
+        letter_to_vector_function, signature='()->(n)')(arr)
+    return result
\ No newline at end of file
diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
index bfba10cb..21a92e72 100644
--- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
@@ -5,14 +5,19 @@ import json
 from os.path import join, exists
 from six import string_types
 import logging
+import warnings
 
 import numpy
 import pandas
+from numpy.testing import assert_equal
 
 import mhcnames
 
 from ..encodable_sequences import EncodableSequences
 from ..downloads import get_path
+from ..common import random_peptides
+from ..percent_rank_transform import PercentRankTransform
+from ..regression_target import to_ic50
 
 from .class1_neural_network import Class1NeuralNetwork
 
@@ -32,7 +37,8 @@ class Class1AffinityPredictor(object):
             allele_to_allele_specific_models=None,
             class1_pan_allele_models=None,
             allele_to_pseudosequence=None,
-            manifest_df=None):
+            manifest_df=None,
+            allele_to_percent_rank_transform=None):
         """
         Parameters
         ----------
@@ -50,6 +56,9 @@ class Class1AffinityPredictor(object):
             Only required if you want to update an existing serialization of a
             Class1AffinityPredictor. Otherwise this dataframe will be generated
             automatically based on the supplied models.
+
+        allele_to_percent_rank_transform : dict of string -> PercentRankTransform, optional
+            PercentRankTransform instances to use for each allele
         """
 
         if allele_to_allele_specific_models is None:
@@ -86,6 +95,11 @@ class Class1AffinityPredictor(object):
                 columns=["model_name", "allele", "config_json", "model"])
         self.manifest_df = manifest_df
 
+        if allele_to_percent_rank_transform is None:
+            allele_to_percent_rank_transform = {}
+        self.allele_to_percent_rank_transform = allele_to_percent_rank_transform
+
+
     @property
     def supported_alleles(self):
         """
@@ -165,6 +179,21 @@ class Class1AffinityPredictor(object):
         write_manifest_df.to_csv(manifest_path, index=False)
         logging.info("Wrote: %s" % manifest_path)
 
+        if self.allele_to_percent_rank_transform:
+            percent_ranks_df = None
+            for (allele, transform) in self.allele_to_percent_rank_transform.items():
+                series = transform.to_series()
+                if percent_ranks_df is None:
+                    percent_ranks_df = pandas.DataFrame(index=series.index)
+                assert_equal(series.index.values, percent_ranks_df.index.values)
+                percent_ranks_df[allele] = series
+            percent_ranks_path = join(models_dir, "percent_ranks.csv")
+            percent_ranks_df.to_csv(
+                percent_ranks_path,
+                index=True,
+                index_label="bin")
+            logging.info("Wrote: %s" % percent_ranks_path)
+
     @staticmethod
     def load(models_dir=None, max_models=None):
         """
@@ -211,11 +240,20 @@ class Class1AffinityPredictor(object):
                 join(models_dir, "pseudosequences.csv"),
                 index_col="allele").to_dict()
 
+        allele_to_percent_rank_transform = {}
+        percent_ranks_path = join(models_dir, "percent_ranks.csv")
+        if exists(percent_ranks_path):
+            percent_ranks_df = pandas.read_csv(percent_ranks_path, index_col=0)
+            for allele in percent_ranks_df.columns:
+                allele_to_percent_rank_transform[allele] = (
+                    PercentRankTransform.from_series(percent_ranks_df[allele]))
+
         logging.info(
-            "Loaded %d class1 pan allele predictors, %d pseudosequences, and "
-            "%d allele specific models: %s" % (
+            "Loaded %d class1 pan allele predictors, %d pseudosequences, "
+            "%d percent rank distributions, and %d allele specific models: %s" % (
                 len(class1_pan_allele_models),
                 len(pseudosequences) if pseudosequences else 0,
+                len(allele_to_percent_rank_transform),
                 sum(len(v) for v in allele_to_allele_specific_models.values()),
                 ", ".join(
                     "%s (%d)" % (allele, len(v))
@@ -226,7 +264,9 @@ class Class1AffinityPredictor(object):
             allele_to_allele_specific_models=allele_to_allele_specific_models,
             class1_pan_allele_models=class1_pan_allele_models,
             allele_to_pseudosequence=pseudosequences,
-            manifest_df=manifest_df)
+            manifest_df=manifest_df,
+            allele_to_percent_rank_transform=allele_to_percent_rank_transform,
+        )
         return result
 
     @staticmethod
@@ -441,6 +481,88 @@ class Class1AffinityPredictor(object):
                 verbose=verbose)
             yield model
 
+    def calibrate_percentile_ranks(
+            self,
+            peptides=None,
+            num_peptides_per_length=int(1e6),
+            alleles=None,
+            bins=None):
+        """
+        Compute the cumulative distribution of ic50 values for a set of alleles
+        over a large universe of random peptides, to enable computing quantiles in
+        this distribution later.
+
+        Parameters
+        ----------
+        peptides : sequence of string, optional
+            Peptides to use
+        num_peptides_per_length : int, optional
+            If peptides argument is not specified, then num_peptides_per_length
+            peptides are randomly sampled from a uniform distribution for each
+            supported length
+        alleles : sequence of string, optional
+            Alleles to perform calibration for. If not specified all supported
+            alleles will be calibrated.
+        """
+        if bins is None:
+            bins = to_ic50(numpy.linspace(1, 0, 1000))
+
+        if alleles is None:
+            alleles = self.supported_alleles
+
+        if peptides is None:
+            peptides = []
+            lengths = range(
+                self.supported_peptide_lengths[0],
+                self.supported_peptide_lengths[1] + 1)
+            for length in lengths:
+                peptides.extend(
+                    random_peptides(num_peptides_per_length, length))
+
+        for allele in alleles:
+            predictions = self.predict(peptides, allele=allele)
+            transform = PercentRankTransform()
+            transform.fit(predictions, bins=bins)
+            self.allele_to_percent_rank_transform[allele] = transform
+
+    def percentile_ranks(self, affinities, allele=None, alleles=None):
+        """
+        Return percentile ranks for the given ic50 affinities and alleles.
+
+        The 'allele' and 'alleles' argument are as in the predict() method.
+        Specify one of these.
+
+        Parameters
+        ----------
+        affinities : sequence of float
+            nM affinities
+        allele : string
+        alleles : sequence of string
+
+        Returns
+        -------
+        numpy.array of float
+        """
+        if allele is not None:
+            try:
+                transform = self.allele_to_percent_rank_transform[allele]
+                return transform.transform(affinities)
+            except KeyError:
+                raise ValueError(
+                    "Allele %s has no percentile rank information" % allele)
+
+        if alleles is None:
+            raise ValueError("Specify allele or alleles")
+
+        df = pandas.DataFrame({"affinity": affinities})
+        df["allele"] = alleles
+        df["result"] = numpy.nan
+        for (allele, sub_df) in df.groupby("allele"):
+            df.loc[sub_df.index, "result"] = self.percentile_ranks\
+                (sub_df.affinity, allele=allele)
+        assert not df.result.isnull().any()
+        return df.result.values
+
     def predict(self, peptides, alleles=None, allele=None, throw=True):
         """
         Predict nM binding affinities.
@@ -472,6 +594,7 @@ class Class1AffinityPredictor(object):
             alleles=alleles,
             allele=allele,
             throw=throw,
+            include_percentile_ranks=False,
         )
         return df.prediction.values
 
@@ -481,7 +604,8 @@ class Class1AffinityPredictor(object):
             alleles=None,
             allele=None,
             throw=True,
-            include_individual_model_predictions=False):
+            include_individual_model_predictions=False,
+            include_percentile_ranks=True):
         """
         Predict nM binding affinities. Gives more detailed output than `predict`
         method, including 5-95% prediction intervals.
@@ -499,13 +623,17 @@ class Class1AffinityPredictor(object):
         peptides : EncodableSequences or list of string
         alleles : list of string
         allele : string
-        include_individual_model_predictions : boolean
-            If True, the predictions of each individual model are incldued as
-            columns in the result dataframe.
         throw : boolean
             If True, a ValueError will be raised in the case of unsupported
             alleles or peptide lengths. If False, a warning will be logged and
             the predictions for the unsupported alleles or peptides will be NaN.
+        include_individual_model_predictions : boolean
+            If True, the predictions of each individual model are included as
+            columns in the result dataframe.
+        include_percentile_ranks : boolean, default True
+            If True, a "prediction_percentile" column will be included giving the
+            percentile ranks. If no percentile rank information is available,
+            this will be ignored with a warning.
 
         Returns
         -------
@@ -619,8 +747,15 @@ class Class1AffinityPredictor(object):
             columns = [
                 c for c in df.columns if c not in df_predictions.columns
             ]
-        return df[columns]
 
+        result = df[columns].copy()
+        if include_percentile_ranks:
+            if self.allele_to_percent_rank_transform:
+                result["prediction_percentile"] = self.percentile_ranks(
+                    df.prediction, alleles=df.allele.values)
+            else:
+                warnings.warn("No percentile rank information available.")
+        return result
 
     @staticmethod
     def save_weights(weights_list, filename):
diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_affinity_prediction/class1_neural_network.py
index 13a7f55c..d685669c 100644
--- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py
+++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py
@@ -16,10 +16,8 @@ from keras.layers.normalization import BatchNormalization
 
 from mhcflurry.hyperparameters import HyperparameterDefaults
 
-from ..encodable_sequences import (
-    EncodableSequences,
-    available_vector_encodings,
-    vector_encoding_length)
+from ..encodable_sequences import EncodableSequences
+from ..amino_acid import available_vector_encodings, vector_encoding_length
 from ..regression_target import to_ic50, from_ic50
 from ..common import random_peptides, amino_acid_distribution
 
diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
index 1128ca99..aa12add4 100644
--- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
+++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
@@ -6,6 +6,7 @@ import os
 import sys
 import argparse
 import yaml
+import time
 
 import pandas
 
@@ -49,6 +50,13 @@ parser.add_argument(
     action="store_true",
     default=False,
     help="Use only quantitative training data")
+parser.add_argument(
+    "--percent-rank-calibration-num-peptides-per-length",
+    type=int,
+    default=int(1e5),
+    help="Number of peptides per length to use to calibrate percent ranks. "
+    "Set to 0 to disable percent rank calibration. The resulting models will "
+    "not support percent ranks")
 parser.add_argument(
     "--verbosity",
     type=int,
@@ -126,6 +134,15 @@ def run(argv=sys.argv[1:]):
                     affinities=train_data.measurement_value.values,
                     models_dir_for_save=args.out_models_dir)
 
+    if args.percent_rank_calibration_num_peptides_per_length > 0:
+        start = time.time()
+        print("Performing percent rank calibration.")
+        predictor.calibrate_percentile_ranks(
+            num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length)
+        print("Finished calibrating percent ranks in %0.2f sec." % (
+            time.time() - start))
+        predictor.save(args.out_models_dir, model_names_to_write=[])
+
 
 if __name__ == '__main__':
     run()
diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index 4abc94a1..67553d0f 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -20,116 +20,12 @@ from __future__ import (
 
 import math
 
-import pandas
 import numpy
-from six import StringIO
 
 import typechecks
 
 from . import amino_acid
 
-AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
-
-BLOSUM62_MATRIX = pandas.read_table(StringIO("""
-   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
-A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0  0
-R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3  0
-N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  0
-D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  0
-C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1  0
-Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0
-E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  0
-G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3  0
-H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0
-I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3  0
-L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1  0
-K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0
-M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1  0
-F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1  0
-P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2  0
-S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0
-T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0  0 
-W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3  0
-Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1  0
-V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4  0
-X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
-"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
-assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
-
-ENCODING_DFS = {
-    "BLOSUM62": BLOSUM62_MATRIX,
-    "one-hot": pandas.DataFrame([
-        [1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
-        for j in range(len(AMINO_ACIDS))
-    ], index=AMINO_ACIDS, columns=AMINO_ACIDS)
-}
-
-
-def available_vector_encodings():
-    """
-    Return list of supported amino acid vector encodings.
-
-    Returns
-    -------
-    list of string
-
-    """
-    return list(ENCODING_DFS)
-
-
-def vector_encoding_length(name):
-    """
-    Return the length of the given vector encoding.
-
-    Parameters
-    ----------
-    name : string
-
-    Returns
-    -------
-    int
-    """
-    return ENCODING_DFS[name].shape[1]
-
-
-def index_encoding(sequences, letter_to_index_dict):
-    """
-    Given a sequence of n strings all of length k, return a k * n array where
-    the (i, j)th element is letter_to_index_dict[sequence[i][j]].
-    
-    Parameters
-    ----------
-    sequences : list of length n of strings of length k
-    letter_to_index_dict : dict : string -> int
-
-    Returns
-    -------
-    numpy.array of integers with shape (k, n)
-    """
-    df = pandas.DataFrame(iter(s) for s in sequences)
-    result = df.replace(letter_to_index_dict)
-    return result.values
-
-
-def fixed_vectors_encoding(sequences, letter_to_vector_function):
-    """
-    Given a sequence of n strings all of length k, return a n * k * m array where
-    the (i, j)th element is letter_to_vector_function(sequence[i][j]).
-
-    Parameters
-    ----------
-    sequences : list of length n of strings of length k
-    letter_to_vector_function : function : string -> vector of length m
-
-    Returns
-    -------
-    numpy.array of integers with shape (n, k, m)
-    """
-    arr = numpy.array([list(s) for s in sequences])
-    result = numpy.vectorize(
-        letter_to_vector_function, signature='()->(n)')(arr)
-    return result
-
 
 class EncodableSequences(object):
     """
@@ -198,7 +94,7 @@ class EncodableSequences(object):
                     max_length=max_length)
                 for sequence in self.sequences
             ]
-            self.encoding_cache[cache_key] = index_encoding(
+            self.encoding_cache[cache_key] = amino_acid.index_encoding(
                 fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
         return self.encoding_cache[cache_key]
 
@@ -242,9 +138,9 @@ class EncodableSequences(object):
                     max_length=max_length)
                 for sequence in self.sequences
             ]
-            result = fixed_vectors_encoding(
+            result = amino_acid.fixed_vectors_encoding(
                 fixed_length_sequences,
-                ENCODING_DFS[vector_encoding_name].loc.__getitem__)
+                amino_acid.ENCODING_DFS[vector_encoding_name].loc.__getitem__)
             assert result.shape[0] == len(self.sequences)
             self.encoding_cache[cache_key] = result
         return self.encoding_cache[cache_key]
diff --git a/mhcflurry/percent_rank_transform.py b/mhcflurry/percent_rank_transform.py
new file mode 100644
index 00000000..80c32409
--- /dev/null
+++ b/mhcflurry/percent_rank_transform.py
@@ -0,0 +1,77 @@
+import numpy
+import pandas
+
+class PercentRankTransform(object):
+    """
+    Transform arbitrary values into percent ranks.
+    """
+
+    def __init__(self):
+        self.cdf = None
+        self.bin_edges = None
+
+    def fit(self, values, bins):
+        """
+        Fit the transform using the given values, which are used to
+        establish percentiles.
+        """
+        assert self.cdf is None
+        assert self.bin_edges is None
+        assert len(values) > 0
+        (hist, self.bin_edges) = numpy.histogram(values, bins=bins)
+        self.cdf = numpy.ones(len(hist) + 3) * numpy.nan
+        self.cdf[0] = 0.0
+        self.cdf[1] = 0.0
+        self.cdf[-1] = 100.0
+        numpy.cumsum(hist * 100.0 / numpy.sum(hist), out=self.cdf[2:-1])
+        assert not numpy.isnan(self.cdf).any()
+
+    def transform(self, values):
+        """
+        Return percent ranks (range [0, 100]) for the given values.
+        """
+        assert self.cdf is not None
+        assert self.bin_edges is not None
+        indices = numpy.searchsorted(self.bin_edges, values)
+        result = self.cdf[indices]
+        assert len(result) == len(values)
+        return result
+
+    def to_series(self):
+        """
+        Serialize the fit to a pandas.Series.
+
+        The index on the series gives the bin edges and the valeus give the CDF.
+
+        Returns
+        -------
+        pandas.Series
+
+        """
+        return pandas.Series(
+            self.cdf, index=[numpy.nan] + list(self.bin_edges) + [numpy.nan])
+
+    @staticmethod
+    def from_series(series):
+        """
+        Deseralize a PercentRankTransform the given pandas.Series, as returned
+        by `to_series()`.
+
+        Parameters
+        ----------
+        series : pandas.Series
+
+        Returns
+        -------
+        PercentRankTransform
+
+        """
+        result = PercentRankTransform()
+        result.cdf = series.values
+        result.bin_edges = series.index.values[1:-1]
+        return result
+
+
+
+
+
diff --git a/test/test_encodable_sequences.py b/test/test_amino_acid.py
similarity index 84%
rename from test/test_encodable_sequences.py
rename to test/test_amino_acid.py
index cc47290a..26c50b7b 100644
--- a/test/test_encodable_sequences.py
+++ b/test/test_amino_acid.py
@@ -1,4 +1,4 @@
-from mhcflurry import encodable_sequences
+from mhcflurry import amino_acid
 from nose.tools import eq_
 from numpy.testing import assert_equal
 import numpy
@@ -11,7 +11,7 @@ letter_to_index_dict = {
 
 
 def test_index_and_one_hot_encoding():
-    index_encoding = encodable_sequences.index_encoding(
+    index_encoding = amino_acid.index_encoding(
         ["AAAA", "ABCA"], letter_to_index_dict)
     assert_equal(
         index_encoding,
@@ -19,7 +19,7 @@ def test_index_and_one_hot_encoding():
             [0, 0, 0, 0],
             [0, 1, 2, 0],
         ])
-    one_hot = encodable_sequences.fixed_vectors_encoding(
+    one_hot = amino_acid.fixed_vectors_encoding(
         index_encoding,
         {
             0: numpy.array([1, 0, 0]),
diff --git a/test/test_percent_rank_transform.py b/test/test_percent_rank_transform.py
new file mode 100644
index 00000000..e30aa7bb
--- /dev/null
+++ b/test/test_percent_rank_transform.py
@@ -0,0 +1,24 @@
+import numpy
+
+from mhcflurry.percent_rank_transform import PercentRankTransform
+
+from numpy.testing import assert_allclose, assert_equal
+
+
+def test_percent_rank_transform():
+    model = PercentRankTransform()
+    model.fit(numpy.arange(1000), bins=100)
+    assert_allclose(
+        model.transform([-2, 0, 50, 100, 2000]),
+        [0.0, 0.0, 5.0, 10.0, 100.0],
+        err_msg=str(model.__dict__))
+
+    model2 = PercentRankTransform.from_series(model.to_series())
+    assert_allclose(
+        model2.transform([-2, 0, 50, 100, 2000]),
+        [0.0, 0.0, 5.0, 10.0, 100.0],
+        err_msg=str(model.__dict__))
+
+    assert_equal(model.cdf, model2.cdf)
+    assert_equal(model.bin_edges, model2.bin_edges)
+
diff --git a/test/test_train_allele_specific_models_command.py b/test/test_train_allele_specific_models_command.py
index 5c3fbcfc..f08963a1 100644
--- a/test/test_train_allele_specific_models_command.py
+++ b/test/test_train_allele_specific_models_command.py
@@ -21,6 +21,7 @@ HYPERPARAMETERS = [
         "random_negative_rate": 0.0,
         "random_negative_constant": 25,
 
+        "peptide_amino_acid_encoding": "BLOSUM62",
         "use_embedding": False,
         "kmer_size": 15,
         "batch_normalization": False,
@@ -50,29 +51,33 @@ HYPERPARAMETERS = [
 
 
 def test_run():
-    try:
-        models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
-        hyperparameters_filename = os.path.join(
-            models_dir, "hyperparameters.yaml")
-        with open(hyperparameters_filename, "w") as fd:
-            json.dump(HYPERPARAMETERS, fd)
+    models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
+    hyperparameters_filename = os.path.join(
+        models_dir, "hyperparameters.yaml")
+    with open(hyperparameters_filename, "w") as fd:
+        json.dump(HYPERPARAMETERS, fd)
 
-        args = [
-            "--data", get_path("data_curated", "curated_training_data.csv.bz2"),
-            "--hyperparameters", hyperparameters_filename,
-            "--min-measurements-per-allele", "9000",
-            "--out-models-dir", models_dir,
-        ]
-        print("Running with args: %s" % args)
-        train_allele_specific_models_command.run(args)
+    args = [
+        "--data", get_path("data_curated", "curated_training_data.csv.bz2"),
+        "--hyperparameters", hyperparameters_filename,
+        "--min-measurements-per-allele", "9000",
+        "--out-models-dir", models_dir,
+        "--percent-rank-calibration-num-peptides-per-length", "1000",
+    ]
+    print("Running with args: %s" % args)
+    train_allele_specific_models_command.run(args)
 
-        result = Class1AffinityPredictor.load(models_dir)
-        predictions = result.predict(
+    result = Class1AffinityPredictor.load(models_dir)
+    predictions = result.predict(
+        peptides=["SLYNTVATL"],
+        alleles=["HLA-A*02:01"])
+    assert_equal(predictions.shape, (1,))
+    assert_array_less(predictions, 500)
+    df = result.predict_to_dataframe(
             peptides=["SLYNTVATL"],
             alleles=["HLA-A*02:01"])
-        assert_equal(predictions.shape, (1,))
-        assert_array_less(predictions, 500)
+    print(df)
+    assert "prediction_percentile" in df.columns
 
-    finally:
-        print("Deleting: %s" % models_dir)
-        shutil.rmtree(models_dir)
+    print("Deleting: %s" % models_dir)
+    shutil.rmtree(models_dir)
-- 
GitLab