From 3ec5dbe1b46d3e854bbc0a63d52af6f1da290208 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 23 May 2017 19:03:56 -0400
Subject: [PATCH] updates

---
 downloads-generation/models_class1/README.md  |  28 +---
 .../class1_affinity_predictor.py              |   9 +-
 .../train_allele_specific_models_command.py   |  17 ++-
 mhcflurry/common.py                           | 123 ++----------------
 mhcflurry/encodable_sequences.py              |  11 +-
 mhcflurry/predict_command.py                  |  27 +---
 mhcflurry/scoring.py                          |   4 +-
 test/test_predict_command.py                  |   6 +-
 8 files changed, 54 insertions(+), 171 deletions(-)

diff --git a/downloads-generation/models_class1/README.md b/downloads-generation/models_class1/README.md
index 604852a6..ce784b40 100644
--- a/downloads-generation/models_class1/README.md
+++ b/downloads-generation/models_class1/README.md
@@ -1,29 +1,9 @@
 # Class I allele-specific models (ensemble)
 
-This download contains trained MHC Class I allele-specific MHCflurry models. For each allele, an ensemble of predictors is trained on random halves of the training data. Model architectures are selected based on performance on the other half of the dataset, so in general each ensemble contains predictors of different architectures. At prediction time the geometric mean IC50 is taken over the trained models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download.
+This download contains trained MHC Class I MHCflurry models.
 
-The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) library.
+To generate this download run:
 
-To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument. 
-
-To generate this download we run:
-
-```
-./GENERATE.sh \
-    --parallel-backend kubeface \
-    --target-tasks 200 \
-    --kubeface-backend kubernetes \
-    --kubeface-storage gs://kubeface-tim \
-    --kubeface-worker-image hammerlab/mhcflurry-misc:latest \
-    --kubeface-kubernetes-task-resources-memory-mb 10000 \
-    --kubeface-worker-path-prefix venv-py3/bin \
-    --kubeface-max-simultaneous-tasks 200 \
-    --kubeface-speculation-max-reruns 3 \
-```
-
-To debug locally:
-```
-./GENERATE.sh \
-    --parallel-backend local-threads \
-    --target-tasks 1
 ```
+./GENERATE.sh
+```
\ No newline at end of file
diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
index 63c846d2..fdc4bc2d 100644
--- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
@@ -47,7 +47,8 @@ class Class1AffinityPredictor(object):
         manifest_df : pandas.DataFrame, optional
             Must have columns: model_name, allele, config_json, model.
             Only required if you want to update an existing serialization of a
-            Class1AffinityPredictor.
+            Class1AffinityPredictor. Otherwise this dataframe will be generated
+            automatically based on the supplied models.
         """
 
         if allele_to_allele_specific_models is None:
@@ -91,6 +92,12 @@ class Class1AffinityPredictor(object):
         """
         Serialize the predictor to a directory on disk.
         
+        The serialization format consists of a file called "manifest.csv" with
+        the configurations of each Class1NeuralNetwork, along with per-network
+        files giving the model weights. If there are pan-allele predictors in
+        the ensemble, the allele pseudosequences are also stored in the
+        directory.
+        
         Parameters
         ----------
         models_dir : string
diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
index fd62529f..212e8918 100644
--- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
+++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
@@ -1,5 +1,5 @@
 """
-Train single allele models
+Train Class1 single allele models.
 
 """
 import sys
@@ -8,7 +8,6 @@ import json
 
 import pandas
 
-
 from .class1_affinity_predictor import Class1AffinityPredictor
 from ..common import configure_logging
 
@@ -17,14 +16,19 @@ parser = argparse.ArgumentParser(usage=__doc__)
 
 parser.add_argument(
     "--data",
+    metavar="FILE.csv",
     required=True,
-    help="Training data")
+    help=(
+        "Training data CSV. Expected columns: "
+        "allele, peptide, measurement_value"))
 parser.add_argument(
     "--out-models-dir",
+    metavar="DIR",
     required=True,
     help="Directory to write models and manifest")
 parser.add_argument(
     "--hyperparameters",
+    metavar="FILE.json",
     required=True,
     help="JSON of hyperparameters")
 parser.add_argument(
@@ -39,15 +43,10 @@ parser.add_argument(
     metavar="N",
     default=50,
     help="Train models for alleles with >=N measurements.")
-parser.add_argument(
-    "--only-quantitative",
-    action="store_true",
-    help="Exclude qualitative measurements",
-    default=False)
 parser.add_argument(
     "--verbosity",
     type=int,
-    help="Default: %(default)s",
+    help="Keras verbosity. Default: %(default)s",
     default=1)
 
 
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index 42aced01..c34f1969 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from __future__ import print_function, division, absolute_import
-from math import exp, log
 import itertools
 import collections
 import logging
@@ -28,73 +27,6 @@ import pandas
 from . import amino_acid
 
 
-class UnsupportedAllele(Exception):
-    pass
-
-
-def parse_int_list(s):
-    return [int(part.strip()) for part in s.split(",")]
-
-
-def split_uppercase_sequences(s):
-    return [part.strip().upper() for part in s.split(",")]
-
-
-MHC_PREFIXES = [
-    "HLA",
-    "H-2",
-    "Mamu",
-    "Patr",
-    "Gogo",
-    "ELA",
-]
-
-
-def normalize_allele_name(allele_name, default_prefix="HLA"):
-    """
-    Only works for a small number of species.
-
-    TODO: use the same logic as mhctools for MHC name parsing.
-    Possibly even worth its own small repo called something like "mhcnames"
-    """
-    allele_name = allele_name.upper()
-    # old school HLA-C serotypes look like "Cw"
-    allele_name = allele_name.replace("CW", "C")
-
-    prefix = default_prefix
-    for candidate in MHC_PREFIXES:
-        if (allele_name.startswith(candidate.upper()) or
-                allele_name.startswith(candidate.replace("-", "").upper())):
-            prefix = candidate
-            allele_name = allele_name[len(prefix):]
-            break
-    for pattern in MHC_PREFIXES + ["-", "*", ":"]:
-        allele_name = allele_name.replace(pattern, "")
-    return "%s%s" % (prefix + "-" if prefix else "", allele_name)
-
-
-def split_allele_names(s):
-    return [
-        normalize_allele_name(part.strip())
-        for part
-        in s.split(",")
-    ]
-
-
-def geometric_mean(xs, weights=None):
-    """
-    Geometric mean of a collection of affinity measurements, with optional
-    sample weights.
-    """
-    if len(xs) == 1:
-        return xs[0]
-    elif weights is None:
-        return exp(sum(log(xi) for xi in xs) / len(xs))
-    sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights))
-    denom = sum(weights)
-    return exp(sum_weighted_log / denom)
-
-
 def all_combinations(**dict_of_lists):
     """
     Iterator that generates all combinations of parameters given in the
@@ -107,36 +39,6 @@ def all_combinations(**dict_of_lists):
         yield dict(zip(arg_names, combination_of_values))
 
 
-def groupby_indices(iterable, key_fn=lambda x: x):
-    """
-    Returns dictionary mapping unique values to list of indices that
-    had those values.
-    """
-    index_groups = collections.defaultdict(list)
-    for i, x in enumerate(key_fn(x) for x in iterable):
-        index_groups[x].append(i)
-    return index_groups
-
-
-def shuffle_split_list(indices, fraction=0.5):
-    """
-    Split a list of indices into two sub-lists, with an optional parameter
-    controlling what fraction of the indices go to the left list.
-    """
-    indices = np.asarray(indices)
-    np.random.shuffle(indices)
-    n = len(indices)
-
-    left_count = int(np.ceil(fraction * n))
-
-    if n > 1 and left_count == 0:
-        left_count = 1
-    elif n > 1 and left_count == n:
-        left_count = n - 1
-
-    return indices[:left_count], indices[left_count:]
-
-
 def dataframe_cryptographic_hash(df):
     """
     Return a cryptographic (i.e. collisions extremely unlikely) hash
@@ -244,18 +146,21 @@ def drop_nulls_and_warn(df, related_df_with_same_index_to_describe=None):
     return new_df
 
 
-def from_ic50(ic50):
-    x = 1.0 - (numpy.log(ic50) / numpy.log(50000))
-    return numpy.minimum(
-        1.0,
-        numpy.maximum(0.0, x))
-
-
-def to_ic50(x):
-    return 50000.0 ** (1.0 - x)
-
-
 def amino_acid_distribution(peptides, smoothing=0.0):
+    """
+    Compute the fraction of each amino acid across a collection of peptides.
+    
+    Parameters
+    ----------
+    peptides : list of string
+    smoothing : float, optional
+        Small number (e.g. 0.01) to add to all amino acid fractions. The higher
+        the number the more uniform the distribution.
+
+    Returns
+    -------
+    pandas.Series indexed by amino acids
+    """
     peptides = pandas.Series(peptides)
     aa_counts = pandas.Series(peptides.map(collections.Counter).sum())
     normalized = aa_counts / aa_counts.sum()
diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index 86e38a1a..4ffaeb0f 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -252,9 +252,14 @@ class EncodableSequences(object):
         string of length max_length
 
         """
-        assert len(klass.unknown_character) == 1
-        assert len(sequence) >= left_edge + right_edge, sequence
-        assert len(sequence) <= max_length, sequence
+        if len(sequence) < left_edge + right_edge:
+            raise ValueError(
+                "Sequence '%s' (length %d) unsupported: length must be at "
+                "least %d" % (sequence, len(sequence), left_edge + right_edge))
+        if len(sequence) > max_length:
+            raise ValueError(
+                "Sequence '%s' (length %d) unsupported: length must be at "
+                "most %d" % (sequence, len(sequence), max_length))
 
         middle_length = max_length - left_edge - right_edge
 
diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py
index 58c37c0c..a4603ad5 100644
--- a/mhcflurry/predict_command.py
+++ b/mhcflurry/predict_command.py
@@ -41,10 +41,10 @@ from __future__ import (
 )
 import sys
 import argparse
-import logging
-import pandas
 import itertools
 
+import pandas
+
 from .downloads import get_path
 from .class1_affinity_prediction import Class1AffinityPredictor
 
@@ -99,7 +99,7 @@ parser.add_argument(
     metavar="DIR",
     default=None,
     help="Directory containing models. "
-    "Default: %s" % get_path("models_class1", test_exists=False))
+    "Default: %s" % get_path("models_class1", "models", test_exists=False))
 
 
 def run(argv=sys.argv[1:]):
@@ -148,24 +148,11 @@ def run(argv=sys.argv[1:]):
         # The reason we set the default here instead of in the argument parser is that
         # we want to test_exists at this point, so the user gets a message instructing
         # them to download the models if needed.
-        models_dir = get_path("models_class1")
+        models_dir = get_path("models_class1", "models")
     predictor = Class1AffinityPredictor.load(models_dir)
-
-    predictions = {}  # allele -> peptide -> value
-    for (allele, sub_df) in df.groupby(args.allele_column):
-        logging.info("Running %d predictions for allele %s" % (
-            len(sub_df), allele))
-        peptides = sub_df[args.peptide_column].values
-        predictions[allele] = dict(
-            (peptide, prediction)
-            for (peptide, prediction)
-            in zip(peptides, predictor.predict_for_allele(allele, peptides)))
-
-    logging.info("Collecting result")
-    df[args.prediction_column] = [
-        predictions[row[args.allele_column]][row[args.peptide_column]]
-        for (_, row) in df.iterrows()
-    ]
+    df[args.prediction_column] = predictor.predict(
+        peptides=df[args.peptide_column].values,
+        alleles=df[args.allele_column].values)
 
     if args.out:
         df.to_csv(args.out, index=False)
diff --git a/mhcflurry/scoring.py b/mhcflurry/scoring.py
index ad904689..fc2a9263 100644
--- a/mhcflurry/scoring.py
+++ b/mhcflurry/scoring.py
@@ -8,7 +8,7 @@ import sklearn
 import numpy
 import scipy
 
-from mhcflurry.regression_target import ic50_to_regression_target
+from .regression_target import from_ic50
 
 
 def make_scores(
@@ -39,7 +39,7 @@ def make_scores(
     dict with entries "auc", "f1", "tau"
     """
 
-    y_pred = ic50_to_regression_target(ic50_y_pred, max_ic50)
+    y_pred = from_ic50(ic50_y_pred, max_ic50)
     try:
         auc = sklearn.metrics.roc_auc_score(
             ic50_y <= threshold_nm,
diff --git a/test/test_predict_command.py b/test/test_predict_command.py
index 7cb64d27..7229b814 100644
--- a/test/test_predict_command.py
+++ b/test/test_predict_command.py
@@ -37,8 +37,8 @@ def test_csv():
 
 def test_no_csv():
     args = [
-        "--alleles", "HLA-A0201", "H-2Kb",
-        "--peptides", "SIINFEKL", "DENDREKLLL", "PICKLE",
+        "--alleles", "HLA-A0201", "H-2-Kb",
+        "--peptides", "SIINFEKL", "DENDREKLLL", "PICKLEEE",
         "--prediction-column", "prediction",
     ]
 
@@ -58,5 +58,5 @@ def test_no_csv():
     assert_equal(result.shape, (6, 3))
     sub_result1 = result.ix[result.peptide == "SIINFEKL"].set_index("allele")
     assert (
-        sub_result1.ix["H-2Kb"].prediction <
+        sub_result1.ix["H-2-Kb"].prediction <
         sub_result1.ix["HLA-A0201"].prediction)
-- 
GitLab