From 0f9394e2571bec4da61efd16f6b8bd71583cc8b6 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 4 Dec 2017 13:39:32 -0500
Subject: [PATCH] Address code review comments

---
 README.md                                           |  1 +
 .../cross_validation_class1/GENERATE.sh             |  7 ++++++-
 .../cross_validation_class1/score.py                |  2 ++
 .../cross_validation_class1/split_folds.py          | 12 ++++++++++--
 downloads-generation/data_curated/GENERATE.sh       |  6 +++++-
 downloads-generation/data_iedb/GENERATE.sh          |  4 +++-
 downloads-generation/data_kim2014/GENERATE.sh       |  5 ++++-
 downloads-generation/models_class1/GENERATE.sh      |  6 +++++-
 .../models_class1_experiments1/GENERATE.sh          |  6 +++++-
 mhcflurry/amino_acid.py                             |  6 +++---
 .../class1_affinity_predictor.py                    | 13 +++++++++++++
 mhcflurry/encodable_sequences.py                    |  2 +-
 12 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index a4373e27..e47650dd 100644
--- a/README.md
+++ b/README.md
@@ -138,6 +138,7 @@ mhctools \
     --mhc-predictor mhcflurry \
     --input-fasta-file INPUT.fasta \
     --mhc-alleles A02:01,A03:01 \
+    --mhc-peptide-lengths 8,9,10,11 \
     --extract-subsequences \
     --out RESULT.csv
 ```
diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh
index 658460f6..c35e0862 100755
--- a/downloads-generation/cross_validation_class1/GENERATE.sh
+++ b/downloads-generation/cross_validation_class1/GENERATE.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
-
+#
+# Cross validation using the standard class I models.
+# Splits training data into 5 folds (stratifying on allele), trains and tests a
+# predictor on each (train, test) fold, and writes a summary CSV giving
+# performance for each allele on each fold.
+#
 set -e
 set -x
 
diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py
index dcf366de..7af791c4 100644
--- a/downloads-generation/cross_validation_class1/score.py
+++ b/downloads-generation/cross_validation_class1/score.py
@@ -75,6 +75,8 @@ def run(argv):
             scores['kind'] = "single" if "single" in prediction_col else "ensemble"
             scores['train_size'] = allele_df[prediction_col].isnull().sum()
             scores['test_size'] = len(sub_df)
+
+            # make_scores returns a dict with entries "auc", "f1", "tau"
             scores.update(
                 make_scores(
                     sub_df.measurement_value, sub_df[prediction_col]))
diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py
index 3f95337f..dd49085f 100644
--- a/downloads-generation/cross_validation_class1/split_folds.py
+++ b/downloads-generation/cross_validation_class1/split_folds.py
@@ -6,6 +6,7 @@ import sys
 from os.path import abspath
 
 import pandas
+import numpy
 from sklearn.model_selection import StratifiedKFold
 
 parser = argparse.ArgumentParser(usage = __doc__)
@@ -68,18 +69,25 @@ def run(argv):
     else:
         alleles = list(
             allele_counts.ix[
-                           allele_counts > args.min_measurements_per_allele
+                allele_counts > args.min_measurements_per_allele
         ].index)
 
-    df = df.ix[df.allele.isin(alleles)]
+    df = df.loc[df.allele.isin(alleles)].copy()
     print("Potentially subselected by allele to: %s" % str(df.shape))
 
     print("Data has %d alleles: %s" % (
         df.allele.nunique(), " ".join(df.allele.unique())))
 
+    print(df.head())
+
+    # Take log before taking median (in case of even number of samples).
+    df["measurement_value"] = numpy.log1p(df.measurement_value)
     df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index()
+    df["measurement_value"] = numpy.expm1(df.measurement_value)
     print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape))
 
+    print(df.head())
+
     if args.subsample:
         df = df.head(args.subsample)
         print("Subsampled to: %s" % str(df.shape))
diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh
index 89982e6a..f0a3d54a 100755
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-
+#
+# Create "curated" training data, which combines an IEDB download with additional
+# published data, removes unusable entries, normalizes allele name, and performs
+# other filtering and standardization.
+#
 set -e
 set -x
 
diff --git a/downloads-generation/data_iedb/GENERATE.sh b/downloads-generation/data_iedb/GENERATE.sh
index 55156647..a6067a36 100755
--- a/downloads-generation/data_iedb/GENERATE.sh
+++ b/downloads-generation/data_iedb/GENERATE.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
-
+#
+# Download latest MHC I ligand data from IEDB.
+#
 set -e
 set -x
 
diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_kim2014/GENERATE.sh
index baf8a3a4..dbda0fe8 100755
--- a/downloads-generation/data_kim2014/GENERATE.sh
+++ b/downloads-generation/data_kim2014/GENERATE.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
-
+#
+# Download some published MHC I ligand data from a location on Dropbox.
+#
+#
 set -e
 set -x
 
diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh
index 2e509d23..b72334b5 100755
--- a/downloads-generation/models_class1/GENERATE.sh
+++ b/downloads-generation/models_class1/GENERATE.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-
+#
+# Train standard MHCflurry Class I models.
+# Calls mhcflurry-class1-train-allele-specific-models on curated training data
+# using the hyperparameters in "hyperparameters.yaml".
+#
 set -e
 set -x
 
diff --git a/downloads-generation/models_class1_experiments1/GENERATE.sh b/downloads-generation/models_class1_experiments1/GENERATE.sh
index 6edd4348..89921d92 100755
--- a/downloads-generation/models_class1_experiments1/GENERATE.sh
+++ b/downloads-generation/models_class1_experiments1/GENERATE.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-
+#
+# Train "experimental" models using various hyperparameter combinations.
+# This trains models only for a small number of alleles for which we have good
+# mass-spec validation data.
+#
 set -e
 set -x
 
diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index 78b3a854..ffa5cffc 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -80,7 +80,7 @@ X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
 """), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
 assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
 
-ENCODING_DFS = {
+ENCODING_DATA_FRAMES = {
     "BLOSUM62": BLOSUM62_MATRIX,
     "one-hot": pandas.DataFrame([
         [1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
@@ -98,7 +98,7 @@ def available_vector_encodings():
     list of string
 
     """
-    return list(ENCODING_DFS)
+    return list(ENCODING_DATA_FRAMES)
 
 
 def vector_encoding_length(name):
@@ -113,7 +113,7 @@ def vector_encoding_length(name):
     -------
     int
     """
-    return ENCODING_DFS[name].shape[1]
+    return ENCODING_DATA_FRAMES[name].shape[1]
 
 
 def index_encoding(sequences, letter_to_index_dict):
diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
index ada2e9e5..af94cfa0 100644
--- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py
@@ -760,6 +760,19 @@ class Class1AffinityPredictor(object):
             'peptide': peptides.sequences,
             'allele': alleles,
         })
+        if len(df) == 0:
+            # No predictions.
+            logging.warning("Predicting for 0 peptides.")
+            empty_result = pandas.DataFrame(
+                columns=[
+                    'peptide',
+                    'allele',
+                    'prediction',
+                    'prediction_low',
+                    'prediction_high'
+                ])
+            return empty_result
+
         df["normalized_allele"] = df.allele.map(
             mhcnames.normalize_allele_name)
 
diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index b98fb862..8477938b 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -135,7 +135,7 @@ class EncodableSequences(object):
                     max_length=max_length))
             result = amino_acid.fixed_vectors_encoding(
                 fixed_length_sequences,
-                amino_acid.ENCODING_DFS[vector_encoding_name])
+                amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
             assert result.shape[0] == len(self.sequences)
             self.encoding_cache[cache_key] = result
         return self.encoding_cache[cache_key]
-- 
GitLab