From 55e3cfc12f49b5526a6abdbe2b38e9d9abe495f8 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 25 Nov 2017 17:58:25 -0500
Subject: [PATCH] Add cross-validation GENERATE.sh and related scripts

---
 .../cross_validation_class1/GENERATE.sh       |  83 +++++++++++++
 .../cross_validation_class1/README.md         |   7 ++
 .../hyperparameters.yaml                      |   1 +
 .../cross_validation_class1/score.py          | 101 ++++++++++++++++
 .../cross_validation_class1/split_folds.py    | 113 ++++++++++++++++++
 .../train_allele_specific_models_command.py   |  24 +++-
 mhcflurry/scoring.py                          |   2 +-
 7 files changed, 329 insertions(+), 2 deletions(-)
 create mode 100755 downloads-generation/cross_validation_class1/GENERATE.sh
 create mode 100644 downloads-generation/cross_validation_class1/README.md
 create mode 120000 downloads-generation/cross_validation_class1/hyperparameters.yaml
 create mode 100644 downloads-generation/cross_validation_class1/score.py
 create mode 100644 downloads-generation/cross_validation_class1/split_folds.py

diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh
new file mode 100755
index 00000000..ebbe0b8e
--- /dev/null
+++ b/downloads-generation/cross_validation_class1/GENERATE.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+set -e
+set -x
+
+DOWNLOAD_NAME=cross_validation_class1
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+
+NFOLDS=5
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+cp $SCRIPT_DIR/hyperparameters.yaml .
+cp $SCRIPT_DIR/split_folds.py .
+cp $SCRIPT_DIR/score.py .
+
+time python split_folds.py \
+    "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
+    --min-measurements-per-allele 100 \
+    --folds $NFOLDS \
+    --random-state 1 \
+    --output-pattern-test "./test.fold_{}.csv" \
+    --output-pattern-train "./train.fold_{}.csv"
+
+# Kill child processes if parent exits:
+trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
+
+for fold in $(seq 0 $(expr $NFOLDS - 1))
+do
+    mhcflurry-class1-train-allele-specific-models \
+        --data train.fold_${fold}.csv \
+        --hyperparameters hyperparameters.yaml \
+        --out-models-dir models.fold_${fold} \
+        --min-measurements-per-allele 0 \
+        --percent-rank-calibration-num-peptides-per-length 0 \
+    2>&1 | tee -a LOG.train.fold_${fold}.txt &
+done
+wait
+
+echo "DONE TRAINING. NOW PREDICTING."
+
+for fold in $(seq 0 $(expr $NFOLDS - 1))
+do
+    mhcflurry-predict \
+        test.fold_${fold}.csv \
+        --models models.fold_${fold} \
+        --no-throw \
+        --include-individual-model-predictions \
+        --out predictions.fold_${fold}.csv &
+done
+wait
+
+time python score.py \
+    predictions.fold_*.csv \
+    --out-combined predictions.combined.csv \
+    --out-scores scores.csv \
+    --out-summary summary.all.csv
+
+grep -v single summary.all.csv > summary.ensemble.csv
+
+cp $SCRIPT_ABSOLUTE_PATH .
+for i in $(ls *.txt)
+do
+    bzip2 $i
+done
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
diff --git a/downloads-generation/cross_validation_class1/README.md b/downloads-generation/cross_validation_class1/README.md
new file mode 100644
index 00000000..f7584e32
--- /dev/null
+++ b/downloads-generation/cross_validation_class1/README.md
@@ -0,0 +1,7 @@
+# Cross validation of standard Class I models
+
+This download contains cross validation results and intermediate data for
+class I allele-specific MHCflurry models.
+
+This exists to track the exact steps used to generate cross-validation results.
+Users will probably not interact with this directly.
\ No newline at end of file
diff --git a/downloads-generation/cross_validation_class1/hyperparameters.yaml b/downloads-generation/cross_validation_class1/hyperparameters.yaml
new file mode 120000
index 00000000..f32feef1
--- /dev/null
+++ b/downloads-generation/cross_validation_class1/hyperparameters.yaml
@@ -0,0 +1 @@
+../models_class1/hyperparameters.yaml
\ No newline at end of file
diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py
new file mode 100644
index 00000000..dcf366de
--- /dev/null
+++ b/downloads-generation/cross_validation_class1/score.py
@@ -0,0 +1,101 @@
+"""
+Scoring script for cross-validation.
+"""
+import argparse
+import sys
+import collections
+
+import pandas
+from mhcflurry.scoring import make_scores
+
+
+parser = argparse.ArgumentParser(usage = __doc__)
+
+parser.add_argument(
+    "input", metavar="INPUT.csv", help="Input CSV", nargs="+")
+
+parser.add_argument(
+    "--out-scores",
+    metavar="RESULT.csv")
+
+parser.add_argument(
+    "--out-combined",
+    metavar="COMBINED.csv")
+
+parser.add_argument(
+    "--out-summary",
+    metavar="RESULT.csv")
+
+def run(argv):
+    args = parser.parse_args(argv)
+
+    df = None
+    for (i, filename) in enumerate(args.input):
+        input_df = pandas.read_csv(filename)
+        assert not input_df.mhcflurry_prediction.isnull().any()
+
+        cols_to_merge = []
+        input_df["prediction_%d" % i] = input_df.mhcflurry_prediction
+        cols_to_merge.append(input_df.columns[-1])
+        if 'mhcflurry_model_single_0' in input_df.columns:
+            input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0
+            cols_to_merge.append(input_df.columns[-1])
+
+        if df is None:
+            df = input_df[
+                ["allele", "peptide", "measurement_value"] + cols_to_merge
+            ].copy()
+        else:
+            df = pandas.merge(
+                df,
+                input_df[['allele', 'peptide'] + cols_to_merge],
+                on=['allele', 'peptide'],
+                how='outer')
+
+    print("Loaded data:")
+    print(df.head(5))
+
+    if args.out_combined:
+        df.to_csv(args.out_combined, index=False)
+        print("Wrote: %s" % args.out_combined)
+
+    prediction_cols = [
+        c
+        for c in df.columns
+        if c.startswith("prediction_")
+    ]
+
+    scores_rows = []
+    for (allele, allele_df) in df.groupby("allele"):
+        for prediction_col in prediction_cols:
+            sub_df = allele_df.loc[~allele_df[prediction_col].isnull()]
+            scores = collections.OrderedDict()
+            scores['allele'] = allele
+            scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "")
+            scores['kind'] = "single" if "single" in prediction_col else "ensemble"
+            scores['train_size'] = allele_df[prediction_col].isnull().sum()
+            scores['test_size'] = len(sub_df)
+            scores.update(
+                make_scores(
+                    sub_df.measurement_value, sub_df[prediction_col]))
+            scores_rows.append(scores)
+    scores_df = pandas.DataFrame(scores_rows)
+    print(scores_df)
+
+    if args.out_scores:
+        scores_df.to_csv(args.out_scores, index=False)
+        print("Wrote: %s" % args.out_scores)
+
+    summary_df = scores_df.groupby(["allele", "kind"])[
+        ["train_size", "test_size", "auc", "f1", "tau"]
+    ].mean().reset_index()
+    print("Summary:")
+    print(summary_df)
+
+    if args.out_summary:
+        summary_df.to_csv(args.out_summary, index=False)
+        print("Wrote: %s" % args.out_summary)
+
+if __name__ == '__main__':
+    run(sys.argv[1:])
+
diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py
new file mode 100644
index 00000000..3f95337f
--- /dev/null
+++ b/downloads-generation/cross_validation_class1/split_folds.py
@@ -0,0 +1,113 @@
+"""
+Split training data into CV folds.
+"""
+import argparse
+import sys
+from os.path import abspath
+
+import pandas
+from sklearn.model_selection import StratifiedKFold
+
+parser = argparse.ArgumentParser(usage = __doc__)
+
+parser.add_argument(
+    "input", metavar="INPUT.csv", help="Input CSV")
+
+parser.add_argument(
+    "--folds", metavar="N", type=int, default=5)
+
+parser.add_argument(
+    "--allele",
+    nargs="+",
+    help="Include only the specified allele(s)")
+
+parser.add_argument(
+    "--min-measurements-per-allele",
+    type=int,
+    metavar="N",
+    help="Use only alleles with >=N measurements.")
+
+parser.add_argument(
+    "--subsample",
+    type=int,
+    metavar="N",
+    help="Subsample to first N rows")
+
+parser.add_argument(
+    "--random-state",
+    metavar="N",
+    type=int,
+    help="Specify an int for deterministic splitting")
+
+parser.add_argument(
+    "--output-pattern-train",
+    default="./train.fold_{}.csv",
+    help="Pattern to use to generate output filename. Default: %(default)s")
+
+parser.add_argument(
+    "--output-pattern-test",
+    default="./test.fold_{}.csv",
+    help="Pattern to use to generate output filename. Default: %(default)s")
+
+
+def run(argv):
+    args = parser.parse_args(argv)
+
+    df = pandas.read_csv(args.input)
+    print("Loaded data with shape: %s" % str(df.shape))
+
+    df = df.ix[
+        (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
+    ]
+    print("Subselected to 8-15mers: %s" % (str(df.shape)))
+
+    allele_counts = df.allele.value_counts()
+
+    if args.allele:
+        alleles = args.allele
+    else:
+        alleles = list(
+            allele_counts.ix[
+                           allele_counts > args.min_measurements_per_allele
+        ].index)
+
+    df = df.ix[df.allele.isin(alleles)]
+    print("Potentially subselected by allele to: %s" % str(df.shape))
+
+    print("Data has %d alleles: %s" % (
+        df.allele.nunique(), " ".join(df.allele.unique())))
+
+    df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index()
+    print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape))
+
+    if args.subsample:
+        df = df.head(args.subsample)
+        print("Subsampled to: %s" % str(df.shape))
+
+    kf = StratifiedKFold(
+        n_splits=args.folds,
+        shuffle=True,
+        random_state=args.random_state)
+
+    # Stratify by both allele and binder vs. nonbinder.
+    df["key"] = [
+        "%s_%s" % (
+            row.allele,
+            "binder" if row.measurement_value < 500 else "nonbinder")
+        for (_, row) in df.iterrows()
+    ]
+
+    for i, (train, test) in enumerate(kf.split(df, df.key)):
+        train_filename = args.output_pattern_train.format(i)
+        test_filename = args.output_pattern_test.format(i)
+
+        df.iloc[train].to_csv(train_filename, index=False)
+        print("Wrote: %s" % abspath(train_filename))
+
+        df.iloc[test].to_csv(test_filename, index=False)
+        print("Wrote: %s" % abspath(test_filename))
+
+
+if __name__ == '__main__':
+    run(sys.argv[1:])
+
diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
index bdc90dd8..1936e6d7 100644
--- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
+++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
@@ -59,6 +59,19 @@ parser.add_argument(
     help="Number of peptides per length to use to calibrate percent ranks. "
     "Set to 0 to disable percent rank calibration. The resulting models will "
     "not support percent ranks")
+parser.add_argument(
+    "--n-models",
+    type=int,
+    metavar="N",
+    help="Ensemble size, i.e. how many models to train for each architecture. "
+    "If specified here it overrides any 'n_models' specified in the "
+    "hyperparameters.")
+parser.add_argument(
+    "--max-epochs",
+    type=int,
+    metavar="N",
+    help="Max training epochs. If specified here it overrides any 'max_epochs' "
+    "specified in the hyperparameters.")
 parser.add_argument(
     "--verbosity",
     type=int,
@@ -114,7 +127,16 @@ def run(argv=sys.argv[1:]):
         print("Done.")
 
     for (h, hyperparameters) in enumerate(hyperparameters_lst):
-        n_models = hyperparameters.pop("n_models")
+        n_models = None
+        if 'n_models' in hyperparameters:
+            n_models = hyperparameters.pop("n_models")
+        if args.n_models:
+            n_models = args.n_models
+        if not n_models:
+            raise ValueError("Specify --ensemble-size or n_models hyperparameter")
+
+        if args.max_epochs:
+            hyperparameters['max_epochs'] = args.max_epochs
 
         for model_group in range(n_models):
             for (i, allele) in enumerate(alleles):
diff --git a/mhcflurry/scoring.py b/mhcflurry/scoring.py
index fc2a9263..f6a256ba 100644
--- a/mhcflurry/scoring.py
+++ b/mhcflurry/scoring.py
@@ -4,7 +4,7 @@ from __future__ import (
     absolute_import,
 )
 import logging
-import sklearn
+import sklearn.metrics
 import numpy
 import scipy
 
-- 
GitLab