From b0b025967371af43fd67c175ec7502285f57b61c Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 21 Feb 2018 17:00:28 -0500
Subject: [PATCH] Add models_class1_selected_no_mass_spec download

---
 .../models_class1/GENERATE.sh                 |  22 ++++
 .../models_class1/write_validation_data.py    | 105 ++++++++++++++++++
 .../GENERATE.sh                               |  77 +++++++++++++
 .../write_validation_data.py                  |   1 +
 mhcflurry/class1_affinity_predictor.py        |   2 -
 .../select_allele_specific_models_command.py  |  64 ++++++++---
 6 files changed, 252 insertions(+), 19 deletions(-)
 create mode 100644 downloads-generation/models_class1/write_validation_data.py
 create mode 100755 downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh
 create mode 120000 downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py

diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh
index 642c5d96..b8b002c5 100755
--- a/downloads-generation/models_class1/GENERATE.sh
+++ b/downloads-generation/models_class1/GENERATE.sh
@@ -33,9 +33,30 @@ echo "Detected GPUS: $GPUS"
 PROCESSORS=$(getconf _NPROCESSORS_ONLN)
 echo "Detected processors: $PROCESSORS"
 
+python ./write_validation_data.py \
+    --include "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
+    --exclude "$(mhcflurry-downloads path models_class1_unselected)/models/train_data.csv.bz2" \
+    --only-alleles-present-in-exclude \
+    --out-data test.csv \
+    --out-summary test.summary.csv
+
+wc -l test.csv
+
+mhcflurry-predict \
+    test.csv \
+    --prediction-column-prefix "mhcflurry_unselected_" \
+    --models "$(mhcflurry-downloads path models_class1_unselected)/models" \
+    --out test.csv
+
+wc -l test.csv
+
+
 time mhcflurry-class1-select-allele-specific-models \
+    --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
+    --exclude-data "$(mhcflurry-downloads path models_class1_unselected)/models/train_data.csv.bz2" \
     --models-dir "$(mhcflurry-downloads path models_class1_unselected)/models" \
     --out-models-dir models \
+    --out-unselected-predictions unselected_predictions.csv \
     --scoring mass-spec consensus \
     --consensus-num-peptides-per-length 10000 \
     --min-models 8 \
@@ -48,6 +69,7 @@ time mhcflurry-calibrate-percentile-ranks \
 
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
+bzip2 unselected_predictions.csv
 tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
 
 echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
diff --git a/downloads-generation/models_class1/write_validation_data.py b/downloads-generation/models_class1/write_validation_data.py
new file mode 100644
index 00000000..28ad4a38
--- /dev/null
+++ b/downloads-generation/models_class1/write_validation_data.py
@@ -0,0 +1,105 @@
+"""
+Write and summarize model validation data, which is obtained by taking a full
+dataset and removing the data used for training.
+
+"""
+import argparse
+import sys
+from os.path import abspath
+
+import pandas
+import numpy
+from sklearn.model_selection import StratifiedKFold
+
+parser = argparse.ArgumentParser(usage = __doc__)
+
+parser.add_argument(
+    "--include",
+    metavar="INPUT.csv",
+    nargs="+",
+    help="Input CSV to include")
+parser.add_argument(
+    "--exclude",
+    metavar="INPUT.csv",
+    nargs="+",
+    help="Input CSV to exclude")
+parser.add_argument(
+    "--out-data",
+    metavar="RESULT.csv",
+    help="Output dadta CSV")
+parser.add_argument(
+    "--out-summary",
+    metavar="RESULT.csv",
+    help="Output summary CSV")
+parser.add_argument(
+    "--mass-spec-regex",
+    metavar="REGEX",
+    default="mass[- ]spec",
+    help="Regular expression for mass-spec data. Runs on measurement_source col."
+    "Default: %(default)s.")
+parser.add_argument(
+    "--only-alleles-present-in-exclude",
+    action="store_true",
+    default=False,
+    help="Filter to only alleles that are present in files given by --exclude. "
+    "Useful for filtering to only alleles supported by a predictor, where the "
+    "training data for the predictor is given by --exclude.")
+
+
+def run(argv):
+    args = parser.parse_args(argv)
+
+    dfs = []
+    for input in args.include:
+        df = pandas.read_csv(input)
+        dfs.append(df)
+    df = pandas.concat(dfs, ignore_index=True)
+    print("Loaded data with shape: %s" % str(df.shape))
+    del dfs
+
+    df = df.ix[
+        (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
+    ]
+    print("Subselected to 8-15mers: %s" % (str(df.shape)))
+
+    if args.exclude:
+        exclude_dfs = []
+        for exclude in args.exclude:
+            exclude_df = pandas.read_csv(exclude)
+            exclude_dfs.append(exclude_df)
+        exclude_df = pandas.concat(exclude_dfs, ignore_index=True)
+        del exclude_dfs
+
+        df["_key"] = df.allele + "__" + df.peptide
+        exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
+        df["_excluded"] = df._key.isin(exclude_df._key.unique())
+        print("Excluding measurements per allele (counts): ")
+        print(df.groupby("allele")._excluded.sum())
+
+        print("Excluding measurements per allele (fractions): ")
+        print(df.groupby("allele")._excluded.mean())
+
+        df = df.loc[~df._excluded]
+        del df["_excluded"]
+        del df["_key"]
+
+        if args.only_alleles_present_in_exclude:
+            df = df.loc[df.allele.isin(exclude_df.allele.unique())]
+
+    df["mass_spec"] = df.measurement_source.str.contains(args.mass_spec_regex)
+    df.loc[df.mass_spec , "measurement_inequality"] = "mass_spec"
+
+    if args.out_summary:
+        summary_df = df.groupby(
+            ["allele", "measurement_inequality"]
+        )["measurement_value"].count().unstack().fillna(0).astype(int)
+        summary_df["total"] = summary_df.sum(1)
+        summary_df.to_csv(args.out_summary)
+        print("Wrote: %s" % args.out_summary)
+
+    if args.out_data:
+        df.to_csv(args.out_data, index=False)
+        print("Wrote: %s" % args.out_data)
+
+if __name__ == '__main__':
+    run(sys.argv[1:])
diff --git a/downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh b/downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh
new file mode 100755
index 00000000..9d9f603e
--- /dev/null
+++ b/downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Model select standard MHCflurry Class I models.
+#
+set -e
+set -x
+
+DOWNLOAD_NAME=models_class1_selected_no_mass_spec
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+mkdir models
+
+GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
+echo "Detected GPUS: $GPUS"
+
+PROCESSORS=$(getconf _NPROCESSORS_ONLN)
+echo "Detected processors: $PROCESSORS"
+
+time python ./write_validation_data.py \
+    --include "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \
+    --exclude "$(mhcflurry-downloads path models_class1_unselected)/models/train_data.csv.bz2" \
+    --only-alleles-present-in-exclude \
+    --out-data test.csv \
+    --out-summary test.summary.csv
+
+wc -l test.csv
+
+time mhcflurry-predict \
+    test.csv \
+    --prediction-column-prefix "mhcflurry_unselected_" \
+    --models "$(mhcflurry-downloads path models_class1_unselected)/models" \
+    --out test.csv
+
+wc -l test.csv
+
+
+time mhcflurry-class1-select-allele-specific-models \
+    --data test.csv \
+    --models-dir "$(mhcflurry-downloads path models_class1_unselected)/models" \
+    --out-models-dir models \
+    --scoring mse consensus \
+    --consensus-num-peptides-per-length 10000 \
+    --consensus-min-models 8 \
+    --consensus-max-models 8\
+    --mse-min-measurements 20 \
+    --mse-min-models 8 \
+    --mse-max-models 10000 \
+    --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50
+
+time mhcflurry-calibrate-percentile-ranks \
+    --models-dir models \
+    --num-peptides-per-length 100000 \
+    --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50
+
+cp $SCRIPT_ABSOLUTE_PATH .
+bzip2 LOG.txt
+bzip2 unselected_predictions.csv
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
diff --git a/downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py b/downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py
new file mode 120000
index 00000000..0875257f
--- /dev/null
+++ b/downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py
@@ -0,0 +1 @@
+../models_class1/write_validation_data.py
\ No newline at end of file
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 50c13d6d..1638075e 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -1223,8 +1223,6 @@ class Class1AffinityPredictor(object):
                 round_num += 1
 
             dfs.append(df)
-            print("Selected %d models for allele %s" % (
-            df.selected.sum(), allele))
             allele_to_allele_specific_models[allele] = list(
                 df.loc[df.selected].model)
 
diff --git a/mhcflurry/select_allele_specific_models_command.py b/mhcflurry/select_allele_specific_models_command.py
index 577c1d41..3d605ea8 100644
--- a/mhcflurry/select_allele_specific_models_command.py
+++ b/mhcflurry/select_allele_specific_models_command.py
@@ -58,6 +58,11 @@ parser.add_argument(
     metavar="DIR",
     required=True,
     help="Directory to write selected models")
+parser.add_argument(
+    "--out-unselected-predictions",
+    metavar="FILE.csv",
+    help="Write predictions for validation data using unselected predictor to "
+    "FILE.csv")
 parser.add_argument(
     "--allele",
     default=None,
@@ -65,34 +70,47 @@ parser.add_argument(
     help="Alleles to select models for. If not specified, all alleles with "
     "enough measurements will be used.")
 parser.add_argument(
-    "--min-measurements-per-allele",
-    type=int,
-    metavar="N",
-    default=50,
-    help="Min number of data points required for data-driven model selection")
-parser.add_argument(
-    "--min-models",
+    "--mse-min-models",
     type=int,
     default=8,
     metavar="N",
-    help="Min number of models to select per allele")
+    help="Min number of models to select per allele when using MSE selector")
 parser.add_argument(
-    "--max-models",
+    "--mse-max-models",
     type=int,
     default=15,
     metavar="N",
-    help="Max number of models to select per allele")
+    help="Max number of models to select per allele when using MSE selector")
 parser.add_argument(
     "--scoring",
     nargs="+",
     choices=("mse", "mass-spec", "consensus"),
     default=["mse", "consensus"],
     help="Scoring procedures to use in order")
+parser.add_argument(
+    "--consensus-min-models",
+    type=int,
+    default=8,
+    metavar="N",
+    help="Min number of models to select per allele when using consensus selector")
+parser.add_argument(
+    "--consensus-max-models",
+    type=int,
+    default=15,
+    metavar="N",
+    help="Max number of models to select per allele when using consensus selector")
 parser.add_argument(
     "--consensus-num-peptides-per-length",
     type=int,
     default=100000,
     help="Num peptides per length to use for consensus scoring")
+parser.add_argument(
+    "--mse-min-measurements",
+    type=int,
+    metavar="N",
+    default=50,
+    help="Min number of measurements required for an allele to use MSE model "
+    "selection")
 parser.add_argument(
     "--verbosity",
     type=int,
@@ -152,16 +170,20 @@ def run(argv=sys.argv[1:]):
             print(df.groupby("allele")._excluded.mean())
 
             df = df.loc[~df._excluded]
+            del df["_excluded"]
+            del df["_key"]
             print("Reduced data to: %s" % (str(df.shape)))
 
         metadata_dfs["model_selection_data"] = df
     else:
         df = None
 
-    model_selection_kwargs = {
-        'min_models': args.min_models,
-        'max_models': args.max_models,
-    }
+    if args.out_unselected_predictions:
+        df["unselected_prediction"] = input_predictor.predict(
+            alleles=df.allele.values,
+            peptides=df.peptide.values)
+        df.to_csv(args.out_unselected_predictions)
+        print("Wrote: %s" % args.out_unselected_predictions)
 
     selectors = {}
     for scoring in args.scoring:
@@ -169,13 +191,21 @@ def run(argv=sys.argv[1:]):
             selector = MSEModelSelector(
                 df=df,
                 predictor=input_predictor,
-                min_measurements=args.min_measurements_per_allele,
-                model_selection_kwargs=model_selection_kwargs)
+                min_measurements=args.mse_min_measurements,
+                model_selection_kwargs={
+                    'min_models': args.mse_min_models,
+                    'max_models': args.mse_max_models,
+                })
         elif scoring == "consensus":
             selector = ConsensusModelSelector(
                 predictor=input_predictor,
                 num_peptides_per_length=args.consensus_num_peptides_per_length,
-                model_selection_kwargs=model_selection_kwargs)
+                model_selection_kwargs={
+                    'min_models': args.consensus_min_models,
+                    'max_models': args.consensus_max_models,
+                })
+        else:
+            raise ValueError("Unsupported scoring method: %s" % scoring)
         selectors[scoring] = selector
 
     print("Selectors for alleles:")
-- 
GitLab