From 55e3cfc12f49b5526a6abdbe2b38e9d9abe495f8 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 25 Nov 2017 17:58:25 -0500 Subject: [PATCH] Add cross-validation GENERATE.sh and related scripts --- .../cross_validation_class1/GENERATE.sh | 83 +++++++++++++ .../cross_validation_class1/README.md | 7 ++ .../hyperparameters.yaml | 1 + .../cross_validation_class1/score.py | 101 ++++++++++++++++ .../cross_validation_class1/split_folds.py | 113 ++++++++++++++++++ .../train_allele_specific_models_command.py | 24 +++- mhcflurry/scoring.py | 2 +- 7 files changed, 329 insertions(+), 2 deletions(-) create mode 100755 downloads-generation/cross_validation_class1/GENERATE.sh create mode 100644 downloads-generation/cross_validation_class1/README.md create mode 120000 downloads-generation/cross_validation_class1/hyperparameters.yaml create mode 100644 downloads-generation/cross_validation_class1/score.py create mode 100644 downloads-generation/cross_validation_class1/split_folds.py diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh new file mode 100755 index 00000000..ebbe0b8e --- /dev/null +++ b/downloads-generation/cross_validation_class1/GENERATE.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +set -e +set -x + +DOWNLOAD_NAME=cross_validation_class1 +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +NFOLDS=5 + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +cp $SCRIPT_DIR/hyperparameters.yaml . +cp $SCRIPT_DIR/split_folds.py . +cp $SCRIPT_DIR/score.py . + +time python split_folds.py \ + "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ + --min-measurements-per-allele 100 \ + --folds $NFOLDS \ + --random-state 1 \ + --output-pattern-test "./test.fold_{}.csv" \ + --output-pattern-train "./train.fold_{}.csv" + +# Kill child processes if parent exits: +trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT + +for fold in $(seq 0 $(expr $NFOLDS - 1)) +do + mhcflurry-class1-train-allele-specific-models \ + --data train.fold_${fold}.csv \ + --hyperparameters hyperparameters.yaml \ + --out-models-dir models.fold_${fold} \ + --min-measurements-per-allele 0 \ + --percent-rank-calibration-num-peptides-per-length 0 \ + 2>&1 | tee -a LOG.train.fold_${fold}.txt & +done +wait + +echo "DONE TRAINING. NOW PREDICTING." + +for fold in $(seq 0 $(expr $NFOLDS - 1)) +do + mhcflurry-predict \ + test.fold_${fold}.csv \ + --models models.fold_${fold} \ + --no-throw \ + --include-individual-model-predictions \ + --out predictions.fold_${fold}.csv & +done +wait + +time python score.py \ + predictions.fold_*.csv \ + --out-combined predictions.combined.csv \ + --out-scores scores.csv \ + --out-summary summary.all.csv + +grep -v single summary.all.csv > summary.ensemble.csv + +cp $SCRIPT_ABSOLUTE_PATH . +for i in $(ls *.txt) +do + bzip2 $i +done +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * + +echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/cross_validation_class1/README.md b/downloads-generation/cross_validation_class1/README.md new file mode 100644 index 00000000..f7584e32 --- /dev/null +++ b/downloads-generation/cross_validation_class1/README.md @@ -0,0 +1,7 @@ +# Cross validation of standard Class I models + +This download contains cross validation results and intermediate data for +class I allele-specific MHCflurry models. + +This exists to track the exact steps used to generate cross-validation results. +Users will probably not interact with this directly. \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/hyperparameters.yaml b/downloads-generation/cross_validation_class1/hyperparameters.yaml new file mode 120000 index 00000000..f32feef1 --- /dev/null +++ b/downloads-generation/cross_validation_class1/hyperparameters.yaml @@ -0,0 +1 @@ +../models_class1/hyperparameters.yaml \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py new file mode 100644 index 00000000..dcf366de --- /dev/null +++ b/downloads-generation/cross_validation_class1/score.py @@ -0,0 +1,101 @@ +""" +Scoring script for cross-validation. +""" +import argparse +import sys +import collections + +import pandas +from mhcflurry.scoring import make_scores + + +parser = argparse.ArgumentParser(usage = __doc__) + +parser.add_argument( + "input", metavar="INPUT.csv", help="Input CSV", nargs="+") + +parser.add_argument( + "--out-scores", + metavar="RESULT.csv") + +parser.add_argument( + "--out-combined", + metavar="COMBINED.csv") + +parser.add_argument( + "--out-summary", + metavar="RESULT.csv") + +def run(argv): + args = parser.parse_args(argv) + + df = None + for (i, filename) in enumerate(args.input): + input_df = pandas.read_csv(filename) + assert not input_df.mhcflurry_prediction.isnull().any() + + cols_to_merge = [] + input_df["prediction_%d" % i] = input_df.mhcflurry_prediction + cols_to_merge.append(input_df.columns[-1]) + if 'mhcflurry_model_single_0' in input_df.columns: + input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0 + cols_to_merge.append(input_df.columns[-1]) + + if df is None: + df = input_df[ + ["allele", "peptide", "measurement_value"] + cols_to_merge + ].copy() + else: + df = pandas.merge( + df, + input_df[['allele', 'peptide'] + cols_to_merge], + on=['allele', 'peptide'], + how='outer') + + print("Loaded data:") + print(df.head(5)) + + if args.out_combined: + df.to_csv(args.out_combined, index=False) + print("Wrote: %s" % args.out_combined) + + prediction_cols = [ + c + for c in df.columns + if c.startswith("prediction_") + ] + + scores_rows = [] + for (allele, allele_df) in df.groupby("allele"): + for prediction_col in prediction_cols: + sub_df = allele_df.loc[~allele_df[prediction_col].isnull()] + scores = collections.OrderedDict() + scores['allele'] = allele + scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "") + scores['kind'] = "single" if "single" in prediction_col else "ensemble" + scores['train_size'] = allele_df[prediction_col].isnull().sum() + scores['test_size'] = len(sub_df) + scores.update( + make_scores( + sub_df.measurement_value, sub_df[prediction_col])) + scores_rows.append(scores) + scores_df = pandas.DataFrame(scores_rows) + print(scores_df) + + if args.out_scores: + scores_df.to_csv(args.out_scores, index=False) + print("Wrote: %s" % args.out_scores) + + summary_df = scores_df.groupby(["allele", "kind"])[ + ["train_size", "test_size", "auc", "f1", "tau"] + ].mean().reset_index() + print("Summary:") + print(summary_df) + + if args.out_summary: + summary_df.to_csv(args.out_summary, index=False) + print("Wrote: %s" % args.out_summary) + +if __name__ == '__main__': + run(sys.argv[1:]) + diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py new file mode 100644 index 00000000..3f95337f --- /dev/null +++ b/downloads-generation/cross_validation_class1/split_folds.py @@ -0,0 +1,113 @@ +""" +Split training data into CV folds. +""" +import argparse +import sys +from os.path import abspath + +import pandas +from sklearn.model_selection import StratifiedKFold + +parser = argparse.ArgumentParser(usage = __doc__) + +parser.add_argument( + "input", metavar="INPUT.csv", help="Input CSV") + +parser.add_argument( + "--folds", metavar="N", type=int, default=5) + +parser.add_argument( + "--allele", + nargs="+", + help="Include only the specified allele(s)") + +parser.add_argument( + "--min-measurements-per-allele", + type=int, + metavar="N", + help="Use only alleles with >=N measurements.") + +parser.add_argument( + "--subsample", + type=int, + metavar="N", + help="Subsample to first N rows") + +parser.add_argument( + "--random-state", + metavar="N", + type=int, + help="Specify an int for deterministic splitting") + +parser.add_argument( + "--output-pattern-train", + default="./train.fold_{}.csv", + help="Pattern to use to generate output filename. Default: %(default)s") + +parser.add_argument( + "--output-pattern-test", + default="./test.fold_{}.csv", + help="Pattern to use to generate output filename. Default: %(default)s") + + +def run(argv): + args = parser.parse_args(argv) + + df = pandas.read_csv(args.input) + print("Loaded data with shape: %s" % str(df.shape)) + + df = df.ix[ + (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) + ] + print("Subselected to 8-15mers: %s" % (str(df.shape))) + + allele_counts = df.allele.value_counts() + + if args.allele: + alleles = args.allele + else: + alleles = list( + allele_counts.ix[ + allele_counts > args.min_measurements_per_allele + ].index) + + df = df.ix[df.allele.isin(alleles)] + print("Potentially subselected by allele to: %s" % str(df.shape)) + + print("Data has %d alleles: %s" % ( + df.allele.nunique(), " ".join(df.allele.unique()))) + + df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index() + print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape)) + + if args.subsample: + df = df.head(args.subsample) + print("Subsampled to: %s" % str(df.shape)) + + kf = StratifiedKFold( + n_splits=args.folds, + shuffle=True, + random_state=args.random_state) + + # Stratify by both allele and binder vs. nonbinder. + df["key"] = [ + "%s_%s" % ( + row.allele, + "binder" if row.measurement_value < 500 else "nonbinder") + for (_, row) in df.iterrows() + ] + + for i, (train, test) in enumerate(kf.split(df, df.key)): + train_filename = args.output_pattern_train.format(i) + test_filename = args.output_pattern_test.format(i) + + df.iloc[train].to_csv(train_filename, index=False) + print("Wrote: %s" % abspath(train_filename)) + + df.iloc[test].to_csv(test_filename, index=False) + print("Wrote: %s" % abspath(test_filename)) + + +if __name__ == '__main__': + run(sys.argv[1:]) + diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py index bdc90dd8..1936e6d7 100644 --- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py +++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py @@ -59,6 +59,19 @@ parser.add_argument( help="Number of peptides per length to use to calibrate percent ranks. " "Set to 0 to disable percent rank calibration. The resulting models will " "not support percent ranks") +parser.add_argument( + "--n-models", + type=int, + metavar="N", + help="Ensemble size, i.e. how many models to train for each architecture. " + "If specified here it overrides any 'n_models' specified in the " + "hyperparameters.") +parser.add_argument( + "--max-epochs", + type=int, + metavar="N", + help="Max training epochs. If specified here it overrides any 'max_epochs' " + "specified in the hyperparameters.") parser.add_argument( "--verbosity", type=int, @@ -114,7 +127,16 @@ def run(argv=sys.argv[1:]): print("Done.") for (h, hyperparameters) in enumerate(hyperparameters_lst): - n_models = hyperparameters.pop("n_models") + n_models = None + if 'n_models' in hyperparameters: + n_models = hyperparameters.pop("n_models") + if args.n_models: + n_models = args.n_models + if not n_models: + raise ValueError("Specify --ensemble-size or n_models hyperparameter") + + if args.max_epochs: + hyperparameters['max_epochs'] = args.max_epochs for model_group in range(n_models): for (i, allele) in enumerate(alleles): diff --git a/mhcflurry/scoring.py b/mhcflurry/scoring.py index fc2a9263..f6a256ba 100644 --- a/mhcflurry/scoring.py +++ b/mhcflurry/scoring.py @@ -4,7 +4,7 @@ from __future__ import ( absolute_import, ) import logging -import sklearn +import sklearn.metrics import numpy import scipy -- GitLab