Skip to content
Snippets Groups Projects
Commit 55e3cfc1 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add cross-validation GENERATE.sh and related scripts

parent 700535fa
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
set -e
set -x
DOWNLOAD_NAME=cross_validation_class1
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
NFOLDS=5
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/hyperparameters.yaml .
cp $SCRIPT_DIR/split_folds.py .
cp $SCRIPT_DIR/score.py .
time python split_folds.py \
"$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--min-measurements-per-allele 100 \
--folds $NFOLDS \
--random-state 1 \
--output-pattern-test "./test.fold_{}.csv" \
--output-pattern-train "./train.fold_{}.csv"
# Kill child processes if parent exits:
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
for fold in $(seq 0 $(expr $NFOLDS - 1))
do
mhcflurry-class1-train-allele-specific-models \
--data train.fold_${fold}.csv \
--hyperparameters hyperparameters.yaml \
--out-models-dir models.fold_${fold} \
--min-measurements-per-allele 0 \
--percent-rank-calibration-num-peptides-per-length 0 \
2>&1 | tee -a LOG.train.fold_${fold}.txt &
done
wait
echo "DONE TRAINING. NOW PREDICTING."
for fold in $(seq 0 $(expr $NFOLDS - 1))
do
mhcflurry-predict \
test.fold_${fold}.csv \
--models models.fold_${fold} \
--no-throw \
--include-individual-model-predictions \
--out predictions.fold_${fold}.csv &
done
wait
time python score.py \
predictions.fold_*.csv \
--out-combined predictions.combined.csv \
--out-scores scores.csv \
--out-summary summary.all.csv
grep -v single summary.all.csv > summary.ensemble.csv
cp $SCRIPT_ABSOLUTE_PATH .
for i in $(ls *.txt)
do
bzip2 $i
done
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Cross validation of standard Class I models
This download contains cross validation results and intermediate data for
class I allele-specific MHCflurry models.
This exists to track the exact steps used to generate cross-validation results.
Users will probably not interact with this directly.
\ No newline at end of file
../models_class1/hyperparameters.yaml
\ No newline at end of file
"""
Scoring script for cross-validation.
"""
import argparse
import sys
import collections
import pandas
from mhcflurry.scoring import make_scores
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV", nargs="+")
parser.add_argument(
"--out-scores",
metavar="RESULT.csv")
parser.add_argument(
"--out-combined",
metavar="COMBINED.csv")
parser.add_argument(
"--out-summary",
metavar="RESULT.csv")
def run(argv):
args = parser.parse_args(argv)
df = None
for (i, filename) in enumerate(args.input):
input_df = pandas.read_csv(filename)
assert not input_df.mhcflurry_prediction.isnull().any()
cols_to_merge = []
input_df["prediction_%d" % i] = input_df.mhcflurry_prediction
cols_to_merge.append(input_df.columns[-1])
if 'mhcflurry_model_single_0' in input_df.columns:
input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0
cols_to_merge.append(input_df.columns[-1])
if df is None:
df = input_df[
["allele", "peptide", "measurement_value"] + cols_to_merge
].copy()
else:
df = pandas.merge(
df,
input_df[['allele', 'peptide'] + cols_to_merge],
on=['allele', 'peptide'],
how='outer')
print("Loaded data:")
print(df.head(5))
if args.out_combined:
df.to_csv(args.out_combined, index=False)
print("Wrote: %s" % args.out_combined)
prediction_cols = [
c
for c in df.columns
if c.startswith("prediction_")
]
scores_rows = []
for (allele, allele_df) in df.groupby("allele"):
for prediction_col in prediction_cols:
sub_df = allele_df.loc[~allele_df[prediction_col].isnull()]
scores = collections.OrderedDict()
scores['allele'] = allele
scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "")
scores['kind'] = "single" if "single" in prediction_col else "ensemble"
scores['train_size'] = allele_df[prediction_col].isnull().sum()
scores['test_size'] = len(sub_df)
scores.update(
make_scores(
sub_df.measurement_value, sub_df[prediction_col]))
scores_rows.append(scores)
scores_df = pandas.DataFrame(scores_rows)
print(scores_df)
if args.out_scores:
scores_df.to_csv(args.out_scores, index=False)
print("Wrote: %s" % args.out_scores)
summary_df = scores_df.groupby(["allele", "kind"])[
["train_size", "test_size", "auc", "f1", "tau"]
].mean().reset_index()
print("Summary:")
print(summary_df)
if args.out_summary:
summary_df.to_csv(args.out_summary, index=False)
print("Wrote: %s" % args.out_summary)
if __name__ == '__main__':
run(sys.argv[1:])
"""
Split training data into CV folds.
"""
import argparse
import sys
from os.path import abspath
import pandas
from sklearn.model_selection import StratifiedKFold
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV")
parser.add_argument(
"--folds", metavar="N", type=int, default=5)
parser.add_argument(
"--allele",
nargs="+",
help="Include only the specified allele(s)")
parser.add_argument(
"--min-measurements-per-allele",
type=int,
metavar="N",
help="Use only alleles with >=N measurements.")
parser.add_argument(
"--subsample",
type=int,
metavar="N",
help="Subsample to first N rows")
parser.add_argument(
"--random-state",
metavar="N",
type=int,
help="Specify an int for deterministic splitting")
parser.add_argument(
"--output-pattern-train",
default="./train.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")
parser.add_argument(
"--output-pattern-test",
default="./test.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")
def run(argv):
args = parser.parse_args(argv)
df = pandas.read_csv(args.input)
print("Loaded data with shape: %s" % str(df.shape))
df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))
allele_counts = df.allele.value_counts()
if args.allele:
alleles = args.allele
else:
alleles = list(
allele_counts.ix[
allele_counts > args.min_measurements_per_allele
].index)
df = df.ix[df.allele.isin(alleles)]
print("Potentially subselected by allele to: %s" % str(df.shape))
print("Data has %d alleles: %s" % (
df.allele.nunique(), " ".join(df.allele.unique())))
df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index()
print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape))
if args.subsample:
df = df.head(args.subsample)
print("Subsampled to: %s" % str(df.shape))
kf = StratifiedKFold(
n_splits=args.folds,
shuffle=True,
random_state=args.random_state)
# Stratify by both allele and binder vs. nonbinder.
df["key"] = [
"%s_%s" % (
row.allele,
"binder" if row.measurement_value < 500 else "nonbinder")
for (_, row) in df.iterrows()
]
for i, (train, test) in enumerate(kf.split(df, df.key)):
train_filename = args.output_pattern_train.format(i)
test_filename = args.output_pattern_test.format(i)
df.iloc[train].to_csv(train_filename, index=False)
print("Wrote: %s" % abspath(train_filename))
df.iloc[test].to_csv(test_filename, index=False)
print("Wrote: %s" % abspath(test_filename))
if __name__ == '__main__':
run(sys.argv[1:])
......@@ -59,6 +59,19 @@ parser.add_argument(
help="Number of peptides per length to use to calibrate percent ranks. "
"Set to 0 to disable percent rank calibration. The resulting models will "
"not support percent ranks")
parser.add_argument(
"--n-models",
type=int,
metavar="N",
help="Ensemble size, i.e. how many models to train for each architecture. "
"If specified here it overrides any 'n_models' specified in the "
"hyperparameters.")
parser.add_argument(
"--max-epochs",
type=int,
metavar="N",
help="Max training epochs. If specified here it overrides any 'max_epochs' "
"specified in the hyperparameters.")
parser.add_argument(
"--verbosity",
type=int,
......@@ -114,7 +127,16 @@ def run(argv=sys.argv[1:]):
print("Done.")
for (h, hyperparameters) in enumerate(hyperparameters_lst):
n_models = hyperparameters.pop("n_models")
n_models = None
if 'n_models' in hyperparameters:
n_models = hyperparameters.pop("n_models")
if args.n_models:
n_models = args.n_models
if not n_models:
raise ValueError("Specify --ensemble-size or n_models hyperparameter")
if args.max_epochs:
hyperparameters['max_epochs'] = args.max_epochs
for model_group in range(n_models):
for (i, allele) in enumerate(alleles):
......
......@@ -4,7 +4,7 @@ from __future__ import (
absolute_import,
)
import logging
import sklearn
import sklearn.metrics
import numpy
import scipy
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment