From b0b025967371af43fd67c175ec7502285f57b61c Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 21 Feb 2018 17:00:28 -0500 Subject: [PATCH] Add models_class1_selected_no_mass_spec download --- .../models_class1/GENERATE.sh | 22 ++++ .../models_class1/write_validation_data.py | 105 ++++++++++++++++++ .../GENERATE.sh | 77 +++++++++++++ .../write_validation_data.py | 1 + mhcflurry/class1_affinity_predictor.py | 2 - .../select_allele_specific_models_command.py | 64 ++++++++--- 6 files changed, 252 insertions(+), 19 deletions(-) create mode 100644 downloads-generation/models_class1/write_validation_data.py create mode 100755 downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh create mode 120000 downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index 642c5d96..b8b002c5 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -33,9 +33,30 @@ echo "Detected GPUS: $GPUS" PROCESSORS=$(getconf _NPROCESSORS_ONLN) echo "Detected processors: $PROCESSORS" +python ./write_validation_data.py \ + --include "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ + --exclude "$(mhcflurry-downloads path models_class1_unselected)/models/train_data.csv.bz2" \ + --only-alleles-present-in-exclude \ + --out-data test.csv \ + --out-summary test.summary.csv + +wc -l test.csv + +mhcflurry-predict \ + test.csv \ + --prediction-column-prefix "mhcflurry_unselected_" \ + --models "$(mhcflurry-downloads path models_class1_unselected)/models" \ + --out test.csv + +wc -l test.csv + + time mhcflurry-class1-select-allele-specific-models \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ + --exclude-data "$(mhcflurry-downloads path models_class1_unselected)/models/train_data.csv.bz2" \ --models-dir "$(mhcflurry-downloads path models_class1_unselected)/models" \ --out-models-dir models \ + --out-unselected-predictions unselected_predictions.csv \ --scoring mass-spec consensus \ --consensus-num-peptides-per-length 10000 \ --min-models 8 \ @@ -48,6 +69,7 @@ time mhcflurry-calibrate-percentile-ranks \ cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt +bzip2 unselected_predictions.csv tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/models_class1/write_validation_data.py b/downloads-generation/models_class1/write_validation_data.py new file mode 100644 index 00000000..28ad4a38 --- /dev/null +++ b/downloads-generation/models_class1/write_validation_data.py @@ -0,0 +1,105 @@ +""" +Write and summarize model validation data, which is obtained by taking a full +dataset and removing the data used for training. + +""" +import argparse +import sys +from os.path import abspath + +import pandas +import numpy +from sklearn.model_selection import StratifiedKFold + +parser = argparse.ArgumentParser(usage = __doc__) + +parser.add_argument( + "--include", + metavar="INPUT.csv", + nargs="+", + help="Input CSV to include") +parser.add_argument( + "--exclude", + metavar="INPUT.csv", + nargs="+", + help="Input CSV to exclude") +parser.add_argument( + "--out-data", + metavar="RESULT.csv", + help="Output dadta CSV") +parser.add_argument( + "--out-summary", + metavar="RESULT.csv", + help="Output summary CSV") +parser.add_argument( + "--mass-spec-regex", + metavar="REGEX", + default="mass[- ]spec", + help="Regular expression for mass-spec data. Runs on measurement_source col." + "Default: %(default)s.") +parser.add_argument( + "--only-alleles-present-in-exclude", + action="store_true", + default=False, + help="Filter to only alleles that are present in files given by --exclude. " + "Useful for filtering to only alleles supported by a predictor, where the " + "training data for the predictor is given by --exclude.") + + +def run(argv): + args = parser.parse_args(argv) + + dfs = [] + for input in args.include: + df = pandas.read_csv(input) + dfs.append(df) + df = pandas.concat(dfs, ignore_index=True) + print("Loaded data with shape: %s" % str(df.shape)) + del dfs + + df = df.ix[ + (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) + ] + print("Subselected to 8-15mers: %s" % (str(df.shape))) + + if args.exclude: + exclude_dfs = [] + for exclude in args.exclude: + exclude_df = pandas.read_csv(exclude) + exclude_dfs.append(exclude_df) + exclude_df = pandas.concat(exclude_dfs, ignore_index=True) + del exclude_dfs + + df["_key"] = df.allele + "__" + df.peptide + exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide + df["_excluded"] = df._key.isin(exclude_df._key.unique()) + print("Excluding measurements per allele (counts): ") + print(df.groupby("allele")._excluded.sum()) + + print("Excluding measurements per allele (fractions): ") + print(df.groupby("allele")._excluded.mean()) + + df = df.loc[~df._excluded] + del df["_excluded"] + del df["_key"] + + if args.only_alleles_present_in_exclude: + df = df.loc[df.allele.isin(exclude_df.allele.unique())] + + df["mass_spec"] = df.measurement_source.str.contains(args.mass_spec_regex) + df.loc[df.mass_spec , "measurement_inequality"] = "mass_spec" + + if args.out_summary: + summary_df = df.groupby( + ["allele", "measurement_inequality"] + )["measurement_value"].count().unstack().fillna(0).astype(int) + summary_df["total"] = summary_df.sum(1) + summary_df.to_csv(args.out_summary) + print("Wrote: %s" % args.out_summary) + + if args.out_data: + df.to_csv(args.out_data, index=False) + print("Wrote: %s" % args.out_data) + +if __name__ == '__main__': + run(sys.argv[1:]) diff --git a/downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh b/downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh new file mode 100755 index 00000000..9d9f603e --- /dev/null +++ b/downloads-generation/models_class1_selected_no_mass_spec/GENERATE.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Model select standard MHCflurry Class I models. +# +set -e +set -x + +DOWNLOAD_NAME=models_class1_selected_no_mass_spec +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +mkdir models + +GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 +echo "Detected GPUS: $GPUS" + +PROCESSORS=$(getconf _NPROCESSORS_ONLN) +echo "Detected processors: $PROCESSORS" + +time python ./write_validation_data.py \ + --include "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \ + --exclude "$(mhcflurry-downloads path models_class1_unselected)/models/train_data.csv.bz2" \ + --only-alleles-present-in-exclude \ + --out-data test.csv \ + --out-summary test.summary.csv + +wc -l test.csv + +time mhcflurry-predict \ + test.csv \ + --prediction-column-prefix "mhcflurry_unselected_" \ + --models "$(mhcflurry-downloads path models_class1_unselected)/models" \ + --out test.csv + +wc -l test.csv + + +time mhcflurry-class1-select-allele-specific-models \ + --data test.csv \ + --models-dir "$(mhcflurry-downloads path models_class1_unselected)/models" \ + --out-models-dir models \ + --scoring mse consensus \ + --consensus-num-peptides-per-length 10000 \ + --consensus-min-models 8 \ + --consensus-max-models 8\ + --mse-min-measurements 20 \ + --mse-min-models 8 \ + --mse-max-models 10000 \ + --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50 + +time mhcflurry-calibrate-percentile-ranks \ + --models-dir models \ + --num-peptides-per-length 100000 \ + --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50 + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +bzip2 unselected_predictions.csv +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * + +echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py b/downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py new file mode 120000 index 00000000..0875257f --- /dev/null +++ b/downloads-generation/models_class1_selected_no_mass_spec/write_validation_data.py @@ -0,0 +1 @@ +../models_class1/write_validation_data.py \ No newline at end of file diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 50c13d6d..1638075e 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -1223,8 +1223,6 @@ class Class1AffinityPredictor(object): round_num += 1 dfs.append(df) - print("Selected %d models for allele %s" % ( - df.selected.sum(), allele)) allele_to_allele_specific_models[allele] = list( df.loc[df.selected].model) diff --git a/mhcflurry/select_allele_specific_models_command.py b/mhcflurry/select_allele_specific_models_command.py index 577c1d41..3d605ea8 100644 --- a/mhcflurry/select_allele_specific_models_command.py +++ b/mhcflurry/select_allele_specific_models_command.py @@ -58,6 +58,11 @@ parser.add_argument( metavar="DIR", required=True, help="Directory to write selected models") +parser.add_argument( + "--out-unselected-predictions", + metavar="FILE.csv", + help="Write predictions for validation data using unselected predictor to " + "FILE.csv") parser.add_argument( "--allele", default=None, @@ -65,34 +70,47 @@ parser.add_argument( help="Alleles to select models for. If not specified, all alleles with " "enough measurements will be used.") parser.add_argument( - "--min-measurements-per-allele", - type=int, - metavar="N", - default=50, - help="Min number of data points required for data-driven model selection") -parser.add_argument( - "--min-models", + "--mse-min-models", type=int, default=8, metavar="N", - help="Min number of models to select per allele") + help="Min number of models to select per allele when using MSE selector") parser.add_argument( - "--max-models", + "--mse-max-models", type=int, default=15, metavar="N", - help="Max number of models to select per allele") + help="Max number of models to select per allele when using MSE selector") parser.add_argument( "--scoring", nargs="+", choices=("mse", "mass-spec", "consensus"), default=["mse", "consensus"], help="Scoring procedures to use in order") +parser.add_argument( + "--consensus-min-models", + type=int, + default=8, + metavar="N", + help="Min number of models to select per allele when using consensus selector") +parser.add_argument( + "--consensus-max-models", + type=int, + default=15, + metavar="N", + help="Max number of models to select per allele when using consensus selector") parser.add_argument( "--consensus-num-peptides-per-length", type=int, default=100000, help="Num peptides per length to use for consensus scoring") +parser.add_argument( + "--mse-min-measurements", + type=int, + metavar="N", + default=50, + help="Min number of measurements required for an allele to use MSE model " + "selection") parser.add_argument( "--verbosity", type=int, @@ -152,16 +170,20 @@ def run(argv=sys.argv[1:]): print(df.groupby("allele")._excluded.mean()) df = df.loc[~df._excluded] + del df["_excluded"] + del df["_key"] print("Reduced data to: %s" % (str(df.shape))) metadata_dfs["model_selection_data"] = df else: df = None - model_selection_kwargs = { - 'min_models': args.min_models, - 'max_models': args.max_models, - } + if args.out_unselected_predictions: + df["unselected_prediction"] = input_predictor.predict( + alleles=df.allele.values, + peptides=df.peptide.values) + df.to_csv(args.out_unselected_predictions) + print("Wrote: %s" % args.out_unselected_predictions) selectors = {} for scoring in args.scoring: @@ -169,13 +191,21 @@ def run(argv=sys.argv[1:]): selector = MSEModelSelector( df=df, predictor=input_predictor, - min_measurements=args.min_measurements_per_allele, - model_selection_kwargs=model_selection_kwargs) + min_measurements=args.mse_min_measurements, + model_selection_kwargs={ + 'min_models': args.mse_min_models, + 'max_models': args.mse_max_models, + }) elif scoring == "consensus": selector = ConsensusModelSelector( predictor=input_predictor, num_peptides_per_length=args.consensus_num_peptides_per_length, - model_selection_kwargs=model_selection_kwargs) + model_selection_kwargs={ + 'min_models': args.consensus_min_models, + 'max_models': args.consensus_max_models, + }) + else: + raise ValueError("Unsupported scoring method: %s" % scoring) selectors[scoring] = selector print("Selectors for alleles:") -- GitLab