From 25c387bb33ca4f5111a86ca566c129e8c94316ee Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 7 Dec 2019 15:44:24 -0500 Subject: [PATCH] working on multiallelic refinement download --- .../models_class1_pan_refined/GENERATE.sh | 110 ++++--------- .../make_multiallelic_training_data.py | 148 ++++++++++++++++++ setup.py | 2 + 3 files changed, 179 insertions(+), 81 deletions(-) create mode 100644 downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.sh b/downloads-generation/models_class1_pan_refined/GENERATE.sh index 4fe89de6..6591e853 100755 --- a/downloads-generation/models_class1_pan_refined/GENERATE.sh +++ b/downloads-generation/models_class1_pan_refined/GENERATE.sh @@ -3,12 +3,12 @@ # Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job # scheduler). This would need to be modified for other sites. # -# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete> +# Usage: GENERATE.sh <local|cluster> # set -e set -x -DOWNLOAD_NAME=models_class1_pan_variants +DOWNLOAD_NAME=models_class1_pan_refined SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") @@ -32,86 +32,46 @@ else PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf" fi -mkdir -p "$SCRATCH_DIR" -if [ "$2" != "continue-incomplete" ] -then - echo "Fresh run" - rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" - mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" -else - echo "Continuing incomplete run" -fi +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME" # Send stdout and stderr to a logfile included with the archive. -LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" -exec > >(tee -ia "$LOG") -exec 2> >(tee -ia "$LOG" >&2) +#LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" +#exec > >(tee -ia "$LOG") +#exec 2> >(tee -ia "$LOG" >&2) # Log some environment info -echo "Invocation: $0 $@" -date -pip freeze -git status +#echo "Invocation: $0 $@" +#date +#pip freeze +#git status cd $SCRATCH_DIR/$DOWNLOAD_NAME export OMP_NUM_THREADS=1 export PYTHONUNBUFFERED=1 -if [ "$2" != "continue-incomplete" ] -then - cp $SCRIPT_DIR/generate_hyperparameters.production.py . - cp $SCRIPT_DIR/generate_hyperparameters.py . - cp $SCRIPT_DIR/normalize_allele_names.py . - python generate_hyperparameters.production.py > hyperparameters.production.yaml - python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml - python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml - python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv -fi +# CAHNGE TO CP +ln -s $SCRIPT_DIR/make_multiallelic_training_data.py . -for kind in 34mer_sequence single_hidden_no_pretrain no_pretrain -do - CONTINUE_INCOMPLETE_ARGS="" - if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ] - then - echo "Will continue existing run: $kind" - CONTINUE_INCOMPLETE_ARGS="--continue-incomplete" - fi +time python make_multiallelic_training_data.py \ + --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \ + --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \ + --out train.multiallelic.csv - ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" - HYPERPARAMETERS=hyperparameters.$kind.yaml - if [ "$kind" == "34mer_sequence" ] - then - ALLELE_SEQUENCES=allele_sequences.34mer.csv - HYPERPARAMETERS=hyperparameters.production.yaml - fi +time mhcflurry-multiallelic-refinement \ + --monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ + --multiallelic-data train.multiallelic.csv \ + --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" + --hyperparameters hyperparameters.yaml \ + --out-affinity-predictor-dir $(pwd)/models.affinity \ + --out-presentation-predictor-dir $(pwd)/models.presentation \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + $PARALLELISM_ARGS + +echo "Done training." - mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ - --allele-sequences "$ALLELE_SEQUENCES" \ - --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ - --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ - --num-folds 4 \ - --hyperparameters "$HYPERPARAMETERS" \ - --out-models-dir $(pwd)/models.unselected.${kind} \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - $PARALLELISM_ARGS $CONTINUE_INCOMPLETE_ARGS -done - -echo "Done training. Beginning model selection." - -for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence -do - MODELS_DIR="models.unselected.${kind}" - mhcflurry-class1-select-pan-allele-models \ - --data "$MODELS_DIR/train_data.csv.bz2" \ - --models-dir "$MODELS_DIR" \ - --out-models-dir models.${kind} \ - --min-models 2 \ - --max-models 8 \ - $PARALLELISM_ARGS - ln -s "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2" -done +bzip2 train.multiallelic.csv cp $SCRIPT_ABSOLUTE_PATH . bzip2 -f "$LOG" @@ -120,15 +80,3 @@ RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" tar -cjf "$RESULT" * echo "Created archive: $RESULT" -# Split into <2GB chunks for GitHub -PARTS="${RESULT}.part." -# Check for pre-existing part files and rename them. -for i in $(ls "${PARTS}"* ) -do - DEST="${i}.OLD.$(date +%s)" - echo "WARNING: already exists: $i . Moving to $DEST" - mv $i $DEST -done -split -b 2000M "$RESULT" "$PARTS" -echo "Split into parts:" -ls -lh "${PARTS}"* diff --git a/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py b/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py new file mode 100644 index 00000000..635fbbae --- /dev/null +++ b/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py @@ -0,0 +1,148 @@ +""" +Make multiallelic training data by selecting decoys, etc. +""" +import sys +import argparse +import os +import json +import collections +from six.moves import StringIO + +import pandas +import tqdm + +import mhcnames + + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "--hits", + metavar="CSV", + required=True, + help="Multiallelic mass spec") +parser.add_argument( + "--expression", + metavar="CSV", + required=True, + help="Expression data") +parser.add_argument( + "--decoys-per-hit", + type=int, + default=None, + help="If not specified will use all possible decoys") +parser.add_argument( + "--out", + metavar="CSV", + required=True, + help="File to write") + +def run(): + args = parser.parse_args(sys.argv[1:]) + hit_df = pandas.read_csv(args.hits) + expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0) + hit_df = hit_df.loc[ + (hit_df.mhc_class == "I") & + (hit_df.peptide.str.len() <= 15) & + (hit_df.peptide.str.len() >= 7) & + (~hit_df.protein_ensembl.isnull()) + ] + hit_df["alleles"] = hit_df.hla.str.split() + + sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") + grouped = hit_df.groupby("sample_id").nunique() + for col in sample_table.columns: + if (grouped[col] > 1).any(): + del sample_table[col] + sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique() + + experiment_to_decoys = {} + for sample_id, row in sample_table.iterrows(): + decoys = sample_table.loc[ + sample_table.alleles.map( + lambda a: not set(a).intersection(set(row.alleles))) + ].index.tolist() + experiment_to_decoys[sample_id] = decoys + experiment_to_decoys = pandas.Series(experiment_to_decoys) + sample_table["decoys"] = experiment_to_decoys + + print("Samples:") + print(sample_table) + + # Add a column to hit_df giving expression value for that sample and that gene + print("Annotating expression.") + hit_df["tpm"] = [ + expression_df.reindex( + row.protein_ensembl.split())[row.expression_dataset].sum() + for _, row in tqdm.tqdm( + hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000) + ] + + print("Selecting max-expression transcripts for each hit.") + + # Discard hits except those that have max expression for each hit_id + max_gene_hit_df = hit_df.loc[ + hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max()) + ].sample(frac=1.0).drop_duplicates("hit_id") + + # Columns are accession, peptide, sample_id, hit + result_df = [] + + columns_to_keep = [ + "hit_id", + "protein_accession", + "protein_primary_ensembl_contig", + "peptide", + "sample_id" + ] + + print("Selecting decoys.") + for sample_id, sub_df in tqdm.tqdm( + max_gene_hit_df.groupby("sample_id"), + total=max_gene_hit_df.sample_id.nunique()): + result_df.append(sub_df[columns_to_keep].copy()) + result_df[-1]["hit"] = 1 + + possible_decoys = hit_df.loc[ + (hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) & + (~hit_df.peptide.isin(sub_df.peptide.unique())) & + (hit_df.protein_accession.isin(sub_df.protein_accession.unique())) + ].drop_duplicates("peptide") + if not args.decoys_per_hit: + selected_decoys = possible_decoys[columns_to_keep].copy() + elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit: + print( + "Insufficient decoys", + len(possible_decoys), + len(sub_df), + args.decoys_per_hit) + selected_decoys = possible_decoys[columns_to_keep].copy() + else: + selected_decoys = possible_decoys.sample( + n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy() + result_df.append(selected_decoys) + result_df[-1]["hit"] = 0 + result_df[-1]["sample_id"] = sample_id + + result_df = pandas.concat(result_df, ignore_index=True, sort=False) + result_df["hla"] = result_df.sample_id.map(sample_table.hla) + + print(result_df) + print("Counts:") + print(result_df.groupby(["sample_id", "hit"]).peptide.nunique()) + + print("Hit counts:") + print( + result_df.loc[ + result_df.hit == 1 + ].groupby("sample_id").hit.count().sort_values()) + + print("Hit rates:") + print(result_df.groupby("sample_id").hit.mean().sort_values()) + + result_df.to_csv(args.out, index=False) + print("Wrote: ", args.out) + + +if __name__ == '__main__': + run() diff --git a/setup.py b/setup.py index 9275775e..8696bf70 100644 --- a/setup.py +++ b/setup.py @@ -83,6 +83,8 @@ if __name__ == '__main__': 'mhcflurry.select_pan_allele_models_command:run', 'mhcflurry-calibrate-percentile-ranks = ' 'mhcflurry.calibrate_percentile_ranks_command:run', + 'mhcflurry-multiallelic-refinement = ' + 'mhcflurry.multiallelic_refinement_command:run', '_mhcflurry-cluster-worker-entry-point = ' 'mhcflurry.cluster_parallelism:worker_entry_point', ] -- GitLab