diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_refined/GENERATE.WITH_HPC_CLUSTER.sh deleted file mode 100755 index 53125eb7bec329ecbbd0d230b8afe809c1064204..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_refined/GENERATE.WITH_HPC_CLUSTER.sh +++ /dev/null @@ -1 +0,0 @@ -bash GENERATE.sh cluster diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.sh b/downloads-generation/models_class1_pan_refined/GENERATE.sh deleted file mode 100755 index b662bf630d7a928828eebf165062a1f0201527e4..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_refined/GENERATE.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -# -# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job -# scheduler). This would need to be modified for other sites. -# -# Usage: GENERATE.sh <local|cluster> -# -set -e -set -x - -DOWNLOAD_NAME=models_class1_pan_refined -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") - -if [ "$1" != "cluster" ] -then - GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 - echo "Detected GPUS: $GPUS" - - PROCESSORS=$(getconf _NPROCESSORS_ONLN) - echo "Detected processors: $PROCESSORS" - - if [ "$GPUS" -eq "0" ]; then - NUM_JOBS=${NUM_JOBS-1} - else - NUM_JOBS=${NUM_JOBS-$GPUS} - fi - echo "Num jobs: $NUM_JOBS" - PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1" -else - PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf" -fi - -rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" -mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME" - -# Send stdout and stderr to a logfile included with the archive. -LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" -exec > >(tee -ia "$LOG") -exec 2> >(tee -ia "$LOG" >&2) - -# Log some environment info -echo "Invocation: $0 $@" -date -pip freeze -git status - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -export OMP_NUM_THREADS=1 -export PYTHONUNBUFFERED=1 - -cp $SCRIPT_DIR/make_multiallelic_training_data.py . -cp $SCRIPT_DIR/hyperparameters.yaml . - -MONOALLELIC_TRAIN="$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec/train_data.csv.bz2" - -# ******************************************************** -# First we refine a single model excluding chromosome 1. -if false; then - echo "Beginning testing run." - time python make_multiallelic_training_data.py \ - --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \ - --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \ - --exclude-contig "1" \ - --decoys-per-hit 1 \ - --out train.multiallelic.no_chr1.csv - - time mhcflurry-multiallelic-refinement \ - --monoallelic-data "$MONOALLELIC_TRAIN" \ - --multiallelic-data train.multiallelic.no_chr1.csv \ - --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \ - --hyperparameters hyperparameters.yaml \ - --out-affinity-predictor-dir $(pwd)/test_models.no_chr1.affinity \ - --out-presentation-predictor-dir $(pwd)/test_models.no_chr1.presentation \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - $PARALLELISM_ARGS - - time mhcflurry-calibrate-percentile-ranks \ - --models-dir $(pwd)/test_models.no_chr1.affinity \ - --match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \ - --motif-summary \ - --num-peptides-per-length 100000 \ - --allele "HLA-A*02:01" "HLA-A*02:20" "HLA-C*02:10" \ - --verbosity 1 \ - $PARALLELISM_ARGS -fi - -# ******************************************************** -echo "Beginning production run" -if [ -f "$SCRIPT_DIR/train.multiallelic.csv" ]; then - echo "Using existing multiallelic train data." - cp "$SCRIPT_DIR/train.multiallelic.csv" . -else - time python make_multiallelic_training_data.py \ - --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \ - --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \ - --decoys-per-hit 1 \ - --out train.multiallelic.csv \ - --alleles "HLA-A*02:20" "HLA-A*02:05" -fi - -ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) -ALLELE_LIST+=$(cat train.multiallelic.csv | cut -f 7 -d , | gerp -v hla | uniq | tr ' ' '\n' | sort | uniq) -ALLELE_LIST+=$(echo " " $(cat $SCRIPT_DIR/additional_alleles.txt | grep -v '#') ) - -time mhcflurry-multiallelic-refinement \ - --monoallelic-data "$MONOALLELIC_TRAIN" \ - --multiallelic-data train.multiallelic.csv \ - --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \ - --hyperparameters hyperparameters.yaml \ - --out-affinity-predictor-dir $(pwd)/models.affinity \ - --out-presentation-predictor-dir $(pwd)/models.presentation \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --only-alleles-with-mass-spec \ - $PARALLELISM_ARGS - -time mhcflurry-calibrate-percentile-ranks \ - --models-dir $(pwd)/models.affinity \ - --match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \ - --motif-summary \ - --num-peptides-per-length 100000 \ - --allele $ALLELE_LIST \ - --verbosity 1 \ - $PARALLELISM_ARGS - -echo "Done training." - -#rm train.multiallelic.* - -cp $SCRIPT_ABSOLUTE_PATH . -bzip2 -f "$LOG" -for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done -RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" -tar -cjf "$RESULT" * -echo "Created archive: $RESULT" diff --git a/downloads-generation/models_class1_pan_refined/additional_alleles.txt b/downloads-generation/models_class1_pan_refined/additional_alleles.txt deleted file mode 120000 index 1ad4ba7ac3f34cc1d9acad10c4f0aba0b0cf2ae1..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_refined/additional_alleles.txt +++ /dev/null @@ -1 +0,0 @@ -../models_class1_pan/additional_alleles.txt \ No newline at end of file diff --git a/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf deleted file mode 120000 index 1860a77b2cb5f01b1aa6df7c7acecd142e69343b..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf +++ /dev/null @@ -1 +0,0 @@ -../models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf \ No newline at end of file diff --git a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml b/downloads-generation/models_class1_pan_refined/hyperparameters.yaml deleted file mode 100644 index 0501a4e176efc82555c3e6589c42eaaba16b2b9f..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml +++ /dev/null @@ -1,14 +0,0 @@ -######################### -# Batch generation -######################### -batch_generator: multiallelic_mass_spec -batch_generator_validation_split: 0.1 -batch_generator_batch_size: 1024 -batch_generator_affinity_fraction: 0.5 -max_epochs: 500 -random_negative_rate: 1.0 -random_negative_constant: 25 -learning_rate: 0.001 -patience: 5 -min_delta: 0.0 -loss_multiallelic_mass_spec_multiplier: 10 diff --git a/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py b/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py deleted file mode 100644 index 4360a215017c1fa6b7a190435529bedcb36bde7c..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -Make multiallelic training data by selecting decoys, etc. -""" -import sys -import argparse -import os -import json -import collections -from six.moves import StringIO - -import pandas -import tqdm - -import mhcnames - - -parser = argparse.ArgumentParser(usage=__doc__) - -parser.add_argument( - "--hits", - metavar="CSV", - required=True, - help="Multiallelic mass spec") -parser.add_argument( - "--expression", - metavar="CSV", - required=True, - help="Expression data") -parser.add_argument( - "--decoys-per-hit", - type=int, - default=None, - help="If not specified will use all possible decoys") -parser.add_argument( - "--exclude-contig", - help="Exclude entries annotated to the given contig") -parser.add_argument( - "--out", - metavar="CSV", - required=True, - help="File to write") -parser.add_argument( - "--alleles", - nargs="+", - help="Include only the specified alleles") - - -def run(): - args = parser.parse_args(sys.argv[1:]) - hit_df = pandas.read_csv(args.hits) - expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0) - hit_df["alleles"] = hit_df.hla.str.split() - - hit_df = hit_df.loc[ - (hit_df.mhc_class == "I") & - (hit_df.peptide.str.len() <= 15) & - (hit_df.peptide.str.len() >= 7) & - (~hit_df.protein_ensembl.isnull()) - ] - if args.exclude_contig: - new_hit_df = hit_df.loc[ - hit_df.protein_primary_ensembl_contig.astype(str) != - args.exclude_contig - ] - print( - "Excluding contig", - args.exclude_contig, - "reduced dataset from", - len(hit_df), - "to", - len(new_hit_df)) - hit_df = new_hit_df.copy() - if args.alleles: - filter_alleles = set(args.alleles) - new_hit_df = hit_df.loc[ - hit_df.alleles.map( - lambda a: len(set(a).intersection(filter_alleles)) > 0) - ] - print( - "Selecting alleles", - args.alleles, - "reduced dataset from", - len(hit_df), - "to", - len(new_hit_df)) - hit_df = new_hit_df.copy() - - sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") - grouped = hit_df.groupby("sample_id").nunique() - for col in sample_table.columns: - if (grouped[col] > 1).any(): - del sample_table[col] - sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique() - - experiment_to_decoys = {} - for sample_id, row in sample_table.iterrows(): - decoys = sample_table.loc[ - sample_table.alleles.map( - lambda a: not set(a).intersection(set(row.alleles))) - ].index.tolist() - experiment_to_decoys[sample_id] = decoys - experiment_to_decoys = pandas.Series(experiment_to_decoys) - sample_table["decoys"] = experiment_to_decoys - - print("Samples:") - print(sample_table) - - # Add a column to hit_df giving expression value for that sample and that gene - print("Annotating expression.") - hit_df["tpm"] = [ - expression_df.reindex( - row.protein_ensembl.split())[row.expression_dataset].sum() - for _, row in tqdm.tqdm( - hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000) - ] - - print("Selecting max-expression transcripts for each hit.") - - # Discard hits except those that have max expression for each hit_id - max_gene_hit_df = hit_df.loc[ - hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max()) - ].sample(frac=1.0).drop_duplicates("hit_id") - - # Columns are accession, peptide, sample_id, hit - result_df = [] - - columns_to_keep = [ - "hit_id", - "protein_accession", - "protein_primary_ensembl_contig", - "peptide", - "sample_id" - ] - - print("Selecting decoys.") - for sample_id, sub_df in tqdm.tqdm( - max_gene_hit_df.groupby("sample_id"), - total=max_gene_hit_df.sample_id.nunique()): - result_df.append(sub_df[columns_to_keep].copy()) - result_df[-1]["hit"] = 1 - - possible_decoys = hit_df.loc[ - (hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) & - (~hit_df.peptide.isin(sub_df.peptide.unique())) & - (hit_df.protein_accession.isin(sub_df.protein_accession.unique())) - ].drop_duplicates("peptide") - if not args.decoys_per_hit: - selected_decoys = possible_decoys[columns_to_keep].copy() - elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit: - print( - "Insufficient decoys", - len(possible_decoys), - len(sub_df), - args.decoys_per_hit) - selected_decoys = possible_decoys[columns_to_keep].copy() - else: - selected_decoys = possible_decoys.sample( - n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy() - result_df.append(selected_decoys) - result_df[-1]["hit"] = 0 - result_df[-1]["sample_id"] = sample_id - - result_df = pandas.concat(result_df, ignore_index=True, sort=False) - result_df["hla"] = result_df.sample_id.map(sample_table.hla) - - print(result_df) - print("Counts:") - print(result_df.groupby(["sample_id", "hit"]).peptide.nunique()) - - print("Hit counts:") - print( - result_df.loc[ - result_df.hit == 1 - ].groupby("sample_id").hit.count().sort_values()) - - print("Hit rates:") - print(result_df.groupby("sample_id").hit.mean().sort_values()) - - result_df.to_csv(args.out, index=False) - print("Wrote: ", args.out) - - -if __name__ == '__main__': - run() diff --git a/mhcflurry/train_presentation_models_command.py b/mhcflurry/train_presentation_models_command.py index ca1ac3490b202a3d4df3d4fc1c2e8a0054e20f25..884350aebda4c81d7a5aa4a5efee03d07c4e69e6 100644 --- a/mhcflurry/train_presentation_models_command.py +++ b/mhcflurry/train_presentation_models_command.py @@ -38,12 +38,12 @@ parser.add_argument( "--cleavage-predictor-with-flanks", metavar="DIR", required=True, - help="Cleavage predictor with flanking") + help="Cleavage predictor with flanks") parser.add_argument( "--cleavage-predictor-without-flanks", metavar="DIR", required=True, - help="Cleavage predictor without flanking") + help="Cleavage predictor without flanks") parser.add_argument( "--verbosity", type=int, @@ -107,9 +107,9 @@ def main(args): affinity_predictor = Class1AffinityPredictor.load(args.affinity_predictor) cleavage_predictor_with_flanks = Class1CleavagePredictor.load( - args.cleavage_predictor_with_flanking) + args.cleavage_predictor_with_flanks) cleavage_predictor_without_flanks = Class1CleavagePredictor.load( - args.cleavage_predictor_without_flanking) + args.cleavage_predictor_without_flanks) predictor = Class1PresentationPredictor( affinity_predictor=affinity_predictor,