Skip to content
Snippets Groups Projects
Commit 1793fd87 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent e55afc59
No related branches found
No related tags found
No related merge requests found
bash GENERATE.sh cluster
#!/bin/bash
#
# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
# scheduler). This would need to be modified for other sites.
#
# Usage: GENERATE.sh <local|cluster>
#
set -e
set -x
DOWNLOAD_NAME=models_class1_pan_refined
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
if [ "$1" != "cluster" ]
then
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1}
else
NUM_JOBS=${NUM_JOBS-$GPUS}
fi
echo "Num jobs: $NUM_JOBS"
PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
else
PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf"
fi
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
exec > >(tee -ia "$LOG")
exec 2> >(tee -ia "$LOG" >&2)
# Log some environment info
echo "Invocation: $0 $@"
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
cp $SCRIPT_DIR/make_multiallelic_training_data.py .
cp $SCRIPT_DIR/hyperparameters.yaml .
MONOALLELIC_TRAIN="$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec/train_data.csv.bz2"
# ********************************************************
# First we refine a single model excluding chromosome 1.
if false; then
echo "Beginning testing run."
time python make_multiallelic_training_data.py \
--hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
--expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
--exclude-contig "1" \
--decoys-per-hit 1 \
--out train.multiallelic.no_chr1.csv
time mhcflurry-multiallelic-refinement \
--monoallelic-data "$MONOALLELIC_TRAIN" \
--multiallelic-data train.multiallelic.no_chr1.csv \
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \
--hyperparameters hyperparameters.yaml \
--out-affinity-predictor-dir $(pwd)/test_models.no_chr1.affinity \
--out-presentation-predictor-dir $(pwd)/test_models.no_chr1.presentation \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
$PARALLELISM_ARGS
time mhcflurry-calibrate-percentile-ranks \
--models-dir $(pwd)/test_models.no_chr1.affinity \
--match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \
--motif-summary \
--num-peptides-per-length 100000 \
--allele "HLA-A*02:01" "HLA-A*02:20" "HLA-C*02:10" \
--verbosity 1 \
$PARALLELISM_ARGS
fi
# ********************************************************
echo "Beginning production run"
if [ -f "$SCRIPT_DIR/train.multiallelic.csv" ]; then
echo "Using existing multiallelic train data."
cp "$SCRIPT_DIR/train.multiallelic.csv" .
else
time python make_multiallelic_training_data.py \
--hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
--expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
--decoys-per-hit 1 \
--out train.multiallelic.csv \
--alleles "HLA-A*02:20" "HLA-A*02:05"
fi
ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST+=$(cat train.multiallelic.csv | cut -f 7 -d , | gerp -v hla | uniq | tr ' ' '\n' | sort | uniq)
ALLELE_LIST+=$(echo " " $(cat $SCRIPT_DIR/additional_alleles.txt | grep -v '#') )
time mhcflurry-multiallelic-refinement \
--monoallelic-data "$MONOALLELIC_TRAIN" \
--multiallelic-data train.multiallelic.csv \
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \
--hyperparameters hyperparameters.yaml \
--out-affinity-predictor-dir $(pwd)/models.affinity \
--out-presentation-predictor-dir $(pwd)/models.presentation \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--only-alleles-with-mass-spec \
$PARALLELISM_ARGS
time mhcflurry-calibrate-percentile-ranks \
--models-dir $(pwd)/models.affinity \
--match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \
--motif-summary \
--num-peptides-per-length 100000 \
--allele $ALLELE_LIST \
--verbosity 1 \
$PARALLELISM_ARGS
echo "Done training."
#rm train.multiallelic.*
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 -f "$LOG"
for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
../models_class1_pan/additional_alleles.txt
\ No newline at end of file
../models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
\ No newline at end of file
#########################
# Batch generation
#########################
batch_generator: multiallelic_mass_spec
batch_generator_validation_split: 0.1
batch_generator_batch_size: 1024
batch_generator_affinity_fraction: 0.5
max_epochs: 500
random_negative_rate: 1.0
random_negative_constant: 25
learning_rate: 0.001
patience: 5
min_delta: 0.0
loss_multiallelic_mass_spec_multiplier: 10
"""
Make multiallelic training data by selecting decoys, etc.
"""
import sys
import argparse
import os
import json
import collections
from six.moves import StringIO
import pandas
import tqdm
import mhcnames
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--hits",
metavar="CSV",
required=True,
help="Multiallelic mass spec")
parser.add_argument(
"--expression",
metavar="CSV",
required=True,
help="Expression data")
parser.add_argument(
"--decoys-per-hit",
type=int,
default=None,
help="If not specified will use all possible decoys")
parser.add_argument(
"--exclude-contig",
help="Exclude entries annotated to the given contig")
parser.add_argument(
"--out",
metavar="CSV",
required=True,
help="File to write")
parser.add_argument(
"--alleles",
nargs="+",
help="Include only the specified alleles")
def run():
args = parser.parse_args(sys.argv[1:])
hit_df = pandas.read_csv(args.hits)
expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0)
hit_df["alleles"] = hit_df.hla.str.split()
hit_df = hit_df.loc[
(hit_df.mhc_class == "I") &
(hit_df.peptide.str.len() <= 15) &
(hit_df.peptide.str.len() >= 7) &
(~hit_df.protein_ensembl.isnull())
]
if args.exclude_contig:
new_hit_df = hit_df.loc[
hit_df.protein_primary_ensembl_contig.astype(str) !=
args.exclude_contig
]
print(
"Excluding contig",
args.exclude_contig,
"reduced dataset from",
len(hit_df),
"to",
len(new_hit_df))
hit_df = new_hit_df.copy()
if args.alleles:
filter_alleles = set(args.alleles)
new_hit_df = hit_df.loc[
hit_df.alleles.map(
lambda a: len(set(a).intersection(filter_alleles)) > 0)
]
print(
"Selecting alleles",
args.alleles,
"reduced dataset from",
len(hit_df),
"to",
len(new_hit_df))
hit_df = new_hit_df.copy()
sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
grouped = hit_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique()
experiment_to_decoys = {}
for sample_id, row in sample_table.iterrows():
decoys = sample_table.loc[
sample_table.alleles.map(
lambda a: not set(a).intersection(set(row.alleles)))
].index.tolist()
experiment_to_decoys[sample_id] = decoys
experiment_to_decoys = pandas.Series(experiment_to_decoys)
sample_table["decoys"] = experiment_to_decoys
print("Samples:")
print(sample_table)
# Add a column to hit_df giving expression value for that sample and that gene
print("Annotating expression.")
hit_df["tpm"] = [
expression_df.reindex(
row.protein_ensembl.split())[row.expression_dataset].sum()
for _, row in tqdm.tqdm(
hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000)
]
print("Selecting max-expression transcripts for each hit.")
# Discard hits except those that have max expression for each hit_id
max_gene_hit_df = hit_df.loc[
hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max())
].sample(frac=1.0).drop_duplicates("hit_id")
# Columns are accession, peptide, sample_id, hit
result_df = []
columns_to_keep = [
"hit_id",
"protein_accession",
"protein_primary_ensembl_contig",
"peptide",
"sample_id"
]
print("Selecting decoys.")
for sample_id, sub_df in tqdm.tqdm(
max_gene_hit_df.groupby("sample_id"),
total=max_gene_hit_df.sample_id.nunique()):
result_df.append(sub_df[columns_to_keep].copy())
result_df[-1]["hit"] = 1
possible_decoys = hit_df.loc[
(hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) &
(~hit_df.peptide.isin(sub_df.peptide.unique())) &
(hit_df.protein_accession.isin(sub_df.protein_accession.unique()))
].drop_duplicates("peptide")
if not args.decoys_per_hit:
selected_decoys = possible_decoys[columns_to_keep].copy()
elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit:
print(
"Insufficient decoys",
len(possible_decoys),
len(sub_df),
args.decoys_per_hit)
selected_decoys = possible_decoys[columns_to_keep].copy()
else:
selected_decoys = possible_decoys.sample(
n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy()
result_df.append(selected_decoys)
result_df[-1]["hit"] = 0
result_df[-1]["sample_id"] = sample_id
result_df = pandas.concat(result_df, ignore_index=True, sort=False)
result_df["hla"] = result_df.sample_id.map(sample_table.hla)
print(result_df)
print("Counts:")
print(result_df.groupby(["sample_id", "hit"]).peptide.nunique())
print("Hit counts:")
print(
result_df.loc[
result_df.hit == 1
].groupby("sample_id").hit.count().sort_values())
print("Hit rates:")
print(result_df.groupby("sample_id").hit.mean().sort_values())
result_df.to_csv(args.out, index=False)
print("Wrote: ", args.out)
if __name__ == '__main__':
run()
...@@ -38,12 +38,12 @@ parser.add_argument( ...@@ -38,12 +38,12 @@ parser.add_argument(
"--cleavage-predictor-with-flanks", "--cleavage-predictor-with-flanks",
metavar="DIR", metavar="DIR",
required=True, required=True,
help="Cleavage predictor with flanking") help="Cleavage predictor with flanks")
parser.add_argument( parser.add_argument(
"--cleavage-predictor-without-flanks", "--cleavage-predictor-without-flanks",
metavar="DIR", metavar="DIR",
required=True, required=True,
help="Cleavage predictor without flanking") help="Cleavage predictor without flanks")
parser.add_argument( parser.add_argument(
"--verbosity", "--verbosity",
type=int, type=int,
...@@ -107,9 +107,9 @@ def main(args): ...@@ -107,9 +107,9 @@ def main(args):
affinity_predictor = Class1AffinityPredictor.load(args.affinity_predictor) affinity_predictor = Class1AffinityPredictor.load(args.affinity_predictor)
cleavage_predictor_with_flanks = Class1CleavagePredictor.load( cleavage_predictor_with_flanks = Class1CleavagePredictor.load(
args.cleavage_predictor_with_flanking) args.cleavage_predictor_with_flanks)
cleavage_predictor_without_flanks = Class1CleavagePredictor.load( cleavage_predictor_without_flanks = Class1CleavagePredictor.load(
args.cleavage_predictor_without_flanking) args.cleavage_predictor_without_flanks)
predictor = Class1PresentationPredictor( predictor = Class1PresentationPredictor(
affinity_predictor=affinity_predictor, affinity_predictor=affinity_predictor,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment