Skip to content
Snippets Groups Projects
Commit 25c387bb authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

working on multiallelic refinement download

parent 34458035
No related merge requests found
...@@ -3,12 +3,12 @@ ...@@ -3,12 +3,12 @@
# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job # Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
# scheduler). This would need to be modified for other sites. # scheduler). This would need to be modified for other sites.
# #
# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete> # Usage: GENERATE.sh <local|cluster>
# #
set -e set -e
set -x set -x
DOWNLOAD_NAME=models_class1_pan_variants DOWNLOAD_NAME=models_class1_pan_refined
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
...@@ -32,86 +32,46 @@ else ...@@ -32,86 +32,46 @@ else
PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf" PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf"
fi fi
mkdir -p "$SCRATCH_DIR" rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
if [ "$2" != "continue-incomplete" ] mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
then
echo "Fresh run"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
else
echo "Continuing incomplete run"
fi
# Send stdout and stderr to a logfile included with the archive. # Send stdout and stderr to a logfile included with the archive.
LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" #LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
exec > >(tee -ia "$LOG") #exec > >(tee -ia "$LOG")
exec 2> >(tee -ia "$LOG" >&2) #exec 2> >(tee -ia "$LOG" >&2)
# Log some environment info # Log some environment info
echo "Invocation: $0 $@" #echo "Invocation: $0 $@"
date #date
pip freeze #pip freeze
git status #git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1 export PYTHONUNBUFFERED=1
if [ "$2" != "continue-incomplete" ] # CAHNGE TO CP
then ln -s $SCRIPT_DIR/make_multiallelic_training_data.py .
cp $SCRIPT_DIR/generate_hyperparameters.production.py .
cp $SCRIPT_DIR/generate_hyperparameters.py .
cp $SCRIPT_DIR/normalize_allele_names.py .
python generate_hyperparameters.production.py > hyperparameters.production.yaml
python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv
fi
for kind in 34mer_sequence single_hidden_no_pretrain no_pretrain time python make_multiallelic_training_data.py \
do --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
CONTINUE_INCOMPLETE_ARGS="" --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ] --out train.multiallelic.csv
then
echo "Will continue existing run: $kind"
CONTINUE_INCOMPLETE_ARGS="--continue-incomplete"
fi
ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" time mhcflurry-multiallelic-refinement \
HYPERPARAMETERS=hyperparameters.$kind.yaml --monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
if [ "$kind" == "34mer_sequence" ] --multiallelic-data train.multiallelic.csv \
then --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec"
ALLELE_SEQUENCES=allele_sequences.34mer.csv --hyperparameters hyperparameters.yaml \
HYPERPARAMETERS=hyperparameters.production.yaml --out-affinity-predictor-dir $(pwd)/models.affinity \
fi --out-presentation-predictor-dir $(pwd)/models.presentation \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
$PARALLELISM_ARGS
echo "Done training."
mhcflurry-class1-train-pan-allele-models \ bzip2 train.multiallelic.csv
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
--allele-sequences "$ALLELE_SEQUENCES" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--num-folds 4 \
--hyperparameters "$HYPERPARAMETERS" \
--out-models-dir $(pwd)/models.unselected.${kind} \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
$PARALLELISM_ARGS $CONTINUE_INCOMPLETE_ARGS
done
echo "Done training. Beginning model selection."
for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
do
MODELS_DIR="models.unselected.${kind}"
mhcflurry-class1-select-pan-allele-models \
--data "$MODELS_DIR/train_data.csv.bz2" \
--models-dir "$MODELS_DIR" \
--out-models-dir models.${kind} \
--min-models 2 \
--max-models 8 \
$PARALLELISM_ARGS
ln -s "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2"
done
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
bzip2 -f "$LOG" bzip2 -f "$LOG"
...@@ -120,15 +80,3 @@ RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" ...@@ -120,15 +80,3 @@ RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" * tar -cjf "$RESULT" *
echo "Created archive: $RESULT" echo "Created archive: $RESULT"
# Split into <2GB chunks for GitHub
PARTS="${RESULT}.part."
# Check for pre-existing part files and rename them.
for i in $(ls "${PARTS}"* )
do
DEST="${i}.OLD.$(date +%s)"
echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST
done
split -b 2000M "$RESULT" "$PARTS"
echo "Split into parts:"
ls -lh "${PARTS}"*
"""
Make multiallelic training data by selecting decoys, etc.
"""
import sys
import argparse
import os
import json
import collections
from six.moves import StringIO
import pandas
import tqdm
import mhcnames
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--hits",
metavar="CSV",
required=True,
help="Multiallelic mass spec")
parser.add_argument(
"--expression",
metavar="CSV",
required=True,
help="Expression data")
parser.add_argument(
"--decoys-per-hit",
type=int,
default=None,
help="If not specified will use all possible decoys")
parser.add_argument(
"--out",
metavar="CSV",
required=True,
help="File to write")
def run():
args = parser.parse_args(sys.argv[1:])
hit_df = pandas.read_csv(args.hits)
expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0)
hit_df = hit_df.loc[
(hit_df.mhc_class == "I") &
(hit_df.peptide.str.len() <= 15) &
(hit_df.peptide.str.len() >= 7) &
(~hit_df.protein_ensembl.isnull())
]
hit_df["alleles"] = hit_df.hla.str.split()
sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
grouped = hit_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique()
experiment_to_decoys = {}
for sample_id, row in sample_table.iterrows():
decoys = sample_table.loc[
sample_table.alleles.map(
lambda a: not set(a).intersection(set(row.alleles)))
].index.tolist()
experiment_to_decoys[sample_id] = decoys
experiment_to_decoys = pandas.Series(experiment_to_decoys)
sample_table["decoys"] = experiment_to_decoys
print("Samples:")
print(sample_table)
# Add a column to hit_df giving expression value for that sample and that gene
print("Annotating expression.")
hit_df["tpm"] = [
expression_df.reindex(
row.protein_ensembl.split())[row.expression_dataset].sum()
for _, row in tqdm.tqdm(
hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000)
]
print("Selecting max-expression transcripts for each hit.")
# Discard hits except those that have max expression for each hit_id
max_gene_hit_df = hit_df.loc[
hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max())
].sample(frac=1.0).drop_duplicates("hit_id")
# Columns are accession, peptide, sample_id, hit
result_df = []
columns_to_keep = [
"hit_id",
"protein_accession",
"protein_primary_ensembl_contig",
"peptide",
"sample_id"
]
print("Selecting decoys.")
for sample_id, sub_df in tqdm.tqdm(
max_gene_hit_df.groupby("sample_id"),
total=max_gene_hit_df.sample_id.nunique()):
result_df.append(sub_df[columns_to_keep].copy())
result_df[-1]["hit"] = 1
possible_decoys = hit_df.loc[
(hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) &
(~hit_df.peptide.isin(sub_df.peptide.unique())) &
(hit_df.protein_accession.isin(sub_df.protein_accession.unique()))
].drop_duplicates("peptide")
if not args.decoys_per_hit:
selected_decoys = possible_decoys[columns_to_keep].copy()
elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit:
print(
"Insufficient decoys",
len(possible_decoys),
len(sub_df),
args.decoys_per_hit)
selected_decoys = possible_decoys[columns_to_keep].copy()
else:
selected_decoys = possible_decoys.sample(
n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy()
result_df.append(selected_decoys)
result_df[-1]["hit"] = 0
result_df[-1]["sample_id"] = sample_id
result_df = pandas.concat(result_df, ignore_index=True, sort=False)
result_df["hla"] = result_df.sample_id.map(sample_table.hla)
print(result_df)
print("Counts:")
print(result_df.groupby(["sample_id", "hit"]).peptide.nunique())
print("Hit counts:")
print(
result_df.loc[
result_df.hit == 1
].groupby("sample_id").hit.count().sort_values())
print("Hit rates:")
print(result_df.groupby("sample_id").hit.mean().sort_values())
result_df.to_csv(args.out, index=False)
print("Wrote: ", args.out)
if __name__ == '__main__':
run()
...@@ -83,6 +83,8 @@ if __name__ == '__main__': ...@@ -83,6 +83,8 @@ if __name__ == '__main__':
'mhcflurry.select_pan_allele_models_command:run', 'mhcflurry.select_pan_allele_models_command:run',
'mhcflurry-calibrate-percentile-ranks = ' 'mhcflurry-calibrate-percentile-ranks = '
'mhcflurry.calibrate_percentile_ranks_command:run', 'mhcflurry.calibrate_percentile_ranks_command:run',
'mhcflurry-multiallelic-refinement = '
'mhcflurry.multiallelic_refinement_command:run',
'_mhcflurry-cluster-worker-entry-point = ' '_mhcflurry-cluster-worker-entry-point = '
'mhcflurry.cluster_parallelism:worker_entry_point', 'mhcflurry.cluster_parallelism:worker_entry_point',
] ]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment