Skip to content
Snippets Groups Projects
Commit 25c387bb authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

working on multiallelic refinement download

parent 34458035
No related branches found
No related tags found
No related merge requests found
......@@ -3,12 +3,12 @@
# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
# scheduler). This would need to be modified for other sites.
#
# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete>
# Usage: GENERATE.sh <local|cluster>
#
set -e
set -x
DOWNLOAD_NAME=models_class1_pan_variants
DOWNLOAD_NAME=models_class1_pan_refined
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
......@@ -32,86 +32,46 @@ else
PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf"
fi
mkdir -p "$SCRATCH_DIR"
if [ "$2" != "continue-incomplete" ]
then
echo "Fresh run"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
else
echo "Continuing incomplete run"
fi
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
exec > >(tee -ia "$LOG")
exec 2> >(tee -ia "$LOG" >&2)
#LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
#exec > >(tee -ia "$LOG")
#exec 2> >(tee -ia "$LOG" >&2)
# Log some environment info
echo "Invocation: $0 $@"
date
pip freeze
git status
#echo "Invocation: $0 $@"
#date
#pip freeze
#git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
if [ "$2" != "continue-incomplete" ]
then
cp $SCRIPT_DIR/generate_hyperparameters.production.py .
cp $SCRIPT_DIR/generate_hyperparameters.py .
cp $SCRIPT_DIR/normalize_allele_names.py .
python generate_hyperparameters.production.py > hyperparameters.production.yaml
python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv
fi
# CAHNGE TO CP
ln -s $SCRIPT_DIR/make_multiallelic_training_data.py .
for kind in 34mer_sequence single_hidden_no_pretrain no_pretrain
do
CONTINUE_INCOMPLETE_ARGS=""
if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ]
then
echo "Will continue existing run: $kind"
CONTINUE_INCOMPLETE_ARGS="--continue-incomplete"
fi
time python make_multiallelic_training_data.py \
--hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
--expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
--out train.multiallelic.csv
ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv"
HYPERPARAMETERS=hyperparameters.$kind.yaml
if [ "$kind" == "34mer_sequence" ]
then
ALLELE_SEQUENCES=allele_sequences.34mer.csv
HYPERPARAMETERS=hyperparameters.production.yaml
fi
time mhcflurry-multiallelic-refinement \
--monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
--multiallelic-data train.multiallelic.csv \
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec"
--hyperparameters hyperparameters.yaml \
--out-affinity-predictor-dir $(pwd)/models.affinity \
--out-presentation-predictor-dir $(pwd)/models.presentation \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
$PARALLELISM_ARGS
echo "Done training."
mhcflurry-class1-train-pan-allele-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
--allele-sequences "$ALLELE_SEQUENCES" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--num-folds 4 \
--hyperparameters "$HYPERPARAMETERS" \
--out-models-dir $(pwd)/models.unselected.${kind} \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
$PARALLELISM_ARGS $CONTINUE_INCOMPLETE_ARGS
done
echo "Done training. Beginning model selection."
for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
do
MODELS_DIR="models.unselected.${kind}"
mhcflurry-class1-select-pan-allele-models \
--data "$MODELS_DIR/train_data.csv.bz2" \
--models-dir "$MODELS_DIR" \
--out-models-dir models.${kind} \
--min-models 2 \
--max-models 8 \
$PARALLELISM_ARGS
ln -s "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2"
done
bzip2 train.multiallelic.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 -f "$LOG"
......@@ -120,15 +80,3 @@ RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
# Split into <2GB chunks for GitHub
PARTS="${RESULT}.part."
# Check for pre-existing part files and rename them.
for i in $(ls "${PARTS}"* )
do
DEST="${i}.OLD.$(date +%s)"
echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST
done
split -b 2000M "$RESULT" "$PARTS"
echo "Split into parts:"
ls -lh "${PARTS}"*
"""
Make multiallelic training data by selecting decoys, etc.
"""
import sys
import argparse
import os
import json
import collections
from six.moves import StringIO
import pandas
import tqdm
import mhcnames
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--hits",
metavar="CSV",
required=True,
help="Multiallelic mass spec")
parser.add_argument(
"--expression",
metavar="CSV",
required=True,
help="Expression data")
parser.add_argument(
"--decoys-per-hit",
type=int,
default=None,
help="If not specified will use all possible decoys")
parser.add_argument(
"--out",
metavar="CSV",
required=True,
help="File to write")
def run():
args = parser.parse_args(sys.argv[1:])
hit_df = pandas.read_csv(args.hits)
expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0)
hit_df = hit_df.loc[
(hit_df.mhc_class == "I") &
(hit_df.peptide.str.len() <= 15) &
(hit_df.peptide.str.len() >= 7) &
(~hit_df.protein_ensembl.isnull())
]
hit_df["alleles"] = hit_df.hla.str.split()
sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
grouped = hit_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique()
experiment_to_decoys = {}
for sample_id, row in sample_table.iterrows():
decoys = sample_table.loc[
sample_table.alleles.map(
lambda a: not set(a).intersection(set(row.alleles)))
].index.tolist()
experiment_to_decoys[sample_id] = decoys
experiment_to_decoys = pandas.Series(experiment_to_decoys)
sample_table["decoys"] = experiment_to_decoys
print("Samples:")
print(sample_table)
# Add a column to hit_df giving expression value for that sample and that gene
print("Annotating expression.")
hit_df["tpm"] = [
expression_df.reindex(
row.protein_ensembl.split())[row.expression_dataset].sum()
for _, row in tqdm.tqdm(
hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000)
]
print("Selecting max-expression transcripts for each hit.")
# Discard hits except those that have max expression for each hit_id
max_gene_hit_df = hit_df.loc[
hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max())
].sample(frac=1.0).drop_duplicates("hit_id")
# Columns are accession, peptide, sample_id, hit
result_df = []
columns_to_keep = [
"hit_id",
"protein_accession",
"protein_primary_ensembl_contig",
"peptide",
"sample_id"
]
print("Selecting decoys.")
for sample_id, sub_df in tqdm.tqdm(
max_gene_hit_df.groupby("sample_id"),
total=max_gene_hit_df.sample_id.nunique()):
result_df.append(sub_df[columns_to_keep].copy())
result_df[-1]["hit"] = 1
possible_decoys = hit_df.loc[
(hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) &
(~hit_df.peptide.isin(sub_df.peptide.unique())) &
(hit_df.protein_accession.isin(sub_df.protein_accession.unique()))
].drop_duplicates("peptide")
if not args.decoys_per_hit:
selected_decoys = possible_decoys[columns_to_keep].copy()
elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit:
print(
"Insufficient decoys",
len(possible_decoys),
len(sub_df),
args.decoys_per_hit)
selected_decoys = possible_decoys[columns_to_keep].copy()
else:
selected_decoys = possible_decoys.sample(
n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy()
result_df.append(selected_decoys)
result_df[-1]["hit"] = 0
result_df[-1]["sample_id"] = sample_id
result_df = pandas.concat(result_df, ignore_index=True, sort=False)
result_df["hla"] = result_df.sample_id.map(sample_table.hla)
print(result_df)
print("Counts:")
print(result_df.groupby(["sample_id", "hit"]).peptide.nunique())
print("Hit counts:")
print(
result_df.loc[
result_df.hit == 1
].groupby("sample_id").hit.count().sort_values())
print("Hit rates:")
print(result_df.groupby("sample_id").hit.mean().sort_values())
result_df.to_csv(args.out, index=False)
print("Wrote: ", args.out)
if __name__ == '__main__':
run()
......@@ -83,6 +83,8 @@ if __name__ == '__main__':
'mhcflurry.select_pan_allele_models_command:run',
'mhcflurry-calibrate-percentile-ranks = '
'mhcflurry.calibrate_percentile_ranks_command:run',
'mhcflurry-multiallelic-refinement = '
'mhcflurry.multiallelic_refinement_command:run',
'_mhcflurry-cluster-worker-entry-point = '
'mhcflurry.cluster_parallelism:worker_entry_point',
]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment