From 25c387bb33ca4f5111a86ca566c129e8c94316ee Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 7 Dec 2019 15:44:24 -0500
Subject: [PATCH] working on multiallelic refinement download

---
 .../models_class1_pan_refined/GENERATE.sh     | 110 ++++---------
 .../make_multiallelic_training_data.py        | 148 ++++++++++++++++++
 setup.py                                      |   2 +
 3 files changed, 179 insertions(+), 81 deletions(-)
 create mode 100644 downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py

diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.sh b/downloads-generation/models_class1_pan_refined/GENERATE.sh
index 4fe89de6..6591e853 100755
--- a/downloads-generation/models_class1_pan_refined/GENERATE.sh
+++ b/downloads-generation/models_class1_pan_refined/GENERATE.sh
@@ -3,12 +3,12 @@
 # Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
 # scheduler). This would need to be modified for other sites.
 #
-# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete>
+# Usage: GENERATE.sh <local|cluster>
 #
 set -e
 set -x
 
-DOWNLOAD_NAME=models_class1_pan_variants
+DOWNLOAD_NAME=models_class1_pan_refined
 SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
 SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
 SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
@@ -32,86 +32,46 @@ else
     PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf"
 fi
 
-mkdir -p "$SCRATCH_DIR"
-if [ "$2" != "continue-incomplete" ]
-then
-    echo "Fresh run"
-    rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
-    mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
-else
-    echo "Continuing incomplete run"
-fi
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
 
 # Send stdout and stderr to a logfile included with the archive.
-LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
-exec >  >(tee -ia "$LOG")
-exec 2> >(tee -ia "$LOG" >&2)
+#LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
+#exec >  >(tee -ia "$LOG")
+#exec 2> >(tee -ia "$LOG" >&2)
 
 # Log some environment info
-echo "Invocation: $0 $@"
-date
-pip freeze
-git status
+#echo "Invocation: $0 $@"
+#date
+#pip freeze
+#git status
 
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 export OMP_NUM_THREADS=1
 export PYTHONUNBUFFERED=1
 
-if [ "$2" != "continue-incomplete" ]
-then
-    cp $SCRIPT_DIR/generate_hyperparameters.production.py .
-    cp $SCRIPT_DIR/generate_hyperparameters.py .
-    cp $SCRIPT_DIR/normalize_allele_names.py .
-    python generate_hyperparameters.production.py > hyperparameters.production.yaml
-    python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
-    python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
-    python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv"  --out allele_sequences.34mer.csv
-fi
+# CAHNGE TO CP
+ln -s $SCRIPT_DIR/make_multiallelic_training_data.py .
 
-for kind in 34mer_sequence single_hidden_no_pretrain no_pretrain 
-do
-    CONTINUE_INCOMPLETE_ARGS=""
-    if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ]
-    then
-        echo "Will continue existing run: $kind"
-        CONTINUE_INCOMPLETE_ARGS="--continue-incomplete"
-    fi
+time python make_multiallelic_training_data.py \
+    --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
+    --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
+    --out train.multiallelic.csv
 
-    ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv"
-    HYPERPARAMETERS=hyperparameters.$kind.yaml
-    if [ "$kind" == "34mer_sequence" ]
-    then
-        ALLELE_SEQUENCES=allele_sequences.34mer.csv
-        HYPERPARAMETERS=hyperparameters.production.yaml
-    fi
+time mhcflurry-multiallelic-refinement \
+    --monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
+    --multiallelic-data train.multiallelic.csv \
+    --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec"
+    --hyperparameters hyperparameters.yaml \
+    --out-affinity-predictor-dir $(pwd)/models.affinity \
+    --out-presentation-predictor-dir $(pwd)/models.presentation \
+    --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
+    $PARALLELISM_ARGS
+
+echo "Done training."
 
-    mhcflurry-class1-train-pan-allele-models \
-        --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
-        --allele-sequences "$ALLELE_SEQUENCES" \
-        --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
-        --held-out-measurements-per-allele-fraction-and-max 0.25 100 \
-        --num-folds 4 \
-        --hyperparameters "$HYPERPARAMETERS" \
-        --out-models-dir $(pwd)/models.unselected.${kind} \
-        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-        $PARALLELISM_ARGS $CONTINUE_INCOMPLETE_ARGS
-done
-
-echo "Done training. Beginning model selection."
-
-for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
-do
-    MODELS_DIR="models.unselected.${kind}"
-    mhcflurry-class1-select-pan-allele-models \
-        --data "$MODELS_DIR/train_data.csv.bz2" \
-        --models-dir "$MODELS_DIR" \
-        --out-models-dir models.${kind} \
-        --min-models 2 \
-        --max-models 8 \
-        $PARALLELISM_ARGS
-    ln -s "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2"
-done
+bzip2 train.multiallelic.csv
 
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 -f "$LOG"
@@ -120,15 +80,3 @@ RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
 tar -cjf "$RESULT" *
 echo "Created archive: $RESULT"
 
-# Split into <2GB chunks for GitHub
-PARTS="${RESULT}.part."
-# Check for pre-existing part files and rename them.
-for i in $(ls "${PARTS}"* )
-do
-    DEST="${i}.OLD.$(date +%s)"
-    echo "WARNING: already exists: $i . Moving to $DEST"
-    mv $i $DEST
-done
-split -b 2000M "$RESULT" "$PARTS"
-echo "Split into parts:"
-ls -lh "${PARTS}"*
diff --git a/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py b/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py
new file mode 100644
index 00000000..635fbbae
--- /dev/null
+++ b/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py
@@ -0,0 +1,148 @@
+"""
+Make multiallelic training data by selecting decoys, etc.
+"""
+import sys
+import argparse
+import os
+import json
+import collections
+from six.moves import StringIO
+
+import pandas
+import tqdm
+
+import mhcnames
+
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument(
+    "--hits",
+    metavar="CSV",
+    required=True,
+    help="Multiallelic mass spec")
+parser.add_argument(
+    "--expression",
+    metavar="CSV",
+    required=True,
+    help="Expression data")
+parser.add_argument(
+    "--decoys-per-hit",
+    type=int,
+    default=None,
+    help="If not specified will use all possible decoys")
+parser.add_argument(
+    "--out",
+    metavar="CSV",
+    required=True,
+    help="File to write")
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+    hit_df = pandas.read_csv(args.hits)
+    expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0)
+    hit_df = hit_df.loc[
+        (hit_df.mhc_class == "I") &
+        (hit_df.peptide.str.len() <= 15) &
+        (hit_df.peptide.str.len() >= 7) &
+        (~hit_df.protein_ensembl.isnull())
+    ]
+    hit_df["alleles"] = hit_df.hla.str.split()
+
+    sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
+    grouped = hit_df.groupby("sample_id").nunique()
+    for col in sample_table.columns:
+        if (grouped[col] > 1).any():
+            del sample_table[col]
+    sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique()
+
+    experiment_to_decoys = {}
+    for sample_id, row in sample_table.iterrows():
+        decoys = sample_table.loc[
+            sample_table.alleles.map(
+                lambda a: not set(a).intersection(set(row.alleles)))
+        ].index.tolist()
+        experiment_to_decoys[sample_id] = decoys
+    experiment_to_decoys = pandas.Series(experiment_to_decoys)
+    sample_table["decoys"] = experiment_to_decoys
+
+    print("Samples:")
+    print(sample_table)
+
+    # Add a column to hit_df giving expression value for that sample and that gene
+    print("Annotating expression.")
+    hit_df["tpm"] = [
+        expression_df.reindex(
+            row.protein_ensembl.split())[row.expression_dataset].sum()
+        for _, row in tqdm.tqdm(
+            hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000)
+    ]
+
+    print("Selecting max-expression transcripts for each hit.")
+
+    # Discard hits except those that have max expression for each hit_id
+    max_gene_hit_df = hit_df.loc[
+        hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max())
+    ].sample(frac=1.0).drop_duplicates("hit_id")
+
+    # Columns are accession, peptide, sample_id, hit
+    result_df = []
+
+    columns_to_keep = [
+        "hit_id",
+        "protein_accession",
+        "protein_primary_ensembl_contig",
+        "peptide",
+        "sample_id"
+    ]
+
+    print("Selecting decoys.")
+    for sample_id, sub_df in tqdm.tqdm(
+            max_gene_hit_df.groupby("sample_id"),
+            total=max_gene_hit_df.sample_id.nunique()):
+        result_df.append(sub_df[columns_to_keep].copy())
+        result_df[-1]["hit"] = 1
+
+        possible_decoys = hit_df.loc[
+            (hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) &
+            (~hit_df.peptide.isin(sub_df.peptide.unique())) &
+            (hit_df.protein_accession.isin(sub_df.protein_accession.unique()))
+        ].drop_duplicates("peptide")
+        if not args.decoys_per_hit:
+            selected_decoys = possible_decoys[columns_to_keep].copy()
+        elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit:
+            print(
+                "Insufficient decoys",
+                len(possible_decoys),
+                len(sub_df),
+                args.decoys_per_hit)
+            selected_decoys = possible_decoys[columns_to_keep].copy()
+        else:
+            selected_decoys = possible_decoys.sample(
+                n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy()
+        result_df.append(selected_decoys)
+        result_df[-1]["hit"] = 0
+        result_df[-1]["sample_id"] = sample_id
+
+    result_df = pandas.concat(result_df, ignore_index=True, sort=False)
+    result_df["hla"] = result_df.sample_id.map(sample_table.hla)
+
+    print(result_df)
+    print("Counts:")
+    print(result_df.groupby(["sample_id", "hit"]).peptide.nunique())
+
+    print("Hit counts:")
+    print(
+        result_df.loc[
+            result_df.hit == 1
+        ].groupby("sample_id").hit.count().sort_values())
+
+    print("Hit rates:")
+    print(result_df.groupby("sample_id").hit.mean().sort_values())
+
+    result_df.to_csv(args.out, index=False)
+    print("Wrote: ", args.out)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/setup.py b/setup.py
index 9275775e..8696bf70 100644
--- a/setup.py
+++ b/setup.py
@@ -83,6 +83,8 @@ if __name__ == '__main__':
                     'mhcflurry.select_pan_allele_models_command:run',
                 'mhcflurry-calibrate-percentile-ranks = '
                     'mhcflurry.calibrate_percentile_ranks_command:run',
+                'mhcflurry-multiallelic-refinement = '
+                    'mhcflurry.multiallelic_refinement_command:run',
                 '_mhcflurry-cluster-worker-entry-point = '
                     'mhcflurry.cluster_parallelism:worker_entry_point',
             ]
-- 
GitLab