fix

1793fd87 · Tim O'Donnell · e55afc59 · e55afc59 · e55afc59 · e55afc59
Commit 1793fd87 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_pan_refined/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan_refined/GENERATE.WITH_HPC_CLUSTER.sh
-bash GENERATE.sh cluster
--- a/downloads-generation/models_class1_pan_refined/GENERATE.sh
+++ b/downloads-generation/models_class1_pan_refined/GENERATE.sh
-#!/bin/bash
-#
-# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
-# scheduler). This would need to be modified for other sites.
-#
-# Usage: GENERATE.sh <local|cluster>
-#
-set -e
-set -x
-DOWNLOAD_NAME=models_class1_pan_refined
-SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
-SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
-SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
-if [ "$1" != "cluster" ]
-then
-    GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
-    echo "Detected GPUS: $GPUS"
-    PROCESSORS=$(getconf _NPROCESSORS_ONLN)
-    echo "Detected processors: $PROCESSORS"
-    if [ "$GPUS" -eq "0" ]; then
-       NUM_JOBS=${NUM_JOBS-1}
-    else
-        NUM_JOBS=${NUM_JOBS-$GPUS}
-    fi
-    echo "Num jobs: $NUM_JOBS"
-    PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
-else
-    PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf"
-fi
-rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
-mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
-# Send stdout and stderr to a logfile included with the archive.
-LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
-exec >  >(tee -ia "$LOG")
-exec 2> >(tee -ia "$LOG" >&2)
-# Log some environment info
-echo "Invocation: $0 $@"
-date
-pip freeze
-git status
-cd $SCRATCH_DIR/$DOWNLOAD_NAME
-export OMP_NUM_THREADS=1
-export PYTHONUNBUFFERED=1
-cp $SCRIPT_DIR/make_multiallelic_training_data.py .
-cp $SCRIPT_DIR/hyperparameters.yaml .
-MONOALLELIC_TRAIN="$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec/train_data.csv.bz2"
-# ********************************************************
-# First we refine a single model excluding chromosome 1.
-if false; then
-    echo "Beginning testing run."
-    time python make_multiallelic_training_data.py \
-        --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
-        --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
-        --exclude-contig "1" \
-        --decoys-per-hit 1 \
-        --out train.multiallelic.no_chr1.csv
-    time mhcflurry-multiallelic-refinement \
-        --monoallelic-data "$MONOALLELIC_TRAIN" \
-        --multiallelic-data train.multiallelic.no_chr1.csv \
-        --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \
-        --hyperparameters hyperparameters.yaml \
-        --out-affinity-predictor-dir $(pwd)/test_models.no_chr1.affinity \
-        --out-presentation-predictor-dir $(pwd)/test_models.no_chr1.presentation \
-        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-        $PARALLELISM_ARGS
-    time mhcflurry-calibrate-percentile-ranks \
-        --models-dir $(pwd)/test_models.no_chr1.affinity   \
-        --match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \
-        --motif-summary \
-        --num-peptides-per-length 100000 \
-        --allele "HLA-A*02:01" "HLA-A*02:20" "HLA-C*02:10" \
-        --verbosity 1 \
-        $PARALLELISM_ARGS
-fi
-# ********************************************************
-echo "Beginning production run"
-if [ -f "$SCRIPT_DIR/train.multiallelic.csv" ]; then
-    echo "Using existing multiallelic train data."
-    cp "$SCRIPT_DIR/train.multiallelic.csv" .
-else
-    time python make_multiallelic_training_data.py \
-        --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
-        --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
-        --decoys-per-hit 1 \
-        --out train.multiallelic.csv \
-        --alleles "HLA-A*02:20" "HLA-A*02:05"
-fi
-ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
-ALLELE_LIST+=$(cat train.multiallelic.csv | cut -f 7 -d , | gerp -v hla | uniq | tr ' ' '\n' | sort | uniq)
-ALLELE_LIST+=$(echo " " $(cat $SCRIPT_DIR/additional_alleles.txt | grep -v '#') )
-time mhcflurry-multiallelic-refinement \
-    --monoallelic-data "$MONOALLELIC_TRAIN" \
-    --multiallelic-data train.multiallelic.csv \
-    --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \
-    --hyperparameters hyperparameters.yaml \
-    --out-affinity-predictor-dir $(pwd)/models.affinity \
-    --out-presentation-predictor-dir $(pwd)/models.presentation \
-    --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-    --only-alleles-with-mass-spec \
-    $PARALLELISM_ARGS
-time mhcflurry-calibrate-percentile-ranks \
-    --models-dir $(pwd)/models.affinity  \
-    --match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \
-    --motif-summary \
-    --num-peptides-per-length 100000 \
-    --allele $ALLELE_LIST \
-    --verbosity 1 \
-    $PARALLELISM_ARGS
-echo "Done training."
-#rm train.multiallelic.*
-cp $SCRIPT_ABSOLUTE_PATH .
-bzip2 -f "$LOG"
-for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done
-RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
-tar -cjf "$RESULT" *
-echo "Created archive: $RESULT"
--- a/downloads-generation/models_class1_pan_refined/additional_alleles.txt
+++ b/downloads-generation/models_class1_pan_refined/additional_alleles.txt
-../models_class1_pan/additional_alleles.txt
\ No newline at end of file
--- a/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf
+++ b/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf
-../models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
\ No newline at end of file
--- a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml
+++ b/downloads-generation/models_class1_pan_refined/hyperparameters.yaml
-#########################
-# Batch generation
-#########################
-batch_generator: multiallelic_mass_spec
-batch_generator_validation_split: 0.1
-batch_generator_batch_size: 1024
-batch_generator_affinity_fraction: 0.5
-max_epochs: 500
-random_negative_rate: 1.0
-random_negative_constant: 25
-learning_rate: 0.001
-patience: 5
-min_delta: 0.0
-loss_multiallelic_mass_spec_multiplier: 10
--- a/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py
+++ b/downloads-generation/models_class1_pan_refined/make_multiallelic_training_data.py
-"""
-Make multiallelic training data by selecting decoys, etc.
-"""
-import sys
-import argparse
-import os
-import json
-import collections
-from six.moves import StringIO
-import pandas
-import tqdm
-import mhcnames
-parser = argparse.ArgumentParser(usage=__doc__)
-parser.add_argument(
-    "--hits",
-    metavar="CSV",
-    required=True,
-    help="Multiallelic mass spec")
-parser.add_argument(
-    "--expression",
-    metavar="CSV",
-    required=True,
-    help="Expression data")
-parser.add_argument(
-    "--decoys-per-hit",
-    type=int,
-    default=None,
-    help="If not specified will use all possible decoys")
-parser.add_argument(
-    "--exclude-contig",
-    help="Exclude entries annotated to the given contig")
-parser.add_argument(
-    "--out",
-    metavar="CSV",
-    required=True,
-    help="File to write")
-parser.add_argument(
-    "--alleles",
-    nargs="+",
-    help="Include only the specified alleles")
-def run():
-    args = parser.parse_args(sys.argv[1:])
-    hit_df = pandas.read_csv(args.hits)
-    expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0)
-    hit_df["alleles"] = hit_df.hla.str.split()
-    hit_df = hit_df.loc[
-        (hit_df.mhc_class == "I") &
-        (hit_df.peptide.str.len() <= 15) &
-        (hit_df.peptide.str.len() >= 7) &
-        (~hit_df.protein_ensembl.isnull())
-    ]
-    if args.exclude_contig:
-        new_hit_df = hit_df.loc[
-            hit_df.protein_primary_ensembl_contig.astype(str) !=
-            args.exclude_contig
-        ]
-        print(
-            "Excluding contig",
-            args.exclude_contig,
-            "reduced dataset from",
-            len(hit_df),
-            "to",
-            len(new_hit_df))
-        hit_df = new_hit_df.copy()
-    if args.alleles:
-        filter_alleles = set(args.alleles)
-        new_hit_df = hit_df.loc[
-            hit_df.alleles.map(
-                lambda a: len(set(a).intersection(filter_alleles)) > 0)
-        ]
-        print(
-            "Selecting alleles",
-            args.alleles,
-            "reduced dataset from",
-            len(hit_df),
-            "to",
-            len(new_hit_df))
-        hit_df = new_hit_df.copy()
-    sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
-    grouped = hit_df.groupby("sample_id").nunique()
-    for col in sample_table.columns:
-        if (grouped[col] > 1).any():
-            del sample_table[col]
-    sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique()
-    experiment_to_decoys = {}
-    for sample_id, row in sample_table.iterrows():
-        decoys = sample_table.loc[
-            sample_table.alleles.map(
-                lambda a: not set(a).intersection(set(row.alleles)))
-        ].index.tolist()
-        experiment_to_decoys[sample_id] = decoys
-    experiment_to_decoys = pandas.Series(experiment_to_decoys)
-    sample_table["decoys"] = experiment_to_decoys
-    print("Samples:")
-    print(sample_table)
-    # Add a column to hit_df giving expression value for that sample and that gene
-    print("Annotating expression.")
-    hit_df["tpm"] = [
-        expression_df.reindex(
-            row.protein_ensembl.split())[row.expression_dataset].sum()
-        for _, row in tqdm.tqdm(
-            hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000)
-    ]
-    print("Selecting max-expression transcripts for each hit.")
-    # Discard hits except those that have max expression for each hit_id
-    max_gene_hit_df = hit_df.loc[
-        hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max())
-    ].sample(frac=1.0).drop_duplicates("hit_id")
-    # Columns are accession, peptide, sample_id, hit
-    result_df = []
-    columns_to_keep = [
-        "hit_id",
-        "protein_accession",
-        "protein_primary_ensembl_contig",
-        "peptide",
-        "sample_id"
-    ]
-    print("Selecting decoys.")
-    for sample_id, sub_df in tqdm.tqdm(
-            max_gene_hit_df.groupby("sample_id"),
-            total=max_gene_hit_df.sample_id.nunique()):
-        result_df.append(sub_df[columns_to_keep].copy())
-        result_df[-1]["hit"] = 1
-        possible_decoys = hit_df.loc[
-            (hit_df.sample_id.isin(sample_table.loc[sample_id].decoys)) &
-            (~hit_df.peptide.isin(sub_df.peptide.unique())) &
-            (hit_df.protein_accession.isin(sub_df.protein_accession.unique()))
-        ].drop_duplicates("peptide")
-        if not args.decoys_per_hit:
-            selected_decoys = possible_decoys[columns_to_keep].copy()
-        elif len(possible_decoys) < len(sub_df) * args.decoys_per_hit:
-            print(
-                "Insufficient decoys",
-                len(possible_decoys),
-                len(sub_df),
-                args.decoys_per_hit)
-            selected_decoys = possible_decoys[columns_to_keep].copy()
-        else:
-            selected_decoys = possible_decoys.sample(
-                n=len(sub_df) * args.decoys_per_hit)[columns_to_keep].copy()
-        result_df.append(selected_decoys)
-        result_df[-1]["hit"] = 0
-        result_df[-1]["sample_id"] = sample_id
-    result_df = pandas.concat(result_df, ignore_index=True, sort=False)
-    result_df["hla"] = result_df.sample_id.map(sample_table.hla)
-    print(result_df)
-    print("Counts:")
-    print(result_df.groupby(["sample_id", "hit"]).peptide.nunique())
-    print("Hit counts:")
-    print(
-        result_df.loc[
-            result_df.hit == 1
-        ].groupby("sample_id").hit.count().sort_values())
-    print("Hit rates:")
-    print(result_df.groupby("sample_id").hit.mean().sort_values())
-    result_df.to_csv(args.out, index=False)
-    print("Wrote: ", args.out)
-if __name__ == '__main__':
-    run()
--- a/mhcflurry/train_presentation_models_command.py
+++ b/mhcflurry/train_presentation_models_command.py
@@ -38,12 +38,12 @@ parser.add_argument(
    "--cleavage-predictor-with-flanks",
    metavar="DIR",
    required=True,
-    help="Cleavage predictor with flanking")
+    help="Cleavage predictor with flanks")
 parser.add_argument(
    "--cleavage-predictor-without-flanks",
    metavar="DIR",
    required=True,
-    help="Cleavage predictor without flanking")
+    help="Cleavage predictor without flanks")
 parser.add_argument(
    "--verbosity",
    type=int,
@@ -107,9 +107,9 @@ def main(args):
    affinity_predictor = Class1AffinityPredictor.load(args.affinity_predictor)
    cleavage_predictor_with_flanks = Class1CleavagePredictor.load(
-        args.cleavage_predictor_with_flanking)
+        args.cleavage_predictor_with_flanks)
    cleavage_predictor_without_flanks = Class1CleavagePredictor.load(
-        args.cleavage_predictor_without_flanking)
+        args.cleavage_predictor_without_flanks)
    predictor = Class1PresentationPredictor(
        affinity_predictor=affinity_predictor,