Rename data_mass_spec_benchmark to data_predictions

52909241 · Tim O'Donnell · e14c82ac · 52909241 · 52909241 · 52909241
Commit 52909241 authored 4 years ago by Tim O'Donnell
--- a/downloads-generation/data_evaluation/GENERATE.sh
+++ b/downloads-generation/data_evaluation/GENERATE.sh
@@ -70,7 +70,7 @@ do
        cp $SCRIPT_DIR/make_benchmark.py .
        time python make_benchmark.py \
            --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-            --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+            --proteome-peptides "$(mhcflurry-downloads path data_predictions)/proteome_peptides.all.csv.bz2" \
            --decoys-per-hit 110 \
            --exclude-train-data "$EXCLUDE_TRAIN_DATA" \
            --only-format MONOALLELIC \
@@ -95,7 +95,7 @@ do
        cp $SCRIPT_DIR/make_benchmark.py .
        time python make_benchmark.py \
            --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-            --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+            --proteome-peptides "$(mhcflurry-downloads path data_predictions)/proteome_peptides.all.csv.bz2" \
            --decoys-per-hit 110 \
            --exclude-train-data "$EXCLUDE_TRAIN_DATA" \
            --only-format MULTIALLELIC \

--- a/downloads-generation/data_evaluation/join_with_precomputed.py
+++ b/downloads-generation/data_evaluation/join_with_precomputed.py
@@ -77,7 +77,7 @@ def run():

    if 'netmhcpan4.ba' in args.predictors:
        precomputed_dfs['netmhcpan4.ba'] = load_results(
-            get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"),
+            get_path("data_predictions", "predictions/all.netmhcpan4.ba"),
            result_df=pandas.DataFrame(
                dtype=numpy.float32,
                index=peptides,
@@ -87,7 +87,7 @@ def run():

    if 'netmhcpan4.el' in args.predictors:
        precomputed_dfs['netmhcpan4.el'] = load_results(
-            get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"),
+            get_path("data_predictions", "predictions/all.netmhcpan4.el"),
            result_df=pandas.DataFrame(
                dtype=numpy.float32,
                index=peptides,
@@ -96,7 +96,7 @@ def run():

    if 'mixmhcpred' in args.predictors:
        precomputed_dfs['mixmhcpred'] = load_results(
-            get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
+            get_path("data_predictions", "predictions/all.mixmhcpred"),
            result_df=pandas.DataFrame(
                dtype=numpy.float32,
                index=peptides,

--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -16,7 +16,7 @@
 #
 # SECOND ARGUMENT: whether to reuse predictions from existing downloaded data
 # reuse-all         - reuse predictions and peptide / allele lists from existing
-#                     downloaded data_mass_spec_benchmark.
+#                     downloaded data_predictions.
 # reuse-none        - fully self-contained run; do not reuse anything.
 # reuse-predictions - reuse predictions but not peptide or allele lists. Any
 #                     new peptides not already included will be run.
@@ -26,7 +26,7 @@
 set -e
 set -x

-DOWNLOAD_NAME=data_mass_spec_benchmark
+DOWNLOAD_NAME=data_predictions
 SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
 SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
 SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")

--- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.gpu.lsf
+++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.gpu.lsf
--- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
+++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
--- a/downloads-generation/data_mass_spec_benchmark/requirements.txt
+++ b/downloads-generation/data_mass_spec_benchmark/requirements.txt
--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
--- a/downloads-generation/data_mass_spec_benchmark/write_allele_list.py
+++ b/downloads-generation/data_mass_spec_benchmark/write_allele_list.py
--- a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
+++ b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
--- a/downloads-generation/models_class1_presentation/GENERATE.sh
+++ b/downloads-generation/models_class1_presentation/GENERATE.sh
@@ -79,7 +79,7 @@ else
    cp $SCRIPT_DIR/make_benchmark.py .
    time python make_benchmark.py \
        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+        --proteome-peptides "$(mhcflurry-downloads path data_predictions)/proteome_peptides.all.csv.bz2" \
        --decoys-per-hit 2 \
        --exclude-pmid 31844290 31495665 31154438 \
        --only-format MULTIALLELIC \
@@ -94,14 +94,14 @@ else
    mhcflurry-class1-train-presentation-models \
        --data "$(pwd)/train_data.csv.bz2" \
        --affinity-predictor "$(mhcflurry-downloads path models_class1_pan)/models.combined" \
-        --processing-predictor-with-flanks "$(mhcflurry-downloads path models_class1_processing)/models" \
-        --processing-predictor-without-flanks "$(mhcflurry-downloads path models_class1_processing_variants)/models.selected.no_flank" \
+        --processing-predictor-with-flanks "$(mhcflurry-downloads path models_class1_processing)/models.selected.with_flanks" \
+        --processing-predictor-without-flanks "$(mhcflurry-downloads path models_class1_processing)/models.selected.no_flank" \
        --out-models-dir "$(pwd)/models"
 fi

 cp "$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2" models/affinity_predictor_train_data.csv.bz2
-cp "$(mhcflurry-downloads path models_class1_processing)/models/train_data.csv.bz2" models/processing_predictor_train_data.csv.bz2
-cp "$(mhcflurry-downloads path models_class1_processing_variants)/models.selected.no_flank/train_data.csv.bz2" models/processing_predictor_no_flank_train_data.csv.bz2
+cp "$(mhcflurry-downloads path models_class1_processing)/models.selected.with_flanks/train_data.csv.bz2" models/processing_predictor_train_data.csv.bz2
+cp "$(mhcflurry-downloads path models_class1_processing)/models.selected.no_flank/train_data.csv.bz2" models/processing_predictor_no_flank_train_data.csv.bz2

 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 -f "$LOG"

--- a/downloads-generation/models_class1_processing/GENERATE.sh
+++ b/downloads-generation/models_class1_processing/GENERATE.sh
@@ -91,8 +91,8 @@ else
    cp $SCRIPT_DIR/make_train_data.py .
    time python make_train_data.py \
        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-        --predictions "$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.mhcflurry.combined" \
-        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+        --predictions "$(mhcflurry-downloads path data_predictions)/predictions/all.mhcflurry.combined" \
+        --proteome-peptides "$(mhcflurry-downloads path data_predictions)/proteome_peptides.all.csv.bz2" \
        --ppv-multiplier 100 \
        --hit-multiplier-to-take 2 \
        --out "$(pwd)/train_data.csv"

--- a/downloads-generation/models_class1_processing/cluster_submit_script_header.mssm_hpc.lsf
+++ b/downloads-generation/models_class1_processing/cluster_submit_script_header.mssm_hpc.lsf
-../models_class1_processing/cluster_submit_script_header.mssm_hpc.lsf
\ No newline at end of file
--- a/downloads-generation/models_class1_processing/cluster_submit_script_header.mssm_hpc.lsf
+++ b/downloads-generation/models_class1_processing/cluster_submit_script_header.mssm_hpc.lsf
+#!/bin/bash
+#BSUB -J MHCf-{work_item_num} # Job name
+#BSUB -P acc_nkcancer # allocation account or Unix group
+#BSUB -q gpu # queue
+#BSUB -R rusage[ngpus_excl_p=1]  # 1 exclusive GPU
+#BSUB -R span[hosts=1] # one node
+#BSUB -n 1 # number of compute cores
+#BSUB -W 10:00 # walltime in HH:MM
+#BSUB -R rusage[mem=20000] # mb memory requested
+#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
+#BSUB -eo {work_dir}/STDERR # error log
+#BSUB -L /bin/bash # Initialize the execution environment
+#
+
+set -e
+set -x
+
+echo "Subsequent stderr output redirected to stdout" >&2
+exec 2>&1
+
+export TMPDIR=/local/JOBS/mhcflurry-{work_item_num}
+export PATH=$HOME/.conda/envs/py36b/bin/:$PATH
+export PYTHONUNBUFFERED=1
+export KMP_SETTINGS=1
+
+free -m
+
+module add cuda/10.0.130
+module list
+
+export CUDNN_HOME=/hpc/users/odonnt02/oss/cudnn/cuda
+export LD_LIBRARY_PATH=$CUDNN_HOME/lib64:$LD_LIBRARY_PATH
+export CMAKE_LIBRARY_PATH=$CUDNN_HOME/lib64:$CMAKE_LIBRARY_PATH
+export INCLUDE_PATH=$CUDNN_HOME/include:$INCLUDE_PATH
+export C_INCLUDE_PATH=$CUDNN_HOME/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=$CUDNN_HOME/include:$CPLUS_INCLUDE_PATH
+export CMAKE_INCLUDE_PATH=$CUDNN_HOME/include:$CMAKE_INCLUDE_PATH
+
+python -c 'import tensorflow as tf ; print("GPU AVAILABLE" if tf.test.is_gpu_available() else "GPU NOT AVAILABLE")'
+
+env
+
+cd {work_dir}
+
--- a/downloads-generation/models_class1_processing/generate_hyperparameters.base.py
+++ b/downloads-generation/models_class1_processing/generate_hyperparameters.base.py
-../models_class1_processing/generate_hyperparameters.py
\ No newline at end of file
--- a/downloads-generation/models_class1_processing/generate_hyperparameters.base.py
+++ b/downloads-generation/models_class1_processing/generate_hyperparameters.base.py
+"""
+Generate grid of hyperparameters
+"""
+from __future__ import print_function
+from sys import stdout, stderr
+from copy import deepcopy
+from yaml import dump
+
+base_hyperparameters = dict(
+    convolutional_filters=64,
+    convolutional_kernel_size=8,
+    convolutional_kernel_l1_l2=(0.00, 0.0),
+    flanking_averages=True,
+    n_flank_length=15,
+    c_flank_length=15,
+    post_convolutional_dense_layer_sizes=[],
+    minibatch_size=512,
+    dropout_rate=0.5,
+    convolutional_activation="relu",
+    patience=20,
+    learning_rate=0.001)
+
+grid = []
+
+
+def hyperparrameters_grid():
+    for learning_rate in [0.001]:
+        for convolutional_activation in ["tanh", "relu"]:
+            for convolutional_filters in [256, 512]:
+                for flanking_averages in [True]:
+                    for convolutional_kernel_size in [11, 13, 15, 17]:
+                        for l1 in [0.0, 1e-6]:
+                            for s in [[8], [16]]:
+                                for d in [0.3, 0.5]:
+                                    new = deepcopy(base_hyperparameters)
+                                    new["learning_rate"] = learning_rate
+                                    new["convolutional_activation"] = convolutional_activation
+                                    new["convolutional_filters"] = convolutional_filters
+                                    new["flanking_averages"] = flanking_averages
+                                    new["convolutional_kernel_size"] = convolutional_kernel_size
+                                    new["convolutional_kernel_l1_l2"] = (l1, 0.0)
+                                    new["post_convolutional_dense_layer_sizes"] = s
+                                    new["dropout_rate"] = d
+                                    yield new
+
+
+for new in hyperparrameters_grid():
+    if new not in grid:
+        grid.append(new)
+
+print("Hyperparameters grid size: %d" % len(grid), file=stderr)
+dump(grid, stdout)
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -48,7 +48,7 @@ releases:
              url: https://github.com/openvax/mhcflurry/releases/download/1.6.0/data_evaluation.20200209.tar.bz2
              default: false

-            - name: data_mass_spec_benchmark
+            - name: data_predictions
              part_urls:
                - https://github.com/openvax/mhcflurry/releases/download/pre-1.7.0/data_mass_spec_benchmark.20200428.tar.bz2.part.aa
                - https://github.com/openvax/mhcflurry/releases/download/pre-1.7.0/data_mass_spec_benchmark.20200428.tar.bz2.part.ab