add downloads generation for ensemble

5ec93422 · Tim O'Donnell · 420d8b89 · 5ec93422 · 5ec93422 · 5ec93422
Commit 5ec93422 authored 8 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_allele_specific_ensemble/GENERATE.sh
+++ b/downloads-generation/models_class1_allele_specific_ensemble/GENERATE.sh
+#!/bin/bash
+
+if [[ $# -eq 0 ]] ; then
+    echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
+    echo 'See README.md'
+fi
+
+set -e
+set -x
+
+DOWNLOAD_NAME=models_class1_allele_specific_ensemble
+SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+export PYTHONUNBUFFERED=1
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git rev-parse HEAD
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+mkdir models
+
+cp $SCRIPT_DIR/models.py .
+python models.py > models.json
+
+time mhcflurry-class1-allele-specific-ensemble-train \
+    --ensemble-size 8 \
+    --model-architectures models.json \
+    --train-data "$(mhcflurry-downloads path data_combined_iedb_kim2014)/combined_human_class1_dataset.csv" \
+    --min-samples-per-allele 100 \
+    --out-manifest models.csv \
+    --out-models models \
+    --verbose \
+    "$@"
+
+cp $SCRIPT_ABSOLUTE_PATH .
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/models_class1_allele_specific_ensemble/README.md
+++ b/downloads-generation/models_class1_allele_specific_ensemble/README.md
+# Class I allele-specific models (ensemble)
+
+This download contains trained MHC Class I allele-specific MHCflurry models. For each allele, an ensemble of predictors is trained on random halves of the training data. Model architectures are selected based on performance on the other half of the dataset, so in general each ensemble contains predictors of different architectures. At prediction time the geometric mean IC50 is taken over the trained models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download.
+
+The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) library.
+
+To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument. 
+
+To generate this download we run:
+
+```
+./GENERATE.sh \
+    --parallel-backend kubeface \
+    --backend kubernetes \
+    --storage-prefix gs://kubeface \
+    --worker-image hammerlab/mhcflurry:latest \
+    --kubernetes-task-resources-memory-mb 10000 \
+    --worker-path-prefix venv-py3/bin \
+    --max-simultaneous-tasks 200 \
+
+```
--- a/downloads-generation/models_class1_allele_specific_ensemble/models.py
+++ b/downloads-generation/models_class1_allele_specific_ensemble/models.py
+import sys
+from mhcflurry.class1_allele_specific.train import HYPERPARAMETER_DEFAULTS
+import json
+
+models = HYPERPARAMETER_DEFAULTS.models_grid(
+    impute=[False, True],
+    activation=["tanh"],
+    layer_sizes=[[12], [64], [128]],
+    embedding_output_dim=[8, 32, 64],
+    dropout_probability=[0, .1, .25],
+    fraction_negative=[0, .1, .2],
+    n_training_epochs=[250],
+
+    # Imputation arguments
+    impute_method=["mice"],
+    imputer_args=[
+        # Arguments specific to imputation method (mice)
+        {"n_burn_in": 5, "n_imputations": 50, "n_nearest_columns": 25}
+    ],
+    impute_min_observations_per_peptide=[5],
+    impute_min_observations_per_allele=[100])
+
+sys.stderr.write("Models: %d\n" % len(models))
+print(json.dumps(models, indent=4))