diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh deleted file mode 100755 index 23cc24f783b43fb476f36dac168a39d80c31348f..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan/GENERATE.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# -# Train pan-allele MHCflurry Class I models. -# -set -e -set -x - -DOWNLOAD_NAME=models_class1_pan -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") - -mkdir -p "$SCRATCH_DIR" -rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" -mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" - -# Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) - -# Log some environment info -date -pip freeze -git status - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -mkdir models - -cp $SCRIPT_DIR/generate_hyperparameters.py . -python generate_hyperparameters.py > hyperparameters.yaml - -cp $SCRIPT_DIR/write_validation_data.py . - -GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 -echo "Detected GPUS: $GPUS" - -PROCESSORS=$(getconf _NPROCESSORS_ONLN) -echo "Detected processors: $PROCESSORS" - -export PYTHONUNBUFFERED=1 -VERBOSITY=1 - -mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ - --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ - --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ - --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ - --ensemble-size 4 \ - --hyperparameters hyperparameters.yaml \ - --out-models-dir models-unselected.with_mass_spec \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --verbosity $VERBOSITY \ - --num-jobs $GPUS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 - -mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \ - --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ - --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ - --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ - --ensemble-size 4 \ - --hyperparameters hyperparameters.yaml \ - --out-models-dir models-unselected.no_mass_spec \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --verbosity $VERBOSITY \ - --num-jobs $GPUS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 - -cp $SCRIPT_ABSOLUTE_PATH . -bzip2 LOG.txt -for i in $(ls LOG-worker.*.txt) ; do bzip2 $i ; done -tar -cjf "../${DOWNLOAD_NAME}.with_unselected.tar.bz2" * -echo "Created archive: $SCRATCH_DIR/${DOWNLOAD_NAME}.with_unselected.tar.bz2" - -ls -d * | grep -v models-unselected | xargs -I {} tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" {} -echo "Created archive: $SCRATCH_DIR/${DOWNLOAD_NAME}.tar.bz2" diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..a805de9b58a2cc699623197f9bde152548c2c576 --- /dev/null +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# Train pan-allele MHCflurry Class I models. +# +set -e +set -x + +DOWNLOAD_NAME=models_class1_pan_unselected +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +mkdir models + +cp $SCRIPT_DIR/generate_hyperparameters.py . +python generate_hyperparameters.py > hyperparameters.yaml + +GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 +echo "Detected GPUS: $GPUS" + +PROCESSORS=$(getconf _NPROCESSORS_ONLN) +echo "Detected processors: $PROCESSORS" + +for kind in with_mass_spec no_mass_spec +do + mhcflurry-class1-train-pan-allele-models \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.${kind}.csv.bz2" \ + --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ + --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ + --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ + --ensemble-size 4 \ + --hyperparameters hyperparameters.yaml \ + --out-models-dir models.${kind} \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + --verbosity 0 \ + --num-jobs $GPUS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 +done + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +for i in $(ls LOG-worker.*.txt) ; do bzip2 $i ; done +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * +echo "Created archive: $SCRATCH_DIR/${DOWNLOAD_NAME}.tar.bz2" diff --git a/downloads-generation/models_class1_pan/README.md b/downloads-generation/models_class1_pan_unselected/README.md similarity index 100% rename from downloads-generation/models_class1_pan/README.md rename to downloads-generation/models_class1_pan_unselected/README.md diff --git a/downloads-generation/models_class1_pan/generate_hyperparameters.py b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py similarity index 100% rename from downloads-generation/models_class1_pan/generate_hyperparameters.py rename to downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py diff --git a/downloads-generation/models_class1_pan/write_validation_data.py b/downloads-generation/models_class1_pan_unselected/write_validation_data.py similarity index 100% rename from downloads-generation/models_class1_pan/write_validation_data.py rename to downloads-generation/models_class1_pan_unselected/write_validation_data.py diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index b7d2e5e32af896110f44b21d66f65c2037b6b996..eb3bca3df75e5cb18ab4a6d9528fd7d02647ce83 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -152,21 +152,21 @@ def run(argv=sys.argv[1:]): if num_folds <= 1: raise ValueError("Too few folds: ", num_folds) - #df = df.loc[ - # (df.peptide.str.len() >= min_peptide_length) & - # (df.peptide.str.len() <= max_peptide_length) - #] - #print("Subselected to %d-%dmers: %s" % ( - # min_peptide_length, max_peptide_length, str(df.shape))) + df = df.loc[ + (df.peptide.str.len() >= min_peptide_length) & + (df.peptide.str.len() <= max_peptide_length) + ] + print("Subselected to %d-%dmers: %s" % ( + min_peptide_length, max_peptide_length, str(df.shape))) print("Num folds: ", num_folds, "fraction included:") print(df[fold_cols].mean()) # Allele names in data are assumed to be already normalized. - #df = df.loc[df.allele.isin(alleles)].dropna() - #print("Subselected to supported alleles: %s" % str(df.shape)) + df = df.loc[df.allele.isin(alleles)].dropna() + print("Subselected to supported alleles: %s" % str(df.shape)) - #print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) + print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) metadata_dfs["model_selection_data"] = df