Skip to content
Snippets Groups Projects
Commit e65d7a79 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

move

parent 7a688ba5
No related branches found
No related tags found
No related merge requests found
......@@ -5,7 +5,7 @@
set -e
set -x
DOWNLOAD_NAME=models_class1_pan
DOWNLOAD_NAME=models_class1_pan_unselected
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
......@@ -30,46 +30,29 @@ mkdir models
cp $SCRIPT_DIR/generate_hyperparameters.py .
python generate_hyperparameters.py > hyperparameters.yaml
cp $SCRIPT_DIR/write_validation_data.py .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
export PYTHONUNBUFFERED=1
VERBOSITY=1
mhcflurry-class1-train-pan-allele-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
--allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--ensemble-size 4 \
--hyperparameters hyperparameters.yaml \
--out-models-dir models-unselected.with_mass_spec \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--verbosity $VERBOSITY \
--num-jobs $GPUS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
mhcflurry-class1-train-pan-allele-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \
--allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--ensemble-size 4 \
--hyperparameters hyperparameters.yaml \
--out-models-dir models-unselected.no_mass_spec \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--verbosity $VERBOSITY \
--num-jobs $GPUS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
for kind in with_mass_spec no_mass_spec
do
mhcflurry-class1-train-pan-allele-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.${kind}.csv.bz2" \
--allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--ensemble-size 4 \
--hyperparameters hyperparameters.yaml \
--out-models-dir models.${kind} \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--verbosity 0 \
--num-jobs $GPUS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
done
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
for i in $(ls LOG-worker.*.txt) ; do bzip2 $i ; done
tar -cjf "../${DOWNLOAD_NAME}.with_unselected.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/${DOWNLOAD_NAME}.with_unselected.tar.bz2"
ls -d * | grep -v models-unselected | xargs -I {} tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" {}
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/${DOWNLOAD_NAME}.tar.bz2"
......@@ -152,21 +152,21 @@ def run(argv=sys.argv[1:]):
if num_folds <= 1:
raise ValueError("Too few folds: ", num_folds)
#df = df.loc[
# (df.peptide.str.len() >= min_peptide_length) &
# (df.peptide.str.len() <= max_peptide_length)
#]
#print("Subselected to %d-%dmers: %s" % (
# min_peptide_length, max_peptide_length, str(df.shape)))
df = df.loc[
(df.peptide.str.len() >= min_peptide_length) &
(df.peptide.str.len() <= max_peptide_length)
]
print("Subselected to %d-%dmers: %s" % (
min_peptide_length, max_peptide_length, str(df.shape)))
print("Num folds: ", num_folds, "fraction included:")
print(df[fold_cols].mean())
# Allele names in data are assumed to be already normalized.
#df = df.loc[df.allele.isin(alleles)].dropna()
#print("Subselected to supported alleles: %s" % str(df.shape))
df = df.loc[df.allele.isin(alleles)].dropna()
print("Subselected to supported alleles: %s" % str(df.shape))
#print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
metadata_dfs["model_selection_data"] = df
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment