Skip to content
Snippets Groups Projects
Commit 5ddf981a authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

tweak model selection alleles

parent 931e7304
No related branches found
No related tags found
No related merge requests found
......@@ -33,6 +33,7 @@ export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
cp $SCRIPT_ABSOLUTE_PATH .
cp $SCRIPT_DIR/additional_alleles.txt .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
......@@ -49,6 +50,12 @@ echo "Num local jobs for model selection: $NUM_JOBS"
UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
# For now we calibrate percentile ranks only for alleles for which there
# is training data. Calibrating all alleles would be too slow.
# This could be improved though.
ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
for kind in with_mass_spec no_mass_spec
do
# Model selection is run on the cluster, although for any reasonable
......@@ -72,15 +79,13 @@ do
cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/"
# Percentile rank calibration is run on the cluster.
# For now we calibrate percentile ranks only for alleles for which there
# is training data. Calibrating all alleles would be too slow.
# This could be improved though.
time mhcflurry-calibrate-percentile-ranks \
--models-dir models.${kind} \
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \
--num-peptides-per-length 100000 \
--allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
--allele $ALLELE_LIST \
--verbosity 1 \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--prediction-batch-size 524288 \
......
......@@ -25,6 +25,7 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_ABSOLUTE_PATH .
cp $SCRIPT_DIR/additional_alleles.txt .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
......@@ -43,6 +44,12 @@ export PYTHONUNBUFFERED=1
UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
# For now we calibrate percentile ranks only for alleles for which there
# is training data. Calibrating all alleles would be too slow.
# This could be improved though.
ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
for kind in with_mass_spec no_mass_spec
do
MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
......@@ -65,7 +72,7 @@ do
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \
--num-peptides-per-length 100000 \
--allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
--allele $ALLELE_LIST \
--verbosity 1 \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
done
......
......@@ -115,6 +115,8 @@ def run(argv=sys.argv[1:]):
else:
alleles = predictor.supported_alleles
alleles = sorted(set(alleles))
distribution = None
if args.match_amino_acid_distribution_data:
distribution_peptides = pandas.read_csv(
......
......@@ -288,7 +288,9 @@ def run(argv=sys.argv[1:]):
worker_pool.join()
print("Model selection time %0.2f min." % (model_selection_time / 60.0))
print("Predictor written to: %s" % args.out_models_dir)
print("Predictor [%d models] written to: %s" % (
len(result_predictor.neural_networks),
args.out_models_dir))
def do_model_select_task(item, constant_data=GLOBAL_DATA):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment