Skip to content
Snippets Groups Projects
Commit ea1b52f1 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 1fcf39fc
No related merge requests found
...@@ -76,8 +76,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references) ...@@ -76,8 +76,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references)
if [ "${2:-reuse-none}" != "reuse-none" ] if [ "${2:-reuse-none}" != "reuse-none" ]
then then
EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME) EXISTING_DATA="$(mhcflurry-downloads path $DOWNLOAD_NAME)"
echo "Will reuse data from $REFERENCES_DIR" echo "Will reuse data from $EXISTING_DATA"
else else
EXISTING_DATA="" EXISTING_DATA=""
echo "Will NOT reuse any data" echo "Will NOT reuse any data"
...@@ -121,14 +121,15 @@ do ...@@ -121,14 +121,15 @@ do
for kind in with_mass_spec no_mass_spec for kind in with_mass_spec no_mass_spec
do do
OUT_DIR=predictions/${subset}.mhcflurry.${kind} OUT_DIR=predictions/${subset}.mhcflurry.${kind}
REUSE_ARG="" REUSE1=""
REUSE2=""
if [ "$subset" == "all" ] if [ "$subset" == "all" ]
then then
REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}" REUSE1="predictions/chr1.mhcflurry.${kind}"
fi fi
if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ] if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
then then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" REUSE2="$EXISTING_DATA"/$OUT_DIR
fi fi
python run_predictors.py \ python run_predictors.py \
...@@ -141,19 +142,20 @@ do ...@@ -141,19 +142,20 @@ do
--out "$OUT_DIR" \ --out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \ --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \
$REUSE_ARG $EXTRA_ARGS --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
done done
# Run netmhcpan4 # Run netmhcpan4
OUT_DIR=predictions/${subset}.netmhcpan4 OUT_DIR=predictions/${subset}.netmhcpan4
REUSE_ARG="" REUSE1=""
REUSE2=""
if [ "$subset" == "all" ] if [ "$subset" == "all" ]
then then
REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4" REUSE1="predictions/chr1.netmhcpan4"
fi fi
if [ "${2:-reuse-none}" != "reuse-none" ] if [ "${2:-reuse-none}" != "reuse-none" ]
then then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" REUSE2="$EXISTING_DATA"/$OUT_DIR
fi fi
python run_predictors.py \ python run_predictors.py \
...@@ -164,7 +166,7 @@ do ...@@ -164,7 +166,7 @@ do
--out "$OUT_DIR" \ --out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \ --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
$REUSE_ARG $EXTRA_ARGS --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
done done
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
...@@ -182,6 +184,6 @@ do ...@@ -182,6 +184,6 @@ do
echo "WARNING: already exists: $i . Moving to $DEST" echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST mv $i $DEST
done done
split -b 2000M "$RESULT" "$PARTS" split -b 2000m "$RESULT" "$PARTS"
echo "Split into parts:" echo "Split into parts:"
ls -lh "${PARTS}"* ls -lh "${PARTS}"*
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#BSUB -R span[hosts=1] # one node #BSUB -R span[hosts=1] # one node
#BSUB -n 1 # number of compute cores #BSUB -n 1 # number of compute cores
#BSUB -W 12:00 # walltime in HH:MM #BSUB -W 12:00 # walltime in HH:MM
#BSUB -R rusage[mem=4000] # mb memory requested #BSUB -R rusage[mem=10000] # mb memory requested
#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
#BSUB -eo {work_dir}/STDERR # error log #BSUB -eo {work_dir}/STDERR # error log
#BSUB -L /bin/bash # Initialize the execution environment #BSUB -L /bin/bash # Initialize the execution environment
......
...@@ -75,7 +75,7 @@ parser.add_argument( ...@@ -75,7 +75,7 @@ parser.add_argument(
parser.add_argument( parser.add_argument(
"--reuse-predictions", "--reuse-predictions",
metavar="DIR", metavar="DIR",
action="append", nargs="*",
help="Take predictions from indicated DIR instead of re-running them") help="Take predictions from indicated DIR instead of re-running them")
add_local_parallelism_args(parser) add_local_parallelism_args(parser)
...@@ -99,21 +99,31 @@ def load_results(dirname, result_df=None): ...@@ -99,21 +99,31 @@ def load_results(dirname, result_df=None):
len(manifest_df), len(manifest_df),
"columns") "columns")
# Make adjustments for old style data. Can be removed later.
if "kind" not in manifest_df.columns:
manifest_df["kind"] = "affinity"
if "col" not in manifest_df.columns:
manifest_df["col"] = manifest_df.allele + " " + manifest_df.kind
if result_df is None: if result_df is None:
result_df = pandas.DataFrame( result_df = pandas.DataFrame(
index=peptides, columns=manifest_df.col.values, dtype="float32") index=peptides, columns=manifest_df.col.values, dtype="float32")
result_df[:] = numpy.nan result_df[:] = numpy.nan
peptides_to_assign = peptides
mask = None
else: else:
manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)] manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
peptides = peptides[peptides.isin(result_df.index)] mask = (peptides.isin(result_df.index)).values
peptides_to_assign = peptides[mask]
print("Will load", len(peptides), "peptides and", len(manifest_df), "cols") print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
for _, row in manifest_df.iterrows(): for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
with open(os.path.join(dirname, row.path), "rb") as fd: with open(os.path.join(dirname, row.path), "rb") as fd:
result_df.loc[ value = numpy.load(fd)['arr_0']
peptides.values, row.col if mask is not None:
] = numpy.load(fd)['arr_0'] value = value[mask]
result_df.loc[peptides_to_assign, row.col] = value
return result_df return result_df
...@@ -222,10 +232,15 @@ def run(argv=sys.argv[1:]): ...@@ -222,10 +232,15 @@ def run(argv=sys.argv[1:]):
if args.reuse_predictions: if args.reuse_predictions:
for dirname in args.reuse_predictions: for dirname in args.reuse_predictions:
print("Loading predictions", dirname) if not dirname:
result_df = load_results(dirname, result_df) continue # ignore empty strings
print("Existing data filled %f%% entries" % ( if os.path.exists(dirname):
result_df.notnull().values.mean())) print("Loading predictions", dirname)
result_df = load_results(dirname, result_df)
print("Existing data filled %f%% entries" % (
result_df.notnull().values.mean()))
else:
print("WARNING: skipping because does not exist", dirname)
# We rerun any alleles have nulls for any kind of values # We rerun any alleles have nulls for any kind of values
# (e.g. affinity, percentile rank, elution score). # (e.g. affinity, percentile rank, elution score).
...@@ -306,6 +321,17 @@ def run(argv=sys.argv[1:]): ...@@ -306,6 +321,17 @@ def run(argv=sys.argv[1:]):
for allele in alleles: for allele in alleles:
allele_to_chunk_index_to_predictions[allele] = {} allele_to_chunk_index_to_predictions[allele] = {}
last_write_time_per_column = dict((col, 0.0) for col in result_df.columns)
def write_col(col):
out_path = os.path.join(
args.out, col_to_filename[col])
numpy.savez(out_path, result_df[col].values)
print(
"Wrote [%f%% null]:" % (
result_df[col].isnull().mean() * 100.0),
out_path)
for (work_item_num, col_to_predictions) in tqdm.tqdm( for (work_item_num, col_to_predictions) in tqdm.tqdm(
results, total=len(work_items)): results, total=len(work_items)):
for (col, predictions) in col_to_predictions.items(): for (col, predictions) in col_to_predictions.items():
...@@ -313,13 +339,13 @@ def run(argv=sys.argv[1:]): ...@@ -313,13 +339,13 @@ def run(argv=sys.argv[1:]):
work_items[work_item_num]['peptides'], work_items[work_item_num]['peptides'],
col col
] = predictions ] = predictions
out_path = os.path.join( if time.time() - last_write_time_per_column[col] > 180:
args.out, col_to_filename[col]) write_col(col)
numpy.savez(out_path, result_df[col].values) last_write_time_per_column[col] = time.time()
print(
"Wrote [%f%% null]:" % ( print("Done processing. Final write for each column.")
result_df[col].isnull().mean() * 100.0), for col in result_df.columns:
out_path) write_col(col)
print("Overall null rate (should be 0): %f" % ( print("Overall null rate (should be 0): %f" % (
100.0 * result_df.isnull().values.flatten().mean())) 100.0 * result_df.isnull().values.flatten().mean()))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment