Skip to content
Snippets Groups Projects
Commit 886f99db authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent ea043877
No related merge requests found
......@@ -76,8 +76,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references)
if [ "${2:-reuse-none}" != "reuse-none" ]
then
EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME)
echo "Will reuse data from $REFERENCES_DIR"
EXISTING_DATA="$(mhcflurry-downloads path $DOWNLOAD_NAME)"
echo "Will reuse data from $EXISTING_DATA"
else
EXISTING_DATA=""
echo "Will NOT reuse any data"
......@@ -121,14 +121,15 @@ do
for kind in with_mass_spec no_mass_spec
do
OUT_DIR=predictions/${subset}.mhcflurry.${kind}
REUSE_ARG=""
REUSE1=""
REUSE2=""
if [ "$subset" == "all" ]
then
REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
REUSE1="predictions/chr1.mhcflurry.${kind}"
fi
if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
REUSE2="$EXISTING_DATA"/$OUT_DIR
fi
python run_predictors.py \
......@@ -141,19 +142,20 @@ do
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \
$REUSE_ARG $EXTRA_ARGS
--reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
done
# Run netmhcpan4
OUT_DIR=predictions/${subset}.netmhcpan4
REUSE_ARG=""
REUSE1=""
REUSE2=""
if [ "$subset" == "all" ]
then
REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4"
REUSE1="predictions/chr1.netmhcpan4"
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
REUSE2="$EXISTING_DATA"/$OUT_DIR
fi
python run_predictors.py \
......@@ -164,7 +166,7 @@ do
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
$REUSE_ARG $EXTRA_ARGS
--reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
done
cp $SCRIPT_ABSOLUTE_PATH .
......@@ -182,6 +184,6 @@ do
echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST
done
split -b 2000M "$RESULT" "$PARTS"
split -b 2000m "$RESULT" "$PARTS"
echo "Split into parts:"
ls -lh "${PARTS}"*
......@@ -5,7 +5,7 @@
#BSUB -R span[hosts=1] # one node
#BSUB -n 1 # number of compute cores
#BSUB -W 12:00 # walltime in HH:MM
#BSUB -R rusage[mem=4000] # mb memory requested
#BSUB -R rusage[mem=10000] # mb memory requested
#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
#BSUB -eo {work_dir}/STDERR # error log
#BSUB -L /bin/bash # Initialize the execution environment
......
......@@ -75,7 +75,7 @@ parser.add_argument(
parser.add_argument(
"--reuse-predictions",
metavar="DIR",
action="append",
nargs="*",
help="Take predictions from indicated DIR instead of re-running them")
add_local_parallelism_args(parser)
......@@ -99,21 +99,31 @@ def load_results(dirname, result_df=None):
len(manifest_df),
"columns")
# Make adjustments for old style data. Can be removed later.
if "kind" not in manifest_df.columns:
manifest_df["kind"] = "affinity"
if "col" not in manifest_df.columns:
manifest_df["col"] = manifest_df.allele + " " + manifest_df.kind
if result_df is None:
result_df = pandas.DataFrame(
index=peptides, columns=manifest_df.col.values, dtype="float32")
result_df[:] = numpy.nan
peptides_to_assign = peptides
mask = None
else:
manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
peptides = peptides[peptides.isin(result_df.index)]
mask = (peptides.isin(result_df.index)).values
peptides_to_assign = peptides[mask]
print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
for _, row in manifest_df.iterrows():
for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
with open(os.path.join(dirname, row.path), "rb") as fd:
result_df.loc[
peptides.values, row.col
] = numpy.load(fd)['arr_0']
value = numpy.load(fd)['arr_0']
if mask is not None:
value = value[mask]
result_df.loc[peptides_to_assign, row.col] = value
return result_df
......@@ -222,10 +232,15 @@ def run(argv=sys.argv[1:]):
if args.reuse_predictions:
for dirname in args.reuse_predictions:
print("Loading predictions", dirname)
result_df = load_results(dirname, result_df)
print("Existing data filled %f%% entries" % (
result_df.notnull().values.mean()))
if not dirname:
continue # ignore empty strings
if os.path.exists(dirname):
print("Loading predictions", dirname)
result_df = load_results(dirname, result_df)
print("Existing data filled %f%% entries" % (
result_df.notnull().values.mean()))
else:
print("WARNING: skipping because does not exist", dirname)
# We rerun any alleles have nulls for any kind of values
# (e.g. affinity, percentile rank, elution score).
......@@ -306,6 +321,17 @@ def run(argv=sys.argv[1:]):
for allele in alleles:
allele_to_chunk_index_to_predictions[allele] = {}
last_write_time_per_column = dict((col, 0.0) for col in result_df.columns)
def write_col(col):
out_path = os.path.join(
args.out, col_to_filename[col])
numpy.savez(out_path, result_df[col].values)
print(
"Wrote [%f%% null]:" % (
result_df[col].isnull().mean() * 100.0),
out_path)
for (work_item_num, col_to_predictions) in tqdm.tqdm(
results, total=len(work_items)):
for (col, predictions) in col_to_predictions.items():
......@@ -313,13 +339,13 @@ def run(argv=sys.argv[1:]):
work_items[work_item_num]['peptides'],
col
] = predictions
out_path = os.path.join(
args.out, col_to_filename[col])
numpy.savez(out_path, result_df[col].values)
print(
"Wrote [%f%% null]:" % (
result_df[col].isnull().mean() * 100.0),
out_path)
if time.time() - last_write_time_per_column[col] > 180:
write_col(col)
last_write_time_per_column[col] = time.time()
print("Done processing. Final write for each column.")
for col in result_df.columns:
write_col(col)
print("Overall null rate (should be 0): %f" % (
100.0 * result_df.isnull().values.flatten().mean()))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment