fixes

886f99db · Tim O'Donnell · ea043877 · 886f99db · 886f99db · 886f99db
Commit 886f99db authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -76,8 +76,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references)

 if [ "${2:-reuse-none}" != "reuse-none" ]
 then
-    EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME)
-    echo "Will reuse data from $REFERENCES_DIR"
+    EXISTING_DATA="$(mhcflurry-downloads path $DOWNLOAD_NAME)"
+    echo "Will reuse data from $EXISTING_DATA"
 else
    EXISTING_DATA=""
    echo "Will NOT reuse any data"
@@ -121,14 +121,15 @@ do
    for kind in with_mass_spec no_mass_spec
    do
        OUT_DIR=predictions/${subset}.mhcflurry.${kind}
-        REUSE_ARG=""
+        REUSE1=""
+        REUSE2=""
        if [ "$subset" == "all" ]
        then
-            REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
+            REUSE1="predictions/chr1.mhcflurry.${kind}"
        fi
        if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
        then
-            REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
+            REUSE2="$EXISTING_DATA"/$OUT_DIR
        fi

        python run_predictors.py \
@@ -141,19 +142,20 @@ do
            --out "$OUT_DIR" \
            --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
            --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \
-            $REUSE_ARG $EXTRA_ARGS
+            --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
    done

    # Run netmhcpan4
    OUT_DIR=predictions/${subset}.netmhcpan4
-    REUSE_ARG=""
+    REUSE1=""
+    REUSE2=""
    if [ "$subset" == "all" ]
    then
-        REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4"
+        REUSE1="predictions/chr1.netmhcpan4"
    fi
    if [ "${2:-reuse-none}" != "reuse-none" ]
    then
-        REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
+        REUSE2="$EXISTING_DATA"/$OUT_DIR
    fi

    python run_predictors.py \
@@ -164,7 +166,7 @@ do
        --out "$OUT_DIR" \
        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
-        $REUSE_ARG $EXTRA_ARGS
+        --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
 done

 cp $SCRIPT_ABSOLUTE_PATH .
@@ -182,6 +184,6 @@ do
    echo "WARNING: already exists: $i . Moving to $DEST"
    mv $i $DEST
 done
-split -b 2000M "$RESULT" "$PARTS"
+split -b 2000m "$RESULT" "$PARTS"
 echo "Split into parts:"
 ls -lh "${PARTS}"*
--- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
+++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
@@ -5,7 +5,7 @@
 #BSUB -R span[hosts=1] # one node
 #BSUB -n 1 # number of compute cores
 #BSUB -W 12:00 # walltime in HH:MM
-#BSUB -R rusage[mem=4000] # mb memory requested
+#BSUB -R rusage[mem=10000] # mb memory requested
 #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
 #BSUB -eo {work_dir}/STDERR # error log
 #BSUB -L /bin/bash # Initialize the execution environment

--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -75,7 +75,7 @@ parser.add_argument(
 parser.add_argument(
    "--reuse-predictions",
    metavar="DIR",
-    action="append",
+    nargs="*",
    help="Take predictions from indicated DIR instead of re-running them")

 add_local_parallelism_args(parser)
@@ -99,21 +99,31 @@ def load_results(dirname, result_df=None):
        len(manifest_df),
        "columns")

+    # Make adjustments for old style data. Can be removed later.
+    if "kind" not in manifest_df.columns:
+        manifest_df["kind"] = "affinity"
+    if "col" not in manifest_df.columns:
+        manifest_df["col"] = manifest_df.allele + " " + manifest_df.kind
+
    if result_df is None:
        result_df = pandas.DataFrame(
            index=peptides, columns=manifest_df.col.values, dtype="float32")
        result_df[:] = numpy.nan
+        peptides_to_assign = peptides
+        mask = None
    else:
        manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
-        peptides = peptides[peptides.isin(result_df.index)]
+        mask = (peptides.isin(result_df.index)).values
+        peptides_to_assign = peptides[mask]

    print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")

-    for _, row in manifest_df.iterrows():
+    for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
        with open(os.path.join(dirname, row.path), "rb") as fd:
-            result_df.loc[
-                peptides.values, row.col
-            ] = numpy.load(fd)['arr_0']
+            value = numpy.load(fd)['arr_0']
+            if mask is not None:
+                value = value[mask]
+            result_df.loc[peptides_to_assign, row.col] = value

    return result_df

@@ -222,10 +232,15 @@ def run(argv=sys.argv[1:]):

    if args.reuse_predictions:
        for dirname in args.reuse_predictions:
-            print("Loading predictions", dirname)
-            result_df = load_results(dirname, result_df)
-            print("Existing data filled %f%% entries" % (
-                result_df.notnull().values.mean()))
+            if not dirname:
+                continue  # ignore empty strings
+            if os.path.exists(dirname):
+                print("Loading predictions", dirname)
+                result_df = load_results(dirname, result_df)
+                print("Existing data filled %f%% entries" % (
+                    result_df.notnull().values.mean()))
+            else:
+                print("WARNING: skipping because does not exist", dirname)

        # We rerun any alleles have nulls for any kind of values
        # (e.g. affinity, percentile rank, elution score).
@@ -306,6 +321,17 @@ def run(argv=sys.argv[1:]):
    for allele in alleles:
        allele_to_chunk_index_to_predictions[allele] = {}

+    last_write_time_per_column = dict((col, 0.0) for col in result_df.columns)
+
+    def write_col(col):
+        out_path = os.path.join(
+            args.out, col_to_filename[col])
+        numpy.savez(out_path, result_df[col].values)
+        print(
+            "Wrote [%f%% null]:" % (
+                result_df[col].isnull().mean() * 100.0),
+            out_path)
+
    for (work_item_num, col_to_predictions) in tqdm.tqdm(
            results, total=len(work_items)):
        for (col, predictions) in col_to_predictions.items():
@@ -313,13 +339,13 @@ def run(argv=sys.argv[1:]):
                work_items[work_item_num]['peptides'],
                col
            ] = predictions
-            out_path = os.path.join(
-                args.out, col_to_filename[col])
-            numpy.savez(out_path, result_df[col].values)
-            print(
-                "Wrote [%f%% null]:" % (
-                    result_df[col].isnull().mean() * 100.0),
-                out_path)
+            if time.time() - last_write_time_per_column[col] > 180:
+                write_col(col)
+                last_write_time_per_column[col] = time.time()
+
+    print("Done processing. Final write for each column.")
+    for col in result_df.columns:
+        write_col(col)

    print("Overall null rate (should be 0): %f" % (
        100.0 * result_df.isnull().values.flatten().mean()))