From 16858d1446424d33d3012e02206390cf794dc2dd Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 4 Oct 2019 14:01:51 -0400
Subject: [PATCH] fix

---
 .../run_predictors.py                         | 76 ++++---------------
 1 file changed, 16 insertions(+), 60 deletions(-)

diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
index d74d2a80..d4324737 100644
--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -7,6 +7,7 @@ import sys
 import time
 import traceback
 import math
+import collections
 from functools import partial
 
 import numpy
@@ -134,48 +135,6 @@ def load_results(dirname, result_df=None, dtype="float32"):
     return result_df
 
 
-def blocks_of_ones(arr):
-    """
-    Given a binary matrix, return indices of rectangular blocks of 1s.
-
-    Parameters
-    ----------
-    arr : binary matrix
-
-    Returns
-    -------
-    List of (x1, y1, x2, y2) where all indices are INCLUSIVE. Each block spans
-    from (x1, y1) on its upper left corner to (x2, y2) on its lower right corner.
-
-    """
-    arr = arr.copy()
-    blocks = []
-    while arr.sum() > 0:
-        (x1, y1) = numpy.unravel_index(arr.argmax(), arr.shape)
-        block = [x1, y1, x1, y1]
-
-        # Extend in first dimension as far as possible
-        down_stop = numpy.argmax(arr[x1:, y1] == 0) - 1
-        if down_stop == -1:
-            block[2] = arr.shape[0] - 1
-        else:
-            assert down_stop >= 0
-            block[2] = x1 + down_stop
-
-        # Extend in second dimension as far as possible
-        for i in range(y1, arr.shape[1]):
-            if (arr[block[0] : block[2] + 1, i] == 1).all():
-                block[3] = i
-
-        # Zero out block:
-        assert (
-            arr[block[0]: block[2] + 1, block[1] : block[3] + 1] == 1).all(), (arr, block)
-        arr[block[0] : block[2] + 1, block[1] : block[3] + 1] = 0
-
-        blocks.append(block)
-    return blocks
-
-
 def run(argv=sys.argv[1:]):
     global GLOBAL_DATA
 
@@ -190,7 +149,7 @@ def run(argv=sys.argv[1:]):
     serial_run = not args.cluster_parallelism and args.num_jobs == 0
 
     alleles = [normalize_allele_name(a) for a in args.allele]
-    alleles = sorted(set(alleles))
+    alleles = numpy.array(sorted(set(alleles)))
 
     peptides = pandas.read_csv(
         args.input_peptides, nrows=args.max_peptides).peptide.drop_duplicates()
@@ -251,35 +210,32 @@ def run(argv=sys.argv[1:]):
             else:
                 print("WARNING: skipping because does not exist", dirname)
 
-        # We rerun any alleles have nulls for any kind of values
+        # We rerun any alleles that have nulls for any kind of values
         # (e.g. affinity, percentile rank, elution score).
         for (i, allele) in enumerate(alleles):
             sub_df = manifest_df.loc[manifest_df.allele == allele]
             is_null_matrix[:, i] = result_df[sub_df.col.values].isnull().any(1)
         print("Fraction null", is_null_matrix.mean())
 
-        print("Computing blocks.")
-        start = time.time()
-        blocks = blocks_of_ones(is_null_matrix)
-        print("Found %d blocks in %f sec." % (
-            len(blocks), (time.time() - start)))
+        print("Grouping peptides by alleles")
+        allele_indices_to_peptides = collections.defaultdict(list)
+        for (i, peptide) in tqdm.tqdm(enumerate(peptides), total=len(peptides)):
+            (allele_indices,) = numpy.where(is_null_matrix[i])
+            if len(allele_indices) > 0:
+                allele_indices_to_peptides[tuple(allele_indices)].append(peptide)
 
-        work_items = []
-        for (row_index1, col_index1, row_index2, col_index2) in blocks:
-            block_alleles = alleles[col_index1 : col_index2 + 1]
-            block_peptides = result_df.index[row_index1 : row_index2 + 1]
+        del is_null_matrix
 
-            print("Block: ", row_index1, col_index1, row_index2, col_index2)
+        work_items = []
+        print("Assigning peptides to work items.")
+        for (indices, block_peptides) in allele_indices_to_peptides.items():
             num_chunks = int(math.ceil(len(block_peptides) / args.chunk_size))
-            print("Splitting peptides into %d chunks" % num_chunks)
             peptide_chunks = numpy.array_split(peptides, num_chunks)
-
             for chunk_peptides in peptide_chunks:
-                work_item = {
-                    'alleles': block_alleles,
+                work_items.append({
+                    'alleles': alleles[list(allele_indices_to_peptides)],
                     'peptides': chunk_peptides,
-                }
-                work_items.append(work_item)
+                })
     else:
         # Same number of chunks for all alleles
         num_chunks = int(math.ceil(len(peptides) / args.chunk_size))
-- 
GitLab