Skip to content
Snippets Groups Projects
Commit 16858d14 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent 18d9460d
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ import sys
import time
import traceback
import math
import collections
from functools import partial
import numpy
......@@ -134,48 +135,6 @@ def load_results(dirname, result_df=None, dtype="float32"):
return result_df
def blocks_of_ones(arr):
"""
Given a binary matrix, return indices of rectangular blocks of 1s.
Parameters
----------
arr : binary matrix
Returns
-------
List of (x1, y1, x2, y2) where all indices are INCLUSIVE. Each block spans
from (x1, y1) on its upper left corner to (x2, y2) on its lower right corner.
"""
arr = arr.copy()
blocks = []
while arr.sum() > 0:
(x1, y1) = numpy.unravel_index(arr.argmax(), arr.shape)
block = [x1, y1, x1, y1]
# Extend in first dimension as far as possible
down_stop = numpy.argmax(arr[x1:, y1] == 0) - 1
if down_stop == -1:
block[2] = arr.shape[0] - 1
else:
assert down_stop >= 0
block[2] = x1 + down_stop
# Extend in second dimension as far as possible
for i in range(y1, arr.shape[1]):
if (arr[block[0] : block[2] + 1, i] == 1).all():
block[3] = i
# Zero out block:
assert (
arr[block[0]: block[2] + 1, block[1] : block[3] + 1] == 1).all(), (arr, block)
arr[block[0] : block[2] + 1, block[1] : block[3] + 1] = 0
blocks.append(block)
return blocks
def run(argv=sys.argv[1:]):
global GLOBAL_DATA
......@@ -190,7 +149,7 @@ def run(argv=sys.argv[1:]):
serial_run = not args.cluster_parallelism and args.num_jobs == 0
alleles = [normalize_allele_name(a) for a in args.allele]
alleles = sorted(set(alleles))
alleles = numpy.array(sorted(set(alleles)))
peptides = pandas.read_csv(
args.input_peptides, nrows=args.max_peptides).peptide.drop_duplicates()
......@@ -251,35 +210,32 @@ def run(argv=sys.argv[1:]):
else:
print("WARNING: skipping because does not exist", dirname)
# We rerun any alleles have nulls for any kind of values
# We rerun any alleles that have nulls for any kind of values
# (e.g. affinity, percentile rank, elution score).
for (i, allele) in enumerate(alleles):
sub_df = manifest_df.loc[manifest_df.allele == allele]
is_null_matrix[:, i] = result_df[sub_df.col.values].isnull().any(1)
print("Fraction null", is_null_matrix.mean())
print("Computing blocks.")
start = time.time()
blocks = blocks_of_ones(is_null_matrix)
print("Found %d blocks in %f sec." % (
len(blocks), (time.time() - start)))
print("Grouping peptides by alleles")
allele_indices_to_peptides = collections.defaultdict(list)
for (i, peptide) in tqdm.tqdm(enumerate(peptides), total=len(peptides)):
(allele_indices,) = numpy.where(is_null_matrix[i])
if len(allele_indices) > 0:
allele_indices_to_peptides[tuple(allele_indices)].append(peptide)
work_items = []
for (row_index1, col_index1, row_index2, col_index2) in blocks:
block_alleles = alleles[col_index1 : col_index2 + 1]
block_peptides = result_df.index[row_index1 : row_index2 + 1]
del is_null_matrix
print("Block: ", row_index1, col_index1, row_index2, col_index2)
work_items = []
print("Assigning peptides to work items.")
for (indices, block_peptides) in allele_indices_to_peptides.items():
num_chunks = int(math.ceil(len(block_peptides) / args.chunk_size))
print("Splitting peptides into %d chunks" % num_chunks)
peptide_chunks = numpy.array_split(peptides, num_chunks)
for chunk_peptides in peptide_chunks:
work_item = {
'alleles': block_alleles,
work_items.append({
'alleles': alleles[list(allele_indices_to_peptides)],
'peptides': chunk_peptides,
}
work_items.append(work_item)
})
else:
# Same number of chunks for all alleles
num_chunks = int(math.ceil(len(peptides) / args.chunk_size))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment