Skip to content
Snippets Groups Projects
Commit cdcb7d81 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent 61ab5bfd
No related merge requests found
...@@ -62,8 +62,9 @@ for kind in with_mass_spec no_mass_spec ...@@ -62,8 +62,9 @@ for kind in with_mass_spec no_mass_spec
do do
python run_mhcflurry.py \ python run_mhcflurry.py \
proteome_peptides.csv.bz2 \ proteome_peptides.csv.bz2 \
--chunk-size 10000000 \ --chunk-size 100000 \
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
--batch-size 65536 \
--allele $(cat alleles.txt) \ --allele $(cat alleles.txt) \
--out "predictions/mhcflurry.$kind" \ --out "predictions/mhcflurry.$kind" \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
......
...@@ -56,7 +56,7 @@ parser.add_argument( ...@@ -56,7 +56,7 @@ parser.add_argument(
parser.add_argument( parser.add_argument(
"--chunk-size", "--chunk-size",
type=int, type=int,
default=100000000, default=100000,
help="Num peptides per job. Default: %(default)s") help="Num peptides per job. Default: %(default)s")
parser.add_argument( parser.add_argument(
"--batch-size", "--batch-size",
...@@ -76,6 +76,12 @@ parser.add_argument( ...@@ -76,6 +76,12 @@ parser.add_argument(
type=int, type=int,
help="Keras verbosity. Default: %(default)s", help="Keras verbosity. Default: %(default)s",
default=0) default=0)
parser.add_argument(
"--max-peptides",
type=int,
help="Max peptides to process. For debugging.",
default=None)
add_local_parallelism_args(parser) add_local_parallelism_args(parser)
add_cluster_parallelism_args(parser) add_cluster_parallelism_args(parser)
...@@ -97,18 +103,17 @@ def run(argv=sys.argv[1:]): ...@@ -97,18 +103,17 @@ def run(argv=sys.argv[1:]):
serial_run = not args.cluster_parallelism and args.num_jobs == 0 serial_run = not args.cluster_parallelism and args.num_jobs == 0
# It's important that we don't trigger a Keras import here since that breaks # It's important that we don't trigger a Keras import here since that breaks
# local parallelism (tensorflow backend). So we set optimization_level=0 if # local parallelism (tensorflow backend). So we set optimization_level=0.
# using local parallelism.
predictor = Class1AffinityPredictor.load( predictor = Class1AffinityPredictor.load(
args.models_dir, args.models_dir,
#optimization_level=None if serial_run or args.cluster_parallelism else 0,
optimization_level=0, optimization_level=0,
) )
alleles = [normalize_allele_name(a) for a in args.allele] alleles = [normalize_allele_name(a) for a in args.allele]
alleles = sorted(set(alleles)) alleles = sorted(set(alleles))
peptides = pandas.read_csv(args.input_peptides).peptide.drop_duplicates() peptides = pandas.read_csv(
args.input_peptides, nrows=args.max_peptides).peptide.drop_duplicates()
print("Filtering to valid peptides. Starting at: ", len(peptides)) print("Filtering to valid peptides. Starting at: ", len(peptides))
peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")] peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")]
print("Filtered to: ", len(peptides)) print("Filtered to: ", len(peptides))
...@@ -136,11 +141,8 @@ def run(argv=sys.argv[1:]): ...@@ -136,11 +141,8 @@ def run(argv=sys.argv[1:]):
print("Wrote: ", out_alleles) print("Wrote: ", out_alleles)
num_chunks = int(math.ceil(len(peptides) / args.chunk_size)) num_chunks = int(math.ceil(len(peptides) / args.chunk_size))
print("Split peptides into %d chunks" % num_chunks) print("Splitting peptides into %d chunks" % num_chunks)
peptide_chunks = [ peptide_chunks = numpy.array_split(peptides, num_chunks)
EncodableSequences.create(chunk)
for chunk in numpy.array_split(peptides, num_chunks)
]
GLOBAL_DATA["predictor"] = predictor GLOBAL_DATA["predictor"] = predictor
GLOBAL_DATA["args"] = { GLOBAL_DATA["args"] = {
...@@ -151,14 +153,13 @@ def run(argv=sys.argv[1:]): ...@@ -151,14 +153,13 @@ def run(argv=sys.argv[1:]):
} }
work_items = [] work_items = []
for allele in alleles: for (chunk_index, chunk_peptides) in enumerate(peptide_chunks):
for (chunk_index, chunk_peptides) in enumerate(peptide_chunks): work_item = {
work_item = { 'alleles': alleles,
'allele': allele, 'chunk_index': chunk_index,
'chunk_index': chunk_index, 'peptides': chunk_peptides,
'chunk_peptides': chunk_peptides, }
} work_items.append(work_item)
work_items.append(work_item)
print("Work items: ", len(work_items)) print("Work items: ", len(work_items))
worker_pool = None worker_pool = None
...@@ -191,27 +192,30 @@ def run(argv=sys.argv[1:]): ...@@ -191,27 +192,30 @@ def run(argv=sys.argv[1:]):
for allele in alleles: for allele in alleles:
allele_to_chunk_index_to_predictions[allele] = {} allele_to_chunk_index_to_predictions[allele] = {}
for (allele, chunk_index, predictions) in tqdm.tqdm( for (chunk_index, allele_to_predictions) in tqdm.tqdm(
results, total=len(work_items)): results, total=len(work_items)):
for (allele, predictions) in allele_to_predictions.items():
chunk_index_to_predictions = allele_to_chunk_index_to_predictions[allele] chunk_index_to_predictions = allele_to_chunk_index_to_predictions[
allele
assert chunk_index not in chunk_index_to_predictions ]
chunk_index_to_predictions[chunk_index] = predictions assert chunk_index not in chunk_index_to_predictions
chunk_index_to_predictions[chunk_index] = predictions
if len(allele_to_chunk_index_to_predictions[allele]) == num_chunks:
chunk_predictions = sorted(chunk_index_to_predictions.items()) if len(allele_to_chunk_index_to_predictions[allele]) == num_chunks:
assert [i for (i, _) in chunk_predictions] == list(range(num_chunks)) chunk_predictions = sorted(chunk_index_to_predictions.items())
predictions = numpy.concatenate([ assert [i for (i, _) in chunk_predictions] == list(
predictions for (_, predictions) in chunk_predictions range(num_chunks))
]) predictions = numpy.concatenate([
assert len(predictions) == num_peptides predictions for (_, predictions) in chunk_predictions
out_path = os.path.join(args.out, allele.replace("*", "")) + ".npz" ])
out_path = os.path.abspath(out_path) assert len(predictions) == num_peptides
numpy.savez(out_path, predictions) out_path = os.path.join(
print("Wrote:", out_path) args.out, allele.replace("*", "")) + ".npz"
out_path = os.path.abspath(out_path)
del allele_to_chunk_index_to_predictions[allele] numpy.savez(out_path, predictions)
print("Wrote:", out_path)
del allele_to_chunk_index_to_predictions[allele]
assert not allele_to_chunk_index_to_predictions, ( assert not allele_to_chunk_index_to_predictions, (
"Not all results written: ", allele_to_chunk_index_to_predictions) "Not all results written: ", allele_to_chunk_index_to_predictions)
...@@ -225,35 +229,35 @@ def run(argv=sys.argv[1:]): ...@@ -225,35 +229,35 @@ def run(argv=sys.argv[1:]):
prediction_time / 60.0)) prediction_time / 60.0))
def do_predictions(allele, chunk_index, chunk_peptides, constant_data=GLOBAL_DATA): def do_predictions(chunk_index, peptides, alleles, constant_data=GLOBAL_DATA):
return predict_for_allele( return predict_for_allele(
allele,
chunk_index, chunk_index,
chunk_peptides, peptides,
constant_data['predictor'], alleles,
predictor=constant_data['predictor'],
**constant_data["args"]) **constant_data["args"])
def predict_for_allele( def predict_for_allele(
allele,
chunk_index, chunk_index,
chunk_peptides, peptides,
alleles,
predictor, predictor,
verbose=False, verbose=False,
model_kwargs={}): model_kwargs={}):
if verbose: predictor.optimize(warn=False) # since we loaded with optimization_level=0
print("Predicting", allele)
predictor.optimize() # since we may have loaded with optimization_level=0
start = time.time() start = time.time()
predictions = predictor.predict( results = {}
peptides=chunk_peptides, peptides = EncodableSequences.create(peptides)
allele=allele, for allele in alleles:
throw=False, results[allele] = predictor.predict(
model_kwargs=model_kwargs).astype('float32') peptides=peptides,
allele=allele,
throw=False,
model_kwargs=model_kwargs).astype('float32')
if verbose: if verbose:
print("Done predicting", allele, "in", time.time() - start, "sec") print("Done predicting in", time.time() - start, "sec")
return (chunk_index, results)
return (allele, chunk_index, predictions)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -521,7 +521,7 @@ class Class1AffinityPredictor(object): ...@@ -521,7 +521,7 @@ class Class1AffinityPredictor(object):
"succeeded" if optimized else "not supported for these models") "succeeded" if optimized else "not supported for these models")
return result return result
def optimize(self): def optimize(self, warn=True):
""" """
EXPERIMENTAL: Optimize the predictor for faster predictions. EXPERIMENTAL: Optimize the predictor for faster predictions.
...@@ -545,7 +545,8 @@ class Class1AffinityPredictor(object): ...@@ -545,7 +545,8 @@ class Class1AffinityPredictor(object):
merge_method="concatenate") merge_method="concatenate")
] ]
except NotImplementedError as e: except NotImplementedError as e:
logging.warning("Optimization failed: %s", str(e)) if warn:
logging.warning("Optimization failed: %s", str(e))
return False return False
self._manifest_df = None self._manifest_df = None
self.clear_cache() self.clear_cache()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment