fix

cdcb7d81 · Tim O'Donnell · 61ab5bfd · cdcb7d81 · cdcb7d81 · cdcb7d81
Commit cdcb7d81 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -62,8 +62,9 @@ for kind in with_mass_spec no_mass_spec
 do
    python run_mhcflurry.py \
        proteome_peptides.csv.bz2 \
-        --chunk-size 10000000 \
+        --chunk-size 100000 \
        --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
+        --batch-size 65536 \
        --allele $(cat alleles.txt) \
        --out "predictions/mhcflurry.$kind" \
        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1

--- a/downloads-generation/data_mass_spec_benchmark/run_mhcflurry.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_mhcflurry.py
@@ -56,7 +56,7 @@ parser.add_argument(
 parser.add_argument(
    "--chunk-size",
    type=int,
-    default=100000000,
+    default=100000,
    help="Num peptides per job. Default: %(default)s")
 parser.add_argument(
    "--batch-size",
@@ -76,6 +76,12 @@ parser.add_argument(
    type=int,
    help="Keras verbosity. Default: %(default)s",
    default=0)
+parser.add_argument(
+    "--max-peptides",
+    type=int,
+    help="Max peptides to process. For debugging.",
+    default=None)
 add_local_parallelism_args(parser)
 add_cluster_parallelism_args(parser)
@@ -97,18 +103,17 @@ def run(argv=sys.argv[1:]):
    serial_run = not args.cluster_parallelism and args.num_jobs == 0
    # It's important that we don't trigger a Keras import here since that breaks
-    # local parallelism (tensorflow backend). So we set optimization_level=0 if
+    # local parallelism (tensorflow backend). So we set optimization_level=0.
-    # using local parallelism.
    predictor = Class1AffinityPredictor.load(
        args.models_dir,
-        #optimization_level=None if serial_run or args.cluster_parallelism else 0,
        optimization_level=0,
    )
    alleles = [normalize_allele_name(a) for a in args.allele]
    alleles = sorted(set(alleles))
-    peptides = pandas.read_csv(args.input_peptides).peptide.drop_duplicates()
+    peptides = pandas.read_csv(
+        args.input_peptides, nrows=args.max_peptides).peptide.drop_duplicates()
    print("Filtering to valid peptides. Starting at: ", len(peptides))
    peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")]
    print("Filtered to: ", len(peptides))
@@ -136,11 +141,8 @@ def run(argv=sys.argv[1:]):
    print("Wrote: ", out_alleles)
    num_chunks = int(math.ceil(len(peptides) / args.chunk_size))
-    print("Split peptides into %d chunks" % num_chunks)
+    print("Splitting peptides into %d chunks" % num_chunks)
-    peptide_chunks = [
+    peptide_chunks = numpy.array_split(peptides, num_chunks)
-        EncodableSequences.create(chunk)
-        for chunk in numpy.array_split(peptides, num_chunks)
-    ]
    GLOBAL_DATA["predictor"] = predictor
    GLOBAL_DATA["args"] = {
@@ -151,14 +153,13 @@ def run(argv=sys.argv[1:]):
    }
    work_items = []
-    for allele in alleles:
+    for (chunk_index, chunk_peptides) in enumerate(peptide_chunks):
-        for (chunk_index, chunk_peptides) in enumerate(peptide_chunks):
+        work_item = {
-            work_item = {
+            'alleles': alleles,
-                'allele': allele,
+            'chunk_index': chunk_index,
-                'chunk_index': chunk_index,
+            'peptides': chunk_peptides,
-                'chunk_peptides': chunk_peptides,
+        }
-            }
+        work_items.append(work_item)
-            work_items.append(work_item)
    print("Work items: ", len(work_items))
    worker_pool = None
@@ -191,27 +192,30 @@ def run(argv=sys.argv[1:]):
    for allele in alleles:
        allele_to_chunk_index_to_predictions[allele] = {}
-    for (allele, chunk_index, predictions) in tqdm.tqdm(
+    for (chunk_index, allele_to_predictions) in tqdm.tqdm(
            results, total=len(work_items)):
+        for (allele, predictions) in allele_to_predictions.items():
-        chunk_index_to_predictions = allele_to_chunk_index_to_predictions[allele]
+            chunk_index_to_predictions = allele_to_chunk_index_to_predictions[
+                allele
-        assert chunk_index not in chunk_index_to_predictions
+            ]
-        chunk_index_to_predictions[chunk_index] = predictions
+            assert chunk_index not in chunk_index_to_predictions
+            chunk_index_to_predictions[chunk_index] = predictions
-        if len(allele_to_chunk_index_to_predictions[allele]) == num_chunks:
-            chunk_predictions = sorted(chunk_index_to_predictions.items())
+            if len(allele_to_chunk_index_to_predictions[allele]) == num_chunks:
-            assert [i for (i, _) in chunk_predictions] == list(range(num_chunks))
+                chunk_predictions = sorted(chunk_index_to_predictions.items())
-            predictions = numpy.concatenate([
+                assert [i for (i, _) in chunk_predictions] == list(
-                predictions for (_, predictions) in chunk_predictions
+                    range(num_chunks))
-            ])
+                predictions = numpy.concatenate([
-            assert len(predictions) == num_peptides
+                    predictions for (_, predictions) in chunk_predictions
-            out_path = os.path.join(args.out, allele.replace("*", "")) + ".npz"
+                ])
-            out_path = os.path.abspath(out_path)
+                assert len(predictions) == num_peptides
-            numpy.savez(out_path, predictions)
+                out_path = os.path.join(
-            print("Wrote:", out_path)
+                    args.out, allele.replace("*", "")) + ".npz"
+                out_path = os.path.abspath(out_path)
-            del allele_to_chunk_index_to_predictions[allele]
+                numpy.savez(out_path, predictions)
+                print("Wrote:", out_path)
+                del allele_to_chunk_index_to_predictions[allele]
    assert not allele_to_chunk_index_to_predictions, (
        "Not all results written: ", allele_to_chunk_index_to_predictions)
@@ -225,35 +229,35 @@ def run(argv=sys.argv[1:]):
        prediction_time / 60.0))
-def do_predictions(allele, chunk_index, chunk_peptides, constant_data=GLOBAL_DATA):
+def do_predictions(chunk_index, peptides, alleles, constant_data=GLOBAL_DATA):
    return predict_for_allele(
-        allele,
        chunk_index,
-        chunk_peptides,
+        peptides,
-        constant_data['predictor'],
+        alleles,
+        predictor=constant_data['predictor'],
        **constant_data["args"])
 def predict_for_allele(
-        allele,
        chunk_index,
-        chunk_peptides,
+        peptides,
+        alleles,
        predictor,
        verbose=False,
        model_kwargs={}):
-    if verbose:
+    predictor.optimize(warn=False)  # since we loaded with optimization_level=0
-        print("Predicting", allele)
-    predictor.optimize()  # since we may have loaded with optimization_level=0
    start = time.time()
-    predictions = predictor.predict(
+    results = {}
-        peptides=chunk_peptides,
+    peptides = EncodableSequences.create(peptides)
-        allele=allele,
+    for allele in alleles:
-        throw=False,
+        results[allele] = predictor.predict(
-        model_kwargs=model_kwargs).astype('float32')
+            peptides=peptides,
+            allele=allele,
+            throw=False,
+            model_kwargs=model_kwargs).astype('float32')
    if verbose:
-        print("Done predicting", allele, "in", time.time() - start, "sec")
+        print("Done predicting in", time.time() - start, "sec")
+    return (chunk_index, results)
-    return (allele, chunk_index, predictions)
 if __name__ == '__main__':

--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -521,7 +521,7 @@ class Class1AffinityPredictor(object):
                "succeeded" if optimized else "not supported for these models")
        return result
-    def optimize(self):
+    def optimize(self, warn=True):
        """
        EXPERIMENTAL: Optimize the predictor for faster predictions.
@@ -545,7 +545,8 @@ class Class1AffinityPredictor(object):
                        merge_method="concatenate")
                ]
            except NotImplementedError as e:
-                logging.warning("Optimization failed: %s", str(e))
+                if warn:
+                    logging.warning("Optimization failed: %s", str(e))
                return False
            self._manifest_df = None
            self.clear_cache()