Skip to content
Snippets Groups Projects
Commit 65030dcb authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 688f4038
No related merge requests found
...@@ -97,7 +97,9 @@ fi ...@@ -97,7 +97,9 @@ fi
# Write out and process peptides. # Write out and process peptides.
# First just chr1 peptides, then all peptides. # First just chr1 peptides, then all peptides.
for subset in chr1 all # TODO: switch this back
#for subset in chr1 all
for subset in all chr1
do do
if [ "$2" == "reuse-all" ] if [ "$2" == "reuse-all" ]
then then
...@@ -123,7 +125,9 @@ do ...@@ -123,7 +125,9 @@ do
REUSE2="" REUSE2=""
if [ "$subset" == "all" ] if [ "$subset" == "all" ]
then then
REUSE1="predictions/chr1.netmhcpan4" #REUSE1="predictions/chr1.netmhcpan4"
# TODO: switch this back
REUSE1="$EXISTING_DATA"/predictions/chr1.netmhcpan4
fi fi
if [ "${2:-reuse-none}" != "reuse-none" ] if [ "${2:-reuse-none}" != "reuse-none" ]
then then
...@@ -132,6 +136,7 @@ do ...@@ -132,6 +136,7 @@ do
python run_predictors.py \ python run_predictors.py \
proteome_peptides.$subset.csv.bz2 \ proteome_peptides.$subset.csv.bz2 \
--result-dtype "float16" \
--predictor netmhcpan4 \ --predictor netmhcpan4 \
--chunk-size 10000 \ --chunk-size 10000 \
--allele $(cat alleles.txt) \ --allele $(cat alleles.txt) \
...@@ -148,7 +153,9 @@ do ...@@ -148,7 +153,9 @@ do
REUSE2="" REUSE2=""
if [ "$subset" == "all" ] if [ "$subset" == "all" ]
then then
REUSE1="predictions/chr1.mhcflurry.${kind}" #REUSE1="predictions/chr1.mhcflurry.${kind}"
# TODO: switch this back
REUSE1="$EXISTING_DATA"/predictions/chr1.mhcflurry.${kind}
fi fi
if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ] if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
then then
...@@ -157,6 +164,7 @@ do ...@@ -157,6 +164,7 @@ do
python run_predictors.py \ python run_predictors.py \
proteome_peptides.${subset}.csv.bz2 \ proteome_peptides.${subset}.csv.bz2 \
--result-dtype "float16" \
--predictor mhcflurry \ --predictor mhcflurry \
--chunk-size 500000 \ --chunk-size 500000 \
--mhcflurry-batch-size 65536 \ --mhcflurry-batch-size 65536 \
......
...@@ -77,6 +77,10 @@ parser.add_argument( ...@@ -77,6 +77,10 @@ parser.add_argument(
metavar="DIR", metavar="DIR",
nargs="*", nargs="*",
help="Take predictions from indicated DIR instead of re-running them") help="Take predictions from indicated DIR instead of re-running them")
parser.add_argument(
"--result-dtype",
default="float32",
help="Numpy dtype of result. Default: %(default)s.")
add_local_parallelism_args(parser) add_local_parallelism_args(parser)
add_cluster_parallelism_args(parser) add_cluster_parallelism_args(parser)
...@@ -87,7 +91,7 @@ PREDICTOR_TO_COLS = { ...@@ -87,7 +91,7 @@ PREDICTOR_TO_COLS = {
} }
def load_results(dirname, result_df=None): def load_results(dirname, result_df=None, dtype="float32"):
peptides = pandas.read_csv( peptides = pandas.read_csv(
os.path.join(dirname, "peptides.csv")).peptide os.path.join(dirname, "peptides.csv")).peptide
manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv")) manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))
...@@ -107,7 +111,9 @@ def load_results(dirname, result_df=None): ...@@ -107,7 +111,9 @@ def load_results(dirname, result_df=None):
if result_df is None: if result_df is None:
result_df = pandas.DataFrame( result_df = pandas.DataFrame(
index=peptides, columns=manifest_df.col.values, dtype="float32") index=peptides,
columns=manifest_df.col.values,
dtype=dtype)
result_df[:] = numpy.nan result_df[:] = numpy.nan
peptides_to_assign = peptides peptides_to_assign = peptides
mask = None mask = None
...@@ -227,23 +233,28 @@ def run(argv=sys.argv[1:]): ...@@ -227,23 +233,28 @@ def run(argv=sys.argv[1:]):
print("Wrote: ", out_manifest) print("Wrote: ", out_manifest)
result_df = pandas.DataFrame( result_df = pandas.DataFrame(
index=peptides, columns=manifest_df.col.values, dtype="float32") index=peptides, columns=manifest_df.col.values, dtype=args.result_dtype)
result_df[:] = numpy.nan result_df[:] = numpy.nan
if args.reuse_predictions: if args.reuse_predictions:
# Allocating this here to hit any memory errors as early as possible.
is_null_matrix = pandas.DataFrame(
columns=alleles,
index=result_df.index,
dtype="int8")
for dirname in args.reuse_predictions: for dirname in args.reuse_predictions:
if not dirname: if not dirname:
continue # ignore empty strings continue # ignore empty strings
if os.path.exists(dirname): if os.path.exists(dirname):
print("Loading predictions", dirname) print("Loading predictions", dirname)
result_df = load_results(dirname, result_df) result_df = load_results(
dirname, result_df, dtype=args.result_dtype)
else: else:
print("WARNING: skipping because does not exist", dirname) print("WARNING: skipping because does not exist", dirname)
# We rerun any alleles have nulls for any kind of values # We rerun any alleles have nulls for any kind of values
# (e.g. affinity, percentile rank, elution score). # (e.g. affinity, percentile rank, elution score).
is_null_matrix = pandas.DataFrame(
columns=alleles, index=result_df.index, dtype="int8")
for (allele, sub_df) in manifest_df.groupby("allele"): for (allele, sub_df) in manifest_df.groupby("allele"):
is_null_matrix[allele] = result_df[sub_df.col.values].isnull().any(1) is_null_matrix[allele] = result_df[sub_df.col.values].isnull().any(1)
print("Fraction null", is_null_matrix.values.mean()) print("Fraction null", is_null_matrix.values.mean())
...@@ -424,7 +435,8 @@ def do_predictions_mhctools(work_item_dicts, constant_data=None): ...@@ -424,7 +435,8 @@ def do_predictions_mhctools(work_item_dicts, constant_data=None):
for (allele, sub_df) in df.groupby("allele"): for (allele, sub_df) in df.groupby("allele"):
for col in cols: for col in cols:
result["%s %s" % (allele, col)] = ( result["%s %s" % (allele, col)] = (
sub_df[col].values.astype('float32')) sub_df[col].values.astype(
constant_data['args'].result_dtype))
return results return results
...@@ -471,7 +483,7 @@ def do_predictions_mhcflurry(work_item_dicts, constant_data=None): ...@@ -471,7 +483,7 @@ def do_predictions_mhcflurry(work_item_dicts, constant_data=None):
throw=False, throw=False,
model_kwargs={ model_kwargs={
'batch_size': args.mhcflurry_batch_size, 'batch_size': args.mhcflurry_batch_size,
}).astype('float32') }).astype(constant_data['args'].result_dtype)
print("Done predicting in", time.time() - start, "sec") print("Done predicting in", time.time() - start, "sec")
return results return results
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment