Skip to content
Snippets Groups Projects
Commit 65030dcb authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 688f4038
No related branches found
No related tags found
No related merge requests found
......@@ -97,7 +97,9 @@ fi
# Write out and process peptides.
# First just chr1 peptides, then all peptides.
for subset in chr1 all
# TODO: switch this back
#for subset in chr1 all
for subset in all chr1
do
if [ "$2" == "reuse-all" ]
then
......@@ -123,7 +125,9 @@ do
REUSE2=""
if [ "$subset" == "all" ]
then
REUSE1="predictions/chr1.netmhcpan4"
#REUSE1="predictions/chr1.netmhcpan4"
# TODO: switch this back
REUSE1="$EXISTING_DATA"/predictions/chr1.netmhcpan4
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
......@@ -132,6 +136,7 @@ do
python run_predictors.py \
proteome_peptides.$subset.csv.bz2 \
--result-dtype "float16" \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \
......@@ -148,7 +153,9 @@ do
REUSE2=""
if [ "$subset" == "all" ]
then
REUSE1="predictions/chr1.mhcflurry.${kind}"
#REUSE1="predictions/chr1.mhcflurry.${kind}"
# TODO: switch this back
REUSE1="$EXISTING_DATA"/predictions/chr1.mhcflurry.${kind}
fi
if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
then
......@@ -157,6 +164,7 @@ do
python run_predictors.py \
proteome_peptides.${subset}.csv.bz2 \
--result-dtype "float16" \
--predictor mhcflurry \
--chunk-size 500000 \
--mhcflurry-batch-size 65536 \
......
......@@ -77,6 +77,10 @@ parser.add_argument(
metavar="DIR",
nargs="*",
help="Take predictions from indicated DIR instead of re-running them")
parser.add_argument(
"--result-dtype",
default="float32",
help="Numpy dtype of result. Default: %(default)s.")
add_local_parallelism_args(parser)
add_cluster_parallelism_args(parser)
......@@ -87,7 +91,7 @@ PREDICTOR_TO_COLS = {
}
def load_results(dirname, result_df=None):
def load_results(dirname, result_df=None, dtype="float32"):
peptides = pandas.read_csv(
os.path.join(dirname, "peptides.csv")).peptide
manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))
......@@ -107,7 +111,9 @@ def load_results(dirname, result_df=None):
if result_df is None:
result_df = pandas.DataFrame(
index=peptides, columns=manifest_df.col.values, dtype="float32")
index=peptides,
columns=manifest_df.col.values,
dtype=dtype)
result_df[:] = numpy.nan
peptides_to_assign = peptides
mask = None
......@@ -227,23 +233,28 @@ def run(argv=sys.argv[1:]):
print("Wrote: ", out_manifest)
result_df = pandas.DataFrame(
index=peptides, columns=manifest_df.col.values, dtype="float32")
index=peptides, columns=manifest_df.col.values, dtype=args.result_dtype)
result_df[:] = numpy.nan
if args.reuse_predictions:
# Allocating this here to hit any memory errors as early as possible.
is_null_matrix = pandas.DataFrame(
columns=alleles,
index=result_df.index,
dtype="int8")
for dirname in args.reuse_predictions:
if not dirname:
continue # ignore empty strings
if os.path.exists(dirname):
print("Loading predictions", dirname)
result_df = load_results(dirname, result_df)
result_df = load_results(
dirname, result_df, dtype=args.result_dtype)
else:
print("WARNING: skipping because does not exist", dirname)
# We rerun any alleles have nulls for any kind of values
# (e.g. affinity, percentile rank, elution score).
is_null_matrix = pandas.DataFrame(
columns=alleles, index=result_df.index, dtype="int8")
for (allele, sub_df) in manifest_df.groupby("allele"):
is_null_matrix[allele] = result_df[sub_df.col.values].isnull().any(1)
print("Fraction null", is_null_matrix.values.mean())
......@@ -424,7 +435,8 @@ def do_predictions_mhctools(work_item_dicts, constant_data=None):
for (allele, sub_df) in df.groupby("allele"):
for col in cols:
result["%s %s" % (allele, col)] = (
sub_df[col].values.astype('float32'))
sub_df[col].values.astype(
constant_data['args'].result_dtype))
return results
......@@ -471,7 +483,7 @@ def do_predictions_mhcflurry(work_item_dicts, constant_data=None):
throw=False,
model_kwargs={
'batch_size': args.mhcflurry_batch_size,
}).astype('float32')
}).astype(constant_data['args'].result_dtype)
print("Done predicting in", time.time() - start, "sec")
return results
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment