Skip to content
Snippets Groups Projects
Commit 339e7caf authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent 66c3c43b
No related branches found
No related tags found
No related merge requests found
......@@ -40,6 +40,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
export MHCFLURRY_DEFAULT_PREDICT_BATCH_SIZE=16384
if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ]
then
......@@ -67,7 +68,6 @@ else
--only-format MONOALLELIC \
--out "$(pwd)/benchmark.monoallelic.csv"
bzip2 -f benchmark.monoallelic.csv
rm -f benchmark.monoallelic.predictions.csv.bz2
fi
### GENERATE BENCHMARK: MULTIALLELIC
......@@ -174,6 +174,22 @@ else
echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.csv" >> commands/multiallelic.presentation_without_flanks.sh
fi
### PRECOMPUTED ####
for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred
do
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.csv.bz2" ]
then
echo "Reusing existing multiallelic ${variant}"
else
echo time python join_with_precomputed.py \
\""$(pwd)/benchmark.multiallelic.csv.bz2"\" \
\""$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.${variant}"\" \
${variant} \
--out "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
fi
done
ls -lh commands
if [ "$1" != "cluster" ]
......
"""
Join benchmark with precomputed predictions.
"""
import sys
import argparse
import os
import numpy
import collections
import pandas
import tqdm
import mhcflurry
from mhcflurry.downloads import get_path
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"benchmark")
parser.add_argument(
"precomputed_data")
parser.add_argument(
"predictors",
nargs="+",
choices=("netmhcpan4.ba", "netmhcpan4.el", "mixmhcpred"))
parser.add_argument(
"--out",
metavar="CSV",
required=True,
help="File to write")
def load_results(dirname, result_df=None, columns=None):
peptides = pandas.read_csv(os.path.join(dirname, "peptides.csv")).peptide
manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))
print("Loading results. Existing data has", len(peptides), "peptides and",
len(manifest_df), "columns")
if columns is None:
columns = manifest_df.col.values
if result_df is None:
result_df = pandas.DataFrame(index=peptides, columns=columns,
dtype="float32")
result_df[:] = numpy.nan
peptides_to_assign = peptides
mask = None
else:
mask = (peptides.isin(result_df.index)).values
peptides_to_assign = peptides[mask]
manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
with open(os.path.join(dirname, row.path), "rb") as fd:
value = numpy.load(fd)['arr_0']
if mask is not None:
value = value[mask]
result_df.loc[peptides_to_assign, row.col] = value
return result_df
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.hits)
df["alleles"] = df.hla.str.split()
peptides = df.peptide.unique()
alleles = set()
for some in df.hla.unique():
alleles.update(some.split())
predictions_dfs = {}
if 'netmhcpan4.ba' in args.predictor:
predictions_dfs['netmhcpan4.ba'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s affinity" % a for a in alleles])).rename(
columns=lambda s: s.replace("affinity", "").strip())
predictions_dfs['netmhcpan4.ba'] *= -1
if 'netmhcpan4.el' in args.predictor:
predictions_dfs['netmhcpan4.el'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s score" % a for a in alleles])).rename(
columns=lambda s: s.replace("score", "").strip())
if 'mixmhcpred' in args.predictor:
predictions_dfs['mixmhcpred'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s score" % a for a in alleles])).rename(
columns=lambda s: s.replace("score", "").strip())
skip_experiments = set()
for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()):
hla = hla_text.split()
for (name, precomputed_df) in predictions_dfs.items():
df.loc[sub_df.index, name] = numpy.nan
prediction_df = pandas.DataFrame(index=sub_df.peptide)
for allele in hla:
if allele not in precomputed_df.columns or precomputed_df[allele].isnull().all():
print(sub_df.sample_id.unique(), hla)
skip_experiments.update(sub_df.sample_id.unique())
prediction_df[allele] = precomputed_df.loc[df.index, allele]
df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values
df.loc[sub_df.index, name + " allele"] = prediction_df.idxmax(1, skipna=False).values
print("Skip experiments", skip_experiments)
print("results")
print(df)
df.to_csv(args.out)
print("Wrote", args.out)
if __name__ == '__main__':
run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment