Skip to content
Snippets Groups Projects
Commit 339e7caf authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent 66c3c43b
No related merge requests found
......@@ -40,6 +40,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
export MHCFLURRY_DEFAULT_PREDICT_BATCH_SIZE=16384
if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ]
then
......@@ -67,7 +68,6 @@ else
--only-format MONOALLELIC \
--out "$(pwd)/benchmark.monoallelic.csv"
bzip2 -f benchmark.monoallelic.csv
rm -f benchmark.monoallelic.predictions.csv.bz2
fi
### GENERATE BENCHMARK: MULTIALLELIC
......@@ -174,6 +174,22 @@ else
echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.csv" >> commands/multiallelic.presentation_without_flanks.sh
fi
### PRECOMPUTED ####
for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred
do
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.csv.bz2" ]
then
echo "Reusing existing multiallelic ${variant}"
else
echo time python join_with_precomputed.py \
\""$(pwd)/benchmark.multiallelic.csv.bz2"\" \
\""$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.${variant}"\" \
${variant} \
--out "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
fi
done
ls -lh commands
if [ "$1" != "cluster" ]
......
"""
Join benchmark with precomputed predictions.
"""
import sys
import argparse
import os
import numpy
import collections
import pandas
import tqdm
import mhcflurry
from mhcflurry.downloads import get_path
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"benchmark")
parser.add_argument(
"precomputed_data")
parser.add_argument(
"predictors",
nargs="+",
choices=("netmhcpan4.ba", "netmhcpan4.el", "mixmhcpred"))
parser.add_argument(
"--out",
metavar="CSV",
required=True,
help="File to write")
def load_results(dirname, result_df=None, columns=None):
peptides = pandas.read_csv(os.path.join(dirname, "peptides.csv")).peptide
manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))
print("Loading results. Existing data has", len(peptides), "peptides and",
len(manifest_df), "columns")
if columns is None:
columns = manifest_df.col.values
if result_df is None:
result_df = pandas.DataFrame(index=peptides, columns=columns,
dtype="float32")
result_df[:] = numpy.nan
peptides_to_assign = peptides
mask = None
else:
mask = (peptides.isin(result_df.index)).values
peptides_to_assign = peptides[mask]
manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
with open(os.path.join(dirname, row.path), "rb") as fd:
value = numpy.load(fd)['arr_0']
if mask is not None:
value = value[mask]
result_df.loc[peptides_to_assign, row.col] = value
return result_df
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.hits)
df["alleles"] = df.hla.str.split()
peptides = df.peptide.unique()
alleles = set()
for some in df.hla.unique():
alleles.update(some.split())
predictions_dfs = {}
if 'netmhcpan4.ba' in args.predictor:
predictions_dfs['netmhcpan4.ba'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s affinity" % a for a in alleles])).rename(
columns=lambda s: s.replace("affinity", "").strip())
predictions_dfs['netmhcpan4.ba'] *= -1
if 'netmhcpan4.el' in args.predictor:
predictions_dfs['netmhcpan4.el'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s score" % a for a in alleles])).rename(
columns=lambda s: s.replace("score", "").strip())
if 'mixmhcpred' in args.predictor:
predictions_dfs['mixmhcpred'] = load_results(
get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
result_df=pandas.DataFrame(
index=peptides,
columns=["%s score" % a for a in alleles])).rename(
columns=lambda s: s.replace("score", "").strip())
skip_experiments = set()
for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()):
hla = hla_text.split()
for (name, precomputed_df) in predictions_dfs.items():
df.loc[sub_df.index, name] = numpy.nan
prediction_df = pandas.DataFrame(index=sub_df.peptide)
for allele in hla:
if allele not in precomputed_df.columns or precomputed_df[allele].isnull().all():
print(sub_df.sample_id.unique(), hla)
skip_experiments.update(sub_df.sample_id.unique())
prediction_df[allele] = precomputed_df.loc[df.index, allele]
df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values
df.loc[sub_df.index, name + " allele"] = prediction_df.idxmax(1, skipna=False).values
print("Skip experiments", skip_experiments)
print("results")
print(df)
df.to_csv(args.out)
print("Wrote", args.out)
if __name__ == '__main__':
run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment