Skip to content
Snippets Groups Projects
join_with_precomputed.py 4.11 KiB
Newer Older
Tim O'Donnell's avatar
Tim O'Donnell committed
"""
Join benchmark with precomputed predictions.
"""
import sys
import argparse
import os
import numpy
import collections

import pandas
import tqdm

import mhcflurry
from mhcflurry.downloads import get_path

parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
    "benchmark")
parser.add_argument(
    "predictors",
    nargs="+",
    choices=("netmhcpan4.ba", "netmhcpan4.el", "mixmhcpred"))
parser.add_argument(
    "--out",
    metavar="CSV",
    required=True,
    help="File to write")


def load_results(dirname, result_df=None, columns=None):
    peptides = pandas.read_csv(os.path.join(dirname, "peptides.csv")).peptide
    manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))

    print("Loading results. Existing data has", len(peptides), "peptides and",
        len(manifest_df), "columns")

    if columns is None:
        columns = manifest_df.col.values

    if result_df is None:
        result_df = pandas.DataFrame(index=peptides, columns=columns,
            dtype="float32")
        result_df[:] = numpy.nan
        peptides_to_assign = peptides
        mask = None
    else:
        mask = (peptides.isin(result_df.index)).values
        peptides_to_assign = peptides[mask]

    manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]

    print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")

    for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
        with open(os.path.join(dirname, row.path), "rb") as fd:
            value = numpy.load(fd)['arr_0']
            if mask is not None:
                value = value[mask]
            result_df.loc[peptides_to_assign, row.col] = value

    return result_df


def run():
    args = parser.parse_args(sys.argv[1:])
Tim O'Donnell's avatar
Tim O'Donnell committed
    df = pandas.read_csv(args.benchmark)
Tim O'Donnell's avatar
Tim O'Donnell committed

    df["alleles"] = df.hla.str.split()

    peptides = df.peptide.unique()
    alleles = set()
    for some in df.hla.unique():
        alleles.update(some.split())

    predictions_dfs = {}

    if 'netmhcpan4.ba' in args.predictor:
        predictions_dfs['netmhcpan4.ba'] = load_results(
            get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"),
            result_df=pandas.DataFrame(
                index=peptides,
                columns=["%s affinity" % a for a in alleles])).rename(
            columns=lambda s: s.replace("affinity", "").strip())
        predictions_dfs['netmhcpan4.ba'] *= -1

    if 'netmhcpan4.el' in args.predictor:
        predictions_dfs['netmhcpan4.el'] = load_results(
            get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"),
            result_df=pandas.DataFrame(
                index=peptides,
                columns=["%s score" % a for a in alleles])).rename(
            columns=lambda s: s.replace("score", "").strip())

    if 'mixmhcpred' in args.predictor:
        predictions_dfs['mixmhcpred'] = load_results(
        get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
        result_df=pandas.DataFrame(
            index=peptides,
            columns=["%s score" % a for a in alleles])).rename(
        columns=lambda s: s.replace("score", "").strip())

    skip_experiments = set()

    for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()):
        hla = hla_text.split()
        for (name, precomputed_df) in predictions_dfs.items():
            df.loc[sub_df.index, name] = numpy.nan
            prediction_df = pandas.DataFrame(index=sub_df.peptide)
            for allele in hla:
                if allele not in precomputed_df.columns or precomputed_df[allele].isnull().all():
                    print(sub_df.sample_id.unique(), hla)
                    skip_experiments.update(sub_df.sample_id.unique())
                prediction_df[allele] = precomputed_df.loc[df.index, allele]
            df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values
            df.loc[sub_df.index, name + " allele"] = prediction_df.idxmax(1, skipna=False).values

    print("Skip experiments", skip_experiments)
    print("results")
    print(df)

    df.to_csv(args.out)
    print("Wrote", args.out)

if __name__ == '__main__':
    run()