""" Filter and combine various peptide/MHC datasets to derive a composite training set, optionally including eluted peptides identified by mass-spec. """ import sys import argparse import os import pandas import mhcnames def normalize_allele_name(s): try: return mhcnames.normalize_allele_name(s) except Exception: return "UNKNOWN" parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( "--item", nargs="+", action="append", metavar="PMID FILE, ... FILE", default=[], help="Item to curate: PMID and list of files") parser.add_argument( "--out", metavar="OUT.csv", help="Out file path") parser.add_argument( "--debug", action="store_true", default=False, help="Leave user in pdb if PMID is unsupported") HANDLERS = {} def load(filenames, **kwargs): result = {} for filename in filenames: if filename.endswith(".csv"): result[filename] = pandas.read_csv(filename, **kwargs) elif filename.endswith(".xlsx") or filename.endswith(".xls"): result[filename] = pandas.read_excel(filename, **kwargs) else: result[filename] = filename return result def debug(*filenames): loaded = load(filenames) import ipdb ipdb.set_trace() def handle_pmid_27600516(filename): df = pandas.read_csv(filename) sample_to_peptides = {} current_sample = None for peptide in df.peptide: if peptide.startswith("#"): current_sample = peptide[1:] sample_to_peptides[current_sample] = [] else: assert current_sample is not None sample_to_peptides[current_sample].append(peptide.strip().upper()) rows = [] for (sample, peptides) in sample_to_peptides.items(): for peptide in sorted(set(peptides)): rows.append([sample, peptide]) result = pandas.DataFrame(rows, columns=["sample_id", "peptide"]) result["sample_type"] = "melanoma_cell_line" return result def handle_pmid_23481700(filename): df = pandas.read_excel(filename) peptides = df.iloc[10:,0].values assert peptides[0] == "TPSLVKSTSQL" assert peptides[-1] == "LPHSVNSKL" result = pandas.DataFrame({ "peptide": peptides, }) result["sample_id"] = "23481700" result["sample_type"] = "B-LCL" return result def handle_pmid_24616531(filename): df = pandas.read_excel(filename, sheetname="EThcD") peptides = df.Sequence.values assert peptides[0] == "APFLRIAF" assert peptides[-1] == "WRQAGLSYIRYSQI" result = pandas.DataFrame({ "peptide": peptides, }) result["sample_id"] = "24616531" result["sample_type"] = "B-lymphoblastoid" result["cell_line"] = "GR" result["pulldown_antibody"] = "W6/32" # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07" # we are guessing the exact 4 digit alleles based on this. result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01" return result def handle_pmid_25576301(filename): df = pandas.read_excel(filename, sheetname="Peptides") peptides = df.Sequence.values assert peptides[0] == "AAAAAAAQSVY" assert peptides[-1] == "YYYNGKAVY" # TODO TODO import ipdb ; ipdb.set_trace() # THIS IS ALL JUNK: result = pandas.DataFrame({ "peptide": peptides, }) result["sample_id"] = "24616531" result["sample_type"] = "B-lymphoblastoid" result["cell_line"] = "GR" result["pulldown_antibody"] = "W6/32" # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07" # we are guessing the exact 4 digit alleles based on this. result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01" return result # Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict. for (key, value) in list(locals().items()): if key.startswith("handle_pmid_"): HANDLERS[key.replace("handle_pmid_", "")] = value def run(): args = parser.parse_args(sys.argv[1:]) dfs = [] for item_tpl in args.item: (pmid, filenames) = (item_tpl[0], item_tpl[1:]) print("Processing item", pmid, *[os.path.abspath(f) for f in filenames]) df = None if pmid in HANDLERS: df = HANDLERS[pmid](*filenames) elif args.debug: debug(*filenames) else: raise NotImplementedError(args.pmid) if df is not None: df["pmid"] = pmid print("*** PMID %s: %d peptides ***" % (pmid, len(df))) print("Counts by sample id:") print(df.groupby("sample_id").peptide.nunique()) print("") print("Counts by sample type:") print(df.groupby("sample_type").peptide.nunique()) print("****************************") dfs.append(df) df = pandas.concat(dfs, ignore_index=True) df.to_csv(args.out, index=False) print("Wrote: %s" % args.out) if __name__ == '__main__': run()