From 790ebc178cd459067ff99d5b267fb8d4bbb473bc Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 20 Sep 2019 16:24:48 -0400 Subject: [PATCH] working on curation --- downloads-generation/data_curated/GENERATE.sh | 20 +- .../data_curated/curate_by_pmid.py | 187 ++++++++++++++++++ .../data_curated/curate_multiallelic_ms.py | 102 ---------- mhcflurry/downloads.yml | 4 +- 4 files changed, 207 insertions(+), 106 deletions(-) create mode 100755 downloads-generation/data_curated/curate_by_pmid.py delete mode 100755 downloads-generation/data_curated/curate_multiallelic_ms.py diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index bbb24732..0e68fc90 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -18,8 +18,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" # Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) +#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) # Log some environment info date @@ -29,6 +29,22 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_DIR/curate.py . +cp $SCRIPT_DIR/curate_by_pmid.py . + +RAW_DIR="$(mhcflurry-downloads path data_published)/raw" +cp -r "$RAW_DIR" . + +CURATE_BY_PMID_ARGS="" +for pmid in $(ls raw) +do + CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ') +done + +time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug + +exit 1 + + # No mass-spec data time python curate.py \ diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py new file mode 100755 index 00000000..7f261777 --- /dev/null +++ b/downloads-generation/data_curated/curate_by_pmid.py @@ -0,0 +1,187 @@ +""" +Filter and combine various peptide/MHC datasets to derive a composite training set, +optionally including eluted peptides identified by mass-spec. +""" +import sys +import argparse +import os + +import pandas + +import mhcnames + + +def normalize_allele_name(s): + try: + return mhcnames.normalize_allele_name(s) + except Exception: + return "UNKNOWN" + + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "--item", + nargs="+", + action="append", + metavar="PMID FILE, ... FILE", + default=[], + help="Item to curate: PMID and list of files") +parser.add_argument( + "--out", + metavar="OUT.csv", + help="Out file path") +parser.add_argument( + "--debug", + action="store_true", + default=False, + help="Leave user in pdb if PMID is unsupported") + +HANDLERS = {} + + +def load(filenames, **kwargs): + result = {} + for filename in filenames: + if filename.endswith(".csv"): + result[filename] = pandas.read_csv(filename, **kwargs) + elif filename.endswith(".xlsx") or filename.endswith(".xls"): + result[filename] = pandas.read_excel(filename, **kwargs) + else: + result[filename] = filename + + return result + + +def debug(*filenames): + loaded = load(filenames) + import ipdb + ipdb.set_trace() + + +def handle_pmid_27600516(filename): + df = pandas.read_csv(filename) + + sample_to_peptides = {} + current_sample = None + for peptide in df.peptide: + if peptide.startswith("#"): + current_sample = peptide[1:] + sample_to_peptides[current_sample] = [] + else: + assert current_sample is not None + sample_to_peptides[current_sample].append(peptide.strip().upper()) + + rows = [] + for (sample, peptides) in sample_to_peptides.items(): + for peptide in sorted(set(peptides)): + rows.append([sample, peptide]) + + result = pandas.DataFrame(rows, columns=["sample_id", "peptide"]) + result["sample_type"] = "melanoma_cell_line" + return result + + +def handle_pmid_23481700(filename): + df = pandas.read_excel(filename) + peptides = df.iloc[10:,0].values + assert peptides[0] == "TPSLVKSTSQL" + assert peptides[-1] == "LPHSVNSKL" + + result = pandas.DataFrame({ + "peptide": peptides, + }) + result["sample_id"] = "23481700" + result["sample_type"] = "B-LCL" + return result + + +def handle_pmid_24616531(filename): + df = pandas.read_excel(filename, sheetname="EThcD") + peptides = df.Sequence.values + assert peptides[0] == "APFLRIAF" + assert peptides[-1] == "WRQAGLSYIRYSQI" + + result = pandas.DataFrame({ + "peptide": peptides, + }) + result["sample_id"] = "24616531" + result["sample_type"] = "B-lymphoblastoid" + result["cell_line"] = "GR" + result["pulldown_antibody"] = "W6/32" + + # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07" + # we are guessing the exact 4 digit alleles based on this. + result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01" + return result + + +def handle_pmid_25576301(filename): + df = pandas.read_excel(filename, sheetname="Peptides") + peptides = df.Sequence.values + assert peptides[0] == "AAAAAAAQSVY" + assert peptides[-1] == "YYYNGKAVY" + + # TODO TODO + import ipdb ; ipdb.set_trace() + + # THIS IS ALL JUNK: + result = pandas.DataFrame({ + "peptide": peptides, + }) + result["sample_id"] = "24616531" + result["sample_type"] = "B-lymphoblastoid" + result["cell_line"] = "GR" + result["pulldown_antibody"] = "W6/32" + + # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07" + # we are guessing the exact 4 digit alleles based on this. + result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01" + return result + + + + + + +# Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict. +for (key, value) in list(locals().items()): + if key.startswith("handle_pmid_"): + HANDLERS[key.replace("handle_pmid_", "")] = value + + +def run(): + args = parser.parse_args(sys.argv[1:]) + + dfs = [] + for item_tpl in args.item: + (pmid, filenames) = (item_tpl[0], item_tpl[1:]) + print("Processing item", pmid, *[os.path.abspath(f) for f in filenames]) + + df = None + if pmid in HANDLERS: + df = HANDLERS[pmid](*filenames) + elif args.debug: + debug(*filenames) + else: + raise NotImplementedError(args.pmid) + + if df is not None: + df["pmid"] = pmid + print("*** PMID %s: %d peptides ***" % (pmid, len(df))) + print("Counts by sample id:") + print(df.groupby("sample_id").peptide.nunique()) + print("") + print("Counts by sample type:") + print(df.groupby("sample_type").peptide.nunique()) + print("****************************") + + dfs.append(df) + + + df = pandas.concat(dfs, ignore_index=True) + df.to_csv(args.out, index=False) + print("Wrote: %s" % args.out) + +if __name__ == '__main__': + run() diff --git a/downloads-generation/data_curated/curate_multiallelic_ms.py b/downloads-generation/data_curated/curate_multiallelic_ms.py deleted file mode 100755 index bdbb8115..00000000 --- a/downloads-generation/data_curated/curate_multiallelic_ms.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Filter and combine various peptide/MHC datasets to derive a composite training set, -optionally including eluted peptides identified by mass-spec. -""" -import sys -import argparse - -import pandas - -import mhcnames - - -def normalize_allele_name(s): - try: - return mhcnames.normalize_allele_name(s) - except Exception: - return "UNKNOWN" - - -parser = argparse.ArgumentParser(usage=__doc__) - -parser.add_argument( - "pmid", - metavar="PMID", - help="PMID of dataset to curate") -parser.add_argument( - "files", - nargs="+", - metavar="FILE", - help="File paths of data to curate") -parser.add_argument( - "--out", - metavar="OUT.csv", - help="Out file path") -parser.add_argument( - "--debug", - action="store_true", - default=False, - help="Leave user in pdb if PMID is unsupported") - -HANDLERS = {} - - -def load(filenames, **kwargs): - result = {} - for filename in filenames: - if filename.endswith(".csv"): - result[filename] = pandas.read_csv(filename, **kwargs) - elif filename.endswith(".xlsx") or filename.endswith(".xls"): - result[filename] = pandas.read_excel(filename, **kwargs) - else: - result[filename] = filename - - return result - - -def debug(*filenames): - loaded = load(filenames) - import ipdb - ipdb.set_trace() - - -def pmid_27600516(filename): - df = pandas.read_csv(filename) - - sample_to_peptides = {} - current_sample = None - for peptide in df.peptide: - if peptide.startswith("#"): - current_sample = peptide[1:] - sample_to_peptides[current_sample] = [] - else: - assert current_sample is not None - sample_to_peptides[current_sample].append(peptide.strip().upper()) - - rows = [] - for (sample, peptides) in sample_to_peptides.items(): - for peptide in sorted(set(peptides)): - rows.append([sample, peptide]) - - result = pandas.DataFrame(rows, columns=["sample_id", "peptide"]) - return result - - -HANDLERS["27600516"] = pmid_27600516 - - -def run(): - args = parser.parse_args(sys.argv[1:]) - - if args.pmid in HANDLERS: - df = HANDLERS[args.pmid](*args.files) - elif args.debug: - debug(*args.files) - else: - raise NotImplementedError(args.pmid) - - df.to_csv(args.out, index=False) - print("Wrote: %s" % args.out) - -if __name__ == '__main__': - run() diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 3c731928..1ccb25be 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -46,7 +46,7 @@ releases: default: false - name: data_published - url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2 default: false - name: data_curated @@ -109,7 +109,7 @@ releases: default: false - name: data_published - url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2 default: false - name: data_curated -- GitLab