diff --git a/downloads-generation/data_curated/curate_multiallelic_ms.py b/downloads-generation/data_curated/curate_multiallelic_ms.py new file mode 100755 index 0000000000000000000000000000000000000000..bdbb8115c0ba0f6c3ad5022fa8b4b72af00d6cc6 --- /dev/null +++ b/downloads-generation/data_curated/curate_multiallelic_ms.py @@ -0,0 +1,102 @@ +""" +Filter and combine various peptide/MHC datasets to derive a composite training set, +optionally including eluted peptides identified by mass-spec. +""" +import sys +import argparse + +import pandas + +import mhcnames + + +def normalize_allele_name(s): + try: + return mhcnames.normalize_allele_name(s) + except Exception: + return "UNKNOWN" + + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "pmid", + metavar="PMID", + help="PMID of dataset to curate") +parser.add_argument( + "files", + nargs="+", + metavar="FILE", + help="File paths of data to curate") +parser.add_argument( + "--out", + metavar="OUT.csv", + help="Out file path") +parser.add_argument( + "--debug", + action="store_true", + default=False, + help="Leave user in pdb if PMID is unsupported") + +HANDLERS = {} + + +def load(filenames, **kwargs): + result = {} + for filename in filenames: + if filename.endswith(".csv"): + result[filename] = pandas.read_csv(filename, **kwargs) + elif filename.endswith(".xlsx") or filename.endswith(".xls"): + result[filename] = pandas.read_excel(filename, **kwargs) + else: + result[filename] = filename + + return result + + +def debug(*filenames): + loaded = load(filenames) + import ipdb + ipdb.set_trace() + + +def pmid_27600516(filename): + df = pandas.read_csv(filename) + + sample_to_peptides = {} + current_sample = None + for peptide in df.peptide: + if peptide.startswith("#"): + current_sample = peptide[1:] + sample_to_peptides[current_sample] = [] + else: + assert current_sample is not None + sample_to_peptides[current_sample].append(peptide.strip().upper()) + + rows = [] + for (sample, peptides) in sample_to_peptides.items(): + for peptide in sorted(set(peptides)): + rows.append([sample, peptide]) + + result = pandas.DataFrame(rows, columns=["sample_id", "peptide"]) + return result + + +HANDLERS["27600516"] = pmid_27600516 + + +def run(): + args = parser.parse_args(sys.argv[1:]) + + if args.pmid in HANDLERS: + df = HANDLERS[args.pmid](*args.files) + elif args.debug: + debug(*args.files) + else: + raise NotImplementedError(args.pmid) + + df.to_csv(args.out, index=False) + print("Wrote: %s" % args.out) + +if __name__ == '__main__': + run() diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh index d55f802cf36b2769258436d76e8842ae6ec66153..ea3d723912585a56cbeef0403f8d446d65956fb5 100755 --- a/downloads-generation/data_published/GENERATE.sh +++ b/downloads-generation/data_published/GENERATE.sh @@ -20,11 +20,7 @@ mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) -# Log some environment info date -pip freeze -# git rev-parse HEAD -git status cd $SCRATCH_DIR/$DOWNLOAD_NAME @@ -65,23 +61,33 @@ wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_f # Data extracted from supplemental PDF table. PMID=27600516 mkdir -p raw/$PMID -wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/Gloger_Neri_CII_2016_27600516_extracted_from_pdf.csv -P raw/$PMID +wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID # Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070] # Supplemental zip downloaded from publication PMID=26992070 mkdir -p raw/$PMID wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID -unzip raw/$PMID/pmic12297-sup-0001-supinfo.zip +cd raw/$PMID +unzip pmic12297-sup-0001-supinfo.zip +cd ../.. # Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690] PMID=27412690 mkdir -p raw/$PMID wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID -# Pearson, ..., Perreault 2016 J Clin Invest [PMID 27841757] +# Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757] # Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it. # + +# Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700] +PMID=23481700 +mkdir -p raw/$PMID +wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID + + + cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"