curating some multiallelic ms

a9931e96 · Tim O'Donnell · 53084003 · a9931e96 · a9931e96
Commit a9931e96 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_curated/curate_multiallelic_ms.py
+++ b/downloads-generation/data_curated/curate_multiallelic_ms.py
+"""
+Filter and combine various peptide/MHC datasets to derive a composite training set,
+optionally including eluted peptides identified by mass-spec.
+"""
+import sys
+import argparse
+import pandas
+import mhcnames
+def normalize_allele_name(s):
+    try:
+        return mhcnames.normalize_allele_name(s)
+    except Exception:
+        return "UNKNOWN"
+parser = argparse.ArgumentParser(usage=__doc__)
+parser.add_argument(
+    "pmid",
+    metavar="PMID",
+    help="PMID of dataset to curate")
+parser.add_argument(
+    "files",
+    nargs="+",
+    metavar="FILE",
+    help="File paths of data to curate")
+parser.add_argument(
+    "--out",
+    metavar="OUT.csv",
+    help="Out file path")
+parser.add_argument(
+    "--debug",
+    action="store_true",
+    default=False,
+    help="Leave user in pdb if PMID is unsupported")
+HANDLERS = {}
+def load(filenames, **kwargs):
+    result = {}
+    for filename in filenames:
+        if filename.endswith(".csv"):
+            result[filename] = pandas.read_csv(filename, **kwargs)
+        elif filename.endswith(".xlsx") or filename.endswith(".xls"):
+            result[filename] = pandas.read_excel(filename, **kwargs)
+        else:
+            result[filename] = filename
+    return result
+def debug(*filenames):
+    loaded = load(filenames)
+    import ipdb
+    ipdb.set_trace()
+def pmid_27600516(filename):
+    df = pandas.read_csv(filename)
+    sample_to_peptides = {}
+    current_sample = None
+    for peptide in df.peptide:
+        if peptide.startswith("#"):
+            current_sample = peptide[1:]
+            sample_to_peptides[current_sample] = []
+        else:
+            assert current_sample is not None
+            sample_to_peptides[current_sample].append(peptide.strip().upper())
+    rows = []
+    for (sample, peptides) in sample_to_peptides.items():
+        for peptide in sorted(set(peptides)):
+            rows.append([sample, peptide])
+    result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
+    return result
+HANDLERS["27600516"] = pmid_27600516
+def run():
+    args = parser.parse_args(sys.argv[1:])
+    if args.pmid in HANDLERS:
+        df = HANDLERS[args.pmid](*args.files)
+    elif args.debug:
+        debug(*args.files)
+    else:
+        raise NotImplementedError(args.pmid)
+    df.to_csv(args.out, index=False)
+    print("Wrote: %s" % args.out)
+if __name__ == '__main__':
+    run()
--- a/downloads-generation/data_published/GENERATE.sh
+++ b/downloads-generation/data_published/GENERATE.sh
@@ -20,11 +20,7 @@ mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
 exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
-# Log some environment info
 date
-pip freeze
-# git rev-parse HEAD
-git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
@@ -65,23 +61,33 @@ wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_f
 # Data extracted from supplemental PDF table.
 PMID=27600516
 mkdir -p raw/$PMID
-wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/Gloger_Neri_CII_2016_27600516_extracted_from_pdf.csv -P raw/$PMID
+wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID
 # Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]
 # Supplemental zip downloaded from publication
 PMID=26992070
 mkdir -p raw/$PMID
 wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID
-unzip raw/$PMID/pmic12297-sup-0001-supinfo.zip
+cd raw/$PMID
+unzip pmic12297-sup-0001-supinfo.zip
+cd ../..
 # Shraibman, ..., Admon Mol Cell Proteomics	2016 [PMID 27412690]
 PMID=27412690
 mkdir -p raw/$PMID
 wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID
-# Pearson, ..., Perreault 2016 J Clin Invest [PMID 27841757]
+# Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
 # Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it.
 #
+# Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]
+PMID=23481700
+mkdir -p raw/$PMID
+wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls  -P raw/$PMID
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
 RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"