From 790ebc178cd459067ff99d5b267fb8d4bbb473bc Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 20 Sep 2019 16:24:48 -0400
Subject: [PATCH] working on curation

---
 downloads-generation/data_curated/GENERATE.sh |  20 +-
 .../data_curated/curate_by_pmid.py            | 187 ++++++++++++++++++
 .../data_curated/curate_multiallelic_ms.py    | 102 ----------
 mhcflurry/downloads.yml                       |   4 +-
 4 files changed, 207 insertions(+), 106 deletions(-)
 create mode 100755 downloads-generation/data_curated/curate_by_pmid.py
 delete mode 100755 downloads-generation/data_curated/curate_multiallelic_ms.py

diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh
index bbb24732..0e68fc90 100755
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -18,8 +18,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 
 # Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 
 # Log some environment info
 date
@@ -29,6 +29,22 @@ git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 cp $SCRIPT_DIR/curate.py .
+cp $SCRIPT_DIR/curate_by_pmid.py .
+
+RAW_DIR="$(mhcflurry-downloads path data_published)/raw"
+cp -r "$RAW_DIR" .
+
+CURATE_BY_PMID_ARGS=""
+for pmid in $(ls raw)
+do
+    CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ')
+done
+
+time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug
+
+exit 1
+
+
 
 # No mass-spec data
 time python curate.py \
diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py
new file mode 100755
index 00000000..7f261777
--- /dev/null
+++ b/downloads-generation/data_curated/curate_by_pmid.py
@@ -0,0 +1,187 @@
+"""
+Filter and combine various peptide/MHC datasets to derive a composite training set,
+optionally including eluted peptides identified by mass-spec.
+"""
+import sys
+import argparse
+import os
+
+import pandas
+
+import mhcnames
+
+
+def normalize_allele_name(s):
+    try:
+        return mhcnames.normalize_allele_name(s)
+    except Exception:
+        return "UNKNOWN"
+
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument(
+    "--item",
+    nargs="+",
+    action="append",
+    metavar="PMID FILE, ... FILE",
+    default=[],
+    help="Item to curate: PMID and list of files")
+parser.add_argument(
+    "--out",
+    metavar="OUT.csv",
+    help="Out file path")
+parser.add_argument(
+    "--debug",
+    action="store_true",
+    default=False,
+    help="Leave user in pdb if PMID is unsupported")
+
+HANDLERS = {}
+
+
+def load(filenames, **kwargs):
+    result = {}
+    for filename in filenames:
+        if filename.endswith(".csv"):
+            result[filename] = pandas.read_csv(filename, **kwargs)
+        elif filename.endswith(".xlsx") or filename.endswith(".xls"):
+            result[filename] = pandas.read_excel(filename, **kwargs)
+        else:
+            result[filename] = filename
+
+    return result
+
+
+def debug(*filenames):
+    loaded = load(filenames)
+    import ipdb
+    ipdb.set_trace()
+
+
+def handle_pmid_27600516(filename):
+    df = pandas.read_csv(filename)
+
+    sample_to_peptides = {}
+    current_sample = None
+    for peptide in df.peptide:
+        if peptide.startswith("#"):
+            current_sample = peptide[1:]
+            sample_to_peptides[current_sample] = []
+        else:
+            assert current_sample is not None
+            sample_to_peptides[current_sample].append(peptide.strip().upper())
+
+    rows = []
+    for (sample, peptides) in sample_to_peptides.items():
+        for peptide in sorted(set(peptides)):
+            rows.append([sample, peptide])
+
+    result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
+    result["sample_type"] = "melanoma_cell_line"
+    return result
+
+
+def handle_pmid_23481700(filename):
+    df = pandas.read_excel(filename)
+    peptides = df.iloc[10:,0].values
+    assert peptides[0] == "TPSLVKSTSQL"
+    assert peptides[-1] == "LPHSVNSKL"
+
+    result = pandas.DataFrame({
+        "peptide": peptides,
+    })
+    result["sample_id"] = "23481700"
+    result["sample_type"] = "B-LCL"
+    return result
+
+
+def handle_pmid_24616531(filename):
+    df = pandas.read_excel(filename, sheetname="EThcD")
+    peptides = df.Sequence.values
+    assert peptides[0] == "APFLRIAF"
+    assert peptides[-1] == "WRQAGLSYIRYSQI"
+
+    result = pandas.DataFrame({
+        "peptide": peptides,
+    })
+    result["sample_id"] = "24616531"
+    result["sample_type"] = "B-lymphoblastoid"
+    result["cell_line"] = "GR"
+    result["pulldown_antibody"] = "W6/32"
+
+    # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
+    # we are guessing the exact 4 digit alleles based on this.
+    result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
+    return result
+
+
+def handle_pmid_25576301(filename):
+    df = pandas.read_excel(filename, sheetname="Peptides")
+    peptides = df.Sequence.values
+    assert peptides[0] == "AAAAAAAQSVY"
+    assert peptides[-1] == "YYYNGKAVY"
+
+    # TODO TODO
+    import ipdb ; ipdb.set_trace()
+
+    # THIS IS ALL JUNK:
+    result = pandas.DataFrame({
+        "peptide": peptides,
+    })
+    result["sample_id"] = "24616531"
+    result["sample_type"] = "B-lymphoblastoid"
+    result["cell_line"] = "GR"
+    result["pulldown_antibody"] = "W6/32"
+
+    # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
+    # we are guessing the exact 4 digit alleles based on this.
+    result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
+    return result
+
+
+
+
+
+
+# Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict.
+for (key, value) in list(locals().items()):
+    if key.startswith("handle_pmid_"):
+        HANDLERS[key.replace("handle_pmid_", "")] = value
+
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+
+    dfs = []
+    for item_tpl in args.item:
+        (pmid, filenames) = (item_tpl[0], item_tpl[1:])
+        print("Processing item", pmid, *[os.path.abspath(f) for f in filenames])
+
+        df = None
+        if pmid in HANDLERS:
+            df = HANDLERS[pmid](*filenames)
+        elif args.debug:
+            debug(*filenames)
+        else:
+            raise NotImplementedError(args.pmid)
+
+        if df is not None:
+            df["pmid"] = pmid
+            print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
+            print("Counts by sample id:")
+            print(df.groupby("sample_id").peptide.nunique())
+            print("")
+            print("Counts by sample type:")
+            print(df.groupby("sample_type").peptide.nunique())
+            print("****************************")
+
+            dfs.append(df)
+
+
+    df = pandas.concat(dfs, ignore_index=True)
+    df.to_csv(args.out, index=False)
+    print("Wrote: %s" % args.out)
+
+if __name__ == '__main__':
+    run()
diff --git a/downloads-generation/data_curated/curate_multiallelic_ms.py b/downloads-generation/data_curated/curate_multiallelic_ms.py
deleted file mode 100755
index bdbb8115..00000000
--- a/downloads-generation/data_curated/curate_multiallelic_ms.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-Filter and combine various peptide/MHC datasets to derive a composite training set,
-optionally including eluted peptides identified by mass-spec.
-"""
-import sys
-import argparse
-
-import pandas
-
-import mhcnames
-
-
-def normalize_allele_name(s):
-    try:
-        return mhcnames.normalize_allele_name(s)
-    except Exception:
-        return "UNKNOWN"
-
-
-parser = argparse.ArgumentParser(usage=__doc__)
-
-parser.add_argument(
-    "pmid",
-    metavar="PMID",
-    help="PMID of dataset to curate")
-parser.add_argument(
-    "files",
-    nargs="+",
-    metavar="FILE",
-    help="File paths of data to curate")
-parser.add_argument(
-    "--out",
-    metavar="OUT.csv",
-    help="Out file path")
-parser.add_argument(
-    "--debug",
-    action="store_true",
-    default=False,
-    help="Leave user in pdb if PMID is unsupported")
-
-HANDLERS = {}
-
-
-def load(filenames, **kwargs):
-    result = {}
-    for filename in filenames:
-        if filename.endswith(".csv"):
-            result[filename] = pandas.read_csv(filename, **kwargs)
-        elif filename.endswith(".xlsx") or filename.endswith(".xls"):
-            result[filename] = pandas.read_excel(filename, **kwargs)
-        else:
-            result[filename] = filename
-
-    return result
-
-
-def debug(*filenames):
-    loaded = load(filenames)
-    import ipdb
-    ipdb.set_trace()
-
-
-def pmid_27600516(filename):
-    df = pandas.read_csv(filename)
-
-    sample_to_peptides = {}
-    current_sample = None
-    for peptide in df.peptide:
-        if peptide.startswith("#"):
-            current_sample = peptide[1:]
-            sample_to_peptides[current_sample] = []
-        else:
-            assert current_sample is not None
-            sample_to_peptides[current_sample].append(peptide.strip().upper())
-
-    rows = []
-    for (sample, peptides) in sample_to_peptides.items():
-        for peptide in sorted(set(peptides)):
-            rows.append([sample, peptide])
-
-    result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
-    return result
-
-
-HANDLERS["27600516"] = pmid_27600516
-
-
-def run():
-    args = parser.parse_args(sys.argv[1:])
-
-    if args.pmid in HANDLERS:
-        df = HANDLERS[args.pmid](*args.files)
-    elif args.debug:
-        debug(*args.files)
-    else:
-        raise NotImplementedError(args.pmid)
-
-    df.to_csv(args.out, index=False)
-    print("Wrote: %s" % args.out)
-
-if __name__ == '__main__':
-    run()
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 3c731928..1ccb25be 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -46,7 +46,7 @@ releases:
               default: false
 
             - name: data_published
-              url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
               default: false
 
             - name: data_curated
@@ -109,7 +109,7 @@ releases:
               default: false
 
             - name: data_published
-              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
+              url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
               default: false
 
             - name: data_curated
-- 
GitLab