working on curation

38c44b56 · Tim O'Donnell · ecae1b31 · 38c44b56 · 38c44b56 · 38c44b56
Commit 38c44b56 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -18,8 +18,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"

 # Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)

 # Log some environment info
 date
@@ -29,6 +29,22 @@ git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME

 cp $SCRIPT_DIR/curate.py .
+cp $SCRIPT_DIR/curate_by_pmid.py .
+
+RAW_DIR="$(mhcflurry-downloads path data_published)/raw"
+cp -r "$RAW_DIR" .
+
+CURATE_BY_PMID_ARGS=""
+for pmid in $(ls raw)
+do
+    CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ')
+done
+
+time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug
+
+exit 1
+
+

 # No mass-spec data
 time python curate.py \

--- a/downloads-generation/data_curated/curate_multiallelic_ms.py
+++ b/downloads-generation/data_curated/curate_multiallelic_ms.py
@@ -4,6 +4,7 @@ optionally including eluted peptides identified by mass-spec.
 """
 import sys
 import argparse
+import os

 import pandas

@@ -20,14 +21,12 @@ def normalize_allele_name(s):
 parser = argparse.ArgumentParser(usage=__doc__)

 parser.add_argument(
-    "pmid",
-    metavar="PMID",
-    help="PMID of dataset to curate")
-parser.add_argument(
-    "files",
+    "--item",
    nargs="+",
-    metavar="FILE",
-    help="File paths of data to curate")
+    action="append",
+    metavar="PMID FILE, ... FILE",
+    default=[],
+    help="Item to curate: PMID and list of files")
 parser.add_argument(
    "--out",
    metavar="OUT.csv",
@@ -60,7 +59,7 @@ def debug(*filenames):
    ipdb.set_trace()


-def pmid_27600516(filename):
+def handle_pmid_27600516(filename):
    df = pandas.read_csv(filename)

    sample_to_peptides = {}
@@ -79,22 +78,108 @@ def pmid_27600516(filename):
            rows.append([sample, peptide])

    result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
+    result["sample_type"] = "melanoma_cell_line"
    return result


-HANDLERS["27600516"] = pmid_27600516
+def handle_pmid_23481700(filename):
+    df = pandas.read_excel(filename)
+    peptides = df.iloc[10:,0].values
+    assert peptides[0] == "TPSLVKSTSQL"
+    assert peptides[-1] == "LPHSVNSKL"
+
+    result = pandas.DataFrame({
+        "peptide": peptides,
+    })
+    result["sample_id"] = "23481700"
+    result["sample_type"] = "B-LCL"
+    return result
+
+
+def handle_pmid_24616531(filename):
+    df = pandas.read_excel(filename, sheetname="EThcD")
+    peptides = df.Sequence.values
+    assert peptides[0] == "APFLRIAF"
+    assert peptides[-1] == "WRQAGLSYIRYSQI"
+
+    result = pandas.DataFrame({
+        "peptide": peptides,
+    })
+    result["sample_id"] = "24616531"
+    result["sample_type"] = "B-lymphoblastoid"
+    result["cell_line"] = "GR"
+    result["pulldown_antibody"] = "W6/32"
+
+    # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
+    # we are guessing the exact 4 digit alleles based on this.
+    result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
+    return result
+
+
+def handle_pmid_25576301(filename):
+    df = pandas.read_excel(filename, sheetname="Peptides")
+    peptides = df.Sequence.values
+    assert peptides[0] == "AAAAAAAQSVY"
+    assert peptides[-1] == "YYYNGKAVY"
+
+    # TODO TODO
+    import ipdb ; ipdb.set_trace()
+
+    # THIS IS ALL JUNK:
+    result = pandas.DataFrame({
+        "peptide": peptides,
+    })
+    result["sample_id"] = "24616531"
+    result["sample_type"] = "B-lymphoblastoid"
+    result["cell_line"] = "GR"
+    result["pulldown_antibody"] = "W6/32"
+
+    # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
+    # we are guessing the exact 4 digit alleles based on this.
+    result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
+    return result
+
+
+
+
+
+
+# Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict.
+for (key, value) in list(locals().items()):
+    if key.startswith("handle_pmid_"):
+        HANDLERS[key.replace("handle_pmid_", "")] = value


 def run():
    args = parser.parse_args(sys.argv[1:])

-    if args.pmid in HANDLERS:
-        df = HANDLERS[args.pmid](*args.files)
-    elif args.debug:
-        debug(*args.files)
-    else:
-        raise NotImplementedError(args.pmid)
+    dfs = []
+    for item_tpl in args.item:
+        (pmid, filenames) = (item_tpl[0], item_tpl[1:])
+        print("Processing item", pmid, *[os.path.abspath(f) for f in filenames])
+
+        df = None
+        if pmid in HANDLERS:
+            df = HANDLERS[pmid](*filenames)
+        elif args.debug:
+            debug(*filenames)
+        else:
+            raise NotImplementedError(args.pmid)
+
+        if df is not None:
+            df["pmid"] = pmid
+            print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
+            print("Counts by sample id:")
+            print(df.groupby("sample_id").peptide.nunique())
+            print("")
+            print("Counts by sample type:")
+            print(df.groupby("sample_type").peptide.nunique())
+            print("****************************")
+
+            dfs.append(df)
+

+    df = pandas.concat(dfs, ignore_index=True)
    df.to_csv(args.out, index=False)
    print("Wrote: %s" % args.out)


--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -46,7 +46,7 @@ releases:
              default: false

            - name: data_published
-              url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
              default: false

            - name: data_curated
@@ -109,7 +109,7 @@ releases:
              default: false

            - name: data_published
-              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
+              url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
              default: false

            - name: data_curated