update

2bdae397 · Tim O'Donnell · dd582f48 · 2bdae397
Commit 2bdae397 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_curated/curate_by_pmid.py
+++ b/downloads-generation/data_curated/curate_by_pmid.py
@@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec.
 import sys
 import argparse
 import os
+import collections
 import pandas
@@ -122,28 +123,58 @@ def handle_pmid_25576301(filename):
    assert peptides[0] == "AAAAAAAQSVY"
    assert peptides[-1] == "YYYNGKAVY"
-    # TODO TODO
+    column_to_sample = {}
-    import ipdb ; ipdb.set_trace()
+    for s in [c for c in df if c.startswith("Intensity ")]:
+        assert s[-2] == "-"
+        column_to_sample[s] = s.replace("Intensity ", "")[:-2].strip()
-    # THIS IS ALL JUNK:
+    intensity_columns = list(column_to_sample)
-    result = pandas.DataFrame({
-        "peptide": peptides,
+    rows = []
-    })
+    for _, row in df.iterrows():
-    result["sample_id"] = "24616531"
+        x1 = row[intensity_columns]
-    result["sample_type"] = "B-lymphoblastoid"
+        x2 = x1[x1 > 0].index.map(column_to_sample).value_counts()
-    result["cell_line"] = "GR"
+        x3 = x2[x2 >= 2]  # require at least two replicates for each peptide
+        for sample in x3.index:
+            rows.append((row.Sequence, sample))
+    result = pandas.DataFrame(rows, columns=["peptide", "sample_id"])
+    result["cell_line"] = ""
    result["pulldown_antibody"] = "W6/32"
-    # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
+    allele_map = {
-    # we are guessing the exact 4 digit alleles based on this.
+        'Fib': "HLA-A*03:01	HLA-A*23:01	HLA-B*08:01	HLA-B*15:18	HLA-C*07:02	HLA-C*07:04",
-    result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
+        'HCC1937': "HLA-A*23:01 HLA-A*24:02 HLA-B*07:02 HLA-B*40:01 HLA-C*03:04 HLA-C*07:02",
+        'SupB15WT': None,  # four digit alleles unknown, will drop sample
+        'SupB15RT': None,
+        'HCT116': "HLA-A*01:01 HLA-A*02:01 HLA-B*45:01 HLA-B*18:01 HLA-C*05:01 HLA-C*07:01",
+        # Homozygous at HLA-A:
+        'HCC1143': "HLA-A*31:01 HLA-A*31:01 HLA-B*35:08 HLA-B*37:01 HLA-C*04:01 HLA-C*06:02",
+        # Homozygous everywhere:
+        'JY': "HLA-A*02:01 HLA-A*02:01 HLA-B*07:02 HLA-B*07:02 HLA-C*07:02 HLA-C*07:02",
+    }
+    sample_type = {
+        'Fib': "fibroblast",
+        'HCC1937': "basal like breast cancer",
+        'SupB15WT': None,
+        'SupB15RT': None,
+        'HCT116': "colon carcinoma",
+        'HCC1143': "basal like breast cancer",
+        'JY': "B-cell",
+    }
+    result["hla"] = result.sample_id.map(allele_map)
+    print("Entries before dropping samples with unknown alleles", len(result))
+    result = result.loc[~result.hla.isnull()]
+    print("Entries after dropping samples with unknown alleles", len(result))
+    result["sample_type"] = result.sample_id.map(sample_type)
+    print(result.head(3))
    return result
 # Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict.
 for (key, value) in list(locals().items()):
    if key.startswith("handle_pmid_"):