From 6e21104d5d146cd9b250a2131cf405f601f1030b Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 24 Sep 2019 22:38:34 -0400
Subject: [PATCH] update

---
 downloads-generation/data_curated/GENERATE.sh |   8 +-
 .../data_curated/curate_by_pmid.py            | 531 ++++++++++++++++--
 .../data_published/GENERATE.sh                |   9 +
 mhcflurry/downloads.yml                       |   2 +-
 4 files changed, 505 insertions(+), 45 deletions(-)

diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh
index 0e68fc90..ac51875b 100755
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -18,8 +18,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 
 # Send stdout and stderr to a logfile included with the archive.
-#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 
 # Log some environment info
 date
@@ -42,10 +42,6 @@ done
 
 time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug
 
-exit 1
-
-
-
 # No mass-spec data
 time python curate.py \
     --data-iedb \
diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py
index a105de05..4d478b9d 100755
--- a/downloads-generation/data_curated/curate_by_pmid.py
+++ b/downloads-generation/data_curated/curate_by_pmid.py
@@ -6,6 +6,7 @@ import sys
 import argparse
 import os
 import collections
+from six.moves import StringIO
 
 import pandas
 
@@ -61,6 +62,7 @@ def debug(*filenames):
 
 
 def handle_pmid_27600516(filename):
+    """Gloger, ..., Neri Cancer Immunol Immunother 2016 [PMID 27600516]"""
     df = pandas.read_csv(filename)
 
     sample_to_peptides = {}
@@ -78,47 +80,81 @@ def handle_pmid_27600516(filename):
         for peptide in sorted(set(peptides)):
             rows.append([sample, peptide])
 
-    result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
-    result["sample_type"] = "melanoma_cell_line"
-    return result
+    result_df = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
+    result_df["sample_type"] = "melanoma_cell_line"
+    result_df["cell_line"] = result_df.sample_id
+    result_df["mhc_class"] = "I"
+    result_df["pulldown_antibody"] = "W6/32"
+    result_df["format"] = "multiallelic"
+    result_df["hla"] = result_df.sample_id.map({
+        "FM-82": "HLA-A*02:01 HLA-A*01:01 HLA-B*08:01 HLA-B*15:01 HLA-C*03:04 HLA-C*07:01",
+        "FM-93/2": "HLA-A*02:01 HLA-A*26:01 HLA-B*40:01 HLA-B*44:02 HLA-C*03:04 HLA-C*05:01",
+        "Mel-624": "HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-B*14:01 HLA-C*07:02 HLA-C*08:02",
+        "MeWo": "HLA-A*02:01 HLA-A*26:01 HLA-B*14:02 HLA-B*38:01 HLA-C*08:02 HLA-C*12:03",
+        "SK-Mel-5": "HLA-A*02:01 HLA-A*11:01 HLA-B*40:01 HLA-C*03:03",
+    })
+    return result_df
 
 
 def handle_pmid_23481700(filename):
-    df = pandas.read_excel(filename)
-    peptides = df.iloc[10:,0].values
-    assert peptides[0] == "TPSLVKSTSQL"
-    assert peptides[-1] == "LPHSVNSKL"
+    """Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]"""
+    df = pandas.read_excel(filename, skiprows=10)
+    assert df["Peptide sequence"].iloc[0] == "TPSLVKSTSQL"
+    assert df["Peptide sequence"].iloc[-1] == "LPHSVNSKL"
+
+    hla = {
+        "JY": "HLA-A*02:01 HLA-B*07:02 HLA-C*07:02",
+        "HHC": "HLA-A*02:01 HLA-B*07:02 HLA-B*44:02 HLA-C*05:01 HLA-C*07:02",
+    }
 
-    result = pandas.DataFrame({
-        "peptide": peptides,
-    })
-    result["sample_id"] = "23481700"
-    result["sample_type"] = "B-LCL"
-    return result
+    results = []
+    for sample_id in ["JY", "HHC"]:
+        hits_df = df.loc[
+            df["Int %s" % sample_id].map(
+                lambda x: {"n.q.": 0, "n.q": 0}.get(x, x)).astype(float) > 0
+        ]
+        result_df = pandas.DataFrame({
+            "peptide": hits_df["Peptide sequence"].dropna().values,
+        })
+        result_df["sample_id"] = sample_id
+        result_df["cell_line"] = "B-LCL-" + sample_id
+        result_df["hla"] = hla[sample_id]
+        result_df["sample_type"] = "B-LCL"
+        result_df["mhc_class"] = "I"
+        result_df["format"] = "multiallelic"
+        result_df["pulldown_antibody"] = "W6/32"
+        results.append(result_df)
+
+    result_df = pandas.concat(results, ignore_index=True)
+    return result_df
 
 
 def handle_pmid_24616531(filename):
-    df = pandas.read_excel(filename, sheetname="EThcD")
+    """Mommen, ..., Heck PNAS 2014 [PMID 24616531]"""
+    df = pandas.read_excel(filename, sheet_name="EThcD")
     peptides = df.Sequence.values
     assert peptides[0] == "APFLRIAF"
     assert peptides[-1] == "WRQAGLSYIRYSQI"
 
-    result = pandas.DataFrame({
+    result_df = pandas.DataFrame({
         "peptide": peptides,
     })
-    result["sample_id"] = "24616531"
-    result["sample_type"] = "B-lymphoblastoid"
-    result["cell_line"] = "GR"
-    result["pulldown_antibody"] = "W6/32"
+    result_df["sample_id"] = "24616531"
+    result_df["sample_type"] = "B-lymphoblastoid"
+    result_df["cell_line"] = "GR"
+    result_df["pulldown_antibody"] = "W6/32"
 
     # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
     # we are guessing the exact 4 digit alleles based on this.
-    result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
-    return result
+    result_df["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
+    result_df["mhc_class"] = "I"
+    result_df["format"] = "multiallelic"
+    return result_df
 
 
 def handle_pmid_25576301(filename):
-    df = pandas.read_excel(filename, sheetname="Peptides")
+    """Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301]"""
+    df = pandas.read_excel(filename, sheet_name="Peptides")
     peptides = df.Sequence.values   
     assert peptides[0] == "AAAAAAAQSVY"
     assert peptides[-1] == "YYYNGKAVY"
@@ -138,9 +174,11 @@ def handle_pmid_25576301(filename):
         for sample in x3.index:
             rows.append((row.Sequence, sample))
 
-    result = pandas.DataFrame(rows, columns=["peptide", "sample_id"])
-    result["cell_line"] = ""
-    result["pulldown_antibody"] = "W6/32"
+    result_df = pandas.DataFrame(rows, columns=["peptide", "sample_id"])
+    result_df["cell_line"] = ""
+    result_df["pulldown_antibody"] = "W6/32"
+    result_df["mhc_class"] = "I"
+    result_df["format"] = "multiallelic"
 
     allele_map = {
         'Fib': "HLA-A*03:01	HLA-A*23:01	HLA-B*08:01	HLA-B*15:18	HLA-C*07:02	HLA-C*07:04",
@@ -165,17 +203,411 @@ def handle_pmid_25576301(filename):
         'HCC1143': "basal like breast cancer",
         'JY': "B-cell",
     }
-    result["hla"] = result.sample_id.map(allele_map)
-    print("Entries before dropping samples with unknown alleles", len(result))
-    result = result.loc[~result.hla.isnull()]
-    print("Entries after dropping samples with unknown alleles", len(result))
-    result["sample_type"] = result.sample_id.map(sample_type)
-    print(result.head(3))
-    return result
+    result_df["hla"] = result_df.sample_id.map(allele_map)
+    print("Entries before dropping samples with unknown alleles", len(result_df))
+    result_df = result_df.loc[~result_df.hla.isnull()]
+    print("Entries after dropping samples with unknown alleles", len(result_df))
+    result_df["sample_type"] = result_df.sample_id.map(sample_type)
+    print(result_df.head(3))
+    return result_df
+
+
+def handle_pmid_26992070(*filenames):
+    """Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]"""
+    allele_text = """
+        Cell line	HLA-A 1	HLA-A 2	HLA-B 1	HLA-B 2	HLA-C 1	HLA-C 2
+        HEK293	03:01	03:01	07:02	07:02	07:02	07:02
+        HL-60	01:01	01:01	57:01	57:01	06:02	06:02
+        RPMI8226	30:01	68:02	15:03	15:10	02:10	03:04
+        MAVER-1	24:02	26:01	38:01	44:02	05:01	12:03
+        THP-1	02:01	24:02	15:11	35:01	03:03	03:03
+    """
+    allele_info = pandas.read_csv(
+        StringIO(allele_text), sep="\t", index_col=0)
+    allele_info.index = allele_info.index.str.strip()
+    for gene in ["A", "B", "C"]:
+        for num in ["1", "2"]:
+            allele_info[
+                "HLA-%s %s" % (gene, num)
+            ] = "HLA-" + gene + allele_info["HLA-%s %s" % (gene, num)]
+    cell_line_to_allele = allele_info.apply(" ".join, axis=1)
+
+    sheets = {}
+    for f in filenames:
+        if f.endswith(".xlsx"):
+            d = pandas.read_excel(f, sheet_name=None, skiprows=1)
+            sheets.update(d)
 
+    dfs = []
+    for cell_line in cell_line_to_allele.index:
+        # Using data from DeepQuanTR, which appears to be a consensus between
+        # two other methods used.
+        sheet = sheets[cell_line + "_DeepQuanTR"]
+        replicated = sheet.loc[
+            sheet[[c for c in sheet if "Sample" in c]].fillna(0).sum(1) > 1
+        ]
+        df = pandas.DataFrame({
+            'peptide': replicated.Sequence.values
+        })
+        df["sample_id"] = cell_line
+        df["hla"] = cell_line_to_allele.get(cell_line)
+        dfs.append(df)
+
+    result_df = pandas.concat(dfs, ignore_index=True)
+    result_df["pulldown_antibody"] = "W6/32"
+    result_df["cell_line"] = result_df["sample_id"]
+    result_df["sample_type"] = result_df.sample_id.map({
+        "HEK293": "hek",
+        "HL-60": "neutrophil",
+        "RPMI8226": "b-cell",
+        "MAVER-1": "b-lymphoblast",
+        "THP-1": "monocyte",
+    })
+    result_df["mhc_class"] = "I"
+    result_df["format"] = "multiallelic"
+    return result_df
 
 
-# Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict.
+def handle_pmid_27412690(filename):
+    """Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690]"""
+    hla_types = {
+        "U-87": "HLA-A*02:01 HLA-B*44:02 HLA-C*05:01",
+        "T98G": "HLA-A*02:01 HLA-B*39:06 HLA-C*07:02",
+        "LNT-229": "HLA-A*03:01 HLA-B*35:01 HLA-C*04:01",
+    }
+    sample_id_to_cell_line = {
+        "U-87": "U-87",
+        "T98G": "T98G",
+        "LNT-229": "LNT-229",
+        "U-87+DAC": "U-87",
+        "T98G+DAC": "T98G",
+        "LNT-229+DAC": "LNT-229",
+    }
+
+    df = pandas.read_excel(filename)
+    assert df.Sequence.iloc[0] == "AAAAAAGSGTPR"
+
+    intensity_col_to_sample_id = {}
+    for col in df:
+        if col.startswith("Intensity "):
+            sample_id = col.split()[1]
+            assert sample_id in sample_id_to_cell_line, (col, sample_id)
+            intensity_col_to_sample_id[col] = sample_id
+
+    dfs = []
+    for (sample_id, cell_line) in sample_id_to_cell_line.items():
+        intensity_cols = [
+            c for (c, v) in intensity_col_to_sample_id.items()
+            if v == sample_id
+        ]
+        hits_df = df.loc[
+            (df[intensity_cols] > 0).sum(1) > 1
+        ]
+        result_df = pandas.DataFrame({
+            "peptide": hits_df.Sequence.values,
+        })
+        result_df["sample_id"] = sample_id
+        result_df["cell_line"] = cell_line
+        result_df["hla"] = hla_types[cell_line]
+
+        dfs.append(result_df)
+
+    result_df = pandas.concat(dfs, ignore_index=True)
+    result_df["sample_type"] = "glioblastoma"
+    result_df["pulldown_antibody"] = "W6/32"
+    result_df["mhc_class"] = "I"
+    result_df["format"] = "multiallelic"
+    return result_df
+
+
+def handle_pmid_28832583(*filenames):
+    """Bassani-Sternberg, ..., Gfeller PLOS Comp. Bio. 2017 [PMID 28832583]"""
+    # This work also reanalyzes data from
+    # Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
+
+    (filename_dataset1, filename_dataset2) = sorted(filenames)
+
+    dataset1 = pandas.read_csv(filename_dataset1, sep="\t")
+    dataset2 = pandas.read_csv(filename_dataset2, sep="\t")
+    df = pandas.concat([dataset1, dataset2], ignore_index=True, sort=False)
+
+    info_text = """
+    cell_line	origin	original_pmid	allele1	allele2	allele3	allele4	allele5	allele6
+    CD165	B-cell	28832583	HLA-A*02:05	HLA-A*24:02	HLA-B*15:01	HLA-B*50:01	HLA-C*03:03	HLA-C*06:02
+    CM467	B-cell	28832583	HLA-A*01:01	HLA-A*24:02	HLA-B*13:02	HLA-B*39:06	HLA-C*06:02	HLA-C*12:03
+    GD149	B-cell	28832583	HLA-A*01:01	HLA-A*24:02	HLA-B*38:01	HLA-B*44:03	HLA-C*06:02	HLA-C*12:03
+    MD155	B-cell	28832583	HLA-A*02:01	HLA-A*24:02	HLA-B*15:01	HLA-B*18:01	HLA-C*03:03	HLA-C*07:01
+    PD42	B cell	28832583	HLA-A*02:06	HLA-A*24:02	HLA-B*07:02	HLA-B*55:01	HLA-C*01:02	HLA-C*07:02
+    RA957	B cell	28832583	HLA-A*02:20	HLA-A*68:01	HLA-B*35:03	HLA-B*39:01	HLA-C*04:01	HLA-C*07:02
+    TIL1	TIL	28832583	HLA-A*02:01	HLA-A*02:01	HLA-B*18:01	HLA-B*38:01	HLA-C*05:01	
+    TIL3	TIL	28832583	HLA-A*01:01	HLA-A*23:01	HLA-B*07:02	HLA-B*15:01	HLA-C*12:03	HLA-C*14:02
+    Apher1	Leukapheresis	28832583	HLA-A*03:01	HLA-A*29:02	HLA-B*44:02	HLA-B*44:03	HLA-C*12:03	HLA-C*16:01
+    Apher6	Leukapheresis	28832583	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
+    pat_AC2	B lymphoblast	27841757	HLA-A*03:01	HLA-A*32:01	HLA-B*27:05	HLA-B*45:01		
+    pat_C	B lymphoblast	27841757	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
+    pat_CELG	B lymphoblast	27841757	HLA-A*02:01	HLA-A*24:02	HLA-B*15:01	HLA-B*73:01	HLA-C*03:03	HLA-C*15:05
+    pat_CP2	B lymphoblast	27841757	HLA-A*11:01		HLA-B*14:02	HLA-B*44:02		
+    pat_FL	B lymphoblast	27841757	HLA-A*03:01	HLA-A*11:01	HLA-B*44:03	HLA-B*50:01		
+    pat_J	B lymphoblast	27841757	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
+    pat_JPB3	B lymphoblast	27841757	HLA-A*02:01	HLA-A*11:01	HLA-B*27:05	HLA-B*56:01		
+    pat_JT2	B lymphoblast	27841757	HLA-A*11:01		HLA-B*18:03	HLA-B*35:01		
+    pat_M	B lymphoblast	27841757	HLA-A*03:01	HLA-A*29:02	HLA-B*08:01	HLA-B*44:03	HLA-C*07:01	HLA-C*16:01
+    pat_MA	B lymphoblast	27841757	HLA-A*02:01	HLA-A*29:02	HLA-B*44:03	HLA-B*57:01	HLA-C*07:01	HLA-C*16:01
+    pat_ML	B lymphoblast	27841757	HLA-A*02:01	HLA-A*11:01	HLA-B*40:01	HLA-B*44:03		
+    pat_NS2	B lymphoblast	27841757	HLA-A*02:01		HLA-B*13:02	HLA-B*41:01		
+    pat_NT	B lymphoblast	27841757	HLA-A*01:01	HLA-A*32:01	HLA-B*08:01			
+    pat_PF1	B lymphoblast	27841757	HLA-A*01:01	HLA-A*02:01	HLA-B*07:02	HLA-B*44:03	HLA-C*07:02	HLA-C*16:01
+    pat_R	B lymphoblast	27841757	HLA-A*03:01	HLA-A*29:02	HLA-B*08:01	HLA-B*44:03	HLA-C*07:01	HLA-C*16:01
+    pat_RT	B lymphoblast	27841757	HLA-A*01:01	HLA-A*02:01	HLA-B*18:01	HLA-B*39:24	HLA-C*05:01	HLA-C*07:01
+    pat_SR	B lymphoblast	27841757	HLA-A*02:01	HLA-A*23:01	HLA-B*18:01	HLA-B*44:03		
+    pat_ST	B lymphoblast	27841757	HLA-A*03:01	HLA-A*24:02	HLA-B*07:02	HLA-B*27:05
+    """
+    info_df = pandas.read_csv(StringIO(info_text), sep="\t", index_col=0)
+    info_df.index = info_df.index.str.strip()
+
+    info_df["hla"] = info_df[
+        [c for c in info_df if c.startswith("allele")]
+    ].fillna("").apply(" ".join, axis=1)
+
+    results = []
+    for col in df.columns:
+        if col.startswith("Intensity "):
+            sample_id = col.replace("Intensity ", "")
+            assert sample_id in info_df.index, sample_id
+            peptides = df.loc[df[col].fillna(0) > 0].Sequence.unique()
+            result_df = pandas.DataFrame({"peptide": peptides})
+            result_df["sample_id"] = sample_id
+            result_df["hla"] = info_df.loc[sample_id].hla
+            result_df["sample_type"] = info_df.loc[sample_id].origin
+            result_df["original_pmid"] = str(
+                info_df.loc[sample_id].original_pmid)
+            results.append(result_df)
+
+    result_df = pandas.concat(results, ignore_index=True)
+    samples = result_df.sample_id.unique()
+    for sample_id in info_df.index:
+        assert sample_id in samples, (sample_id, samples)
+
+    result_df["mhc_class"] = "I"
+    result_df["format"] = "multiallelic"
+    result_df["cell_line"] = ""
+    result_df["pulldown_antibody"] = "W6/32"
+    return result_df
+
+
+def handle_pmid_31495665(filename):
+    """Abelin, ..., Rooney Immunity 2019 [PMID 31495665]"""
+    hla_type = {
+        "HLA-DR_A375": None,
+        "HLA-DR_Lung": "DRB1*01:01 DRB1*03:01 DRB3*01:01",
+        "HLA-DR_PBMC_HDSC": "DRB1*03:01 DRB1*11:01 DRB3*01:01 DRB3*02:02",
+        "HLA-DR_PBMC_RG1095": "HLA-DRA1*01:01-DRB1*03:01 HLA-DRA1*01:01-DRB1*11:01 HLA-DRA1*01:01-DRB3*01:01 HLA-DRA1*01:01-DRB3*02:02",
+        "HLA-DR_PBMC_RG1104": "DRB1*01:01 DRB1*11:01 DRB3*02:02",
+        "HLA-DR_PBMC_RG1248": "DRB1*03:01 DRB1*03:01 DRB3*01:01 DRB3*01:01",
+        "HLA-DR_SILAC_Donor1_10minLysate": None,
+        "HLA-DR_SILAC_Donor1_5hrLysate": None,
+        "HLA-DR_SILAC_Donor1_DConly": None,
+        "HLA-DR_SILAC_Donor1_UVovernight": None,
+        "HLA-DR_SILAC_Donor2_DC_UV_16hr": None,
+        "HLA-DR_SILAC_Donor2_DC_UV_24hr": None,
+        "HLA-DR_Spleen": "DRB1*04:01 DRB4*01:03 DRB1*15:03 DRB5*01:01",
+        "MAPTAC_A*02:01": "HLA-A*02:01",
+        "MAPTAC_A*11:01": "HLA-A*11:01",
+        "MAPTAC_A*32:01": "HLA-A*32:01",
+        "MAPTAC_B*07:02": "HLA-B*07:02",
+        "MAPTAC_B*45:01": "HLA-B*45:01",
+        "MAPTAC_B*52:01": "HLA-B*52:01",
+        "MAPTAC_C*03:03": "HLA-C*03:03",
+        "MAPTAC_C*06:02": "HLA-C*06:02",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "HLA-DPB1*06:01-DPA1*01:03",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "HLA-DPB1*06:01-DPA1*01:03",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "HLA-DQB1*06:04-DQA1*01:02",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "HLA-DQB1*06:04-DQA1*01:02",
+        "MAPTAC_DRB1*01:01": "HLA-DRA1*01:01-DRB1*01:01",
+        "MAPTAC_DRB1*03:01": "HLA-DRA1*01:01-DRB1*03:01",
+        "MAPTAC_DRB1*04:01": "HLA-DRA1*01:01-DRB1*04:01",
+        "MAPTAC_DRB1*07:01": "HLA-DRA1*01:01-DRB1*07:01",
+        "MAPTAC_DRB1*11:01": "HLA-DRA1*01:01-DRB1*11:01",
+        "MAPTAC_DRB1*12:01_dm+": "HLA-DRA1*01:01-DRB1*12:01",
+        "MAPTAC_DRB1*12:01_dm-": "HLA-DRA1*01:01-DRB1*12:01",
+        "MAPTAC_DRB1*15:01": "HLA-DRA1*01:01-DRB1*15:01",
+        "MAPTAC_DRB3*01:01_dm+": "HLA-DRA1*01:01-DRB3*01:01",
+        "MAPTAC_DRB3*01:01_dm-": "HLA-DRA1*01:01-DRB3*01:01",
+    }
+    pulldown_antibody = {
+        "HLA-DR_Lung": "L243 (HLA-DR)",
+        "HLA-DR_PBMC_HDSC": "tal1b5 (HLA-DR)",
+        "HLA-DR_PBMC_RG1095": "tal1b5 (HLA-DR)",
+        "HLA-DR_PBMC_RG1104": "tal1b5 (HLA-DR)",
+        "HLA-DR_PBMC_RG1248": "tal1b5 (HLA-DR)",
+        "HLA-DR_Spleen": "L243 (HLA-DR)",
+        "MAPTAC_A*02:01": "MAPTAC",
+        "MAPTAC_A*11:01": "MAPTAC",
+        "MAPTAC_A*32:01": "MAPTAC",
+        "MAPTAC_B*07:02": "MAPTAC",
+        "MAPTAC_B*45:01": "MAPTAC",
+        "MAPTAC_B*52:01": "MAPTAC",
+        "MAPTAC_C*03:03": "MAPTAC",
+        "MAPTAC_C*06:02": "MAPTAC",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "MAPTAC",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "MAPTAC",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "MAPTAC",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "MAPTAC",
+        "MAPTAC_DRB1*01:01": "MAPTAC",
+        "MAPTAC_DRB1*03:01": "MAPTAC",
+        "MAPTAC_DRB1*04:01": "MAPTAC",
+        "MAPTAC_DRB1*07:01": "MAPTAC",
+        "MAPTAC_DRB1*11:01": "MAPTAC",
+        "MAPTAC_DRB1*12:01_dm+": "MAPTAC",
+        "MAPTAC_DRB1*12:01_dm-": "MAPTAC",
+        "MAPTAC_DRB1*15:01": "MAPTAC",
+        "MAPTAC_DRB3*01:01_dm+": "MAPTAC",
+        "MAPTAC_DRB3*01:01_dm-": "MAPTAC",
+    }
+    format = {
+        "HLA-DR_Lung": "DR-specific",
+        "HLA-DR_PBMC_HDSC": "DR-specific",
+        "HLA-DR_PBMC_RG1095": "DR-specific",
+        "HLA-DR_PBMC_RG1104": "DR-specific",
+        "HLA-DR_PBMC_RG1248": "DR-specific",
+        "HLA-DR_Spleen": "DR-specific",
+        "MAPTAC_A*02:01": "monoallelic",
+        "MAPTAC_A*11:01": "monoallelic",
+        "MAPTAC_A*32:01": "monoallelic",
+        "MAPTAC_B*07:02": "monoallelic",
+        "MAPTAC_B*45:01": "monoallelic",
+        "MAPTAC_B*52:01": "monoallelic",
+        "MAPTAC_C*03:03": "monoallelic",
+        "MAPTAC_C*06:02": "monoallelic",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "monoallelic",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "monoallelic",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "monoallelic",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "monoallelic",
+        "MAPTAC_DRB1*01:01": "monoallelic",
+        "MAPTAC_DRB1*03:01": "monoallelic",
+        "MAPTAC_DRB1*04:01": "monoallelic",
+        "MAPTAC_DRB1*07:01": "monoallelic",
+        "MAPTAC_DRB1*11:01": "monoallelic",
+        "MAPTAC_DRB1*12:01_dm+": "monoallelic",
+        "MAPTAC_DRB1*12:01_dm-": "monoallelic",
+        "MAPTAC_DRB1*15:01": "monoallelic",
+        "MAPTAC_DRB3*01:01_dm+": "monoallelic",
+        "MAPTAC_DRB3*01:01_dm-": "monoallelic",
+    }
+    mhc_class = {
+        "HLA-DR_Lung": "II",
+        "HLA-DR_PBMC_HDSC": "II",
+        "HLA-DR_PBMC_RG1095": "II",
+        "HLA-DR_PBMC_RG1104": "II",
+        "HLA-DR_PBMC_RG1248": "II",
+        "HLA-DR_Spleen": "II",
+        "MAPTAC_A*02:01": "I",
+        "MAPTAC_A*11:01": "I",
+        "MAPTAC_A*32:01": "I",
+        "MAPTAC_B*07:02": "I",
+        "MAPTAC_B*45:01": "I",
+        "MAPTAC_B*52:01": "I",
+        "MAPTAC_C*03:03": "I",
+        "MAPTAC_C*06:02": "I",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "II",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "II",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "II",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "II",
+        "MAPTAC_DRB1*01:01": "II",
+        "MAPTAC_DRB1*03:01": "II",
+        "MAPTAC_DRB1*04:01": "II",
+        "MAPTAC_DRB1*07:01": "II",
+        "MAPTAC_DRB1*11:01": "II",
+        "MAPTAC_DRB1*12:01_dm+": "II",
+        "MAPTAC_DRB1*12:01_dm-": "II",
+        "MAPTAC_DRB1*15:01": "II",
+        "MAPTAC_DRB3*01:01_dm+": "II",
+        "MAPTAC_DRB3*01:01_dm-": "II",
+    }
+    cell_line = {
+        "HLA-DR_Lung": "",
+        "HLA-DR_PBMC_HDSC": "",
+        "HLA-DR_PBMC_RG1095": "",
+        "HLA-DR_PBMC_RG1104": "",
+        "HLA-DR_PBMC_RG1248": "",
+        "HLA-DR_Spleen": "",
+        "MAPTAC_A*02:01": "",
+        "MAPTAC_A*11:01": "",
+        "MAPTAC_A*32:01": "",
+        "MAPTAC_B*07:02": "",
+        "MAPTAC_B*45:01": "",
+        "MAPTAC_B*52:01": "",
+        "MAPTAC_C*03:03": "",
+        "MAPTAC_C*06:02": "",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",  # don't actually see this in DataS1A!
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
+        "MAPTAC_DRB1*01:01": "",
+        "MAPTAC_DRB1*03:01": "",
+        "MAPTAC_DRB1*04:01": "",
+        "MAPTAC_DRB1*07:01": "",
+        "MAPTAC_DRB1*11:01": "",
+        "MAPTAC_DRB1*12:01_dm+": "",
+        "MAPTAC_DRB1*12:01_dm-": "",
+        "MAPTAC_DRB1*15:01": "",
+        "MAPTAC_DRB3*01:01_dm+": "",
+        "MAPTAC_DRB3*01:01_dm-": "",
+    }
+    sample_type = {
+        "HLA-DR_Lung": "lung",
+        "HLA-DR_PBMC_HDSC": "lung",
+        "HLA-DR_PBMC_RG1095": "lung",
+        "HLA-DR_PBMC_RG1104": "lung",
+        "HLA-DR_PBMC_RG1248": "lung",
+        "HLA-DR_Spleen": "spleen",
+        "MAPTAC_A*02:01": "mixed",
+        "MAPTAC_A*11:01": "mixed",
+        "MAPTAC_A*32:01": "mixed",
+        "MAPTAC_B*07:02": "mixed",
+        "MAPTAC_B*45:01": "mixed",
+        "MAPTAC_B*52:01": "mixed",
+        "MAPTAC_C*03:03": "mixed",
+        "MAPTAC_C*06:02": "mixed",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "mixed",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "mixed",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "mixed",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "mixed",
+        "MAPTAC_DRB1*01:01": "mixed",
+        "MAPTAC_DRB1*03:01": "mixed",
+        "MAPTAC_DRB1*04:01": "mixed",
+        "MAPTAC_DRB1*07:01": "mixed",
+        "MAPTAC_DRB1*11:01": "mixed",
+        "MAPTAC_DRB1*12:01_dm+": "mixed",
+        "MAPTAC_DRB1*12:01_dm-": "mixed",
+        "MAPTAC_DRB1*15:01": "mixed",
+        "MAPTAC_DRB3*01:01_dm+": "mixed",
+        "MAPTAC_DRB3*01:01_dm-": "mixed",
+    }
+
+    df = pandas.read_excel(filename, sheetname="DataS1B")
+    results = []
+    for sample_id in df.columns:
+        if hla_type[sample_id] is None:
+            print("Intentionally skipping", sample_id)
+            continue
+
+        result_df = pandas.DataFrame({
+            "peptide": df[sample_id].dropna().values,
+        })
+        result_df["sample_id"] = sample_id
+        result_df["hla"] = hla_type[sample_id]
+        result_df["pulldown_antibody"] = pulldown_antibody[sample_id]
+        result_df["format"] = format[sample_id]
+        result_df["mhc_class"] = mhc_class[sample_id]
+        result_df["sample_type"] = sample_type[sample_id]
+        result_df["cell_line"] = cell_line[sample_id]
+        results.append(result_df)
+    result_df = pandas.concat(results, ignore_index=True)
+    return result_df
+
+
+# Add all functions with names like handle_pmid_XXXX to HANDLERS dict.
 for (key, value) in list(locals().items()):
     if key.startswith("handle_pmid_"):
         HANDLERS[key.replace("handle_pmid_", "")] = value
@@ -185,13 +617,18 @@ def run():
     args = parser.parse_args(sys.argv[1:])
 
     dfs = []
-    for item_tpl in args.item:
+    for (i, item_tpl) in enumerate(args.item):
         (pmid, filenames) = (item_tpl[0], item_tpl[1:])
-        print("Processing item", pmid, *[os.path.abspath(f) for f in filenames])
+        print(
+            "Processing item %d / %d" % (i + 1, len(args.item)),
+            pmid,
+            *[os.path.abspath(f) for f in filenames])
 
         df = None
+        handler = None
         if pmid in HANDLERS:
-            df = HANDLERS[pmid](*filenames)
+            handler = HANDLERS[pmid]
+            df = handler(*filenames)
         elif args.debug:
             debug(*filenames)
         else:
@@ -199,7 +636,12 @@ def run():
 
         if df is not None:
             df["pmid"] = pmid
+            if "original_pmid" not in df.columns:
+                df["original_pmid"] = pmid
+            df = df.applymap(str).applymap(str.upper)
             print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
+            if handler is not None:
+                print(handler.__doc__)
             print("Counts by sample id:")
             print(df.groupby("sample_id").peptide.nunique())
             print("")
@@ -209,10 +651,23 @@ def run():
 
             dfs.append(df)
 
+    df = pandas.concat(dfs, ignore_index=True, sort=False)
+
+    df["cell_line"] = df["cell_line"].fillna("")
+
+    cols = ["pmid", "sample_id", "peptide", "format", "mhc_class", "hla", ]
+    cols += [c for c in sorted(df.columns) if c not in cols]
+    df = df[cols]
+
+    null_df = df.loc[df.isnull().any(1)]
+    if len(null_df) > 0:
+        print("Nulls:")
+        print(null_df)
+    else:
+        print("No nulls.")
 
-    df = pandas.concat(dfs, ignore_index=True)
     df.to_csv(args.out, index=False)
-    print("Wrote: %s" % args.out)
+    print("Wrote: %s" % os.path.abspath(args.out))
 
 if __name__ == '__main__':
     run()
diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh
index 33cc84c5..e566526a 100755
--- a/downloads-generation/data_published/GENERATE.sh
+++ b/downloads-generation/data_published/GENERATE.sh
@@ -46,6 +46,15 @@ PMID=28832583
 mkdir -p raw/$PMID
 wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P raw/$PMID # data generated in this work
 wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P raw/$PMID # data reanalyzed in this work
+cd raw/$PMID
+unzip *.s002
+unzip *.s003
+mkdir saved
+mv Dataset*/Dataset*.txt saved
+rm -rf Dataset* *.s002 *.s003 _*
+mv saved/* .
+rmdir saved
+cd ../..
 
 # Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301]
 PMID=25576301
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 1ccb25be..57d17f13 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -46,7 +46,7 @@ releases:
               default: false
 
             - name: data_published
-              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190924.tar.bz2
               default: false
 
             - name: data_curated
-- 
GitLab