From 48d6eed286ba6d5a8b901c6c3e8405235a9bbd7e Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 11 Oct 2019 10:51:53 -0400
Subject: [PATCH] fixes

---
 downloads-generation/data_curated/GENERATE.sh |  28 +-
 ...curate_by_pmid.py => curate_ms_by_pmid.py} | 465 +++++++++++++++---
 .../data_published/GENERATE.sh                |  82 ++-
 .../models_class1_pan_variants/GENERATE.sh    |   6 +-
 mhcflurry/downloads.yml                       |   6 +-
 5 files changed, 491 insertions(+), 96 deletions(-)
 rename downloads-generation/data_curated/{curate_by_pmid.py => curate_ms_by_pmid.py} (59%)

diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh
index fd041f93..b35d01fc 100755
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -29,23 +29,33 @@ git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 cp $SCRIPT_DIR/curate.py .
-cp $SCRIPT_DIR/curate_by_pmid.py .
+cp $SCRIPT_DIR/curate_ms_by_pmid.py .
 
-RAW_DIR="$(mhcflurry-downloads path data_published)/raw"
-cp -r "$RAW_DIR" .
+MS_DIR="$(mhcflurry-downloads path data_published)/ms"
+cp -r "$MS_DIR" .
+
+EXPRESSION_DIR="$(mhcflurry-downloads path data_published)/expression"
+cp -r "$EXPRESSION_DIR" .
 
 CURATE_BY_PMID_ARGS=""
-for pmid in $(ls raw)
+for pmid in $(ls ms)
+do
+    CURATE_BY_PMID_ARGS+=$(echo --ms-item $pmid ms/$pmid/* ' ')
+done
+for item in $(ls expression)
 do
-    CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ')
+    CURATE_BY_PMID_ARGS+=$(echo --expression-item $item expression/$item/* ' ')
 done
 
-time python curate_by_pmid.py $CURATE_BY_PMID_ARGS \
-    --out nontraining_curated.by_pmid.csv
+time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
+    --ms-out ms.nontraining_curated.by_pmid.csv \
+    --expression-out rna_expression.csv
+
+bzip2 ms.nontraining_curated.by_pmid.csv
+bzip2 rna_expression.csv
 
-bzip2 nontraining_curated.by_pmid.csv
 
-rm -rf raw
+rm -rf ms
 
 # No mass-spec data
 time python curate.py \
diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_ms_by_pmid.py
similarity index 59%
rename from downloads-generation/data_curated/curate_by_pmid.py
rename to downloads-generation/data_curated/curate_ms_by_pmid.py
index 4f1950e5..5406e238 100755
--- a/downloads-generation/data_curated/curate_by_pmid.py
+++ b/downloads-generation/data_curated/curate_ms_by_pmid.py
@@ -23,24 +23,35 @@ def normalize_allele_name(s):
 parser = argparse.ArgumentParser(usage=__doc__)
 
 parser.add_argument(
-    "--item",
+    "--ms-item",
     nargs="+",
     action="append",
     metavar="PMID FILE, ... FILE",
     default=[],
-    help="Item to curate: PMID and list of files")
+    help="Mass spec item to curate: PMID and list of files")
 parser.add_argument(
-    "--out",
+    "--expression-item",
+    nargs="+",
+    action="append",
+    metavar="LABEL FILE, ... FILE",
+    default=[],
+    help="Expression data to curate: dataset label and list of files")
+parser.add_argument(
+    "--ms-out",
+    metavar="OUT.csv",
+    help="Out file path (MS data)")
+parser.add_argument(
+    "--expression-out",
     metavar="OUT.csv",
-    help="Out file path")
+    help="Out file path (RNA-seq expression)")
 parser.add_argument(
     "--debug",
     action="store_true",
     default=False,
     help="Leave user in pdb if PMID is unsupported")
 
-HANDLERS = {}
-
+PMID_HANDLERS = {}
+EXPRESSION_HANDLERS = {}
 
 def load(filenames, **kwargs):
     result = {}
@@ -146,7 +157,7 @@ def handle_pmid_24616531(filename):
         "peptide": peptides,
     })
     result_df["sample_id"] = "24616531"
-    result_df["sample_type"] = "B-lymphoblastoid"
+    result_df["sample_type"] = "B-LCL"
     result_df["cell_line"] = "GR"
     result_df["pulldown_antibody"] = "W6/32"
 
@@ -181,7 +192,6 @@ def handle_pmid_25576301(filename):
             rows.append((row.Sequence, sample))
 
     result_df = pandas.DataFrame(rows, columns=["peptide", "sample_id"])
-    result_df["cell_line"] = ""
     result_df["pulldown_antibody"] = "W6/32"
     result_df["mhc_class"] = "I"
     result_df["format"] = "multiallelic"
@@ -209,11 +219,21 @@ def handle_pmid_25576301(filename):
         'HCC1143': "basal like breast cancer",
         'JY': "B-cell",
     }
+    cell_line = {
+        'Fib': None,
+        'HCC1937': "HCC1937",
+        'SupB15WT': None,
+        'SupB15RT': None,
+        'HCT116': "HCT116",
+        'HCC1143': "HCC1143",
+        'JY': "JY",
+    }
     result_df["hla"] = result_df.sample_id.map(allele_map)
     print("Entries before dropping samples with unknown alleles", len(result_df))
     result_df = result_df.loc[~result_df.hla.isnull()]
     print("Entries after dropping samples with unknown alleles", len(result_df))
     result_df["sample_type"] = result_df.sample_id.map(sample_type)
+    result_df["cell_line"] = result_df.sample_id.map(cell_line)
     print(result_df.head(3))
     return result_df
 
@@ -266,7 +286,7 @@ def handle_pmid_26992070(*filenames):
         "HEK293": "hek",
         "HL-60": "neutrophil",
         "RPMI8226": "b-cell",
-        "MAVER-1": "b-lymphoblast",
+        "MAVER-1": "b-LCL",
         "THP-1": "monocyte",
     })
     result_df["mhc_class"] = "I"
@@ -343,30 +363,30 @@ def handle_pmid_28832583(*filenames):
     CM467	B-cell	28832583	HLA-A*01:01	HLA-A*24:02	HLA-B*13:02	HLA-B*39:06	HLA-C*06:02	HLA-C*12:03
     GD149	B-cell	28832583	HLA-A*01:01	HLA-A*24:02	HLA-B*38:01	HLA-B*44:03	HLA-C*06:02	HLA-C*12:03
     MD155	B-cell	28832583	HLA-A*02:01	HLA-A*24:02	HLA-B*15:01	HLA-B*18:01	HLA-C*03:03	HLA-C*07:01
-    PD42	B cell	28832583	HLA-A*02:06	HLA-A*24:02	HLA-B*07:02	HLA-B*55:01	HLA-C*01:02	HLA-C*07:02
-    RA957	B cell	28832583	HLA-A*02:20	HLA-A*68:01	HLA-B*35:03	HLA-B*39:01	HLA-C*04:01	HLA-C*07:02
+    PD42	B-cell	28832583	HLA-A*02:06	HLA-A*24:02	HLA-B*07:02	HLA-B*55:01	HLA-C*01:02	HLA-C*07:02
+    RA957	B-cell	28832583	HLA-A*02:20	HLA-A*68:01	HLA-B*35:03	HLA-B*39:01	HLA-C*04:01	HLA-C*07:02
     TIL1	TIL	28832583	HLA-A*02:01	HLA-A*02:01	HLA-B*18:01	HLA-B*38:01	HLA-C*05:01	
     TIL3	TIL	28832583	HLA-A*01:01	HLA-A*23:01	HLA-B*07:02	HLA-B*15:01	HLA-C*12:03	HLA-C*14:02
     Apher1	Leukapheresis	28832583	HLA-A*03:01	HLA-A*29:02	HLA-B*44:02	HLA-B*44:03	HLA-C*12:03	HLA-C*16:01
     Apher6	Leukapheresis	28832583	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
-    pat_AC2	B lymphoblast	27841757	HLA-A*03:01	HLA-A*32:01	HLA-B*27:05	HLA-B*45:01		
-    pat_C	B lymphoblast	27841757	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
-    pat_CELG	B lymphoblast	27841757	HLA-A*02:01	HLA-A*24:02	HLA-B*15:01	HLA-B*73:01	HLA-C*03:03	HLA-C*15:05
-    pat_CP2	B lymphoblast	27841757	HLA-A*11:01		HLA-B*14:02	HLA-B*44:02		
-    pat_FL	B lymphoblast	27841757	HLA-A*03:01	HLA-A*11:01	HLA-B*44:03	HLA-B*50:01		
-    pat_J	B lymphoblast	27841757	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
-    pat_JPB3	B lymphoblast	27841757	HLA-A*02:01	HLA-A*11:01	HLA-B*27:05	HLA-B*56:01		
-    pat_JT2	B lymphoblast	27841757	HLA-A*11:01		HLA-B*18:03	HLA-B*35:01		
-    pat_M	B lymphoblast	27841757	HLA-A*03:01	HLA-A*29:02	HLA-B*08:01	HLA-B*44:03	HLA-C*07:01	HLA-C*16:01
-    pat_MA	B lymphoblast	27841757	HLA-A*02:01	HLA-A*29:02	HLA-B*44:03	HLA-B*57:01	HLA-C*07:01	HLA-C*16:01
-    pat_ML	B lymphoblast	27841757	HLA-A*02:01	HLA-A*11:01	HLA-B*40:01	HLA-B*44:03		
-    pat_NS2	B lymphoblast	27841757	HLA-A*02:01		HLA-B*13:02	HLA-B*41:01		
-    pat_NT	B lymphoblast	27841757	HLA-A*01:01	HLA-A*32:01	HLA-B*08:01			
-    pat_PF1	B lymphoblast	27841757	HLA-A*01:01	HLA-A*02:01	HLA-B*07:02	HLA-B*44:03	HLA-C*07:02	HLA-C*16:01
-    pat_R	B lymphoblast	27841757	HLA-A*03:01	HLA-A*29:02	HLA-B*08:01	HLA-B*44:03	HLA-C*07:01	HLA-C*16:01
-    pat_RT	B lymphoblast	27841757	HLA-A*01:01	HLA-A*02:01	HLA-B*18:01	HLA-B*39:24	HLA-C*05:01	HLA-C*07:01
-    pat_SR	B lymphoblast	27841757	HLA-A*02:01	HLA-A*23:01	HLA-B*18:01	HLA-B*44:03		
-    pat_ST	B lymphoblast	27841757	HLA-A*03:01	HLA-A*24:02	HLA-B*07:02	HLA-B*27:05
+    pat_AC2	B-LCL	27841757	HLA-A*03:01	HLA-A*32:01	HLA-B*27:05	HLA-B*45:01		
+    pat_C	B-LCL	27841757	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
+    pat_CELG	B-LCL	27841757	HLA-A*02:01	HLA-A*24:02	HLA-B*15:01	HLA-B*73:01	HLA-C*03:03	HLA-C*15:05
+    pat_CP2	B-LCL	27841757	HLA-A*11:01		HLA-B*14:02	HLA-B*44:02		
+    pat_FL	B-LCL	27841757	HLA-A*03:01	HLA-A*11:01	HLA-B*44:03	HLA-B*50:01		
+    pat_J	B-LCL	27841757	HLA-A*02:01	HLA-A*03:01	HLA-B*07:02		HLA-C*07:02	
+    pat_JPB3	B-LCL	27841757	HLA-A*02:01	HLA-A*11:01	HLA-B*27:05	HLA-B*56:01		
+    pat_JT2	B-LCL	27841757	HLA-A*11:01		HLA-B*18:03	HLA-B*35:01		
+    pat_M	B-LCL	27841757	HLA-A*03:01	HLA-A*29:02	HLA-B*08:01	HLA-B*44:03	HLA-C*07:01	HLA-C*16:01
+    pat_MA	B-LCL	27841757	HLA-A*02:01	HLA-A*29:02	HLA-B*44:03	HLA-B*57:01	HLA-C*07:01	HLA-C*16:01
+    pat_ML	B-LCL	27841757	HLA-A*02:01	HLA-A*11:01	HLA-B*40:01	HLA-B*44:03		
+    pat_NS2	B-LCL	27841757	HLA-A*02:01		HLA-B*13:02	HLA-B*41:01		
+    pat_NT	B-LCL	27841757	HLA-A*01:01	HLA-A*32:01	HLA-B*08:01			
+    pat_PF1	B-LCL	27841757	HLA-A*01:01	HLA-A*02:01	HLA-B*07:02	HLA-B*44:03	HLA-C*07:02	HLA-C*16:01
+    pat_R	B-LCL	27841757	HLA-A*03:01	HLA-A*29:02	HLA-B*08:01	HLA-B*44:03	HLA-C*07:01	HLA-C*16:01
+    pat_RT	B-LCL	27841757	HLA-A*01:01	HLA-A*02:01	HLA-B*18:01	HLA-B*39:24	HLA-C*05:01	HLA-C*07:01
+    pat_SR	B-LCL	27841757	HLA-A*02:01	HLA-A*23:01	HLA-B*18:01	HLA-B*44:03		
+    pat_ST	B-LCL	27841757	HLA-A*03:01	HLA-A*24:02	HLA-B*07:02	HLA-B*27:05
     """
     info_df = pandas.read_csv(StringIO(info_text), sep="\t", index_col=0)
     info_df.index = info_df.index.str.strip()
@@ -613,59 +633,383 @@ def handle_pmid_31495665(filename):
     return result_df
 
 
-# Add all functions with names like handle_pmid_XXXX to HANDLERS dict.
+def handle_pmid_27869121(filename):
+    """Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121]"""
+    # Although this dataset has class II data also, we are only extracting
+    # class I for now.
+    df = pandas.read_excel(filename, skiprows=1)
+
+    # Taking these from:
+    # Supplementary Table 2: Information of patients selected for neoepitope
+    # identification
+    # For the Mel5 sample, only two-digit alleles are shown (A*01, A*25,
+    # B*08, B*18) so we are skipping that sample for now.
+    hla_df = pandas.DataFrame([
+        ("Mel-8", "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*08:01 HLA-C*07:01 HLA-C*07:02"),
+        ("Mel-12", "HLA-A*01:01 HLA-B*08:01 HLA-C*07:01"),
+        ("Mel-15", "HLA-A*03:01 HLA-A*68:01 HLA-B*27:05 HLA-B*35:03 HLA-C*02:02 HLA-C*04:01"),
+        ("Mel-16", "HLA-A*01:01 HLA-A*24:02 HLA-B*07:02 HLA-B*08:01 HLA-C*07:01 HLA-C*07:02"),
+    ], columns=["sample_id", "hla"]).set_index("sample_id")
+
+    # We assert below that none of the class I hit peptides were found in any
+    # of the class II pull downs.
+    class_ii_cols = [
+        c for c in df.columns if c.endswith("HLA-II (arbitrary units)")
+    ]
+    class_ii_hits = set(df.loc[
+        (df[class_ii_cols].fillna(0.0).sum(1) > 0)
+    ].Sequence.unique())
+
+    results = []
+    for (sample_id, hla) in hla_df.hla.items():
+        intensity_col = "Intensity %s_HLA-I (arbitrary units)" % sample_id
+        sub_df = df.loc[
+            (df[intensity_col].fillna(0.0) > 0)
+        ]
+        filtered_sub_df = sub_df.loc[
+            (~sub_df.Sequence.isin(class_ii_hits))
+        ]
+        peptides = filtered_sub_df.Sequence.unique()
+        assert not any(p in class_ii_hits for p in peptides)
+
+        result_df = pandas.DataFrame({
+            "peptide": peptides,
+        })
+        result_df["sample_id"] = sample_id
+        result_df["hla"] = hla_df.loc[sample_id, "hla"]
+        result_df["pulldown_antibody"] = "W6/32"
+        result_df["format"] = "multiallelic"
+        result_df["mhc_class"] = "I"
+        result_df["sample_type"] = "melanoma_met"
+        result_df["cell_line"] = None
+        results.append(result_df)
+
+    result_df = pandas.concat(results, ignore_index=True)
+    return result_df
+
+
+def handle_pmid_31154438(*filenames):
+    """Shraibman, ..., Admon Mol Cell Proteomics 2019"""
+    # Note: this publication also includes analyses of the secreted HLA
+    # peptidedome (sHLA) but we are using only the data from membrane-bound
+    # HLA.
+    (xls, txt) = sorted(filenames, key=lambda s: not s.endswith(".xlsx"))
+
+    info = pandas.read_excel(xls, skiprows=1)
+    df = pandas.read_csv(txt, sep="\t", skiprows=1)
+
+    hla_df = info.loc[
+        ~info["mHLA tissue sample"].isnull()
+    ].set_index("mHLA tissue sample")[["HLA typing"]]
+
+    def fix_hla(string):
+        result = []
+        alleles = string.split(";")
+        for a in alleles:
+            a = a.strip()
+            if "/" in a:
+                (a1, a2) = a.split("/")
+                a2 = a1[:2] + a2
+                lst = [a1, a2]
+            else:
+                lst = [a]
+            for a in lst:
+                normalized = normalize_allele_name(a)
+                # Ignore class II
+                if normalized[4] in ("A", "B", "C"):
+                    result.append(normalized)
+        return " ".join(result)
+
+    hla_df["hla"] = hla_df["HLA typing"].map(fix_hla)
+
+    results = []
+    for (sample_id, hla) in hla_df.hla.items():
+        intensity_col = "Intensity %s" % sample_id
+        sub_df = df.loc[
+            (df[intensity_col].fillna(0.0) > 0)
+        ]
+        peptides = sub_df.Sequence.unique()
+
+        result_df = pandas.DataFrame({
+            "peptide": peptides,
+        })
+        result_df["sample_id"] = sample_id
+        result_df["hla"] = hla_df.loc[sample_id, "hla"]
+        result_df["pulldown_antibody"] = "W6/32"
+        result_df["format"] = "multiallelic"
+        result_df["mhc_class"] = "I"
+        result_df["sample_type"] = "glioblastoma_tissue"
+        result_df["cell_line"] = None
+        results.append(result_df)
+
+    result_df = pandas.concat(results, ignore_index=True)
+    return result_df
+
+
+def expression_groups(dataset_identifier, df, groups):
+    result_df = pandas.DataFrame(index=df.index)
+    for (label, columns) in groups.items():
+        result_df[label] = df[columns].mean(1)
+    return result_df
+
+
+def handle_expression_GSE113126(*filenames):
+    """
+    Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093]
+
+    This is the melanoma met RNA-seq dataset.
+
+    """
+
+    df = pandas.read_csv(filenames[0], sep="\t", index_col=0)
+    df = df[[]]  # no columns
+
+    for filename in filenames:
+        df[os.path.basename(filename)] = pandas.read_csv(
+            filename, sep="\t", index_col=0)["TPM"]
+
+    assert len(df.columns) == len(filenames)
+
+    groups = {
+        "sample_type:MELANOMA_MET": df.columns.tolist(),
+    }
+    return expression_groups("GSE113126", df, groups)
+
+
+def handle_expression_expression_atlas_22460905(filename):
+    df = pandas.read_csv(filename, sep="\t", skiprows=4, index_col=0)
+    del df["Gene Name"]
+    df.columns = df.columns.str.lower()
+    df = df.fillna(0.0)
+
+    def matches(*strings):
+        return [c for c in df.columns if all(s in c for s in strings)]
+
+    import ipdb ; ipdb.set_trace()
+
+    groups = {
+        "sample_type:B-LCL": (
+            matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")),
+        "sample_type:B-CELL": matches("b-cell"),
+        "sample_type:MELANOMA_CELL_LINE": matches("melanoma"),
+
+        # For GBM tissue we are just using a mixture of cell lines.
+        "sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"),
+
+        "cell_line:THP-1": ["childhood acute monocytic leukemia, thp-1"],
+        "cell_line:HL-60": ["adult acute myeloid leukemia, hl-60"],
+        "cell_line:U-87": ['glioblastoma, u-87 mg'],
+        "cell_line:LNT-229": ['glioblastoma, ln-229'],
+        "cell_line:T98G": ['glioblastoma, t98g'],
+        "cell_line:SK-MEL-5": ['cutaneous melanoma, sk-mel-5'],
+        'cell_line:MEWO': ['melanoma, mewo'],
+        "cell_line:HCC1937": ['breast ductal adenocarcinoma, hcc1937'],
+        "cell_line:HCT116": ['colon carcinoma, hct 116'],
+        "cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'],
+    }
+    return expression_groups("expression_atlas_22460905", df, groups)
+
+
+
+def handle_expression_human_protein_atlas(*filenames):
+    (cell_line_filename,) = [f for f in filenames if "celline" in f]
+    (blood_filename,) = [f for f in filenames if "blood" in f]
+    (gtex_filename,) = [f for f in filenames if "gtex" in f]
+
+    cell_line_df = pandas.read_csv(cell_line_filename, sep="\t")
+    blood_df = pandas.read_csv(blood_filename, sep="\t", index_col=0)
+    gtex_df = pandas.read_csv(gtex_filename, sep="\t")
+
+    cell_line_df = cell_line_df.pivot(
+        index="Gene", columns="Cell line", values="TPM")
+
+    gtex_df = gtex_df.pivot(
+        index="Gene", columns="Tissue", values="TPM")
+
+    result_df = pandas.DataFrame(index=cell_line_df.index)
+
+    result_df["sample_type:pbmc"] = blood_df[
+        [c for c in blood_df.columns if "total PBMC" in c]
+    ].mean(1)
+
+    result_df["cell_line:HEK293"] = cell_line_df['HEK 293']
+    result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226']
+
+    # EXPI293 is based off HEK293
+    result_df["cell_line:EXPI293"] = cell_line_df['HEK 293']
+
+    # For leukapheresis we use pbmc sample
+    result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"]
+
+    for tissue in ["lung", "spleen"]:
+        result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue]
+    return result_df
+
+
+
+
+
+# Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict.
 for (key, value) in list(locals().items()):
     if key.startswith("handle_pmid_"):
-        HANDLERS[key.replace("handle_pmid_", "")] = value
+        PMID_HANDLERS[key.replace("handle_pmid_", "")] = value
+    elif key.startswith("handle_expression_"):
+        EXPRESSION_HANDLERS[key.replace("handle_expression_", "")] = value
 
 
 def run():
     args = parser.parse_args(sys.argv[1:])
 
-    dfs = []
-    for (i, item_tpl) in enumerate(args.item):
+    expression_dfs = []
+    for (i, item_tpl) in enumerate(args.expression_item):
+        (label, filenames) = (item_tpl[0], item_tpl[1:])
+        label = label.replace("-", "_")
+        print(
+            "Processing expression item %d of %d" % (i + 1, len(args.expression_item)),
+            label,
+            *[os.path.abspath(f) for f in filenames])
+
+        expression_df = None
+        handler = None
+        if label in EXPRESSION_HANDLERS:
+            handler = EXPRESSION_HANDLERS[label]
+            expression_df = handler(*filenames)
+        elif args.debug:
+            debug(*filenames)
+        else:
+            raise NotImplementedError(label)
+
+        if expression_df is not None:
+            print(
+                "Processed expression data",
+                label,
+                "with shape",
+                expression_df.shape)
+            print(*expression_df.columns)
+            expression_dfs.append(expression_df)
+
+    expression_df = expression_dfs[0]
+    for other in expression_dfs[1:]:
+        expression_df = pandas.merge(
+            expression_df, other, how='outer', left_index=True, right_index=True)
+    expression_df = expression_df.fillna(0)
+
+    print(
+        "Genes in each expression dataframe: ",
+        *[len(e) for e in expression_dfs])
+    print("Genes in merged expression dataframe", len(expression_df))
+
+    ms_dfs = []
+    for (i, item_tpl) in enumerate(args.ms_item):
         (pmid, filenames) = (item_tpl[0], item_tpl[1:])
         print(
-            "Processing item %d / %d" % (i + 1, len(args.item)),
+            "Processing MS item %d of %d" % (i + 1, len(args.ms_item)),
             pmid,
             *[os.path.abspath(f) for f in filenames])
 
-        df = None
+        ms_df = None
         handler = None
-        if pmid in HANDLERS:
-            handler = HANDLERS[pmid]
-            df = handler(*filenames)
+        if pmid in PMID_HANDLERS:
+            handler = PMID_HANDLERS[pmid]
+            ms_df = handler(*filenames)
         elif args.debug:
             debug(*filenames)
         else:
-            raise NotImplementedError(args.pmid)
-
-        if df is not None:
-            df["pmid"] = pmid
-            if "original_pmid" not in df.columns:
-                df["original_pmid"] = pmid
-            df = df.applymap(str).applymap(str.upper)
-            print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
+            raise NotImplementedError(pmid)
+
+        if ms_df is not None:
+            ms_df["pmid"] = pmid
+            if "original_pmid" not in ms_df.columns:
+                ms_df["original_pmid"] = pmid
+            if "expression_dataset" not in ms_df.columns:
+                ms_df["expression_dataset"] = ""
+            ms_df = ms_df.applymap(str).applymap(str.upper)
+            ms_df["sample_id"] = ms_df.sample_id.str.replace(" ", "")
+            print("*** PMID %s: %d peptides ***" % (pmid, len(ms_df)))
             if handler is not None:
                 print(handler.__doc__)
             print("Counts by sample id:")
-            print(df.groupby("sample_id").peptide.nunique())
+            print(ms_df.groupby("sample_id").peptide.nunique())
             print("")
             print("Counts by sample type:")
-            print(df.groupby("sample_type").peptide.nunique())
+            print(ms_df.groupby("sample_type").peptide.nunique())
             print("****************************")
 
-            dfs.append(df)
+            for value in ms_df.expression_dataset.unique():
+                if value and value not in expression_df.columns:
+                    raise ValueError("No such expression dataset", value)
 
-    df = pandas.concat(dfs, ignore_index=True, sort=False)
+            ms_dfs.append(ms_df)
 
-    df["cell_line"] = df["cell_line"].fillna("")
+    ms_df = pandas.concat(ms_dfs, ignore_index=True, sort=False)
+    ms_df["cell_line"] = ms_df["cell_line"].fillna("")
 
-    cols = ["pmid", "sample_id", "peptide", "format", "mhc_class", "hla", ]
-    cols += [c for c in sorted(df.columns) if c not in cols]
-    df = df[cols]
+    sample_table = ms_df[
+        ["sample_id", "pmid", "expression_dataset", "cell_line", "sample_type"]
+    ].drop_duplicates().set_index("sample_id")
 
-    null_df = df.loc[df.isnull().any(1)]
+    sample_id_to_expression_dataset = sample_table.expression_dataset.to_dict()
+    for (sample_id, value) in sorted(sample_id_to_expression_dataset.items()):
+        if value:
+            print("Expression dataset for sample", sample_id, "already assigned")
+            continue
+        cell_line_col = "cell_line:" + sample_table.loc[sample_id, "cell_line"]
+        sample_type_col = "sample_type:" + (
+            sample_table.loc[sample_id, "sample_type"])
+
+        expression_dataset = None
+        for col in [cell_line_col, sample_type_col]:
+            if col in expression_df.columns:
+                expression_dataset = col
+                break
+
+        if not expression_dataset:
+            print("*" * 20)
+            print("No expression dataset for sample ", sample_id)
+            print("Sample info:")
+            print(sample_table.loc[sample_id])
+            print("*" * 20)
+
+        sample_id_to_expression_dataset[sample_id] = expression_dataset
+        print(
+            "Sample", sample_id, "assigned exp. dataset", expression_dataset)
+
+    print("Expression dataset usage:")
+    print(pandas.Series(sample_id_to_expression_dataset).value_counts())
+
+    missing = [
+        key for (key, value) in
+        sample_id_to_expression_dataset.items()
+        if value is None
+    ]
+    if missing:
+        print("Missing expression data for samples", *missing)
+        print(
+            "Missing cell lines: ",
+            *sample_table.loc[missing, "cell_line"].dropna().drop_duplicates().tolist())
+        print("Missing sample types: ", *sample_table.loc[
+            missing, "sample_type"].dropna().drop_duplicates().tolist())
+        if args.debug:
+            import ipdb; ipdb.set_trace()
+        else:
+            raise ValueError("Missing expression data for samples: ", missing)
+
+    ms_df["expression_dataset"] = ms_df.sample_id.map(
+        sample_id_to_expression_dataset)
+
+    cols = [
+        "pmid",
+        "sample_id",
+        "peptide",
+        "format",
+        "mhc_class",
+        "hla",
+        "expression_dataset",
+    ]
+    cols += [c for c in sorted(ms_df.columns) if c not in cols]
+    ms_df = ms_df[cols]
+
+    null_df = ms_df.loc[ms_df.isnull().any(1)]
     if len(null_df) > 0:
         print("Nulls:")
         print(null_df)
@@ -673,11 +1017,14 @@ def run():
         print("No nulls.")
 
     # Each sample should be coming from only one experiment.
-    assert df.groupby("sample_id").pmid.nunique().max() == 1, (
-        df.groupby("sample_id").pmid.nunique().sort_values())
+    assert ms_df.groupby("sample_id").pmid.nunique().max() == 1, (
+        ms_df.groupby("sample_id").pmid.nunique().sort_values())
+
+    expression_df.to_csv(args.expression_out, index=True)
+    print("Wrote: %s" % os.path.abspath(args.expression_out))
 
-    df.to_csv(args.out, index=False)
-    print("Wrote: %s" % os.path.abspath(args.out))
+    ms_df.to_csv(args.ms_out, index=False)
+    print("Wrote: %s" % os.path.abspath(args.ms_out))
 
 if __name__ == '__main__':
     run()
diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh
index e566526a..f10b85b6 100755
--- a/downloads-generation/data_published/GENERATE.sh
+++ b/downloads-generation/data_published/GENERATE.sh
@@ -33,7 +33,7 @@ wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.200
 wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
 wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt
 
-mkdir raw
+mkdir ms
 
 ############################################
 # MS: Multiallelic class I
@@ -43,10 +43,10 @@ mkdir raw
 #   Pearson, ..., Perreault JCI 2016 [PMID 27841757]
 # but was reanalyzed in this work, and we download the reanalyzed version here.
 PMID=28832583
-mkdir -p raw/$PMID
-wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P raw/$PMID # data generated in this work
-wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P raw/$PMID # data reanalyzed in this work
-cd raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P ms/$PMID # data generated in this work
+wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P ms/$PMID # data reanalyzed in this work
+cd ms/$PMID
 unzip *.s002
 unzip *.s003
 mkdir saved
@@ -58,33 +58,33 @@ cd ../..
 
 # Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301]
 PMID=25576301
-mkdir -p raw/$PMID
-wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P ms/$PMID
 
 # Mommen, ..., Heck PNAS 2014 [PMID 24616531]
 PMID=24616531
-mkdir -p raw/$PMID
-wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P ms/$PMID
 
 # Gloger, ..., Neri Cancer Immunol Immunother 2016 [PMID 27600516]
 # Data extracted from supplemental PDF table.
 PMID=27600516
-mkdir -p raw/$PMID
-wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P ms/$PMID
 
 # Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]
 # Supplemental zip downloaded from publication
 PMID=26992070
-mkdir -p raw/$PMID
-wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID
-cd raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P ms/$PMID
+cd ms/$PMID
 unzip pmic12297-sup-0001-supinfo.zip
 cd ../..
 
 # Shraibman, ..., Admon Mol Cell Proteomics	2016 [PMID 27412690]
 PMID=27412690
-mkdir -p raw/$PMID
-wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P ms/$PMID
 
 # Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
 # Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it.
@@ -92,17 +92,59 @@ wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1
 
 # Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]
 PMID=23481700
-mkdir -p raw/$PMID
-wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P ms/$PMID
+
+# Shraibman, ..., Admon Mol Cell Proteomics 2019 [PMID 31154438]
+PMID=31154438
+mkdir -p ms/$PMID
+wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/3/zjw006195963st2.txt -P ms/$PMID
+wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/1/zjw006195963st1.xlsx -P ms/$PMID
+
+# Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121]
+PMID=27869121
+mkdir -p ms/$PMID
+wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fncomms13404/MediaObjects/41467_2016_BFncomms13404_MOESM1318_ESM.xlsx" -P ms/$PMID
 
 ############################################
 # MS: Monoallelic class II
 ############################################
 # Abelin, ..., Rooney Immunity 2019 [PMID 31495665]
 PMID=31495665
-mkdir -p raw/$PMID
-wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P raw/$PMID
+mkdir -p ms/$PMID
+wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P ms/$PMID
 
+############################################
+# RNA-seq expression data (TPMs)
+############################################
+# CCLE as processed by expression atlas
+DATASET=expression-atlas-22460905
+mkdir -p expression/$DATASET
+wget -q https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-2770/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv -P expression/$DATASET
+
+# Human protein atlas
+DATASET=human-protein-atlas
+mkdir -p expression/$DATASET
+cd expression/$DATASET
+wget -q https://www.proteinatlas.org/download/rna_celline.tsv.zip
+wget -q https://www.proteinatlas.org/download/rna_blood_cell_sample_tpm_m.tsv.zip
+wget -q https://www.proteinatlas.org/download/rna_tissue_gtex.tsv.zip
+for i in $(ls *.zip)
+do
+    unzip $i
+    rm $i
+done
+cd ../..
+
+# Melanoma. Original publication
+# Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093].
+DATASET=GSE113126
+mkdir -p expression/$DATASET 
+cd expression/$DATASET
+wget -q "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE113126&format=file" -O GSE113126_RAW.tar
+tar -xvf GSE113126_RAW.tar
+rm GSE113126_RAW.tar
+cd ../..
 
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
diff --git a/downloads-generation/models_class1_pan_variants/GENERATE.sh b/downloads-generation/models_class1_pan_variants/GENERATE.sh
index bda713e2..48da9927 100755
--- a/downloads-generation/models_class1_pan_variants/GENERATE.sh
+++ b/downloads-generation/models_class1_pan_variants/GENERATE.sh
@@ -62,15 +62,15 @@ if [ "$2" != "continue-incomplete" ]
 then
     cp $SCRIPT_DIR/generate_hyperparameters.production.py .
     cp $SCRIPT_DIR/generate_hyperparameters.py .
-    python generate_hyperparameters.production.py > hyperparameters.production.json
-    python generate_hyperparameters.py hyperparameters.production.json no_pretrain > hyperparameters.no_pretrain.yaml
+    python generate_hyperparameters.production.py > hyperparameters.production.yaml
+    python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
     python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
 fi
 
 for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
 do
     CONTINUE_INCOMPLETE_ARGS=""
-    if [ "$2" == "continue-incomplete" ] && [ -d "models.${kind}" ]
+    if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ]
     then
         echo "Will continue existing run: $kind"
         CONTINUE_INCOMPLETE_ARGS="--continue-incomplete"
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 4089fb78..36eff633 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -37,10 +37,6 @@ releases:
               url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2
               default: false
 
-            - name: data_expression
-              url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_expression.20191009.tar.bz2
-              default: false
-
             - name: data_references
               url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
               default: false
@@ -62,7 +58,7 @@ releases:
               default: false
 
             - name: data_published
-              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190924.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_published.20191011.tar.bz2
               default: false
 
             - name: data_curated
-- 
GitLab