From 9152bada5236aa9451af96179f937562273ffe87 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 11 Oct 2019 15:53:53 -0400
Subject: [PATCH] fixes

---
 downloads-generation/data_curated/GENERATE.sh |   3 +-
 .../data_curated/curate_ms_by_pmid.py         | 230 ++++++++++++------
 .../data_mass_spec_annotated/GENERATE.sh      |   2 +-
 mhcflurry/downloads.yml                       |   2 +-
 4 files changed, 159 insertions(+), 78 deletions(-)

diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh
index b35d01fc..eb7b5a73 100755
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -49,7 +49,8 @@ done
 
 time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
     --ms-out ms.nontraining_curated.by_pmid.csv \
-    --expression-out rna_expression.csv
+    --expression-out rna_expression.csv \
+    --expression-metadata-out rna_expression.metadata.csv
 
 bzip2 ms.nontraining_curated.by_pmid.csv
 bzip2 rna_expression.csv
diff --git a/downloads-generation/data_curated/curate_ms_by_pmid.py b/downloads-generation/data_curated/curate_ms_by_pmid.py
index 5406e238..faf68811 100755
--- a/downloads-generation/data_curated/curate_ms_by_pmid.py
+++ b/downloads-generation/data_curated/curate_ms_by_pmid.py
@@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec.
 import sys
 import argparse
 import os
+import json
 import collections
 from six.moves import StringIO
 
@@ -44,6 +45,10 @@ parser.add_argument(
     "--expression-out",
     metavar="OUT.csv",
     help="Out file path (RNA-seq expression)")
+parser.add_argument(
+    "--expression-metadata-out",
+    metavar="OUT.csv",
+    help="Out file path for expression metadata, i.e. which samples used")
 parser.add_argument(
     "--debug",
     action="store_true",
@@ -421,6 +426,42 @@ def handle_pmid_28832583(*filenames):
     return result_df
 
 
+PMID_31495665_SAMPLE_TYPES = {
+        "HLA-DR_Lung": "lung",
+        "HLA-DR_PBMC_HDSC": "pbmc",
+        "HLA-DR_PBMC_RG1095": "pbmc",
+        "HLA-DR_PBMC_RG1104": "pbmc",
+        "HLA-DR_PBMC_RG1248": "pbmc",
+        "HLA-DR_Spleen": "spleen",
+        "MAPTAC_A*02:01": "mix:a375,expi293,hek293,hela",
+        "MAPTAC_A*11:01": "mix:expi293,hela",
+        "MAPTAC_A*32:01": "mix:a375,expi293,hela",
+        "MAPTAC_B*07:02": "mix:a375,expi293,hela",
+        "MAPTAC_B*45:01": "expi293",
+        "MAPTAC_B*52:01": "mix:a375,expi293",
+        "MAPTAC_C*03:03": "expi293",
+        "MAPTAC_C*06:02": "mix:a375,expi293",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
+        "MAPTAC_DRB1*01:01": "mix:a375,b721,expi293,kg1,k562",
+        "MAPTAC_DRB1*03:01": "expi293",
+        "MAPTAC_DRB1*04:01": "expi293",
+        "MAPTAC_DRB1*07:01": "mix:expi293,hek293",
+        "MAPTAC_DRB1*11:01": "mix:expi293,k562,kg1",
+        "MAPTAC_DRB1*12:01_dm+": "expi293",
+        "MAPTAC_DRB1*12:01_dm-": "expi293",
+        "MAPTAC_DRB1*15:01": "expi293",
+        "MAPTAC_DRB3*01:01_dm+": "expi293",
+        "MAPTAC_DRB3*01:01_dm-": "expi293",
+}
+CELL_LINE_MIXTURES = sorted(
+    set(
+        x for x in PMID_31495665_SAMPLE_TYPES.values()
+        if x.startswith("mix:")))
+
+
 def handle_pmid_31495665(filename):
     """Abelin, ..., Rooney Immunity 2019 [PMID 31495665]"""
     hla_type = {
@@ -561,56 +602,27 @@ def handle_pmid_31495665(filename):
         "MAPTAC_A*11:01": "",
         "MAPTAC_A*32:01": "",
         "MAPTAC_B*07:02": "",
-        "MAPTAC_B*45:01": "",
+        "MAPTAC_B*45:01": "expi293",
         "MAPTAC_B*52:01": "",
-        "MAPTAC_C*03:03": "",
+        "MAPTAC_C*03:03": "expi293",
         "MAPTAC_C*06:02": "",
         "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
         "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
         "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",  # don't actually see this in DataS1A!
         "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
         "MAPTAC_DRB1*01:01": "",
-        "MAPTAC_DRB1*03:01": "",
-        "MAPTAC_DRB1*04:01": "",
+        "MAPTAC_DRB1*03:01": "expi293",
+        "MAPTAC_DRB1*04:01": "expi293",
         "MAPTAC_DRB1*07:01": "",
         "MAPTAC_DRB1*11:01": "",
-        "MAPTAC_DRB1*12:01_dm+": "",
-        "MAPTAC_DRB1*12:01_dm-": "",
-        "MAPTAC_DRB1*15:01": "",
-        "MAPTAC_DRB3*01:01_dm+": "",
-        "MAPTAC_DRB3*01:01_dm-": "",
-    }
-    sample_type = {
-        "HLA-DR_Lung": "lung",
-        "HLA-DR_PBMC_HDSC": "lung",
-        "HLA-DR_PBMC_RG1095": "lung",
-        "HLA-DR_PBMC_RG1104": "lung",
-        "HLA-DR_PBMC_RG1248": "lung",
-        "HLA-DR_Spleen": "spleen",
-        "MAPTAC_A*02:01": "mixed",
-        "MAPTAC_A*11:01": "mixed",
-        "MAPTAC_A*32:01": "mixed",
-        "MAPTAC_B*07:02": "mixed",
-        "MAPTAC_B*45:01": "mixed",
-        "MAPTAC_B*52:01": "mixed",
-        "MAPTAC_C*03:03": "mixed",
-        "MAPTAC_C*06:02": "mixed",
-        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "mixed",
-        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "mixed",
-        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "mixed",
-        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "mixed",
-        "MAPTAC_DRB1*01:01": "mixed",
-        "MAPTAC_DRB1*03:01": "mixed",
-        "MAPTAC_DRB1*04:01": "mixed",
-        "MAPTAC_DRB1*07:01": "mixed",
-        "MAPTAC_DRB1*11:01": "mixed",
-        "MAPTAC_DRB1*12:01_dm+": "mixed",
-        "MAPTAC_DRB1*12:01_dm-": "mixed",
-        "MAPTAC_DRB1*15:01": "mixed",
-        "MAPTAC_DRB3*01:01_dm+": "mixed",
-        "MAPTAC_DRB3*01:01_dm-": "mixed",
+        "MAPTAC_DRB1*12:01_dm+": "expi293",
+        "MAPTAC_DRB1*12:01_dm-": "expi293",
+        "MAPTAC_DRB1*15:01": "expi293",
+        "MAPTAC_DRB3*01:01_dm+": "expi293",
+        "MAPTAC_DRB3*01:01_dm-": "expi293",
     }
 
+
     df = pandas.read_excel(filename, sheet_name="DataS1B")
     results = []
     for sample_id in df.columns:
@@ -626,7 +638,7 @@ def handle_pmid_31495665(filename):
         result_df["pulldown_antibody"] = pulldown_antibody[sample_id]
         result_df["format"] = format[sample_id]
         result_df["mhc_class"] = mhc_class[sample_id]
-        result_df["sample_type"] = sample_type[sample_id]
+        result_df["sample_type"] = PMID_31495665_SAMPLE_TYPES[sample_id]
         result_df["cell_line"] = cell_line[sample_id]
         results.append(result_df)
     result_df = pandas.concat(results, ignore_index=True)
@@ -745,11 +757,18 @@ def handle_pmid_31154438(*filenames):
     result_df = pandas.concat(results, ignore_index=True)
     return result_df
 
+EXPRESSION_GROUPS_ROWS = []
 
-def expression_groups(dataset_identifier, df, groups):
+
+def make_expression_groups(dataset_identifier, df, groups):
     result_df = pandas.DataFrame(index=df.index)
     for (label, columns) in groups.items():
+        for col in columns:
+            if col not in df.columns:
+                raise ValueError(
+                    "Missing: %s. Available: %s" % (col, df.columns.tolist()))
         result_df[label] = df[columns].mean(1)
+        EXPRESSION_GROUPS_ROWS.append((dataset_identifier, label, columns))
     return result_df
 
 
@@ -773,7 +792,7 @@ def handle_expression_GSE113126(*filenames):
     groups = {
         "sample_type:MELANOMA_MET": df.columns.tolist(),
     }
-    return expression_groups("GSE113126", df, groups)
+    return [make_expression_groups("GSE113126", df, groups)]
 
 
 def handle_expression_expression_atlas_22460905(filename):
@@ -785,13 +804,17 @@ def handle_expression_expression_atlas_22460905(filename):
     def matches(*strings):
         return [c for c in df.columns if all(s in c for s in strings)]
 
-    import ipdb ; ipdb.set_trace()
-
     groups = {
         "sample_type:B-LCL": (
             matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")),
         "sample_type:B-CELL": matches("b-cell"),
+        "sample_type:B721-LIKE": matches("b-cell"),
         "sample_type:MELANOMA_CELL_LINE": matches("melanoma"),
+        "sample_type:A375-LIKE": matches("melanoma"),
+        "sample_type:KG1-LIKE": matches("myeloid leukemia"),
+
+        # Using a fibrosarcoma cell line for our fibroblast sample.
+        "sample_type:FIBROBLAST": ['fibrosarcoma, ht-1080'],
 
         # For GBM tissue we are just using a mixture of cell lines.
         "sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"),
@@ -807,8 +830,7 @@ def handle_expression_expression_atlas_22460905(filename):
         "cell_line:HCT116": ['colon carcinoma, hct 116'],
         "cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'],
     }
-    return expression_groups("expression_atlas_22460905", df, groups)
-
+    return [make_expression_groups("expression_atlas_22460905", df, groups)]
 
 
 def handle_expression_human_protein_atlas(*filenames):
@@ -826,27 +848,67 @@ def handle_expression_human_protein_atlas(*filenames):
     gtex_df = gtex_df.pivot(
         index="Gene", columns="Tissue", values="TPM")
 
-    result_df = pandas.DataFrame(index=cell_line_df.index)
-
-    result_df["sample_type:pbmc"] = blood_df[
-        [c for c in blood_df.columns if "total PBMC" in c]
-    ].mean(1)
-
-    result_df["cell_line:HEK293"] = cell_line_df['HEK 293']
-    result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226']
-
-    # EXPI293 is based off HEK293
-    result_df["cell_line:EXPI293"] = cell_line_df['HEK 293']
-
-    # For leukapheresis we use pbmc sample
-    result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"]
-
-    for tissue in ["lung", "spleen"]:
-        result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue]
-    return result_df
-
+    return [
+        make_expression_groups(
+            "human_protein_atlas:%s" % os.path.basename(blood_filename),
+            blood_df,
+            groups={
+                "sample_type:PBMC": [
+                    c for c in blood_df.columns if "total PBMC" in c
+                ],
+
+                # for samples labeled leukapheresis we also use PBMC
+                "sample_type:LEUKAPHERESIS": [
+                    c for c in blood_df.columns if "total PBMC" in c
+                ],
+
+                # for samples labeled TIL we are also using PBMC
+                "sample_type:TIL": [
+                    c for c in blood_df.columns if "total PBMC" in c
+                ],
+            }),
+        make_expression_groups(
+            "human_protein_atlas:%s" % os.path.basename(cell_line_filename),
+            cell_line_df,
+            groups={
+                "cell_line:HELA": ['HeLa'],
+                "cell_line:K562": ["K-562"],
+                "cell_line:HEK293": ['HEK 293'],
+                "cell_line:RPMI8226": ['RPMI-8226'],
+                "cell_line:EXPI293": ['HEK 293'],  # EXPI293 derived from HEK293
+            }),
+        make_expression_groups(
+            "human_protein_atlas:%s" % os.path.basename(gtex_filename),
+            gtex_df,
+            groups={
+                "sample_type:LUNG": ["lung"],
+                "sample_type:SPLEEN": ["spleen"],
+            }),
+    ]
 
 
+def make_expression_mixtures(expression_df):
+    global CELL_LINE_MIXTURES
+    groups = {}
+    for mix in CELL_LINE_MIXTURES:
+        components = []
+        for item in mix.replace("mix:", "").upper().split(","):
+            if "cell_line:%s" % item in expression_df.columns:
+                components.append("cell_line:%s" % item)
+            else:
+                print("No cell line, falling back on similar: ", item)
+                components.append("sample_type:%s-LIKE" % item)
+        groups["sample_type:" + mix.upper()] = components
+    missing = set()
+    for some in groups.values():
+        for item in some:
+            if item not in expression_df.columns:
+                missing.add(item)
+    if missing:
+        raise ValueError(
+            "Missing [%d]: %s. Available: %s" % (
+                len(missing), missing, expression_df.columns.tolist()))
+    return make_expression_groups("mixtures", expression_df, groups)
 
 
 # Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict.
@@ -869,36 +931,44 @@ def run():
             label,
             *[os.path.abspath(f) for f in filenames])
 
-        expression_df = None
+        expression_dfs_for_item = []
         handler = None
         if label in EXPRESSION_HANDLERS:
             handler = EXPRESSION_HANDLERS[label]
-            expression_df = handler(*filenames)
+            expression_dfs_for_item = handler(*filenames)
         elif args.debug:
             debug(*filenames)
         else:
             raise NotImplementedError(label)
 
-        if expression_df is not None:
+        if expression_dfs_for_item:
             print(
                 "Processed expression data",
                 label,
-                "with shape",
-                expression_df.shape)
-            print(*expression_df.columns)
-            expression_dfs.append(expression_df)
+                "result dataframes",
+                len(expression_dfs_for_item))
+            print(*[e.columns for e in expression_dfs_for_item])
+            expression_dfs.extend(expression_dfs_for_item)
 
     expression_df = expression_dfs[0]
     for other in expression_dfs[1:]:
         expression_df = pandas.merge(
             expression_df, other, how='outer', left_index=True, right_index=True)
-    expression_df = expression_df.fillna(0)
 
-    print(
-        "Genes in each expression dataframe: ",
+    print("Genes in each expression dataframe: ",
         *[len(e) for e in expression_dfs])
     print("Genes in merged expression dataframe", len(expression_df))
 
+    if CELL_LINE_MIXTURES:
+        print("Generating cell line mixtures.")
+        expression_mixture_df = make_expression_mixtures(expression_df)
+        expression_df = pandas.merge(
+            expression_df,
+            expression_mixture_df,
+            how='outer',
+            left_index=True,
+            right_index=True)
+
     ms_dfs = []
     for (i, item_tpl) in enumerate(args.ms_item):
         (pmid, filenames) = (item_tpl[0], item_tpl[1:])
@@ -1026,5 +1096,15 @@ def run():
     ms_df.to_csv(args.ms_out, index=False)
     print("Wrote: %s" % os.path.abspath(args.ms_out))
 
+    if args.expression_metadata_out is not None:
+        expression_metadata_df = pandas.DataFrame(
+            EXPRESSION_GROUPS_ROWS,
+            columns=["expression_dataset", "label", "samples"])
+        expression_metadata_df["samples"] = expression_metadata_df[
+            "samples"
+        ].map(json.dumps)
+        expression_metadata_df.to_csv(args.expression_metadata_out, index=False)
+        print("Wrote: %s" % os.path.abspath(args.expression_metadata_out))
+
 if __name__ == '__main__':
     run()
diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
index 6fd95390..e9c52381 100755
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -27,7 +27,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 cp $SCRIPT_DIR/annotate.py .
 
-PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
+PEPTIDES=$(mhcflurry-downloads path data_curated)/ms.nontraining_curated.by_pmid.csv.bz2
 REFERENCES_DIR=$(mhcflurry-downloads path data_references)
 
 python annotate.py \
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 36eff633..7c268d3c 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -62,7 +62,7 @@ releases:
               default: false
 
             - name: data_curated
-              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2
               default: true
 
             # Older downloads
-- 
GitLab