fixes

9152bada · Tim O'Donnell · b7087aa8 · 9152bada · 9152bada · 9152bada
Commit 9152bada authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -49,7 +49,8 @@ done
 time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
    --ms-out ms.nontraining_curated.by_pmid.csv \
-    --expression-out rna_expression.csv
+    --expression-out rna_expression.csv \
+    --expression-metadata-out rna_expression.metadata.csv
 bzip2 ms.nontraining_curated.by_pmid.csv
 bzip2 rna_expression.csv

--- a/downloads-generation/data_curated/curate_ms_by_pmid.py
+++ b/downloads-generation/data_curated/curate_ms_by_pmid.py
@@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec.
 import sys
 import argparse
 import os
+import json
 import collections
 from six.moves import StringIO
@@ -44,6 +45,10 @@ parser.add_argument(
    "--expression-out",
    metavar="OUT.csv",
    help="Out file path (RNA-seq expression)")
+parser.add_argument(
+    "--expression-metadata-out",
+    metavar="OUT.csv",
+    help="Out file path for expression metadata, i.e. which samples used")
 parser.add_argument(
    "--debug",
    action="store_true",
@@ -421,6 +426,42 @@ def handle_pmid_28832583(*filenames):
    return result_df
+PMID_31495665_SAMPLE_TYPES = {
+        "HLA-DR_Lung": "lung",
+        "HLA-DR_PBMC_HDSC": "pbmc",
+        "HLA-DR_PBMC_RG1095": "pbmc",
+        "HLA-DR_PBMC_RG1104": "pbmc",
+        "HLA-DR_PBMC_RG1248": "pbmc",
+        "HLA-DR_Spleen": "spleen",
+        "MAPTAC_A*02:01": "mix:a375,expi293,hek293,hela",
+        "MAPTAC_A*11:01": "mix:expi293,hela",
+        "MAPTAC_A*32:01": "mix:a375,expi293,hela",
+        "MAPTAC_B*07:02": "mix:a375,expi293,hela",
+        "MAPTAC_B*45:01": "expi293",
+        "MAPTAC_B*52:01": "mix:a375,expi293",
+        "MAPTAC_C*03:03": "expi293",
+        "MAPTAC_C*06:02": "mix:a375,expi293",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
+        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",
+        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
+        "MAPTAC_DRB1*01:01": "mix:a375,b721,expi293,kg1,k562",
+        "MAPTAC_DRB1*03:01": "expi293",
+        "MAPTAC_DRB1*04:01": "expi293",
+        "MAPTAC_DRB1*07:01": "mix:expi293,hek293",
+        "MAPTAC_DRB1*11:01": "mix:expi293,k562,kg1",
+        "MAPTAC_DRB1*12:01_dm+": "expi293",
+        "MAPTAC_DRB1*12:01_dm-": "expi293",
+        "MAPTAC_DRB1*15:01": "expi293",
+        "MAPTAC_DRB3*01:01_dm+": "expi293",
+        "MAPTAC_DRB3*01:01_dm-": "expi293",
+}
+CELL_LINE_MIXTURES = sorted(
+    set(
+        x for x in PMID_31495665_SAMPLE_TYPES.values()
+        if x.startswith("mix:")))
 def handle_pmid_31495665(filename):
    """Abelin, ..., Rooney Immunity 2019 [PMID 31495665]"""
    hla_type = {
@@ -561,56 +602,27 @@ def handle_pmid_31495665(filename):
        "MAPTAC_A*11:01": "",
        "MAPTAC_A*32:01": "",
        "MAPTAC_B*07:02": "",
-        "MAPTAC_B*45:01": "",
+        "MAPTAC_B*45:01": "expi293",
        "MAPTAC_B*52:01": "",
-        "MAPTAC_C*03:03": "",
+        "MAPTAC_C*03:03": "expi293",
        "MAPTAC_C*06:02": "",
        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",  # don't actually see this in DataS1A!
        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
        "MAPTAC_DRB1*01:01": "",
-        "MAPTAC_DRB1*03:01": "",
+        "MAPTAC_DRB1*03:01": "expi293",
-        "MAPTAC_DRB1*04:01": "",
+        "MAPTAC_DRB1*04:01": "expi293",
        "MAPTAC_DRB1*07:01": "",
        "MAPTAC_DRB1*11:01": "",
-        "MAPTAC_DRB1*12:01_dm+": "",
+        "MAPTAC_DRB1*12:01_dm+": "expi293",
-        "MAPTAC_DRB1*12:01_dm-": "",
+        "MAPTAC_DRB1*12:01_dm-": "expi293",
-        "MAPTAC_DRB1*15:01": "",
+        "MAPTAC_DRB1*15:01": "expi293",
-        "MAPTAC_DRB3*01:01_dm+": "",
+        "MAPTAC_DRB3*01:01_dm+": "expi293",
-        "MAPTAC_DRB3*01:01_dm-": "",
+        "MAPTAC_DRB3*01:01_dm-": "expi293",
-    }
-    sample_type = {
-        "HLA-DR_Lung": "lung",
-        "HLA-DR_PBMC_HDSC": "lung",
-        "HLA-DR_PBMC_RG1095": "lung",
-        "HLA-DR_PBMC_RG1104": "lung",
-        "HLA-DR_PBMC_RG1248": "lung",
-        "HLA-DR_Spleen": "spleen",
-        "MAPTAC_A*02:01": "mixed",
-        "MAPTAC_A*11:01": "mixed",
-        "MAPTAC_A*32:01": "mixed",
-        "MAPTAC_B*07:02": "mixed",
-        "MAPTAC_B*45:01": "mixed",
-        "MAPTAC_B*52:01": "mixed",
-        "MAPTAC_C*03:03": "mixed",
-        "MAPTAC_C*06:02": "mixed",
-        "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "mixed",
-        "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "mixed",
-        "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "mixed",
-        "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "mixed",
-        "MAPTAC_DRB1*01:01": "mixed",
-        "MAPTAC_DRB1*03:01": "mixed",
-        "MAPTAC_DRB1*04:01": "mixed",
-        "MAPTAC_DRB1*07:01": "mixed",
-        "MAPTAC_DRB1*11:01": "mixed",
-        "MAPTAC_DRB1*12:01_dm+": "mixed",
-        "MAPTAC_DRB1*12:01_dm-": "mixed",
-        "MAPTAC_DRB1*15:01": "mixed",
-        "MAPTAC_DRB3*01:01_dm+": "mixed",
-        "MAPTAC_DRB3*01:01_dm-": "mixed",
    }
    df = pandas.read_excel(filename, sheet_name="DataS1B")
    results = []
    for sample_id in df.columns:
@@ -626,7 +638,7 @@ def handle_pmid_31495665(filename):
        result_df["pulldown_antibody"] = pulldown_antibody[sample_id]
        result_df["format"] = format[sample_id]
        result_df["mhc_class"] = mhc_class[sample_id]
-        result_df["sample_type"] = sample_type[sample_id]
+        result_df["sample_type"] = PMID_31495665_SAMPLE_TYPES[sample_id]
        result_df["cell_line"] = cell_line[sample_id]
        results.append(result_df)
    result_df = pandas.concat(results, ignore_index=True)
@@ -745,11 +757,18 @@ def handle_pmid_31154438(*filenames):
    result_df = pandas.concat(results, ignore_index=True)
    return result_df
+EXPRESSION_GROUPS_ROWS = []
-def expression_groups(dataset_identifier, df, groups):
+def make_expression_groups(dataset_identifier, df, groups):
    result_df = pandas.DataFrame(index=df.index)
    for (label, columns) in groups.items():
+        for col in columns:
+            if col not in df.columns:
+                raise ValueError(
+                    "Missing: %s. Available: %s" % (col, df.columns.tolist()))
        result_df[label] = df[columns].mean(1)
+        EXPRESSION_GROUPS_ROWS.append((dataset_identifier, label, columns))
    return result_df
@@ -773,7 +792,7 @@ def handle_expression_GSE113126(*filenames):
    groups = {
        "sample_type:MELANOMA_MET": df.columns.tolist(),
    }
-    return expression_groups("GSE113126", df, groups)
+    return [make_expression_groups("GSE113126", df, groups)]
 def handle_expression_expression_atlas_22460905(filename):
@@ -785,13 +804,17 @@ def handle_expression_expression_atlas_22460905(filename):
    def matches(*strings):
        return [c for c in df.columns if all(s in c for s in strings)]
-    import ipdb ; ipdb.set_trace()
    groups = {
        "sample_type:B-LCL": (
            matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")),
        "sample_type:B-CELL": matches("b-cell"),
+        "sample_type:B721-LIKE": matches("b-cell"),
        "sample_type:MELANOMA_CELL_LINE": matches("melanoma"),
+        "sample_type:A375-LIKE": matches("melanoma"),
+        "sample_type:KG1-LIKE": matches("myeloid leukemia"),
+        # Using a fibrosarcoma cell line for our fibroblast sample.
+        "sample_type:FIBROBLAST": ['fibrosarcoma, ht-1080'],
        # For GBM tissue we are just using a mixture of cell lines.
        "sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"),
@@ -807,8 +830,7 @@ def handle_expression_expression_atlas_22460905(filename):
        "cell_line:HCT116": ['colon carcinoma, hct 116'],
        "cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'],
    }
-    return expression_groups("expression_atlas_22460905", df, groups)
+    return [make_expression_groups("expression_atlas_22460905", df, groups)]
 def handle_expression_human_protein_atlas(*filenames):
@@ -826,27 +848,67 @@ def handle_expression_human_protein_atlas(*filenames):
    gtex_df = gtex_df.pivot(
        index="Gene", columns="Tissue", values="TPM")
-    result_df = pandas.DataFrame(index=cell_line_df.index)
+    return [
+        make_expression_groups(
-    result_df["sample_type:pbmc"] = blood_df[
+            "human_protein_atlas:%s" % os.path.basename(blood_filename),
-        [c for c in blood_df.columns if "total PBMC" in c]
+            blood_df,
-    ].mean(1)
+            groups={
+                "sample_type:PBMC": [
-    result_df["cell_line:HEK293"] = cell_line_df['HEK 293']
+                    c for c in blood_df.columns if "total PBMC" in c
-    result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226']
+                ],
-    # EXPI293 is based off HEK293
+                # for samples labeled leukapheresis we also use PBMC
-    result_df["cell_line:EXPI293"] = cell_line_df['HEK 293']
+                "sample_type:LEUKAPHERESIS": [
+                    c for c in blood_df.columns if "total PBMC" in c
-    # For leukapheresis we use pbmc sample
+                ],
-    result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"]
+                # for samples labeled TIL we are also using PBMC
-    for tissue in ["lung", "spleen"]:
+                "sample_type:TIL": [
-        result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue]
+                    c for c in blood_df.columns if "total PBMC" in c
-    return result_df
+                ],
+            }),
+        make_expression_groups(
+            "human_protein_atlas:%s" % os.path.basename(cell_line_filename),
+            cell_line_df,
+            groups={
+                "cell_line:HELA": ['HeLa'],
+                "cell_line:K562": ["K-562"],
+                "cell_line:HEK293": ['HEK 293'],
+                "cell_line:RPMI8226": ['RPMI-8226'],
+                "cell_line:EXPI293": ['HEK 293'],  # EXPI293 derived from HEK293
+            }),
+        make_expression_groups(
+            "human_protein_atlas:%s" % os.path.basename(gtex_filename),
+            gtex_df,
+            groups={
+                "sample_type:LUNG": ["lung"],
+                "sample_type:SPLEEN": ["spleen"],
+            }),
+    ]
+def make_expression_mixtures(expression_df):
+    global CELL_LINE_MIXTURES
+    groups = {}
+    for mix in CELL_LINE_MIXTURES:
+        components = []
+        for item in mix.replace("mix:", "").upper().split(","):
+            if "cell_line:%s" % item in expression_df.columns:
+                components.append("cell_line:%s" % item)
+            else:
+                print("No cell line, falling back on similar: ", item)
+                components.append("sample_type:%s-LIKE" % item)
+        groups["sample_type:" + mix.upper()] = components
+    missing = set()
+    for some in groups.values():
+        for item in some:
+            if item not in expression_df.columns:
+                missing.add(item)
+    if missing:
+        raise ValueError(
+            "Missing [%d]: %s. Available: %s" % (
+                len(missing), missing, expression_df.columns.tolist()))
+    return make_expression_groups("mixtures", expression_df, groups)
 # Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict.
@@ -869,36 +931,44 @@ def run():
            label,
            *[os.path.abspath(f) for f in filenames])
-        expression_df = None
+        expression_dfs_for_item = []
        handler = None
        if label in EXPRESSION_HANDLERS:
            handler = EXPRESSION_HANDLERS[label]
-            expression_df = handler(*filenames)
+            expression_dfs_for_item = handler(*filenames)
        elif args.debug:
            debug(*filenames)
        else:
            raise NotImplementedError(label)
-        if expression_df is not None:
+        if expression_dfs_for_item:
            print(
                "Processed expression data",
                label,
-                "with shape",
+                "result dataframes",
-                expression_df.shape)
+                len(expression_dfs_for_item))
-            print(*expression_df.columns)
+            print(*[e.columns for e in expression_dfs_for_item])
-            expression_dfs.append(expression_df)
+            expression_dfs.extend(expression_dfs_for_item)
    expression_df = expression_dfs[0]
    for other in expression_dfs[1:]:
        expression_df = pandas.merge(
            expression_df, other, how='outer', left_index=True, right_index=True)
-    expression_df = expression_df.fillna(0)
-    print(
+    print("Genes in each expression dataframe: ",
-        "Genes in each expression dataframe: ",
        *[len(e) for e in expression_dfs])
    print("Genes in merged expression dataframe", len(expression_df))
+    if CELL_LINE_MIXTURES:
+        print("Generating cell line mixtures.")
+        expression_mixture_df = make_expression_mixtures(expression_df)
+        expression_df = pandas.merge(
+            expression_df,
+            expression_mixture_df,
+            how='outer',
+            left_index=True,
+            right_index=True)
    ms_dfs = []
    for (i, item_tpl) in enumerate(args.ms_item):
        (pmid, filenames) = (item_tpl[0], item_tpl[1:])
@@ -1026,5 +1096,15 @@ def run():
    ms_df.to_csv(args.ms_out, index=False)
    print("Wrote: %s" % os.path.abspath(args.ms_out))
+    if args.expression_metadata_out is not None:
+        expression_metadata_df = pandas.DataFrame(
+            EXPRESSION_GROUPS_ROWS,
+            columns=["expression_dataset", "label", "samples"])
+        expression_metadata_df["samples"] = expression_metadata_df[
+            "samples"
+        ].map(json.dumps)
+        expression_metadata_df.to_csv(args.expression_metadata_out, index=False)
+        print("Wrote: %s" % os.path.abspath(args.expression_metadata_out))
 if __name__ == '__main__':
    run()
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -27,7 +27,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 cp $SCRIPT_DIR/annotate.py .
-PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
+PEPTIDES=$(mhcflurry-downloads path data_curated)/ms.nontraining_curated.by_pmid.csv.bz2
 REFERENCES_DIR=$(mhcflurry-downloads path data_references)
 python annotate.py \

--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -62,7 +62,7 @@ releases:
              default: false
            - name: data_curated
-              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2
              default: true
            # Older downloads