From 9152bada5236aa9451af96179f937562273ffe87 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 11 Oct 2019 15:53:53 -0400 Subject: [PATCH] fixes --- downloads-generation/data_curated/GENERATE.sh | 3 +- .../data_curated/curate_ms_by_pmid.py | 230 ++++++++++++------ .../data_mass_spec_annotated/GENERATE.sh | 2 +- mhcflurry/downloads.yml | 2 +- 4 files changed, 159 insertions(+), 78 deletions(-) diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index b35d01fc..eb7b5a73 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -49,7 +49,8 @@ done time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \ --ms-out ms.nontraining_curated.by_pmid.csv \ - --expression-out rna_expression.csv + --expression-out rna_expression.csv \ + --expression-metadata-out rna_expression.metadata.csv bzip2 ms.nontraining_curated.by_pmid.csv bzip2 rna_expression.csv diff --git a/downloads-generation/data_curated/curate_ms_by_pmid.py b/downloads-generation/data_curated/curate_ms_by_pmid.py index 5406e238..faf68811 100755 --- a/downloads-generation/data_curated/curate_ms_by_pmid.py +++ b/downloads-generation/data_curated/curate_ms_by_pmid.py @@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec. import sys import argparse import os +import json import collections from six.moves import StringIO @@ -44,6 +45,10 @@ parser.add_argument( "--expression-out", metavar="OUT.csv", help="Out file path (RNA-seq expression)") +parser.add_argument( + "--expression-metadata-out", + metavar="OUT.csv", + help="Out file path for expression metadata, i.e. which samples used") parser.add_argument( "--debug", action="store_true", @@ -421,6 +426,42 @@ def handle_pmid_28832583(*filenames): return result_df +PMID_31495665_SAMPLE_TYPES = { + "HLA-DR_Lung": "lung", + "HLA-DR_PBMC_HDSC": "pbmc", + "HLA-DR_PBMC_RG1095": "pbmc", + "HLA-DR_PBMC_RG1104": "pbmc", + "HLA-DR_PBMC_RG1248": "pbmc", + "HLA-DR_Spleen": "spleen", + "MAPTAC_A*02:01": "mix:a375,expi293,hek293,hela", + "MAPTAC_A*11:01": "mix:expi293,hela", + "MAPTAC_A*32:01": "mix:a375,expi293,hela", + "MAPTAC_B*07:02": "mix:a375,expi293,hela", + "MAPTAC_B*45:01": "expi293", + "MAPTAC_B*52:01": "mix:a375,expi293", + "MAPTAC_C*03:03": "expi293", + "MAPTAC_C*06:02": "mix:a375,expi293", + "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293", + "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293", + "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293", + "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293", + "MAPTAC_DRB1*01:01": "mix:a375,b721,expi293,kg1,k562", + "MAPTAC_DRB1*03:01": "expi293", + "MAPTAC_DRB1*04:01": "expi293", + "MAPTAC_DRB1*07:01": "mix:expi293,hek293", + "MAPTAC_DRB1*11:01": "mix:expi293,k562,kg1", + "MAPTAC_DRB1*12:01_dm+": "expi293", + "MAPTAC_DRB1*12:01_dm-": "expi293", + "MAPTAC_DRB1*15:01": "expi293", + "MAPTAC_DRB3*01:01_dm+": "expi293", + "MAPTAC_DRB3*01:01_dm-": "expi293", +} +CELL_LINE_MIXTURES = sorted( + set( + x for x in PMID_31495665_SAMPLE_TYPES.values() + if x.startswith("mix:"))) + + def handle_pmid_31495665(filename): """Abelin, ..., Rooney Immunity 2019 [PMID 31495665]""" hla_type = { @@ -561,56 +602,27 @@ def handle_pmid_31495665(filename): "MAPTAC_A*11:01": "", "MAPTAC_A*32:01": "", "MAPTAC_B*07:02": "", - "MAPTAC_B*45:01": "", + "MAPTAC_B*45:01": "expi293", "MAPTAC_B*52:01": "", - "MAPTAC_C*03:03": "", + "MAPTAC_C*03:03": "expi293", "MAPTAC_C*06:02": "", "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293", "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293", "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293", # don't actually see this in DataS1A! "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293", "MAPTAC_DRB1*01:01": "", - "MAPTAC_DRB1*03:01": "", - "MAPTAC_DRB1*04:01": "", + "MAPTAC_DRB1*03:01": "expi293", + "MAPTAC_DRB1*04:01": "expi293", "MAPTAC_DRB1*07:01": "", "MAPTAC_DRB1*11:01": "", - "MAPTAC_DRB1*12:01_dm+": "", - "MAPTAC_DRB1*12:01_dm-": "", - "MAPTAC_DRB1*15:01": "", - "MAPTAC_DRB3*01:01_dm+": "", - "MAPTAC_DRB3*01:01_dm-": "", - } - sample_type = { - "HLA-DR_Lung": "lung", - "HLA-DR_PBMC_HDSC": "lung", - "HLA-DR_PBMC_RG1095": "lung", - "HLA-DR_PBMC_RG1104": "lung", - "HLA-DR_PBMC_RG1248": "lung", - "HLA-DR_Spleen": "spleen", - "MAPTAC_A*02:01": "mixed", - "MAPTAC_A*11:01": "mixed", - "MAPTAC_A*32:01": "mixed", - "MAPTAC_B*07:02": "mixed", - "MAPTAC_B*45:01": "mixed", - "MAPTAC_B*52:01": "mixed", - "MAPTAC_C*03:03": "mixed", - "MAPTAC_C*06:02": "mixed", - "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "mixed", - "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "mixed", - "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "mixed", - "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "mixed", - "MAPTAC_DRB1*01:01": "mixed", - "MAPTAC_DRB1*03:01": "mixed", - "MAPTAC_DRB1*04:01": "mixed", - "MAPTAC_DRB1*07:01": "mixed", - "MAPTAC_DRB1*11:01": "mixed", - "MAPTAC_DRB1*12:01_dm+": "mixed", - "MAPTAC_DRB1*12:01_dm-": "mixed", - "MAPTAC_DRB1*15:01": "mixed", - "MAPTAC_DRB3*01:01_dm+": "mixed", - "MAPTAC_DRB3*01:01_dm-": "mixed", + "MAPTAC_DRB1*12:01_dm+": "expi293", + "MAPTAC_DRB1*12:01_dm-": "expi293", + "MAPTAC_DRB1*15:01": "expi293", + "MAPTAC_DRB3*01:01_dm+": "expi293", + "MAPTAC_DRB3*01:01_dm-": "expi293", } + df = pandas.read_excel(filename, sheet_name="DataS1B") results = [] for sample_id in df.columns: @@ -626,7 +638,7 @@ def handle_pmid_31495665(filename): result_df["pulldown_antibody"] = pulldown_antibody[sample_id] result_df["format"] = format[sample_id] result_df["mhc_class"] = mhc_class[sample_id] - result_df["sample_type"] = sample_type[sample_id] + result_df["sample_type"] = PMID_31495665_SAMPLE_TYPES[sample_id] result_df["cell_line"] = cell_line[sample_id] results.append(result_df) result_df = pandas.concat(results, ignore_index=True) @@ -745,11 +757,18 @@ def handle_pmid_31154438(*filenames): result_df = pandas.concat(results, ignore_index=True) return result_df +EXPRESSION_GROUPS_ROWS = [] -def expression_groups(dataset_identifier, df, groups): + +def make_expression_groups(dataset_identifier, df, groups): result_df = pandas.DataFrame(index=df.index) for (label, columns) in groups.items(): + for col in columns: + if col not in df.columns: + raise ValueError( + "Missing: %s. Available: %s" % (col, df.columns.tolist())) result_df[label] = df[columns].mean(1) + EXPRESSION_GROUPS_ROWS.append((dataset_identifier, label, columns)) return result_df @@ -773,7 +792,7 @@ def handle_expression_GSE113126(*filenames): groups = { "sample_type:MELANOMA_MET": df.columns.tolist(), } - return expression_groups("GSE113126", df, groups) + return [make_expression_groups("GSE113126", df, groups)] def handle_expression_expression_atlas_22460905(filename): @@ -785,13 +804,17 @@ def handle_expression_expression_atlas_22460905(filename): def matches(*strings): return [c for c in df.columns if all(s in c for s in strings)] - import ipdb ; ipdb.set_trace() - groups = { "sample_type:B-LCL": ( matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")), "sample_type:B-CELL": matches("b-cell"), + "sample_type:B721-LIKE": matches("b-cell"), "sample_type:MELANOMA_CELL_LINE": matches("melanoma"), + "sample_type:A375-LIKE": matches("melanoma"), + "sample_type:KG1-LIKE": matches("myeloid leukemia"), + + # Using a fibrosarcoma cell line for our fibroblast sample. + "sample_type:FIBROBLAST": ['fibrosarcoma, ht-1080'], # For GBM tissue we are just using a mixture of cell lines. "sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"), @@ -807,8 +830,7 @@ def handle_expression_expression_atlas_22460905(filename): "cell_line:HCT116": ['colon carcinoma, hct 116'], "cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'], } - return expression_groups("expression_atlas_22460905", df, groups) - + return [make_expression_groups("expression_atlas_22460905", df, groups)] def handle_expression_human_protein_atlas(*filenames): @@ -826,27 +848,67 @@ def handle_expression_human_protein_atlas(*filenames): gtex_df = gtex_df.pivot( index="Gene", columns="Tissue", values="TPM") - result_df = pandas.DataFrame(index=cell_line_df.index) - - result_df["sample_type:pbmc"] = blood_df[ - [c for c in blood_df.columns if "total PBMC" in c] - ].mean(1) - - result_df["cell_line:HEK293"] = cell_line_df['HEK 293'] - result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226'] - - # EXPI293 is based off HEK293 - result_df["cell_line:EXPI293"] = cell_line_df['HEK 293'] - - # For leukapheresis we use pbmc sample - result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"] - - for tissue in ["lung", "spleen"]: - result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue] - return result_df - + return [ + make_expression_groups( + "human_protein_atlas:%s" % os.path.basename(blood_filename), + blood_df, + groups={ + "sample_type:PBMC": [ + c for c in blood_df.columns if "total PBMC" in c + ], + + # for samples labeled leukapheresis we also use PBMC + "sample_type:LEUKAPHERESIS": [ + c for c in blood_df.columns if "total PBMC" in c + ], + + # for samples labeled TIL we are also using PBMC + "sample_type:TIL": [ + c for c in blood_df.columns if "total PBMC" in c + ], + }), + make_expression_groups( + "human_protein_atlas:%s" % os.path.basename(cell_line_filename), + cell_line_df, + groups={ + "cell_line:HELA": ['HeLa'], + "cell_line:K562": ["K-562"], + "cell_line:HEK293": ['HEK 293'], + "cell_line:RPMI8226": ['RPMI-8226'], + "cell_line:EXPI293": ['HEK 293'], # EXPI293 derived from HEK293 + }), + make_expression_groups( + "human_protein_atlas:%s" % os.path.basename(gtex_filename), + gtex_df, + groups={ + "sample_type:LUNG": ["lung"], + "sample_type:SPLEEN": ["spleen"], + }), + ] +def make_expression_mixtures(expression_df): + global CELL_LINE_MIXTURES + groups = {} + for mix in CELL_LINE_MIXTURES: + components = [] + for item in mix.replace("mix:", "").upper().split(","): + if "cell_line:%s" % item in expression_df.columns: + components.append("cell_line:%s" % item) + else: + print("No cell line, falling back on similar: ", item) + components.append("sample_type:%s-LIKE" % item) + groups["sample_type:" + mix.upper()] = components + missing = set() + for some in groups.values(): + for item in some: + if item not in expression_df.columns: + missing.add(item) + if missing: + raise ValueError( + "Missing [%d]: %s. Available: %s" % ( + len(missing), missing, expression_df.columns.tolist())) + return make_expression_groups("mixtures", expression_df, groups) # Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict. @@ -869,36 +931,44 @@ def run(): label, *[os.path.abspath(f) for f in filenames]) - expression_df = None + expression_dfs_for_item = [] handler = None if label in EXPRESSION_HANDLERS: handler = EXPRESSION_HANDLERS[label] - expression_df = handler(*filenames) + expression_dfs_for_item = handler(*filenames) elif args.debug: debug(*filenames) else: raise NotImplementedError(label) - if expression_df is not None: + if expression_dfs_for_item: print( "Processed expression data", label, - "with shape", - expression_df.shape) - print(*expression_df.columns) - expression_dfs.append(expression_df) + "result dataframes", + len(expression_dfs_for_item)) + print(*[e.columns for e in expression_dfs_for_item]) + expression_dfs.extend(expression_dfs_for_item) expression_df = expression_dfs[0] for other in expression_dfs[1:]: expression_df = pandas.merge( expression_df, other, how='outer', left_index=True, right_index=True) - expression_df = expression_df.fillna(0) - print( - "Genes in each expression dataframe: ", + print("Genes in each expression dataframe: ", *[len(e) for e in expression_dfs]) print("Genes in merged expression dataframe", len(expression_df)) + if CELL_LINE_MIXTURES: + print("Generating cell line mixtures.") + expression_mixture_df = make_expression_mixtures(expression_df) + expression_df = pandas.merge( + expression_df, + expression_mixture_df, + how='outer', + left_index=True, + right_index=True) + ms_dfs = [] for (i, item_tpl) in enumerate(args.ms_item): (pmid, filenames) = (item_tpl[0], item_tpl[1:]) @@ -1026,5 +1096,15 @@ def run(): ms_df.to_csv(args.ms_out, index=False) print("Wrote: %s" % os.path.abspath(args.ms_out)) + if args.expression_metadata_out is not None: + expression_metadata_df = pandas.DataFrame( + EXPRESSION_GROUPS_ROWS, + columns=["expression_dataset", "label", "samples"]) + expression_metadata_df["samples"] = expression_metadata_df[ + "samples" + ].map(json.dumps) + expression_metadata_df.to_csv(args.expression_metadata_out, index=False) + print("Wrote: %s" % os.path.abspath(args.expression_metadata_out)) + if __name__ == '__main__': run() diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh index 6fd95390..e9c52381 100755 --- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh +++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh @@ -27,7 +27,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_DIR/annotate.py . -PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2 +PEPTIDES=$(mhcflurry-downloads path data_curated)/ms.nontraining_curated.by_pmid.csv.bz2 REFERENCES_DIR=$(mhcflurry-downloads path data_references) python annotate.py \ diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 36eff633..7c268d3c 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -62,7 +62,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2 default: true # Older downloads -- GitLab