diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index fd041f93db8468eac835da4110f2f359ad90e7fa..b35d01fcda006b707c02b9a2260f577594c5c2a0 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -29,23 +29,33 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_DIR/curate.py . -cp $SCRIPT_DIR/curate_by_pmid.py . +cp $SCRIPT_DIR/curate_ms_by_pmid.py . -RAW_DIR="$(mhcflurry-downloads path data_published)/raw" -cp -r "$RAW_DIR" . +MS_DIR="$(mhcflurry-downloads path data_published)/ms" +cp -r "$MS_DIR" . + +EXPRESSION_DIR="$(mhcflurry-downloads path data_published)/expression" +cp -r "$EXPRESSION_DIR" . CURATE_BY_PMID_ARGS="" -for pmid in $(ls raw) +for pmid in $(ls ms) +do + CURATE_BY_PMID_ARGS+=$(echo --ms-item $pmid ms/$pmid/* ' ') +done +for item in $(ls expression) do - CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ') + CURATE_BY_PMID_ARGS+=$(echo --expression-item $item expression/$item/* ' ') done -time python curate_by_pmid.py $CURATE_BY_PMID_ARGS \ - --out nontraining_curated.by_pmid.csv +time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \ + --ms-out ms.nontraining_curated.by_pmid.csv \ + --expression-out rna_expression.csv + +bzip2 ms.nontraining_curated.by_pmid.csv +bzip2 rna_expression.csv -bzip2 nontraining_curated.by_pmid.csv -rm -rf raw +rm -rf ms # No mass-spec data time python curate.py \ diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_ms_by_pmid.py similarity index 59% rename from downloads-generation/data_curated/curate_by_pmid.py rename to downloads-generation/data_curated/curate_ms_by_pmid.py index 4f1950e5e15e2ef40ce1d7b8cf7199c08a4c6405..5406e2381e42756a4a66efc63096af1e7ac4c967 100755 --- a/downloads-generation/data_curated/curate_by_pmid.py +++ b/downloads-generation/data_curated/curate_ms_by_pmid.py @@ -23,24 +23,35 @@ def normalize_allele_name(s): parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( - "--item", + "--ms-item", nargs="+", action="append", metavar="PMID FILE, ... FILE", default=[], - help="Item to curate: PMID and list of files") + help="Mass spec item to curate: PMID and list of files") parser.add_argument( - "--out", + "--expression-item", + nargs="+", + action="append", + metavar="LABEL FILE, ... FILE", + default=[], + help="Expression data to curate: dataset label and list of files") +parser.add_argument( + "--ms-out", + metavar="OUT.csv", + help="Out file path (MS data)") +parser.add_argument( + "--expression-out", metavar="OUT.csv", - help="Out file path") + help="Out file path (RNA-seq expression)") parser.add_argument( "--debug", action="store_true", default=False, help="Leave user in pdb if PMID is unsupported") -HANDLERS = {} - +PMID_HANDLERS = {} +EXPRESSION_HANDLERS = {} def load(filenames, **kwargs): result = {} @@ -146,7 +157,7 @@ def handle_pmid_24616531(filename): "peptide": peptides, }) result_df["sample_id"] = "24616531" - result_df["sample_type"] = "B-lymphoblastoid" + result_df["sample_type"] = "B-LCL" result_df["cell_line"] = "GR" result_df["pulldown_antibody"] = "W6/32" @@ -181,7 +192,6 @@ def handle_pmid_25576301(filename): rows.append((row.Sequence, sample)) result_df = pandas.DataFrame(rows, columns=["peptide", "sample_id"]) - result_df["cell_line"] = "" result_df["pulldown_antibody"] = "W6/32" result_df["mhc_class"] = "I" result_df["format"] = "multiallelic" @@ -209,11 +219,21 @@ def handle_pmid_25576301(filename): 'HCC1143': "basal like breast cancer", 'JY': "B-cell", } + cell_line = { + 'Fib': None, + 'HCC1937': "HCC1937", + 'SupB15WT': None, + 'SupB15RT': None, + 'HCT116': "HCT116", + 'HCC1143': "HCC1143", + 'JY': "JY", + } result_df["hla"] = result_df.sample_id.map(allele_map) print("Entries before dropping samples with unknown alleles", len(result_df)) result_df = result_df.loc[~result_df.hla.isnull()] print("Entries after dropping samples with unknown alleles", len(result_df)) result_df["sample_type"] = result_df.sample_id.map(sample_type) + result_df["cell_line"] = result_df.sample_id.map(cell_line) print(result_df.head(3)) return result_df @@ -266,7 +286,7 @@ def handle_pmid_26992070(*filenames): "HEK293": "hek", "HL-60": "neutrophil", "RPMI8226": "b-cell", - "MAVER-1": "b-lymphoblast", + "MAVER-1": "b-LCL", "THP-1": "monocyte", }) result_df["mhc_class"] = "I" @@ -343,30 +363,30 @@ def handle_pmid_28832583(*filenames): CM467 B-cell 28832583 HLA-A*01:01 HLA-A*24:02 HLA-B*13:02 HLA-B*39:06 HLA-C*06:02 HLA-C*12:03 GD149 B-cell 28832583 HLA-A*01:01 HLA-A*24:02 HLA-B*38:01 HLA-B*44:03 HLA-C*06:02 HLA-C*12:03 MD155 B-cell 28832583 HLA-A*02:01 HLA-A*24:02 HLA-B*15:01 HLA-B*18:01 HLA-C*03:03 HLA-C*07:01 - PD42 B cell 28832583 HLA-A*02:06 HLA-A*24:02 HLA-B*07:02 HLA-B*55:01 HLA-C*01:02 HLA-C*07:02 - RA957 B cell 28832583 HLA-A*02:20 HLA-A*68:01 HLA-B*35:03 HLA-B*39:01 HLA-C*04:01 HLA-C*07:02 + PD42 B-cell 28832583 HLA-A*02:06 HLA-A*24:02 HLA-B*07:02 HLA-B*55:01 HLA-C*01:02 HLA-C*07:02 + RA957 B-cell 28832583 HLA-A*02:20 HLA-A*68:01 HLA-B*35:03 HLA-B*39:01 HLA-C*04:01 HLA-C*07:02 TIL1 TIL 28832583 HLA-A*02:01 HLA-A*02:01 HLA-B*18:01 HLA-B*38:01 HLA-C*05:01 TIL3 TIL 28832583 HLA-A*01:01 HLA-A*23:01 HLA-B*07:02 HLA-B*15:01 HLA-C*12:03 HLA-C*14:02 Apher1 Leukapheresis 28832583 HLA-A*03:01 HLA-A*29:02 HLA-B*44:02 HLA-B*44:03 HLA-C*12:03 HLA-C*16:01 Apher6 Leukapheresis 28832583 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02 - pat_AC2 B lymphoblast 27841757 HLA-A*03:01 HLA-A*32:01 HLA-B*27:05 HLA-B*45:01 - pat_C B lymphoblast 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02 - pat_CELG B lymphoblast 27841757 HLA-A*02:01 HLA-A*24:02 HLA-B*15:01 HLA-B*73:01 HLA-C*03:03 HLA-C*15:05 - pat_CP2 B lymphoblast 27841757 HLA-A*11:01 HLA-B*14:02 HLA-B*44:02 - pat_FL B lymphoblast 27841757 HLA-A*03:01 HLA-A*11:01 HLA-B*44:03 HLA-B*50:01 - pat_J B lymphoblast 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02 - pat_JPB3 B lymphoblast 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*27:05 HLA-B*56:01 - pat_JT2 B lymphoblast 27841757 HLA-A*11:01 HLA-B*18:03 HLA-B*35:01 - pat_M B lymphoblast 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01 - pat_MA B lymphoblast 27841757 HLA-A*02:01 HLA-A*29:02 HLA-B*44:03 HLA-B*57:01 HLA-C*07:01 HLA-C*16:01 - pat_ML B lymphoblast 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*40:01 HLA-B*44:03 - pat_NS2 B lymphoblast 27841757 HLA-A*02:01 HLA-B*13:02 HLA-B*41:01 - pat_NT B lymphoblast 27841757 HLA-A*01:01 HLA-A*32:01 HLA-B*08:01 - pat_PF1 B lymphoblast 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*07:02 HLA-B*44:03 HLA-C*07:02 HLA-C*16:01 - pat_R B lymphoblast 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01 - pat_RT B lymphoblast 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*18:01 HLA-B*39:24 HLA-C*05:01 HLA-C*07:01 - pat_SR B lymphoblast 27841757 HLA-A*02:01 HLA-A*23:01 HLA-B*18:01 HLA-B*44:03 - pat_ST B lymphoblast 27841757 HLA-A*03:01 HLA-A*24:02 HLA-B*07:02 HLA-B*27:05 + pat_AC2 B-LCL 27841757 HLA-A*03:01 HLA-A*32:01 HLA-B*27:05 HLA-B*45:01 + pat_C B-LCL 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02 + pat_CELG B-LCL 27841757 HLA-A*02:01 HLA-A*24:02 HLA-B*15:01 HLA-B*73:01 HLA-C*03:03 HLA-C*15:05 + pat_CP2 B-LCL 27841757 HLA-A*11:01 HLA-B*14:02 HLA-B*44:02 + pat_FL B-LCL 27841757 HLA-A*03:01 HLA-A*11:01 HLA-B*44:03 HLA-B*50:01 + pat_J B-LCL 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02 + pat_JPB3 B-LCL 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*27:05 HLA-B*56:01 + pat_JT2 B-LCL 27841757 HLA-A*11:01 HLA-B*18:03 HLA-B*35:01 + pat_M B-LCL 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01 + pat_MA B-LCL 27841757 HLA-A*02:01 HLA-A*29:02 HLA-B*44:03 HLA-B*57:01 HLA-C*07:01 HLA-C*16:01 + pat_ML B-LCL 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*40:01 HLA-B*44:03 + pat_NS2 B-LCL 27841757 HLA-A*02:01 HLA-B*13:02 HLA-B*41:01 + pat_NT B-LCL 27841757 HLA-A*01:01 HLA-A*32:01 HLA-B*08:01 + pat_PF1 B-LCL 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*07:02 HLA-B*44:03 HLA-C*07:02 HLA-C*16:01 + pat_R B-LCL 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01 + pat_RT B-LCL 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*18:01 HLA-B*39:24 HLA-C*05:01 HLA-C*07:01 + pat_SR B-LCL 27841757 HLA-A*02:01 HLA-A*23:01 HLA-B*18:01 HLA-B*44:03 + pat_ST B-LCL 27841757 HLA-A*03:01 HLA-A*24:02 HLA-B*07:02 HLA-B*27:05 """ info_df = pandas.read_csv(StringIO(info_text), sep="\t", index_col=0) info_df.index = info_df.index.str.strip() @@ -613,59 +633,383 @@ def handle_pmid_31495665(filename): return result_df -# Add all functions with names like handle_pmid_XXXX to HANDLERS dict. +def handle_pmid_27869121(filename): + """Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121]""" + # Although this dataset has class II data also, we are only extracting + # class I for now. + df = pandas.read_excel(filename, skiprows=1) + + # Taking these from: + # Supplementary Table 2: Information of patients selected for neoepitope + # identification + # For the Mel5 sample, only two-digit alleles are shown (A*01, A*25, + # B*08, B*18) so we are skipping that sample for now. + hla_df = pandas.DataFrame([ + ("Mel-8", "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*08:01 HLA-C*07:01 HLA-C*07:02"), + ("Mel-12", "HLA-A*01:01 HLA-B*08:01 HLA-C*07:01"), + ("Mel-15", "HLA-A*03:01 HLA-A*68:01 HLA-B*27:05 HLA-B*35:03 HLA-C*02:02 HLA-C*04:01"), + ("Mel-16", "HLA-A*01:01 HLA-A*24:02 HLA-B*07:02 HLA-B*08:01 HLA-C*07:01 HLA-C*07:02"), + ], columns=["sample_id", "hla"]).set_index("sample_id") + + # We assert below that none of the class I hit peptides were found in any + # of the class II pull downs. + class_ii_cols = [ + c for c in df.columns if c.endswith("HLA-II (arbitrary units)") + ] + class_ii_hits = set(df.loc[ + (df[class_ii_cols].fillna(0.0).sum(1) > 0) + ].Sequence.unique()) + + results = [] + for (sample_id, hla) in hla_df.hla.items(): + intensity_col = "Intensity %s_HLA-I (arbitrary units)" % sample_id + sub_df = df.loc[ + (df[intensity_col].fillna(0.0) > 0) + ] + filtered_sub_df = sub_df.loc[ + (~sub_df.Sequence.isin(class_ii_hits)) + ] + peptides = filtered_sub_df.Sequence.unique() + assert not any(p in class_ii_hits for p in peptides) + + result_df = pandas.DataFrame({ + "peptide": peptides, + }) + result_df["sample_id"] = sample_id + result_df["hla"] = hla_df.loc[sample_id, "hla"] + result_df["pulldown_antibody"] = "W6/32" + result_df["format"] = "multiallelic" + result_df["mhc_class"] = "I" + result_df["sample_type"] = "melanoma_met" + result_df["cell_line"] = None + results.append(result_df) + + result_df = pandas.concat(results, ignore_index=True) + return result_df + + +def handle_pmid_31154438(*filenames): + """Shraibman, ..., Admon Mol Cell Proteomics 2019""" + # Note: this publication also includes analyses of the secreted HLA + # peptidedome (sHLA) but we are using only the data from membrane-bound + # HLA. + (xls, txt) = sorted(filenames, key=lambda s: not s.endswith(".xlsx")) + + info = pandas.read_excel(xls, skiprows=1) + df = pandas.read_csv(txt, sep="\t", skiprows=1) + + hla_df = info.loc[ + ~info["mHLA tissue sample"].isnull() + ].set_index("mHLA tissue sample")[["HLA typing"]] + + def fix_hla(string): + result = [] + alleles = string.split(";") + for a in alleles: + a = a.strip() + if "/" in a: + (a1, a2) = a.split("/") + a2 = a1[:2] + a2 + lst = [a1, a2] + else: + lst = [a] + for a in lst: + normalized = normalize_allele_name(a) + # Ignore class II + if normalized[4] in ("A", "B", "C"): + result.append(normalized) + return " ".join(result) + + hla_df["hla"] = hla_df["HLA typing"].map(fix_hla) + + results = [] + for (sample_id, hla) in hla_df.hla.items(): + intensity_col = "Intensity %s" % sample_id + sub_df = df.loc[ + (df[intensity_col].fillna(0.0) > 0) + ] + peptides = sub_df.Sequence.unique() + + result_df = pandas.DataFrame({ + "peptide": peptides, + }) + result_df["sample_id"] = sample_id + result_df["hla"] = hla_df.loc[sample_id, "hla"] + result_df["pulldown_antibody"] = "W6/32" + result_df["format"] = "multiallelic" + result_df["mhc_class"] = "I" + result_df["sample_type"] = "glioblastoma_tissue" + result_df["cell_line"] = None + results.append(result_df) + + result_df = pandas.concat(results, ignore_index=True) + return result_df + + +def expression_groups(dataset_identifier, df, groups): + result_df = pandas.DataFrame(index=df.index) + for (label, columns) in groups.items(): + result_df[label] = df[columns].mean(1) + return result_df + + +def handle_expression_GSE113126(*filenames): + """ + Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093] + + This is the melanoma met RNA-seq dataset. + + """ + + df = pandas.read_csv(filenames[0], sep="\t", index_col=0) + df = df[[]] # no columns + + for filename in filenames: + df[os.path.basename(filename)] = pandas.read_csv( + filename, sep="\t", index_col=0)["TPM"] + + assert len(df.columns) == len(filenames) + + groups = { + "sample_type:MELANOMA_MET": df.columns.tolist(), + } + return expression_groups("GSE113126", df, groups) + + +def handle_expression_expression_atlas_22460905(filename): + df = pandas.read_csv(filename, sep="\t", skiprows=4, index_col=0) + del df["Gene Name"] + df.columns = df.columns.str.lower() + df = df.fillna(0.0) + + def matches(*strings): + return [c for c in df.columns if all(s in c for s in strings)] + + import ipdb ; ipdb.set_trace() + + groups = { + "sample_type:B-LCL": ( + matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")), + "sample_type:B-CELL": matches("b-cell"), + "sample_type:MELANOMA_CELL_LINE": matches("melanoma"), + + # For GBM tissue we are just using a mixture of cell lines. + "sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"), + + "cell_line:THP-1": ["childhood acute monocytic leukemia, thp-1"], + "cell_line:HL-60": ["adult acute myeloid leukemia, hl-60"], + "cell_line:U-87": ['glioblastoma, u-87 mg'], + "cell_line:LNT-229": ['glioblastoma, ln-229'], + "cell_line:T98G": ['glioblastoma, t98g'], + "cell_line:SK-MEL-5": ['cutaneous melanoma, sk-mel-5'], + 'cell_line:MEWO': ['melanoma, mewo'], + "cell_line:HCC1937": ['breast ductal adenocarcinoma, hcc1937'], + "cell_line:HCT116": ['colon carcinoma, hct 116'], + "cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'], + } + return expression_groups("expression_atlas_22460905", df, groups) + + + +def handle_expression_human_protein_atlas(*filenames): + (cell_line_filename,) = [f for f in filenames if "celline" in f] + (blood_filename,) = [f for f in filenames if "blood" in f] + (gtex_filename,) = [f for f in filenames if "gtex" in f] + + cell_line_df = pandas.read_csv(cell_line_filename, sep="\t") + blood_df = pandas.read_csv(blood_filename, sep="\t", index_col=0) + gtex_df = pandas.read_csv(gtex_filename, sep="\t") + + cell_line_df = cell_line_df.pivot( + index="Gene", columns="Cell line", values="TPM") + + gtex_df = gtex_df.pivot( + index="Gene", columns="Tissue", values="TPM") + + result_df = pandas.DataFrame(index=cell_line_df.index) + + result_df["sample_type:pbmc"] = blood_df[ + [c for c in blood_df.columns if "total PBMC" in c] + ].mean(1) + + result_df["cell_line:HEK293"] = cell_line_df['HEK 293'] + result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226'] + + # EXPI293 is based off HEK293 + result_df["cell_line:EXPI293"] = cell_line_df['HEK 293'] + + # For leukapheresis we use pbmc sample + result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"] + + for tissue in ["lung", "spleen"]: + result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue] + return result_df + + + + + +# Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict. for (key, value) in list(locals().items()): if key.startswith("handle_pmid_"): - HANDLERS[key.replace("handle_pmid_", "")] = value + PMID_HANDLERS[key.replace("handle_pmid_", "")] = value + elif key.startswith("handle_expression_"): + EXPRESSION_HANDLERS[key.replace("handle_expression_", "")] = value def run(): args = parser.parse_args(sys.argv[1:]) - dfs = [] - for (i, item_tpl) in enumerate(args.item): + expression_dfs = [] + for (i, item_tpl) in enumerate(args.expression_item): + (label, filenames) = (item_tpl[0], item_tpl[1:]) + label = label.replace("-", "_") + print( + "Processing expression item %d of %d" % (i + 1, len(args.expression_item)), + label, + *[os.path.abspath(f) for f in filenames]) + + expression_df = None + handler = None + if label in EXPRESSION_HANDLERS: + handler = EXPRESSION_HANDLERS[label] + expression_df = handler(*filenames) + elif args.debug: + debug(*filenames) + else: + raise NotImplementedError(label) + + if expression_df is not None: + print( + "Processed expression data", + label, + "with shape", + expression_df.shape) + print(*expression_df.columns) + expression_dfs.append(expression_df) + + expression_df = expression_dfs[0] + for other in expression_dfs[1:]: + expression_df = pandas.merge( + expression_df, other, how='outer', left_index=True, right_index=True) + expression_df = expression_df.fillna(0) + + print( + "Genes in each expression dataframe: ", + *[len(e) for e in expression_dfs]) + print("Genes in merged expression dataframe", len(expression_df)) + + ms_dfs = [] + for (i, item_tpl) in enumerate(args.ms_item): (pmid, filenames) = (item_tpl[0], item_tpl[1:]) print( - "Processing item %d / %d" % (i + 1, len(args.item)), + "Processing MS item %d of %d" % (i + 1, len(args.ms_item)), pmid, *[os.path.abspath(f) for f in filenames]) - df = None + ms_df = None handler = None - if pmid in HANDLERS: - handler = HANDLERS[pmid] - df = handler(*filenames) + if pmid in PMID_HANDLERS: + handler = PMID_HANDLERS[pmid] + ms_df = handler(*filenames) elif args.debug: debug(*filenames) else: - raise NotImplementedError(args.pmid) - - if df is not None: - df["pmid"] = pmid - if "original_pmid" not in df.columns: - df["original_pmid"] = pmid - df = df.applymap(str).applymap(str.upper) - print("*** PMID %s: %d peptides ***" % (pmid, len(df))) + raise NotImplementedError(pmid) + + if ms_df is not None: + ms_df["pmid"] = pmid + if "original_pmid" not in ms_df.columns: + ms_df["original_pmid"] = pmid + if "expression_dataset" not in ms_df.columns: + ms_df["expression_dataset"] = "" + ms_df = ms_df.applymap(str).applymap(str.upper) + ms_df["sample_id"] = ms_df.sample_id.str.replace(" ", "") + print("*** PMID %s: %d peptides ***" % (pmid, len(ms_df))) if handler is not None: print(handler.__doc__) print("Counts by sample id:") - print(df.groupby("sample_id").peptide.nunique()) + print(ms_df.groupby("sample_id").peptide.nunique()) print("") print("Counts by sample type:") - print(df.groupby("sample_type").peptide.nunique()) + print(ms_df.groupby("sample_type").peptide.nunique()) print("****************************") - dfs.append(df) + for value in ms_df.expression_dataset.unique(): + if value and value not in expression_df.columns: + raise ValueError("No such expression dataset", value) - df = pandas.concat(dfs, ignore_index=True, sort=False) + ms_dfs.append(ms_df) - df["cell_line"] = df["cell_line"].fillna("") + ms_df = pandas.concat(ms_dfs, ignore_index=True, sort=False) + ms_df["cell_line"] = ms_df["cell_line"].fillna("") - cols = ["pmid", "sample_id", "peptide", "format", "mhc_class", "hla", ] - cols += [c for c in sorted(df.columns) if c not in cols] - df = df[cols] + sample_table = ms_df[ + ["sample_id", "pmid", "expression_dataset", "cell_line", "sample_type"] + ].drop_duplicates().set_index("sample_id") - null_df = df.loc[df.isnull().any(1)] + sample_id_to_expression_dataset = sample_table.expression_dataset.to_dict() + for (sample_id, value) in sorted(sample_id_to_expression_dataset.items()): + if value: + print("Expression dataset for sample", sample_id, "already assigned") + continue + cell_line_col = "cell_line:" + sample_table.loc[sample_id, "cell_line"] + sample_type_col = "sample_type:" + ( + sample_table.loc[sample_id, "sample_type"]) + + expression_dataset = None + for col in [cell_line_col, sample_type_col]: + if col in expression_df.columns: + expression_dataset = col + break + + if not expression_dataset: + print("*" * 20) + print("No expression dataset for sample ", sample_id) + print("Sample info:") + print(sample_table.loc[sample_id]) + print("*" * 20) + + sample_id_to_expression_dataset[sample_id] = expression_dataset + print( + "Sample", sample_id, "assigned exp. dataset", expression_dataset) + + print("Expression dataset usage:") + print(pandas.Series(sample_id_to_expression_dataset).value_counts()) + + missing = [ + key for (key, value) in + sample_id_to_expression_dataset.items() + if value is None + ] + if missing: + print("Missing expression data for samples", *missing) + print( + "Missing cell lines: ", + *sample_table.loc[missing, "cell_line"].dropna().drop_duplicates().tolist()) + print("Missing sample types: ", *sample_table.loc[ + missing, "sample_type"].dropna().drop_duplicates().tolist()) + if args.debug: + import ipdb; ipdb.set_trace() + else: + raise ValueError("Missing expression data for samples: ", missing) + + ms_df["expression_dataset"] = ms_df.sample_id.map( + sample_id_to_expression_dataset) + + cols = [ + "pmid", + "sample_id", + "peptide", + "format", + "mhc_class", + "hla", + "expression_dataset", + ] + cols += [c for c in sorted(ms_df.columns) if c not in cols] + ms_df = ms_df[cols] + + null_df = ms_df.loc[ms_df.isnull().any(1)] if len(null_df) > 0: print("Nulls:") print(null_df) @@ -673,11 +1017,14 @@ def run(): print("No nulls.") # Each sample should be coming from only one experiment. - assert df.groupby("sample_id").pmid.nunique().max() == 1, ( - df.groupby("sample_id").pmid.nunique().sort_values()) + assert ms_df.groupby("sample_id").pmid.nunique().max() == 1, ( + ms_df.groupby("sample_id").pmid.nunique().sort_values()) + + expression_df.to_csv(args.expression_out, index=True) + print("Wrote: %s" % os.path.abspath(args.expression_out)) - df.to_csv(args.out, index=False) - print("Wrote: %s" % os.path.abspath(args.out)) + ms_df.to_csv(args.ms_out, index=False) + print("Wrote: %s" % os.path.abspath(args.ms_out)) if __name__ == '__main__': run() diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh index e566526a095f4b70d97597290fe8413ec4382cf5..f10b85b6563d2000e06bd87139a62947d1e15f46 100755 --- a/downloads-generation/data_published/GENERATE.sh +++ b/downloads-generation/data_published/GENERATE.sh @@ -33,7 +33,7 @@ wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.200 wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt -mkdir raw +mkdir ms ############################################ # MS: Multiallelic class I @@ -43,10 +43,10 @@ mkdir raw # Pearson, ..., Perreault JCI 2016 [PMID 27841757] # but was reanalyzed in this work, and we download the reanalyzed version here. PMID=28832583 -mkdir -p raw/$PMID -wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P raw/$PMID # data generated in this work -wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P raw/$PMID # data reanalyzed in this work -cd raw/$PMID +mkdir -p ms/$PMID +wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P ms/$PMID # data generated in this work +wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P ms/$PMID # data reanalyzed in this work +cd ms/$PMID unzip *.s002 unzip *.s003 mkdir saved @@ -58,33 +58,33 @@ cd ../.. # Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301] PMID=25576301 -mkdir -p raw/$PMID -wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P raw/$PMID +mkdir -p ms/$PMID +wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P ms/$PMID # Mommen, ..., Heck PNAS 2014 [PMID 24616531] PMID=24616531 -mkdir -p raw/$PMID -wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P raw/$PMID +mkdir -p ms/$PMID +wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P ms/$PMID # Gloger, ..., Neri Cancer Immunol Immunother 2016 [PMID 27600516] # Data extracted from supplemental PDF table. PMID=27600516 -mkdir -p raw/$PMID -wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID +mkdir -p ms/$PMID +wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P ms/$PMID # Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070] # Supplemental zip downloaded from publication PMID=26992070 -mkdir -p raw/$PMID -wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID -cd raw/$PMID +mkdir -p ms/$PMID +wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P ms/$PMID +cd ms/$PMID unzip pmic12297-sup-0001-supinfo.zip cd ../.. # Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690] PMID=27412690 -mkdir -p raw/$PMID -wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID +mkdir -p ms/$PMID +wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P ms/$PMID # Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757] # Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it. @@ -92,17 +92,59 @@ wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1 # Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700] PMID=23481700 -mkdir -p raw/$PMID -wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID +mkdir -p ms/$PMID +wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P ms/$PMID + +# Shraibman, ..., Admon Mol Cell Proteomics 2019 [PMID 31154438] +PMID=31154438 +mkdir -p ms/$PMID +wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/3/zjw006195963st2.txt -P ms/$PMID +wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/1/zjw006195963st1.xlsx -P ms/$PMID + +# Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121] +PMID=27869121 +mkdir -p ms/$PMID +wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fncomms13404/MediaObjects/41467_2016_BFncomms13404_MOESM1318_ESM.xlsx" -P ms/$PMID ############################################ # MS: Monoallelic class II ############################################ # Abelin, ..., Rooney Immunity 2019 [PMID 31495665] PMID=31495665 -mkdir -p raw/$PMID -wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P raw/$PMID +mkdir -p ms/$PMID +wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P ms/$PMID +############################################ +# RNA-seq expression data (TPMs) +############################################ +# CCLE as processed by expression atlas +DATASET=expression-atlas-22460905 +mkdir -p expression/$DATASET +wget -q https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-2770/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv -P expression/$DATASET + +# Human protein atlas +DATASET=human-protein-atlas +mkdir -p expression/$DATASET +cd expression/$DATASET +wget -q https://www.proteinatlas.org/download/rna_celline.tsv.zip +wget -q https://www.proteinatlas.org/download/rna_blood_cell_sample_tpm_m.tsv.zip +wget -q https://www.proteinatlas.org/download/rna_tissue_gtex.tsv.zip +for i in $(ls *.zip) +do + unzip $i + rm $i +done +cd ../.. + +# Melanoma. Original publication +# Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093]. +DATASET=GSE113126 +mkdir -p expression/$DATASET +cd expression/$DATASET +wget -q "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE113126&format=file" -O GSE113126_RAW.tar +tar -xvf GSE113126_RAW.tar +rm GSE113126_RAW.tar +cd ../.. cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/models_class1_pan_variants/GENERATE.sh b/downloads-generation/models_class1_pan_variants/GENERATE.sh index bda713e2a02bc4a994d62c67dfbc823027f1d792..48da9927a6e8c9666c25d6004a3e77dfd899585b 100755 --- a/downloads-generation/models_class1_pan_variants/GENERATE.sh +++ b/downloads-generation/models_class1_pan_variants/GENERATE.sh @@ -62,15 +62,15 @@ if [ "$2" != "continue-incomplete" ] then cp $SCRIPT_DIR/generate_hyperparameters.production.py . cp $SCRIPT_DIR/generate_hyperparameters.py . - python generate_hyperparameters.production.py > hyperparameters.production.json - python generate_hyperparameters.py hyperparameters.production.json no_pretrain > hyperparameters.no_pretrain.yaml + python generate_hyperparameters.production.py > hyperparameters.production.yaml + python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml fi for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence do CONTINUE_INCOMPLETE_ARGS="" - if [ "$2" == "continue-incomplete" ] && [ -d "models.${kind}" ] + if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ] then echo "Will continue existing run: $kind" CONTINUE_INCOMPLETE_ARGS="--continue-incomplete" diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 4089fb7882f29c257546807aa68a4057a348e609..36eff63397f7aee1dd53600a08ccad0b0f27d7e6 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -37,10 +37,6 @@ releases: url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2 default: false - - name: data_expression - url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_expression.20191009.tar.bz2 - default: false - - name: data_references url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2 default: false @@ -62,7 +58,7 @@ releases: default: false - name: data_published - url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190924.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_published.20191011.tar.bz2 default: false - name: data_curated