Skip to content
Snippets Groups Projects
Commit 9152bada authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent b7087aa8
No related branches found
No related tags found
No related merge requests found
...@@ -49,7 +49,8 @@ done ...@@ -49,7 +49,8 @@ done
time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \ time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
--ms-out ms.nontraining_curated.by_pmid.csv \ --ms-out ms.nontraining_curated.by_pmid.csv \
--expression-out rna_expression.csv --expression-out rna_expression.csv \
--expression-metadata-out rna_expression.metadata.csv
bzip2 ms.nontraining_curated.by_pmid.csv bzip2 ms.nontraining_curated.by_pmid.csv
bzip2 rna_expression.csv bzip2 rna_expression.csv
......
...@@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec. ...@@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec.
import sys import sys
import argparse import argparse
import os import os
import json
import collections import collections
from six.moves import StringIO from six.moves import StringIO
...@@ -44,6 +45,10 @@ parser.add_argument( ...@@ -44,6 +45,10 @@ parser.add_argument(
"--expression-out", "--expression-out",
metavar="OUT.csv", metavar="OUT.csv",
help="Out file path (RNA-seq expression)") help="Out file path (RNA-seq expression)")
parser.add_argument(
"--expression-metadata-out",
metavar="OUT.csv",
help="Out file path for expression metadata, i.e. which samples used")
parser.add_argument( parser.add_argument(
"--debug", "--debug",
action="store_true", action="store_true",
...@@ -421,6 +426,42 @@ def handle_pmid_28832583(*filenames): ...@@ -421,6 +426,42 @@ def handle_pmid_28832583(*filenames):
return result_df return result_df
PMID_31495665_SAMPLE_TYPES = {
"HLA-DR_Lung": "lung",
"HLA-DR_PBMC_HDSC": "pbmc",
"HLA-DR_PBMC_RG1095": "pbmc",
"HLA-DR_PBMC_RG1104": "pbmc",
"HLA-DR_PBMC_RG1248": "pbmc",
"HLA-DR_Spleen": "spleen",
"MAPTAC_A*02:01": "mix:a375,expi293,hek293,hela",
"MAPTAC_A*11:01": "mix:expi293,hela",
"MAPTAC_A*32:01": "mix:a375,expi293,hela",
"MAPTAC_B*07:02": "mix:a375,expi293,hela",
"MAPTAC_B*45:01": "expi293",
"MAPTAC_B*52:01": "mix:a375,expi293",
"MAPTAC_C*03:03": "expi293",
"MAPTAC_C*06:02": "mix:a375,expi293",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
"MAPTAC_DRB1*01:01": "mix:a375,b721,expi293,kg1,k562",
"MAPTAC_DRB1*03:01": "expi293",
"MAPTAC_DRB1*04:01": "expi293",
"MAPTAC_DRB1*07:01": "mix:expi293,hek293",
"MAPTAC_DRB1*11:01": "mix:expi293,k562,kg1",
"MAPTAC_DRB1*12:01_dm+": "expi293",
"MAPTAC_DRB1*12:01_dm-": "expi293",
"MAPTAC_DRB1*15:01": "expi293",
"MAPTAC_DRB3*01:01_dm+": "expi293",
"MAPTAC_DRB3*01:01_dm-": "expi293",
}
CELL_LINE_MIXTURES = sorted(
set(
x for x in PMID_31495665_SAMPLE_TYPES.values()
if x.startswith("mix:")))
def handle_pmid_31495665(filename): def handle_pmid_31495665(filename):
"""Abelin, ..., Rooney Immunity 2019 [PMID 31495665]""" """Abelin, ..., Rooney Immunity 2019 [PMID 31495665]"""
hla_type = { hla_type = {
...@@ -561,56 +602,27 @@ def handle_pmid_31495665(filename): ...@@ -561,56 +602,27 @@ def handle_pmid_31495665(filename):
"MAPTAC_A*11:01": "", "MAPTAC_A*11:01": "",
"MAPTAC_A*32:01": "", "MAPTAC_A*32:01": "",
"MAPTAC_B*07:02": "", "MAPTAC_B*07:02": "",
"MAPTAC_B*45:01": "", "MAPTAC_B*45:01": "expi293",
"MAPTAC_B*52:01": "", "MAPTAC_B*52:01": "",
"MAPTAC_C*03:03": "", "MAPTAC_C*03:03": "expi293",
"MAPTAC_C*06:02": "", "MAPTAC_C*06:02": "",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293", "MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293", "MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293", # don't actually see this in DataS1A! "MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293", # don't actually see this in DataS1A!
"MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293", "MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
"MAPTAC_DRB1*01:01": "", "MAPTAC_DRB1*01:01": "",
"MAPTAC_DRB1*03:01": "", "MAPTAC_DRB1*03:01": "expi293",
"MAPTAC_DRB1*04:01": "", "MAPTAC_DRB1*04:01": "expi293",
"MAPTAC_DRB1*07:01": "", "MAPTAC_DRB1*07:01": "",
"MAPTAC_DRB1*11:01": "", "MAPTAC_DRB1*11:01": "",
"MAPTAC_DRB1*12:01_dm+": "", "MAPTAC_DRB1*12:01_dm+": "expi293",
"MAPTAC_DRB1*12:01_dm-": "", "MAPTAC_DRB1*12:01_dm-": "expi293",
"MAPTAC_DRB1*15:01": "", "MAPTAC_DRB1*15:01": "expi293",
"MAPTAC_DRB3*01:01_dm+": "", "MAPTAC_DRB3*01:01_dm+": "expi293",
"MAPTAC_DRB3*01:01_dm-": "", "MAPTAC_DRB3*01:01_dm-": "expi293",
}
sample_type = {
"HLA-DR_Lung": "lung",
"HLA-DR_PBMC_HDSC": "lung",
"HLA-DR_PBMC_RG1095": "lung",
"HLA-DR_PBMC_RG1104": "lung",
"HLA-DR_PBMC_RG1248": "lung",
"HLA-DR_Spleen": "spleen",
"MAPTAC_A*02:01": "mixed",
"MAPTAC_A*11:01": "mixed",
"MAPTAC_A*32:01": "mixed",
"MAPTAC_B*07:02": "mixed",
"MAPTAC_B*45:01": "mixed",
"MAPTAC_B*52:01": "mixed",
"MAPTAC_C*03:03": "mixed",
"MAPTAC_C*06:02": "mixed",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "mixed",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "mixed",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "mixed",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "mixed",
"MAPTAC_DRB1*01:01": "mixed",
"MAPTAC_DRB1*03:01": "mixed",
"MAPTAC_DRB1*04:01": "mixed",
"MAPTAC_DRB1*07:01": "mixed",
"MAPTAC_DRB1*11:01": "mixed",
"MAPTAC_DRB1*12:01_dm+": "mixed",
"MAPTAC_DRB1*12:01_dm-": "mixed",
"MAPTAC_DRB1*15:01": "mixed",
"MAPTAC_DRB3*01:01_dm+": "mixed",
"MAPTAC_DRB3*01:01_dm-": "mixed",
} }
df = pandas.read_excel(filename, sheet_name="DataS1B") df = pandas.read_excel(filename, sheet_name="DataS1B")
results = [] results = []
for sample_id in df.columns: for sample_id in df.columns:
...@@ -626,7 +638,7 @@ def handle_pmid_31495665(filename): ...@@ -626,7 +638,7 @@ def handle_pmid_31495665(filename):
result_df["pulldown_antibody"] = pulldown_antibody[sample_id] result_df["pulldown_antibody"] = pulldown_antibody[sample_id]
result_df["format"] = format[sample_id] result_df["format"] = format[sample_id]
result_df["mhc_class"] = mhc_class[sample_id] result_df["mhc_class"] = mhc_class[sample_id]
result_df["sample_type"] = sample_type[sample_id] result_df["sample_type"] = PMID_31495665_SAMPLE_TYPES[sample_id]
result_df["cell_line"] = cell_line[sample_id] result_df["cell_line"] = cell_line[sample_id]
results.append(result_df) results.append(result_df)
result_df = pandas.concat(results, ignore_index=True) result_df = pandas.concat(results, ignore_index=True)
...@@ -745,11 +757,18 @@ def handle_pmid_31154438(*filenames): ...@@ -745,11 +757,18 @@ def handle_pmid_31154438(*filenames):
result_df = pandas.concat(results, ignore_index=True) result_df = pandas.concat(results, ignore_index=True)
return result_df return result_df
EXPRESSION_GROUPS_ROWS = []
def expression_groups(dataset_identifier, df, groups):
def make_expression_groups(dataset_identifier, df, groups):
result_df = pandas.DataFrame(index=df.index) result_df = pandas.DataFrame(index=df.index)
for (label, columns) in groups.items(): for (label, columns) in groups.items():
for col in columns:
if col not in df.columns:
raise ValueError(
"Missing: %s. Available: %s" % (col, df.columns.tolist()))
result_df[label] = df[columns].mean(1) result_df[label] = df[columns].mean(1)
EXPRESSION_GROUPS_ROWS.append((dataset_identifier, label, columns))
return result_df return result_df
...@@ -773,7 +792,7 @@ def handle_expression_GSE113126(*filenames): ...@@ -773,7 +792,7 @@ def handle_expression_GSE113126(*filenames):
groups = { groups = {
"sample_type:MELANOMA_MET": df.columns.tolist(), "sample_type:MELANOMA_MET": df.columns.tolist(),
} }
return expression_groups("GSE113126", df, groups) return [make_expression_groups("GSE113126", df, groups)]
def handle_expression_expression_atlas_22460905(filename): def handle_expression_expression_atlas_22460905(filename):
...@@ -785,13 +804,17 @@ def handle_expression_expression_atlas_22460905(filename): ...@@ -785,13 +804,17 @@ def handle_expression_expression_atlas_22460905(filename):
def matches(*strings): def matches(*strings):
return [c for c in df.columns if all(s in c for s in strings)] return [c for c in df.columns if all(s in c for s in strings)]
import ipdb ; ipdb.set_trace()
groups = { groups = {
"sample_type:B-LCL": ( "sample_type:B-LCL": (
matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")), matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")),
"sample_type:B-CELL": matches("b-cell"), "sample_type:B-CELL": matches("b-cell"),
"sample_type:B721-LIKE": matches("b-cell"),
"sample_type:MELANOMA_CELL_LINE": matches("melanoma"), "sample_type:MELANOMA_CELL_LINE": matches("melanoma"),
"sample_type:A375-LIKE": matches("melanoma"),
"sample_type:KG1-LIKE": matches("myeloid leukemia"),
# Using a fibrosarcoma cell line for our fibroblast sample.
"sample_type:FIBROBLAST": ['fibrosarcoma, ht-1080'],
# For GBM tissue we are just using a mixture of cell lines. # For GBM tissue we are just using a mixture of cell lines.
"sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"), "sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"),
...@@ -807,8 +830,7 @@ def handle_expression_expression_atlas_22460905(filename): ...@@ -807,8 +830,7 @@ def handle_expression_expression_atlas_22460905(filename):
"cell_line:HCT116": ['colon carcinoma, hct 116'], "cell_line:HCT116": ['colon carcinoma, hct 116'],
"cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'], "cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'],
} }
return expression_groups("expression_atlas_22460905", df, groups) return [make_expression_groups("expression_atlas_22460905", df, groups)]
def handle_expression_human_protein_atlas(*filenames): def handle_expression_human_protein_atlas(*filenames):
...@@ -826,27 +848,67 @@ def handle_expression_human_protein_atlas(*filenames): ...@@ -826,27 +848,67 @@ def handle_expression_human_protein_atlas(*filenames):
gtex_df = gtex_df.pivot( gtex_df = gtex_df.pivot(
index="Gene", columns="Tissue", values="TPM") index="Gene", columns="Tissue", values="TPM")
result_df = pandas.DataFrame(index=cell_line_df.index) return [
make_expression_groups(
result_df["sample_type:pbmc"] = blood_df[ "human_protein_atlas:%s" % os.path.basename(blood_filename),
[c for c in blood_df.columns if "total PBMC" in c] blood_df,
].mean(1) groups={
"sample_type:PBMC": [
result_df["cell_line:HEK293"] = cell_line_df['HEK 293'] c for c in blood_df.columns if "total PBMC" in c
result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226'] ],
# EXPI293 is based off HEK293 # for samples labeled leukapheresis we also use PBMC
result_df["cell_line:EXPI293"] = cell_line_df['HEK 293'] "sample_type:LEUKAPHERESIS": [
c for c in blood_df.columns if "total PBMC" in c
# For leukapheresis we use pbmc sample ],
result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"]
# for samples labeled TIL we are also using PBMC
for tissue in ["lung", "spleen"]: "sample_type:TIL": [
result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue] c for c in blood_df.columns if "total PBMC" in c
return result_df ],
}),
make_expression_groups(
"human_protein_atlas:%s" % os.path.basename(cell_line_filename),
cell_line_df,
groups={
"cell_line:HELA": ['HeLa'],
"cell_line:K562": ["K-562"],
"cell_line:HEK293": ['HEK 293'],
"cell_line:RPMI8226": ['RPMI-8226'],
"cell_line:EXPI293": ['HEK 293'], # EXPI293 derived from HEK293
}),
make_expression_groups(
"human_protein_atlas:%s" % os.path.basename(gtex_filename),
gtex_df,
groups={
"sample_type:LUNG": ["lung"],
"sample_type:SPLEEN": ["spleen"],
}),
]
def make_expression_mixtures(expression_df):
global CELL_LINE_MIXTURES
groups = {}
for mix in CELL_LINE_MIXTURES:
components = []
for item in mix.replace("mix:", "").upper().split(","):
if "cell_line:%s" % item in expression_df.columns:
components.append("cell_line:%s" % item)
else:
print("No cell line, falling back on similar: ", item)
components.append("sample_type:%s-LIKE" % item)
groups["sample_type:" + mix.upper()] = components
missing = set()
for some in groups.values():
for item in some:
if item not in expression_df.columns:
missing.add(item)
if missing:
raise ValueError(
"Missing [%d]: %s. Available: %s" % (
len(missing), missing, expression_df.columns.tolist()))
return make_expression_groups("mixtures", expression_df, groups)
# Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict. # Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict.
...@@ -869,36 +931,44 @@ def run(): ...@@ -869,36 +931,44 @@ def run():
label, label,
*[os.path.abspath(f) for f in filenames]) *[os.path.abspath(f) for f in filenames])
expression_df = None expression_dfs_for_item = []
handler = None handler = None
if label in EXPRESSION_HANDLERS: if label in EXPRESSION_HANDLERS:
handler = EXPRESSION_HANDLERS[label] handler = EXPRESSION_HANDLERS[label]
expression_df = handler(*filenames) expression_dfs_for_item = handler(*filenames)
elif args.debug: elif args.debug:
debug(*filenames) debug(*filenames)
else: else:
raise NotImplementedError(label) raise NotImplementedError(label)
if expression_df is not None: if expression_dfs_for_item:
print( print(
"Processed expression data", "Processed expression data",
label, label,
"with shape", "result dataframes",
expression_df.shape) len(expression_dfs_for_item))
print(*expression_df.columns) print(*[e.columns for e in expression_dfs_for_item])
expression_dfs.append(expression_df) expression_dfs.extend(expression_dfs_for_item)
expression_df = expression_dfs[0] expression_df = expression_dfs[0]
for other in expression_dfs[1:]: for other in expression_dfs[1:]:
expression_df = pandas.merge( expression_df = pandas.merge(
expression_df, other, how='outer', left_index=True, right_index=True) expression_df, other, how='outer', left_index=True, right_index=True)
expression_df = expression_df.fillna(0)
print( print("Genes in each expression dataframe: ",
"Genes in each expression dataframe: ",
*[len(e) for e in expression_dfs]) *[len(e) for e in expression_dfs])
print("Genes in merged expression dataframe", len(expression_df)) print("Genes in merged expression dataframe", len(expression_df))
if CELL_LINE_MIXTURES:
print("Generating cell line mixtures.")
expression_mixture_df = make_expression_mixtures(expression_df)
expression_df = pandas.merge(
expression_df,
expression_mixture_df,
how='outer',
left_index=True,
right_index=True)
ms_dfs = [] ms_dfs = []
for (i, item_tpl) in enumerate(args.ms_item): for (i, item_tpl) in enumerate(args.ms_item):
(pmid, filenames) = (item_tpl[0], item_tpl[1:]) (pmid, filenames) = (item_tpl[0], item_tpl[1:])
...@@ -1026,5 +1096,15 @@ def run(): ...@@ -1026,5 +1096,15 @@ def run():
ms_df.to_csv(args.ms_out, index=False) ms_df.to_csv(args.ms_out, index=False)
print("Wrote: %s" % os.path.abspath(args.ms_out)) print("Wrote: %s" % os.path.abspath(args.ms_out))
if args.expression_metadata_out is not None:
expression_metadata_df = pandas.DataFrame(
EXPRESSION_GROUPS_ROWS,
columns=["expression_dataset", "label", "samples"])
expression_metadata_df["samples"] = expression_metadata_df[
"samples"
].map(json.dumps)
expression_metadata_df.to_csv(args.expression_metadata_out, index=False)
print("Wrote: %s" % os.path.abspath(args.expression_metadata_out))
if __name__ == '__main__': if __name__ == '__main__':
run() run()
...@@ -27,7 +27,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME ...@@ -27,7 +27,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/annotate.py . cp $SCRIPT_DIR/annotate.py .
PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2 PEPTIDES=$(mhcflurry-downloads path data_curated)/ms.nontraining_curated.by_pmid.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references) REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python annotate.py \ python annotate.py \
......
...@@ -62,7 +62,7 @@ releases: ...@@ -62,7 +62,7 @@ releases:
default: false default: false
- name: data_curated - name: data_curated
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2
default: true default: true
# Older downloads # Older downloads
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment