Skip to content
Snippets Groups Projects
Commit 6e2ee8bb authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent f6349216
No related branches found
No related tags found
No related merge requests found
......@@ -49,7 +49,8 @@ done
time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
--ms-out ms.nontraining_curated.by_pmid.csv \
--expression-out rna_expression.csv
--expression-out rna_expression.csv \
--expression-metadata-out rna_expression.metadata.csv
bzip2 ms.nontraining_curated.by_pmid.csv
bzip2 rna_expression.csv
......
......@@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec.
import sys
import argparse
import os
import json
import collections
from six.moves import StringIO
......@@ -44,6 +45,10 @@ parser.add_argument(
"--expression-out",
metavar="OUT.csv",
help="Out file path (RNA-seq expression)")
parser.add_argument(
"--expression-metadata-out",
metavar="OUT.csv",
help="Out file path for expression metadata, i.e. which samples used")
parser.add_argument(
"--debug",
action="store_true",
......@@ -421,6 +426,42 @@ def handle_pmid_28832583(*filenames):
return result_df
PMID_31495665_SAMPLE_TYPES = {
"HLA-DR_Lung": "lung",
"HLA-DR_PBMC_HDSC": "pbmc",
"HLA-DR_PBMC_RG1095": "pbmc",
"HLA-DR_PBMC_RG1104": "pbmc",
"HLA-DR_PBMC_RG1248": "pbmc",
"HLA-DR_Spleen": "spleen",
"MAPTAC_A*02:01": "mix:a375,expi293,hek293,hela",
"MAPTAC_A*11:01": "mix:expi293,hela",
"MAPTAC_A*32:01": "mix:a375,expi293,hela",
"MAPTAC_B*07:02": "mix:a375,expi293,hela",
"MAPTAC_B*45:01": "expi293",
"MAPTAC_B*52:01": "mix:a375,expi293",
"MAPTAC_C*03:03": "expi293",
"MAPTAC_C*06:02": "mix:a375,expi293",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
"MAPTAC_DRB1*01:01": "mix:a375,b721,expi293,kg1,k562",
"MAPTAC_DRB1*03:01": "expi293",
"MAPTAC_DRB1*04:01": "expi293",
"MAPTAC_DRB1*07:01": "mix:expi293,hek293",
"MAPTAC_DRB1*11:01": "mix:expi293,k562,kg1",
"MAPTAC_DRB1*12:01_dm+": "expi293",
"MAPTAC_DRB1*12:01_dm-": "expi293",
"MAPTAC_DRB1*15:01": "expi293",
"MAPTAC_DRB3*01:01_dm+": "expi293",
"MAPTAC_DRB3*01:01_dm-": "expi293",
}
CELL_LINE_MIXTURES = sorted(
set(
x for x in PMID_31495665_SAMPLE_TYPES.values()
if x.startswith("mix:")))
def handle_pmid_31495665(filename):
"""Abelin, ..., Rooney Immunity 2019 [PMID 31495665]"""
hla_type = {
......@@ -561,56 +602,27 @@ def handle_pmid_31495665(filename):
"MAPTAC_A*11:01": "",
"MAPTAC_A*32:01": "",
"MAPTAC_B*07:02": "",
"MAPTAC_B*45:01": "",
"MAPTAC_B*45:01": "expi293",
"MAPTAC_B*52:01": "",
"MAPTAC_C*03:03": "",
"MAPTAC_C*03:03": "expi293",
"MAPTAC_C*06:02": "",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "expi293",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "expi293",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "expi293", # don't actually see this in DataS1A!
"MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "expi293",
"MAPTAC_DRB1*01:01": "",
"MAPTAC_DRB1*03:01": "",
"MAPTAC_DRB1*04:01": "",
"MAPTAC_DRB1*03:01": "expi293",
"MAPTAC_DRB1*04:01": "expi293",
"MAPTAC_DRB1*07:01": "",
"MAPTAC_DRB1*11:01": "",
"MAPTAC_DRB1*12:01_dm+": "",
"MAPTAC_DRB1*12:01_dm-": "",
"MAPTAC_DRB1*15:01": "",
"MAPTAC_DRB3*01:01_dm+": "",
"MAPTAC_DRB3*01:01_dm-": "",
}
sample_type = {
"HLA-DR_Lung": "lung",
"HLA-DR_PBMC_HDSC": "lung",
"HLA-DR_PBMC_RG1095": "lung",
"HLA-DR_PBMC_RG1104": "lung",
"HLA-DR_PBMC_RG1248": "lung",
"HLA-DR_Spleen": "spleen",
"MAPTAC_A*02:01": "mixed",
"MAPTAC_A*11:01": "mixed",
"MAPTAC_A*32:01": "mixed",
"MAPTAC_B*07:02": "mixed",
"MAPTAC_B*45:01": "mixed",
"MAPTAC_B*52:01": "mixed",
"MAPTAC_C*03:03": "mixed",
"MAPTAC_C*06:02": "mixed",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm+": "mixed",
"MAPTAC_DPB1*06:01/DPA1*01:03_dm-": "mixed",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm+": "mixed",
"MAPTAC_DQB1*06:04/DQA1*01:02_dm-": "mixed",
"MAPTAC_DRB1*01:01": "mixed",
"MAPTAC_DRB1*03:01": "mixed",
"MAPTAC_DRB1*04:01": "mixed",
"MAPTAC_DRB1*07:01": "mixed",
"MAPTAC_DRB1*11:01": "mixed",
"MAPTAC_DRB1*12:01_dm+": "mixed",
"MAPTAC_DRB1*12:01_dm-": "mixed",
"MAPTAC_DRB1*15:01": "mixed",
"MAPTAC_DRB3*01:01_dm+": "mixed",
"MAPTAC_DRB3*01:01_dm-": "mixed",
"MAPTAC_DRB1*12:01_dm+": "expi293",
"MAPTAC_DRB1*12:01_dm-": "expi293",
"MAPTAC_DRB1*15:01": "expi293",
"MAPTAC_DRB3*01:01_dm+": "expi293",
"MAPTAC_DRB3*01:01_dm-": "expi293",
}
df = pandas.read_excel(filename, sheet_name="DataS1B")
results = []
for sample_id in df.columns:
......@@ -626,7 +638,7 @@ def handle_pmid_31495665(filename):
result_df["pulldown_antibody"] = pulldown_antibody[sample_id]
result_df["format"] = format[sample_id]
result_df["mhc_class"] = mhc_class[sample_id]
result_df["sample_type"] = sample_type[sample_id]
result_df["sample_type"] = PMID_31495665_SAMPLE_TYPES[sample_id]
result_df["cell_line"] = cell_line[sample_id]
results.append(result_df)
result_df = pandas.concat(results, ignore_index=True)
......@@ -745,11 +757,18 @@ def handle_pmid_31154438(*filenames):
result_df = pandas.concat(results, ignore_index=True)
return result_df
EXPRESSION_GROUPS_ROWS = []
def expression_groups(dataset_identifier, df, groups):
def make_expression_groups(dataset_identifier, df, groups):
result_df = pandas.DataFrame(index=df.index)
for (label, columns) in groups.items():
for col in columns:
if col not in df.columns:
raise ValueError(
"Missing: %s. Available: %s" % (col, df.columns.tolist()))
result_df[label] = df[columns].mean(1)
EXPRESSION_GROUPS_ROWS.append((dataset_identifier, label, columns))
return result_df
......@@ -773,7 +792,7 @@ def handle_expression_GSE113126(*filenames):
groups = {
"sample_type:MELANOMA_MET": df.columns.tolist(),
}
return expression_groups("GSE113126", df, groups)
return [make_expression_groups("GSE113126", df, groups)]
def handle_expression_expression_atlas_22460905(filename):
......@@ -785,13 +804,17 @@ def handle_expression_expression_atlas_22460905(filename):
def matches(*strings):
return [c for c in df.columns if all(s in c for s in strings)]
import ipdb ; ipdb.set_trace()
groups = {
"sample_type:B-LCL": (
matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")),
"sample_type:B-CELL": matches("b-cell"),
"sample_type:B721-LIKE": matches("b-cell"),
"sample_type:MELANOMA_CELL_LINE": matches("melanoma"),
"sample_type:A375-LIKE": matches("melanoma"),
"sample_type:KG1-LIKE": matches("myeloid leukemia"),
# Using a fibrosarcoma cell line for our fibroblast sample.
"sample_type:FIBROBLAST": ['fibrosarcoma, ht-1080'],
# For GBM tissue we are just using a mixture of cell lines.
"sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"),
......@@ -807,8 +830,7 @@ def handle_expression_expression_atlas_22460905(filename):
"cell_line:HCT116": ['colon carcinoma, hct 116'],
"cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'],
}
return expression_groups("expression_atlas_22460905", df, groups)
return [make_expression_groups("expression_atlas_22460905", df, groups)]
def handle_expression_human_protein_atlas(*filenames):
......@@ -826,27 +848,67 @@ def handle_expression_human_protein_atlas(*filenames):
gtex_df = gtex_df.pivot(
index="Gene", columns="Tissue", values="TPM")
result_df = pandas.DataFrame(index=cell_line_df.index)
result_df["sample_type:pbmc"] = blood_df[
[c for c in blood_df.columns if "total PBMC" in c]
].mean(1)
result_df["cell_line:HEK293"] = cell_line_df['HEK 293']
result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226']
# EXPI293 is based off HEK293
result_df["cell_line:EXPI293"] = cell_line_df['HEK 293']
# For leukapheresis we use pbmc sample
result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"]
for tissue in ["lung", "spleen"]:
result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue]
return result_df
return [
make_expression_groups(
"human_protein_atlas:%s" % os.path.basename(blood_filename),
blood_df,
groups={
"sample_type:PBMC": [
c for c in blood_df.columns if "total PBMC" in c
],
# for samples labeled leukapheresis we also use PBMC
"sample_type:LEUKAPHERESIS": [
c for c in blood_df.columns if "total PBMC" in c
],
# for samples labeled TIL we are also using PBMC
"sample_type:TIL": [
c for c in blood_df.columns if "total PBMC" in c
],
}),
make_expression_groups(
"human_protein_atlas:%s" % os.path.basename(cell_line_filename),
cell_line_df,
groups={
"cell_line:HELA": ['HeLa'],
"cell_line:K562": ["K-562"],
"cell_line:HEK293": ['HEK 293'],
"cell_line:RPMI8226": ['RPMI-8226'],
"cell_line:EXPI293": ['HEK 293'], # EXPI293 derived from HEK293
}),
make_expression_groups(
"human_protein_atlas:%s" % os.path.basename(gtex_filename),
gtex_df,
groups={
"sample_type:LUNG": ["lung"],
"sample_type:SPLEEN": ["spleen"],
}),
]
def make_expression_mixtures(expression_df):
global CELL_LINE_MIXTURES
groups = {}
for mix in CELL_LINE_MIXTURES:
components = []
for item in mix.replace("mix:", "").upper().split(","):
if "cell_line:%s" % item in expression_df.columns:
components.append("cell_line:%s" % item)
else:
print("No cell line, falling back on similar: ", item)
components.append("sample_type:%s-LIKE" % item)
groups["sample_type:" + mix.upper()] = components
missing = set()
for some in groups.values():
for item in some:
if item not in expression_df.columns:
missing.add(item)
if missing:
raise ValueError(
"Missing [%d]: %s. Available: %s" % (
len(missing), missing, expression_df.columns.tolist()))
return make_expression_groups("mixtures", expression_df, groups)
# Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict.
......@@ -869,36 +931,44 @@ def run():
label,
*[os.path.abspath(f) for f in filenames])
expression_df = None
expression_dfs_for_item = []
handler = None
if label in EXPRESSION_HANDLERS:
handler = EXPRESSION_HANDLERS[label]
expression_df = handler(*filenames)
expression_dfs_for_item = handler(*filenames)
elif args.debug:
debug(*filenames)
else:
raise NotImplementedError(label)
if expression_df is not None:
if expression_dfs_for_item:
print(
"Processed expression data",
label,
"with shape",
expression_df.shape)
print(*expression_df.columns)
expression_dfs.append(expression_df)
"result dataframes",
len(expression_dfs_for_item))
print(*[e.columns for e in expression_dfs_for_item])
expression_dfs.extend(expression_dfs_for_item)
expression_df = expression_dfs[0]
for other in expression_dfs[1:]:
expression_df = pandas.merge(
expression_df, other, how='outer', left_index=True, right_index=True)
expression_df = expression_df.fillna(0)
print(
"Genes in each expression dataframe: ",
print("Genes in each expression dataframe: ",
*[len(e) for e in expression_dfs])
print("Genes in merged expression dataframe", len(expression_df))
if CELL_LINE_MIXTURES:
print("Generating cell line mixtures.")
expression_mixture_df = make_expression_mixtures(expression_df)
expression_df = pandas.merge(
expression_df,
expression_mixture_df,
how='outer',
left_index=True,
right_index=True)
ms_dfs = []
for (i, item_tpl) in enumerate(args.ms_item):
(pmid, filenames) = (item_tpl[0], item_tpl[1:])
......@@ -1026,5 +1096,15 @@ def run():
ms_df.to_csv(args.ms_out, index=False)
print("Wrote: %s" % os.path.abspath(args.ms_out))
if args.expression_metadata_out is not None:
expression_metadata_df = pandas.DataFrame(
EXPRESSION_GROUPS_ROWS,
columns=["expression_dataset", "label", "samples"])
expression_metadata_df["samples"] = expression_metadata_df[
"samples"
].map(json.dumps)
expression_metadata_df.to_csv(args.expression_metadata_out, index=False)
print("Wrote: %s" % os.path.abspath(args.expression_metadata_out))
if __name__ == '__main__':
run()
......@@ -27,7 +27,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/annotate.py .
PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
PEPTIDES=$(mhcflurry-downloads path data_curated)/ms.nontraining_curated.by_pmid.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python annotate.py \
......
......@@ -62,7 +62,7 @@ releases:
default: false
- name: data_curated
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2
url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2
default: true
# Older downloads
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment