Skip to content
Snippets Groups Projects
Commit 48d6eed2 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent ab2ca5d0
No related branches found
No related tags found
No related merge requests found
......@@ -29,23 +29,33 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/curate.py .
cp $SCRIPT_DIR/curate_by_pmid.py .
cp $SCRIPT_DIR/curate_ms_by_pmid.py .
RAW_DIR="$(mhcflurry-downloads path data_published)/raw"
cp -r "$RAW_DIR" .
MS_DIR="$(mhcflurry-downloads path data_published)/ms"
cp -r "$MS_DIR" .
EXPRESSION_DIR="$(mhcflurry-downloads path data_published)/expression"
cp -r "$EXPRESSION_DIR" .
CURATE_BY_PMID_ARGS=""
for pmid in $(ls raw)
for pmid in $(ls ms)
do
CURATE_BY_PMID_ARGS+=$(echo --ms-item $pmid ms/$pmid/* ' ')
done
for item in $(ls expression)
do
CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ')
CURATE_BY_PMID_ARGS+=$(echo --expression-item $item expression/$item/* ' ')
done
time python curate_by_pmid.py $CURATE_BY_PMID_ARGS \
--out nontraining_curated.by_pmid.csv
time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
--ms-out ms.nontraining_curated.by_pmid.csv \
--expression-out rna_expression.csv
bzip2 ms.nontraining_curated.by_pmid.csv
bzip2 rna_expression.csv
bzip2 nontraining_curated.by_pmid.csv
rm -rf raw
rm -rf ms
# No mass-spec data
time python curate.py \
......
......@@ -23,24 +23,35 @@ def normalize_allele_name(s):
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--item",
"--ms-item",
nargs="+",
action="append",
metavar="PMID FILE, ... FILE",
default=[],
help="Item to curate: PMID and list of files")
help="Mass spec item to curate: PMID and list of files")
parser.add_argument(
"--out",
"--expression-item",
nargs="+",
action="append",
metavar="LABEL FILE, ... FILE",
default=[],
help="Expression data to curate: dataset label and list of files")
parser.add_argument(
"--ms-out",
metavar="OUT.csv",
help="Out file path (MS data)")
parser.add_argument(
"--expression-out",
metavar="OUT.csv",
help="Out file path")
help="Out file path (RNA-seq expression)")
parser.add_argument(
"--debug",
action="store_true",
default=False,
help="Leave user in pdb if PMID is unsupported")
HANDLERS = {}
PMID_HANDLERS = {}
EXPRESSION_HANDLERS = {}
def load(filenames, **kwargs):
result = {}
......@@ -146,7 +157,7 @@ def handle_pmid_24616531(filename):
"peptide": peptides,
})
result_df["sample_id"] = "24616531"
result_df["sample_type"] = "B-lymphoblastoid"
result_df["sample_type"] = "B-LCL"
result_df["cell_line"] = "GR"
result_df["pulldown_antibody"] = "W6/32"
......@@ -181,7 +192,6 @@ def handle_pmid_25576301(filename):
rows.append((row.Sequence, sample))
result_df = pandas.DataFrame(rows, columns=["peptide", "sample_id"])
result_df["cell_line"] = ""
result_df["pulldown_antibody"] = "W6/32"
result_df["mhc_class"] = "I"
result_df["format"] = "multiallelic"
......@@ -209,11 +219,21 @@ def handle_pmid_25576301(filename):
'HCC1143': "basal like breast cancer",
'JY': "B-cell",
}
cell_line = {
'Fib': None,
'HCC1937': "HCC1937",
'SupB15WT': None,
'SupB15RT': None,
'HCT116': "HCT116",
'HCC1143': "HCC1143",
'JY': "JY",
}
result_df["hla"] = result_df.sample_id.map(allele_map)
print("Entries before dropping samples with unknown alleles", len(result_df))
result_df = result_df.loc[~result_df.hla.isnull()]
print("Entries after dropping samples with unknown alleles", len(result_df))
result_df["sample_type"] = result_df.sample_id.map(sample_type)
result_df["cell_line"] = result_df.sample_id.map(cell_line)
print(result_df.head(3))
return result_df
......@@ -266,7 +286,7 @@ def handle_pmid_26992070(*filenames):
"HEK293": "hek",
"HL-60": "neutrophil",
"RPMI8226": "b-cell",
"MAVER-1": "b-lymphoblast",
"MAVER-1": "b-LCL",
"THP-1": "monocyte",
})
result_df["mhc_class"] = "I"
......@@ -343,30 +363,30 @@ def handle_pmid_28832583(*filenames):
CM467 B-cell 28832583 HLA-A*01:01 HLA-A*24:02 HLA-B*13:02 HLA-B*39:06 HLA-C*06:02 HLA-C*12:03
GD149 B-cell 28832583 HLA-A*01:01 HLA-A*24:02 HLA-B*38:01 HLA-B*44:03 HLA-C*06:02 HLA-C*12:03
MD155 B-cell 28832583 HLA-A*02:01 HLA-A*24:02 HLA-B*15:01 HLA-B*18:01 HLA-C*03:03 HLA-C*07:01
PD42 B cell 28832583 HLA-A*02:06 HLA-A*24:02 HLA-B*07:02 HLA-B*55:01 HLA-C*01:02 HLA-C*07:02
RA957 B cell 28832583 HLA-A*02:20 HLA-A*68:01 HLA-B*35:03 HLA-B*39:01 HLA-C*04:01 HLA-C*07:02
PD42 B-cell 28832583 HLA-A*02:06 HLA-A*24:02 HLA-B*07:02 HLA-B*55:01 HLA-C*01:02 HLA-C*07:02
RA957 B-cell 28832583 HLA-A*02:20 HLA-A*68:01 HLA-B*35:03 HLA-B*39:01 HLA-C*04:01 HLA-C*07:02
TIL1 TIL 28832583 HLA-A*02:01 HLA-A*02:01 HLA-B*18:01 HLA-B*38:01 HLA-C*05:01
TIL3 TIL 28832583 HLA-A*01:01 HLA-A*23:01 HLA-B*07:02 HLA-B*15:01 HLA-C*12:03 HLA-C*14:02
Apher1 Leukapheresis 28832583 HLA-A*03:01 HLA-A*29:02 HLA-B*44:02 HLA-B*44:03 HLA-C*12:03 HLA-C*16:01
Apher6 Leukapheresis 28832583 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02
pat_AC2 B lymphoblast 27841757 HLA-A*03:01 HLA-A*32:01 HLA-B*27:05 HLA-B*45:01
pat_C B lymphoblast 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02
pat_CELG B lymphoblast 27841757 HLA-A*02:01 HLA-A*24:02 HLA-B*15:01 HLA-B*73:01 HLA-C*03:03 HLA-C*15:05
pat_CP2 B lymphoblast 27841757 HLA-A*11:01 HLA-B*14:02 HLA-B*44:02
pat_FL B lymphoblast 27841757 HLA-A*03:01 HLA-A*11:01 HLA-B*44:03 HLA-B*50:01
pat_J B lymphoblast 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02
pat_JPB3 B lymphoblast 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*27:05 HLA-B*56:01
pat_JT2 B lymphoblast 27841757 HLA-A*11:01 HLA-B*18:03 HLA-B*35:01
pat_M B lymphoblast 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01
pat_MA B lymphoblast 27841757 HLA-A*02:01 HLA-A*29:02 HLA-B*44:03 HLA-B*57:01 HLA-C*07:01 HLA-C*16:01
pat_ML B lymphoblast 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*40:01 HLA-B*44:03
pat_NS2 B lymphoblast 27841757 HLA-A*02:01 HLA-B*13:02 HLA-B*41:01
pat_NT B lymphoblast 27841757 HLA-A*01:01 HLA-A*32:01 HLA-B*08:01
pat_PF1 B lymphoblast 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*07:02 HLA-B*44:03 HLA-C*07:02 HLA-C*16:01
pat_R B lymphoblast 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01
pat_RT B lymphoblast 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*18:01 HLA-B*39:24 HLA-C*05:01 HLA-C*07:01
pat_SR B lymphoblast 27841757 HLA-A*02:01 HLA-A*23:01 HLA-B*18:01 HLA-B*44:03
pat_ST B lymphoblast 27841757 HLA-A*03:01 HLA-A*24:02 HLA-B*07:02 HLA-B*27:05
pat_AC2 B-LCL 27841757 HLA-A*03:01 HLA-A*32:01 HLA-B*27:05 HLA-B*45:01
pat_C B-LCL 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02
pat_CELG B-LCL 27841757 HLA-A*02:01 HLA-A*24:02 HLA-B*15:01 HLA-B*73:01 HLA-C*03:03 HLA-C*15:05
pat_CP2 B-LCL 27841757 HLA-A*11:01 HLA-B*14:02 HLA-B*44:02
pat_FL B-LCL 27841757 HLA-A*03:01 HLA-A*11:01 HLA-B*44:03 HLA-B*50:01
pat_J B-LCL 27841757 HLA-A*02:01 HLA-A*03:01 HLA-B*07:02 HLA-C*07:02
pat_JPB3 B-LCL 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*27:05 HLA-B*56:01
pat_JT2 B-LCL 27841757 HLA-A*11:01 HLA-B*18:03 HLA-B*35:01
pat_M B-LCL 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01
pat_MA B-LCL 27841757 HLA-A*02:01 HLA-A*29:02 HLA-B*44:03 HLA-B*57:01 HLA-C*07:01 HLA-C*16:01
pat_ML B-LCL 27841757 HLA-A*02:01 HLA-A*11:01 HLA-B*40:01 HLA-B*44:03
pat_NS2 B-LCL 27841757 HLA-A*02:01 HLA-B*13:02 HLA-B*41:01
pat_NT B-LCL 27841757 HLA-A*01:01 HLA-A*32:01 HLA-B*08:01
pat_PF1 B-LCL 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*07:02 HLA-B*44:03 HLA-C*07:02 HLA-C*16:01
pat_R B-LCL 27841757 HLA-A*03:01 HLA-A*29:02 HLA-B*08:01 HLA-B*44:03 HLA-C*07:01 HLA-C*16:01
pat_RT B-LCL 27841757 HLA-A*01:01 HLA-A*02:01 HLA-B*18:01 HLA-B*39:24 HLA-C*05:01 HLA-C*07:01
pat_SR B-LCL 27841757 HLA-A*02:01 HLA-A*23:01 HLA-B*18:01 HLA-B*44:03
pat_ST B-LCL 27841757 HLA-A*03:01 HLA-A*24:02 HLA-B*07:02 HLA-B*27:05
"""
info_df = pandas.read_csv(StringIO(info_text), sep="\t", index_col=0)
info_df.index = info_df.index.str.strip()
......@@ -613,59 +633,383 @@ def handle_pmid_31495665(filename):
return result_df
# Add all functions with names like handle_pmid_XXXX to HANDLERS dict.
def handle_pmid_27869121(filename):
"""Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121]"""
# Although this dataset has class II data also, we are only extracting
# class I for now.
df = pandas.read_excel(filename, skiprows=1)
# Taking these from:
# Supplementary Table 2: Information of patients selected for neoepitope
# identification
# For the Mel5 sample, only two-digit alleles are shown (A*01, A*25,
# B*08, B*18) so we are skipping that sample for now.
hla_df = pandas.DataFrame([
("Mel-8", "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*08:01 HLA-C*07:01 HLA-C*07:02"),
("Mel-12", "HLA-A*01:01 HLA-B*08:01 HLA-C*07:01"),
("Mel-15", "HLA-A*03:01 HLA-A*68:01 HLA-B*27:05 HLA-B*35:03 HLA-C*02:02 HLA-C*04:01"),
("Mel-16", "HLA-A*01:01 HLA-A*24:02 HLA-B*07:02 HLA-B*08:01 HLA-C*07:01 HLA-C*07:02"),
], columns=["sample_id", "hla"]).set_index("sample_id")
# We assert below that none of the class I hit peptides were found in any
# of the class II pull downs.
class_ii_cols = [
c for c in df.columns if c.endswith("HLA-II (arbitrary units)")
]
class_ii_hits = set(df.loc[
(df[class_ii_cols].fillna(0.0).sum(1) > 0)
].Sequence.unique())
results = []
for (sample_id, hla) in hla_df.hla.items():
intensity_col = "Intensity %s_HLA-I (arbitrary units)" % sample_id
sub_df = df.loc[
(df[intensity_col].fillna(0.0) > 0)
]
filtered_sub_df = sub_df.loc[
(~sub_df.Sequence.isin(class_ii_hits))
]
peptides = filtered_sub_df.Sequence.unique()
assert not any(p in class_ii_hits for p in peptides)
result_df = pandas.DataFrame({
"peptide": peptides,
})
result_df["sample_id"] = sample_id
result_df["hla"] = hla_df.loc[sample_id, "hla"]
result_df["pulldown_antibody"] = "W6/32"
result_df["format"] = "multiallelic"
result_df["mhc_class"] = "I"
result_df["sample_type"] = "melanoma_met"
result_df["cell_line"] = None
results.append(result_df)
result_df = pandas.concat(results, ignore_index=True)
return result_df
def handle_pmid_31154438(*filenames):
"""Shraibman, ..., Admon Mol Cell Proteomics 2019"""
# Note: this publication also includes analyses of the secreted HLA
# peptidedome (sHLA) but we are using only the data from membrane-bound
# HLA.
(xls, txt) = sorted(filenames, key=lambda s: not s.endswith(".xlsx"))
info = pandas.read_excel(xls, skiprows=1)
df = pandas.read_csv(txt, sep="\t", skiprows=1)
hla_df = info.loc[
~info["mHLA tissue sample"].isnull()
].set_index("mHLA tissue sample")[["HLA typing"]]
def fix_hla(string):
result = []
alleles = string.split(";")
for a in alleles:
a = a.strip()
if "/" in a:
(a1, a2) = a.split("/")
a2 = a1[:2] + a2
lst = [a1, a2]
else:
lst = [a]
for a in lst:
normalized = normalize_allele_name(a)
# Ignore class II
if normalized[4] in ("A", "B", "C"):
result.append(normalized)
return " ".join(result)
hla_df["hla"] = hla_df["HLA typing"].map(fix_hla)
results = []
for (sample_id, hla) in hla_df.hla.items():
intensity_col = "Intensity %s" % sample_id
sub_df = df.loc[
(df[intensity_col].fillna(0.0) > 0)
]
peptides = sub_df.Sequence.unique()
result_df = pandas.DataFrame({
"peptide": peptides,
})
result_df["sample_id"] = sample_id
result_df["hla"] = hla_df.loc[sample_id, "hla"]
result_df["pulldown_antibody"] = "W6/32"
result_df["format"] = "multiallelic"
result_df["mhc_class"] = "I"
result_df["sample_type"] = "glioblastoma_tissue"
result_df["cell_line"] = None
results.append(result_df)
result_df = pandas.concat(results, ignore_index=True)
return result_df
def expression_groups(dataset_identifier, df, groups):
result_df = pandas.DataFrame(index=df.index)
for (label, columns) in groups.items():
result_df[label] = df[columns].mean(1)
return result_df
def handle_expression_GSE113126(*filenames):
"""
Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093]
This is the melanoma met RNA-seq dataset.
"""
df = pandas.read_csv(filenames[0], sep="\t", index_col=0)
df = df[[]] # no columns
for filename in filenames:
df[os.path.basename(filename)] = pandas.read_csv(
filename, sep="\t", index_col=0)["TPM"]
assert len(df.columns) == len(filenames)
groups = {
"sample_type:MELANOMA_MET": df.columns.tolist(),
}
return expression_groups("GSE113126", df, groups)
def handle_expression_expression_atlas_22460905(filename):
df = pandas.read_csv(filename, sep="\t", skiprows=4, index_col=0)
del df["Gene Name"]
df.columns = df.columns.str.lower()
df = df.fillna(0.0)
def matches(*strings):
return [c for c in df.columns if all(s in c for s in strings)]
import ipdb ; ipdb.set_trace()
groups = {
"sample_type:B-LCL": (
matches("b-cell", "lymphoblast") + matches("b acute lymphoblastic")),
"sample_type:B-CELL": matches("b-cell"),
"sample_type:MELANOMA_CELL_LINE": matches("melanoma"),
# For GBM tissue we are just using a mixture of cell lines.
"sample_type:GLIOBLASTOMA_TISSUE": matches("glioblastoma"),
"cell_line:THP-1": ["childhood acute monocytic leukemia, thp-1"],
"cell_line:HL-60": ["adult acute myeloid leukemia, hl-60"],
"cell_line:U-87": ['glioblastoma, u-87 mg'],
"cell_line:LNT-229": ['glioblastoma, ln-229'],
"cell_line:T98G": ['glioblastoma, t98g'],
"cell_line:SK-MEL-5": ['cutaneous melanoma, sk-mel-5'],
'cell_line:MEWO': ['melanoma, mewo'],
"cell_line:HCC1937": ['breast ductal adenocarcinoma, hcc1937'],
"cell_line:HCT116": ['colon carcinoma, hct 116'],
"cell_line:HCC1143": ['breast ductal adenocarcinoma, hcc1143'],
}
return expression_groups("expression_atlas_22460905", df, groups)
def handle_expression_human_protein_atlas(*filenames):
(cell_line_filename,) = [f for f in filenames if "celline" in f]
(blood_filename,) = [f for f in filenames if "blood" in f]
(gtex_filename,) = [f for f in filenames if "gtex" in f]
cell_line_df = pandas.read_csv(cell_line_filename, sep="\t")
blood_df = pandas.read_csv(blood_filename, sep="\t", index_col=0)
gtex_df = pandas.read_csv(gtex_filename, sep="\t")
cell_line_df = cell_line_df.pivot(
index="Gene", columns="Cell line", values="TPM")
gtex_df = gtex_df.pivot(
index="Gene", columns="Tissue", values="TPM")
result_df = pandas.DataFrame(index=cell_line_df.index)
result_df["sample_type:pbmc"] = blood_df[
[c for c in blood_df.columns if "total PBMC" in c]
].mean(1)
result_df["cell_line:HEK293"] = cell_line_df['HEK 293']
result_df["cell_line:RPMI8226"] = cell_line_df['RPMI-8226']
# EXPI293 is based off HEK293
result_df["cell_line:EXPI293"] = cell_line_df['HEK 293']
# For leukapheresis we use pbmc sample
result_df["sample_type:leukapheresis"] = result_df["sample_type:pbmc"]
for tissue in ["lung", "spleen"]:
result_df["sample_type:%s" % tissue.upper()] = gtex_df[tissue]
return result_df
# Add all functions with names like handle_pmid_XXXX to PMID_HANDLERS dict.
for (key, value) in list(locals().items()):
if key.startswith("handle_pmid_"):
HANDLERS[key.replace("handle_pmid_", "")] = value
PMID_HANDLERS[key.replace("handle_pmid_", "")] = value
elif key.startswith("handle_expression_"):
EXPRESSION_HANDLERS[key.replace("handle_expression_", "")] = value
def run():
args = parser.parse_args(sys.argv[1:])
dfs = []
for (i, item_tpl) in enumerate(args.item):
expression_dfs = []
for (i, item_tpl) in enumerate(args.expression_item):
(label, filenames) = (item_tpl[0], item_tpl[1:])
label = label.replace("-", "_")
print(
"Processing expression item %d of %d" % (i + 1, len(args.expression_item)),
label,
*[os.path.abspath(f) for f in filenames])
expression_df = None
handler = None
if label in EXPRESSION_HANDLERS:
handler = EXPRESSION_HANDLERS[label]
expression_df = handler(*filenames)
elif args.debug:
debug(*filenames)
else:
raise NotImplementedError(label)
if expression_df is not None:
print(
"Processed expression data",
label,
"with shape",
expression_df.shape)
print(*expression_df.columns)
expression_dfs.append(expression_df)
expression_df = expression_dfs[0]
for other in expression_dfs[1:]:
expression_df = pandas.merge(
expression_df, other, how='outer', left_index=True, right_index=True)
expression_df = expression_df.fillna(0)
print(
"Genes in each expression dataframe: ",
*[len(e) for e in expression_dfs])
print("Genes in merged expression dataframe", len(expression_df))
ms_dfs = []
for (i, item_tpl) in enumerate(args.ms_item):
(pmid, filenames) = (item_tpl[0], item_tpl[1:])
print(
"Processing item %d / %d" % (i + 1, len(args.item)),
"Processing MS item %d of %d" % (i + 1, len(args.ms_item)),
pmid,
*[os.path.abspath(f) for f in filenames])
df = None
ms_df = None
handler = None
if pmid in HANDLERS:
handler = HANDLERS[pmid]
df = handler(*filenames)
if pmid in PMID_HANDLERS:
handler = PMID_HANDLERS[pmid]
ms_df = handler(*filenames)
elif args.debug:
debug(*filenames)
else:
raise NotImplementedError(args.pmid)
if df is not None:
df["pmid"] = pmid
if "original_pmid" not in df.columns:
df["original_pmid"] = pmid
df = df.applymap(str).applymap(str.upper)
print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
raise NotImplementedError(pmid)
if ms_df is not None:
ms_df["pmid"] = pmid
if "original_pmid" not in ms_df.columns:
ms_df["original_pmid"] = pmid
if "expression_dataset" not in ms_df.columns:
ms_df["expression_dataset"] = ""
ms_df = ms_df.applymap(str).applymap(str.upper)
ms_df["sample_id"] = ms_df.sample_id.str.replace(" ", "")
print("*** PMID %s: %d peptides ***" % (pmid, len(ms_df)))
if handler is not None:
print(handler.__doc__)
print("Counts by sample id:")
print(df.groupby("sample_id").peptide.nunique())
print(ms_df.groupby("sample_id").peptide.nunique())
print("")
print("Counts by sample type:")
print(df.groupby("sample_type").peptide.nunique())
print(ms_df.groupby("sample_type").peptide.nunique())
print("****************************")
dfs.append(df)
for value in ms_df.expression_dataset.unique():
if value and value not in expression_df.columns:
raise ValueError("No such expression dataset", value)
df = pandas.concat(dfs, ignore_index=True, sort=False)
ms_dfs.append(ms_df)
df["cell_line"] = df["cell_line"].fillna("")
ms_df = pandas.concat(ms_dfs, ignore_index=True, sort=False)
ms_df["cell_line"] = ms_df["cell_line"].fillna("")
cols = ["pmid", "sample_id", "peptide", "format", "mhc_class", "hla", ]
cols += [c for c in sorted(df.columns) if c not in cols]
df = df[cols]
sample_table = ms_df[
["sample_id", "pmid", "expression_dataset", "cell_line", "sample_type"]
].drop_duplicates().set_index("sample_id")
null_df = df.loc[df.isnull().any(1)]
sample_id_to_expression_dataset = sample_table.expression_dataset.to_dict()
for (sample_id, value) in sorted(sample_id_to_expression_dataset.items()):
if value:
print("Expression dataset for sample", sample_id, "already assigned")
continue
cell_line_col = "cell_line:" + sample_table.loc[sample_id, "cell_line"]
sample_type_col = "sample_type:" + (
sample_table.loc[sample_id, "sample_type"])
expression_dataset = None
for col in [cell_line_col, sample_type_col]:
if col in expression_df.columns:
expression_dataset = col
break
if not expression_dataset:
print("*" * 20)
print("No expression dataset for sample ", sample_id)
print("Sample info:")
print(sample_table.loc[sample_id])
print("*" * 20)
sample_id_to_expression_dataset[sample_id] = expression_dataset
print(
"Sample", sample_id, "assigned exp. dataset", expression_dataset)
print("Expression dataset usage:")
print(pandas.Series(sample_id_to_expression_dataset).value_counts())
missing = [
key for (key, value) in
sample_id_to_expression_dataset.items()
if value is None
]
if missing:
print("Missing expression data for samples", *missing)
print(
"Missing cell lines: ",
*sample_table.loc[missing, "cell_line"].dropna().drop_duplicates().tolist())
print("Missing sample types: ", *sample_table.loc[
missing, "sample_type"].dropna().drop_duplicates().tolist())
if args.debug:
import ipdb; ipdb.set_trace()
else:
raise ValueError("Missing expression data for samples: ", missing)
ms_df["expression_dataset"] = ms_df.sample_id.map(
sample_id_to_expression_dataset)
cols = [
"pmid",
"sample_id",
"peptide",
"format",
"mhc_class",
"hla",
"expression_dataset",
]
cols += [c for c in sorted(ms_df.columns) if c not in cols]
ms_df = ms_df[cols]
null_df = ms_df.loc[ms_df.isnull().any(1)]
if len(null_df) > 0:
print("Nulls:")
print(null_df)
......@@ -673,11 +1017,14 @@ def run():
print("No nulls.")
# Each sample should be coming from only one experiment.
assert df.groupby("sample_id").pmid.nunique().max() == 1, (
df.groupby("sample_id").pmid.nunique().sort_values())
assert ms_df.groupby("sample_id").pmid.nunique().max() == 1, (
ms_df.groupby("sample_id").pmid.nunique().sort_values())
expression_df.to_csv(args.expression_out, index=True)
print("Wrote: %s" % os.path.abspath(args.expression_out))
df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
ms_df.to_csv(args.ms_out, index=False)
print("Wrote: %s" % os.path.abspath(args.ms_out))
if __name__ == '__main__':
run()
......@@ -33,7 +33,7 @@ wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.200
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt
mkdir raw
mkdir ms
############################################
# MS: Multiallelic class I
......@@ -43,10 +43,10 @@ mkdir raw
# Pearson, ..., Perreault JCI 2016 [PMID 27841757]
# but was reanalyzed in this work, and we download the reanalyzed version here.
PMID=28832583
mkdir -p raw/$PMID
wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P raw/$PMID # data generated in this work
wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P raw/$PMID # data reanalyzed in this work
cd raw/$PMID
mkdir -p ms/$PMID
wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P ms/$PMID # data generated in this work
wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P ms/$PMID # data reanalyzed in this work
cd ms/$PMID
unzip *.s002
unzip *.s003
mkdir saved
......@@ -58,33 +58,33 @@ cd ../..
# Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301]
PMID=25576301
mkdir -p raw/$PMID
wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P raw/$PMID
mkdir -p ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P ms/$PMID
# Mommen, ..., Heck PNAS 2014 [PMID 24616531]
PMID=24616531
mkdir -p raw/$PMID
wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P raw/$PMID
mkdir -p ms/$PMID
wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P ms/$PMID
# Gloger, ..., Neri Cancer Immunol Immunother 2016 [PMID 27600516]
# Data extracted from supplemental PDF table.
PMID=27600516
mkdir -p raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID
mkdir -p ms/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P ms/$PMID
# Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]
# Supplemental zip downloaded from publication
PMID=26992070
mkdir -p raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID
cd raw/$PMID
mkdir -p ms/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P ms/$PMID
cd ms/$PMID
unzip pmic12297-sup-0001-supinfo.zip
cd ../..
# Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690]
PMID=27412690
mkdir -p raw/$PMID
wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID
mkdir -p ms/$PMID
wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P ms/$PMID
# Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
# Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it.
......@@ -92,17 +92,59 @@ wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1
# Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]
PMID=23481700
mkdir -p raw/$PMID
wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID
mkdir -p ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P ms/$PMID
# Shraibman, ..., Admon Mol Cell Proteomics 2019 [PMID 31154438]
PMID=31154438
mkdir -p ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/3/zjw006195963st2.txt -P ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/1/zjw006195963st1.xlsx -P ms/$PMID
# Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121]
PMID=27869121
mkdir -p ms/$PMID
wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fncomms13404/MediaObjects/41467_2016_BFncomms13404_MOESM1318_ESM.xlsx" -P ms/$PMID
############################################
# MS: Monoallelic class II
############################################
# Abelin, ..., Rooney Immunity 2019 [PMID 31495665]
PMID=31495665
mkdir -p raw/$PMID
wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P raw/$PMID
mkdir -p ms/$PMID
wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P ms/$PMID
############################################
# RNA-seq expression data (TPMs)
############################################
# CCLE as processed by expression atlas
DATASET=expression-atlas-22460905
mkdir -p expression/$DATASET
wget -q https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-2770/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv -P expression/$DATASET
# Human protein atlas
DATASET=human-protein-atlas
mkdir -p expression/$DATASET
cd expression/$DATASET
wget -q https://www.proteinatlas.org/download/rna_celline.tsv.zip
wget -q https://www.proteinatlas.org/download/rna_blood_cell_sample_tpm_m.tsv.zip
wget -q https://www.proteinatlas.org/download/rna_tissue_gtex.tsv.zip
for i in $(ls *.zip)
do
unzip $i
rm $i
done
cd ../..
# Melanoma. Original publication
# Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093].
DATASET=GSE113126
mkdir -p expression/$DATASET
cd expression/$DATASET
wget -q "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE113126&format=file" -O GSE113126_RAW.tar
tar -xvf GSE113126_RAW.tar
rm GSE113126_RAW.tar
cd ../..
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
......
......@@ -62,15 +62,15 @@ if [ "$2" != "continue-incomplete" ]
then
cp $SCRIPT_DIR/generate_hyperparameters.production.py .
cp $SCRIPT_DIR/generate_hyperparameters.py .
python generate_hyperparameters.production.py > hyperparameters.production.json
python generate_hyperparameters.py hyperparameters.production.json no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.production.py > hyperparameters.production.yaml
python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
fi
for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
do
CONTINUE_INCOMPLETE_ARGS=""
if [ "$2" == "continue-incomplete" ] && [ -d "models.${kind}" ]
if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ]
then
echo "Will continue existing run: $kind"
CONTINUE_INCOMPLETE_ARGS="--continue-incomplete"
......
......@@ -37,10 +37,6 @@ releases:
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2
default: false
- name: data_expression
url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_expression.20191009.tar.bz2
default: false
- name: data_references
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
default: false
......@@ -62,7 +58,7 @@ releases:
default: false
- name: data_published
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190924.tar.bz2
url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_published.20191011.tar.bz2
default: false
- name: data_curated
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment