Skip to content
Snippets Groups Projects
Commit 38c44b56 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

working on curation

parent ecae1b31
No related merge requests found
......@@ -18,8 +18,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......@@ -29,6 +29,22 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/curate.py .
cp $SCRIPT_DIR/curate_by_pmid.py .
RAW_DIR="$(mhcflurry-downloads path data_published)/raw"
cp -r "$RAW_DIR" .
CURATE_BY_PMID_ARGS=""
for pmid in $(ls raw)
do
CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ')
done
time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug
exit 1
# No mass-spec data
time python curate.py \
......
......@@ -4,6 +4,7 @@ optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import os
import pandas
......@@ -20,14 +21,12 @@ def normalize_allele_name(s):
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"pmid",
metavar="PMID",
help="PMID of dataset to curate")
parser.add_argument(
"files",
"--item",
nargs="+",
metavar="FILE",
help="File paths of data to curate")
action="append",
metavar="PMID FILE, ... FILE",
default=[],
help="Item to curate: PMID and list of files")
parser.add_argument(
"--out",
metavar="OUT.csv",
......@@ -60,7 +59,7 @@ def debug(*filenames):
ipdb.set_trace()
def pmid_27600516(filename):
def handle_pmid_27600516(filename):
df = pandas.read_csv(filename)
sample_to_peptides = {}
......@@ -79,22 +78,108 @@ def pmid_27600516(filename):
rows.append([sample, peptide])
result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
result["sample_type"] = "melanoma_cell_line"
return result
HANDLERS["27600516"] = pmid_27600516
def handle_pmid_23481700(filename):
df = pandas.read_excel(filename)
peptides = df.iloc[10:,0].values
assert peptides[0] == "TPSLVKSTSQL"
assert peptides[-1] == "LPHSVNSKL"
result = pandas.DataFrame({
"peptide": peptides,
})
result["sample_id"] = "23481700"
result["sample_type"] = "B-LCL"
return result
def handle_pmid_24616531(filename):
df = pandas.read_excel(filename, sheetname="EThcD")
peptides = df.Sequence.values
assert peptides[0] == "APFLRIAF"
assert peptides[-1] == "WRQAGLSYIRYSQI"
result = pandas.DataFrame({
"peptide": peptides,
})
result["sample_id"] = "24616531"
result["sample_type"] = "B-lymphoblastoid"
result["cell_line"] = "GR"
result["pulldown_antibody"] = "W6/32"
# Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
# we are guessing the exact 4 digit alleles based on this.
result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
return result
def handle_pmid_25576301(filename):
df = pandas.read_excel(filename, sheetname="Peptides")
peptides = df.Sequence.values
assert peptides[0] == "AAAAAAAQSVY"
assert peptides[-1] == "YYYNGKAVY"
# TODO TODO
import ipdb ; ipdb.set_trace()
# THIS IS ALL JUNK:
result = pandas.DataFrame({
"peptide": peptides,
})
result["sample_id"] = "24616531"
result["sample_type"] = "B-lymphoblastoid"
result["cell_line"] = "GR"
result["pulldown_antibody"] = "W6/32"
# Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
# we are guessing the exact 4 digit alleles based on this.
result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
return result
# Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict.
for (key, value) in list(locals().items()):
if key.startswith("handle_pmid_"):
HANDLERS[key.replace("handle_pmid_", "")] = value
def run():
args = parser.parse_args(sys.argv[1:])
if args.pmid in HANDLERS:
df = HANDLERS[args.pmid](*args.files)
elif args.debug:
debug(*args.files)
else:
raise NotImplementedError(args.pmid)
dfs = []
for item_tpl in args.item:
(pmid, filenames) = (item_tpl[0], item_tpl[1:])
print("Processing item", pmid, *[os.path.abspath(f) for f in filenames])
df = None
if pmid in HANDLERS:
df = HANDLERS[pmid](*filenames)
elif args.debug:
debug(*filenames)
else:
raise NotImplementedError(args.pmid)
if df is not None:
df["pmid"] = pmid
print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
print("Counts by sample id:")
print(df.groupby("sample_id").peptide.nunique())
print("")
print("Counts by sample type:")
print(df.groupby("sample_type").peptide.nunique())
print("****************************")
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
df.to_csv(args.out, index=False)
print("Wrote: %s" % args.out)
......
......@@ -46,7 +46,7 @@ releases:
default: false
- name: data_published
url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
default: false
- name: data_curated
......@@ -109,7 +109,7 @@ releases:
default: false
- name: data_published
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
default: false
- name: data_curated
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment