Skip to content
Snippets Groups Projects
Commit 38c44b56 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

working on curation

parent ecae1b31
No related branches found
No related tags found
No related merge requests found
......@@ -18,8 +18,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......@@ -29,6 +29,22 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/curate.py .
cp $SCRIPT_DIR/curate_by_pmid.py .
RAW_DIR="$(mhcflurry-downloads path data_published)/raw"
cp -r "$RAW_DIR" .
CURATE_BY_PMID_ARGS=""
for pmid in $(ls raw)
do
CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ')
done
time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug
exit 1
# No mass-spec data
time python curate.py \
......
......@@ -4,6 +4,7 @@ optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import os
import pandas
......@@ -20,14 +21,12 @@ def normalize_allele_name(s):
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"pmid",
metavar="PMID",
help="PMID of dataset to curate")
parser.add_argument(
"files",
"--item",
nargs="+",
metavar="FILE",
help="File paths of data to curate")
action="append",
metavar="PMID FILE, ... FILE",
default=[],
help="Item to curate: PMID and list of files")
parser.add_argument(
"--out",
metavar="OUT.csv",
......@@ -60,7 +59,7 @@ def debug(*filenames):
ipdb.set_trace()
def pmid_27600516(filename):
def handle_pmid_27600516(filename):
df = pandas.read_csv(filename)
sample_to_peptides = {}
......@@ -79,22 +78,108 @@ def pmid_27600516(filename):
rows.append([sample, peptide])
result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
result["sample_type"] = "melanoma_cell_line"
return result
HANDLERS["27600516"] = pmid_27600516
def handle_pmid_23481700(filename):
df = pandas.read_excel(filename)
peptides = df.iloc[10:,0].values
assert peptides[0] == "TPSLVKSTSQL"
assert peptides[-1] == "LPHSVNSKL"
result = pandas.DataFrame({
"peptide": peptides,
})
result["sample_id"] = "23481700"
result["sample_type"] = "B-LCL"
return result
def handle_pmid_24616531(filename):
df = pandas.read_excel(filename, sheetname="EThcD")
peptides = df.Sequence.values
assert peptides[0] == "APFLRIAF"
assert peptides[-1] == "WRQAGLSYIRYSQI"
result = pandas.DataFrame({
"peptide": peptides,
})
result["sample_id"] = "24616531"
result["sample_type"] = "B-lymphoblastoid"
result["cell_line"] = "GR"
result["pulldown_antibody"] = "W6/32"
# Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
# we are guessing the exact 4 digit alleles based on this.
result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
return result
def handle_pmid_25576301(filename):
df = pandas.read_excel(filename, sheetname="Peptides")
peptides = df.Sequence.values
assert peptides[0] == "AAAAAAAQSVY"
assert peptides[-1] == "YYYNGKAVY"
# TODO TODO
import ipdb ; ipdb.set_trace()
# THIS IS ALL JUNK:
result = pandas.DataFrame({
"peptide": peptides,
})
result["sample_id"] = "24616531"
result["sample_type"] = "B-lymphoblastoid"
result["cell_line"] = "GR"
result["pulldown_antibody"] = "W6/32"
# Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07"
# we are guessing the exact 4 digit alleles based on this.
result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01"
return result
# Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict.
for (key, value) in list(locals().items()):
if key.startswith("handle_pmid_"):
HANDLERS[key.replace("handle_pmid_", "")] = value
def run():
args = parser.parse_args(sys.argv[1:])
if args.pmid in HANDLERS:
df = HANDLERS[args.pmid](*args.files)
elif args.debug:
debug(*args.files)
else:
raise NotImplementedError(args.pmid)
dfs = []
for item_tpl in args.item:
(pmid, filenames) = (item_tpl[0], item_tpl[1:])
print("Processing item", pmid, *[os.path.abspath(f) for f in filenames])
df = None
if pmid in HANDLERS:
df = HANDLERS[pmid](*filenames)
elif args.debug:
debug(*filenames)
else:
raise NotImplementedError(args.pmid)
if df is not None:
df["pmid"] = pmid
print("*** PMID %s: %d peptides ***" % (pmid, len(df)))
print("Counts by sample id:")
print(df.groupby("sample_id").peptide.nunique())
print("")
print("Counts by sample type:")
print(df.groupby("sample_type").peptide.nunique())
print("****************************")
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
df.to_csv(args.out, index=False)
print("Wrote: %s" % args.out)
......
......@@ -46,7 +46,7 @@ releases:
default: false
- name: data_published
url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
default: false
- name: data_curated
......@@ -109,7 +109,7 @@ releases:
default: false
- name: data_published
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190920.tar.bz2
url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
default: false
- name: data_curated
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment