Skip to content
Snippets Groups Projects
Commit a9931e96 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

curating some multiallelic ms

parent 53084003
No related merge requests found
"""
Filter and combine various peptide/MHC datasets to derive a composite training set,
optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"pmid",
metavar="PMID",
help="PMID of dataset to curate")
parser.add_argument(
"files",
nargs="+",
metavar="FILE",
help="File paths of data to curate")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--debug",
action="store_true",
default=False,
help="Leave user in pdb if PMID is unsupported")
HANDLERS = {}
def load(filenames, **kwargs):
result = {}
for filename in filenames:
if filename.endswith(".csv"):
result[filename] = pandas.read_csv(filename, **kwargs)
elif filename.endswith(".xlsx") or filename.endswith(".xls"):
result[filename] = pandas.read_excel(filename, **kwargs)
else:
result[filename] = filename
return result
def debug(*filenames):
loaded = load(filenames)
import ipdb
ipdb.set_trace()
def pmid_27600516(filename):
df = pandas.read_csv(filename)
sample_to_peptides = {}
current_sample = None
for peptide in df.peptide:
if peptide.startswith("#"):
current_sample = peptide[1:]
sample_to_peptides[current_sample] = []
else:
assert current_sample is not None
sample_to_peptides[current_sample].append(peptide.strip().upper())
rows = []
for (sample, peptides) in sample_to_peptides.items():
for peptide in sorted(set(peptides)):
rows.append([sample, peptide])
result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
return result
HANDLERS["27600516"] = pmid_27600516
def run():
args = parser.parse_args(sys.argv[1:])
if args.pmid in HANDLERS:
df = HANDLERS[args.pmid](*args.files)
elif args.debug:
debug(*args.files)
else:
raise NotImplementedError(args.pmid)
df.to_csv(args.out, index=False)
print("Wrote: %s" % args.out)
if __name__ == '__main__':
run()
...@@ -20,11 +20,7 @@ mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" ...@@ -20,11 +20,7 @@ mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date date
pip freeze
# git rev-parse HEAD
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME cd $SCRATCH_DIR/$DOWNLOAD_NAME
...@@ -65,23 +61,33 @@ wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_f ...@@ -65,23 +61,33 @@ wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_f
# Data extracted from supplemental PDF table. # Data extracted from supplemental PDF table.
PMID=27600516 PMID=27600516
mkdir -p raw/$PMID mkdir -p raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/Gloger_Neri_CII_2016_27600516_extracted_from_pdf.csv -P raw/$PMID wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID
# Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070] # Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]
# Supplemental zip downloaded from publication # Supplemental zip downloaded from publication
PMID=26992070 PMID=26992070
mkdir -p raw/$PMID mkdir -p raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID
unzip raw/$PMID/pmic12297-sup-0001-supinfo.zip cd raw/$PMID
unzip pmic12297-sup-0001-supinfo.zip
cd ../..
# Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690] # Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690]
PMID=27412690 PMID=27412690
mkdir -p raw/$PMID mkdir -p raw/$PMID
wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID
# Pearson, ..., Perreault 2016 J Clin Invest [PMID 27841757] # Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
# Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it. # Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it.
# #
# Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]
PMID=23481700
mkdir -p raw/$PMID
wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment