Skip to content
Snippets Groups Projects
Commit 7f6fb1dc authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

curating some multiallelic ms

parent c69e8062
No related branches found
No related tags found
No related merge requests found
"""
Filter and combine various peptide/MHC datasets to derive a composite training set,
optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"pmid",
metavar="PMID",
help="PMID of dataset to curate")
parser.add_argument(
"files",
nargs="+",
metavar="FILE",
help="File paths of data to curate")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--debug",
action="store_true",
default=False,
help="Leave user in pdb if PMID is unsupported")
HANDLERS = {}
def load(filenames, **kwargs):
result = {}
for filename in filenames:
if filename.endswith(".csv"):
result[filename] = pandas.read_csv(filename, **kwargs)
elif filename.endswith(".xlsx") or filename.endswith(".xls"):
result[filename] = pandas.read_excel(filename, **kwargs)
else:
result[filename] = filename
return result
def debug(*filenames):
loaded = load(filenames)
import ipdb
ipdb.set_trace()
def pmid_27600516(filename):
df = pandas.read_csv(filename)
sample_to_peptides = {}
current_sample = None
for peptide in df.peptide:
if peptide.startswith("#"):
current_sample = peptide[1:]
sample_to_peptides[current_sample] = []
else:
assert current_sample is not None
sample_to_peptides[current_sample].append(peptide.strip().upper())
rows = []
for (sample, peptides) in sample_to_peptides.items():
for peptide in sorted(set(peptides)):
rows.append([sample, peptide])
result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
return result
HANDLERS["27600516"] = pmid_27600516
def run():
args = parser.parse_args(sys.argv[1:])
if args.pmid in HANDLERS:
df = HANDLERS[args.pmid](*args.files)
elif args.debug:
debug(*args.files)
else:
raise NotImplementedError(args.pmid)
df.to_csv(args.out, index=False)
print("Wrote: %s" % args.out)
if __name__ == '__main__':
run()
......@@ -20,11 +20,7 @@ mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
# git rev-parse HEAD
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
......@@ -65,23 +61,33 @@ wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_f
# Data extracted from supplemental PDF table.
PMID=27600516
mkdir -p raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/Gloger_Neri_CII_2016_27600516_extracted_from_pdf.csv -P raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID
# Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]
# Supplemental zip downloaded from publication
PMID=26992070
mkdir -p raw/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID
unzip raw/$PMID/pmic12297-sup-0001-supinfo.zip
cd raw/$PMID
unzip pmic12297-sup-0001-supinfo.zip
cd ../..
# Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690]
PMID=27412690
mkdir -p raw/$PMID
wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID
# Pearson, ..., Perreault 2016 J Clin Invest [PMID 27841757]
# Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
# Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it.
#
# Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]
PMID=23481700
mkdir -p raw/$PMID
wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment