Skip to content
Snippets Groups Projects
Commit c2f609a9 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

clean up data_published download

parent f04fac2d
No related branches found
No related tags found
No related merge requests found
......@@ -26,41 +26,12 @@ pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/parse.py .
# Kim et al 2014 [PMID 25017736]
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt
# Abelin et al 2017 [PMID 28228285]
# This is now in IEDB, so commenting out.
# wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2
#
# For the supplementary tables downloaded below, the ID indicates the PMID.
#
# These have all been incorporated into IEDB so we now leave them commented out.
#ID=28904123 # Di Marco et al 2017
#wget -q http://www.jimmunol.org/highwire/filestream/347380/field_highwire_adjunct_files/1/JI_1700938_Supplemental_Table_1.xlsx -O "${ID}.xlsx"
#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
#ID=30410026 # Illing et al 2018
#wget -q https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-018-07109-w/MediaObjects/41467_2018_7109_MOESM3_ESM.xlsx -O "${ID}.xlsx"
#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
#ID=28855257 # Mobbs et al 2017
#wget -q http://www.jbc.org/lookup/suppl/doi:10.1074/jbc.M117.806976/-/DC1/jbc.M117.806976-1.xlsx -O "${ID}.xlsx"
#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
#ID=29437277 # Ramarathinam et al 2018
#wget -q https://onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fpmic.201700253&file=pmic12831-sup-0002-Data.xlsx -O "${ID}.xlsx"
#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
#ID=28218747 # Pymm et al 2017
#wget -q https://media.nature.com/original/nature-assets/nsmb/journal/v24/n4/extref/nsmb.3381-S2.xlsx -O "${ID}.xlsx"
#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
......
"""
Parse various publications' supplementary tables
"""
import sys
import argparse
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--format",
metavar="PMID",
required=True,
help="pubmed ID of paper to parse")
parser.add_argument(
"--input",
required=True,
help="Input data")
parser.add_argument(
"--out-csv",
required=True,
help="Output file to write")
parser.add_argument(
"--out-csv",
required=True,
help="Result file")
PARSERS = {}
# Di Marco et al 2017
def parse_28904123(input):
import ipdb ; ipdb.set_trace()
PARSERS["28904123"] = parse_28904123
# Illing et al 2018
def parse_30410026(input):
import ipdb ; ipdb.set_trace()
PARSERS["30410026"] = parse_30410026
# Mobbs et al 2017
def parse_28855257(input):
import ipdb ; ipdb.set_trace()
PARSERS["28855257"] = parse_28855257
# Ramarathinam et al 2018
def parse_29437277(input):
import ipdb ; ipdb.set_trace()
PARSERS["29437277"] = parse_29437277
# Pymm et al 2017
def parse_28218747(input):
import ipdb ; ipdb.set_trace()
PARSERS["28218747"] = parse_28218747
def run():
args = parser.parse_args(sys.argv[1:])
if args.input.endswith(".xlsx"):
handle = pandas.read_excel(args.input, sheet_name=None)
else:
raise ValueError("Unsupported input: %s" % args.input)
parse_function = PARSERS.get(args.format)
if not parse_function:
raise ValueError("Unsupported format: %s" % args.format)
result = parse_function(handle)
result.to_csv(args.out_csv, index=False)
print("Wrote dataframe of shape %s: %s" % (result.shape, args.out_csv))
if __name__ == '__main__':
run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment