diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh index e8a37ce6cb9d4ca6e32b58116f1d9f9b9bca7474..de4f01af5a4739ba01c318c9acbedb39208f989c 100755 --- a/downloads-generation/data_published/GENERATE.sh +++ b/downloads-generation/data_published/GENERATE.sh @@ -26,41 +26,12 @@ pip freeze git status cd $SCRATCH_DIR/$DOWNLOAD_NAME -cp $SCRIPT_DIR/parse.py . # Kim et al 2014 [PMID 25017736] wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt -# Abelin et al 2017 [PMID 28228285] -# This is now in IEDB, so commenting out. -# wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2 - -# -# For the supplementary tables downloaded below, the ID indicates the PMID. -# -# These have all been incorporated into IEDB so we now leave them commented out. -#ID=28904123 # Di Marco et al 2017 -#wget -q http://www.jimmunol.org/highwire/filestream/347380/field_highwire_adjunct_files/1/JI_1700938_Supplemental_Table_1.xlsx -O "${ID}.xlsx" -#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv" - -#ID=30410026 # Illing et al 2018 -#wget -q https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-018-07109-w/MediaObjects/41467_2018_7109_MOESM3_ESM.xlsx -O "${ID}.xlsx" -#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv" - -#ID=28855257 # Mobbs et al 2017 -#wget -q http://www.jbc.org/lookup/suppl/doi:10.1074/jbc.M117.806976/-/DC1/jbc.M117.806976-1.xlsx -O "${ID}.xlsx" -#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv" - -#ID=29437277 # Ramarathinam et al 2018 -#wget -q https://onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fpmic.201700253&file=pmic12831-sup-0002-Data.xlsx -O "${ID}.xlsx" -#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv" - -#ID=28218747 # Pymm et al 2017 -#wget -q https://media.nature.com/original/nature-assets/nsmb/journal/v24/n4/extref/nsmb.3381-S2.xlsx -O "${ID}.xlsx" -#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv" - cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * diff --git a/downloads-generation/data_published/parse.py b/downloads-generation/data_published/parse.py deleted file mode 100755 index 7931633c3e6461ae65dfb7bbc9c9a6ab4d9ef6dd..0000000000000000000000000000000000000000 --- a/downloads-generation/data_published/parse.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Parse various publications' supplementary tables -""" -import sys -import argparse - -import pandas - -import mhcnames - - -def normalize_allele_name(s): - try: - return mhcnames.normalize_allele_name(s) - except Exception: - return "UNKNOWN" - - -parser = argparse.ArgumentParser(usage=__doc__) - -parser.add_argument( - "--format", - metavar="PMID", - required=True, - help="pubmed ID of paper to parse") -parser.add_argument( - "--input", - required=True, - help="Input data") -parser.add_argument( - "--out-csv", - required=True, - help="Output file to write") - -parser.add_argument( - "--out-csv", - required=True, - help="Result file") - - -PARSERS = {} - -# Di Marco et al 2017 -def parse_28904123(input): - import ipdb ; ipdb.set_trace() - -PARSERS["28904123"] = parse_28904123 - -# Illing et al 2018 -def parse_30410026(input): - import ipdb ; ipdb.set_trace() - -PARSERS["30410026"] = parse_30410026 - -# Mobbs et al 2017 -def parse_28855257(input): - import ipdb ; ipdb.set_trace() - -PARSERS["28855257"] = parse_28855257 - -# Ramarathinam et al 2018 -def parse_29437277(input): - import ipdb ; ipdb.set_trace() - -PARSERS["29437277"] = parse_29437277 - -# Pymm et al 2017 -def parse_28218747(input): - import ipdb ; ipdb.set_trace() - -PARSERS["28218747"] = parse_28218747 - - -def run(): - args = parser.parse_args(sys.argv[1:]) - - if args.input.endswith(".xlsx"): - handle = pandas.read_excel(args.input, sheet_name=None) - else: - raise ValueError("Unsupported input: %s" % args.input) - - parse_function = PARSERS.get(args.format) - if not parse_function: - raise ValueError("Unsupported format: %s" % args.format) - - result = parse_function(handle) - - result.to_csv(args.out_csv, index=False) - print("Wrote dataframe of shape %s: %s" % (result.shape, args.out_csv)) - -if __name__ == '__main__': - run()