clean up data_published download

c2f609a9 · Tim O'Donnell · f04fac2d · c2f609a9 · f04fac2d
Commit c2f609a9 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_published/GENERATE.sh
+++ b/downloads-generation/data_published/GENERATE.sh
@@ -26,41 +26,12 @@ pip freeze
 git status

 cd $SCRATCH_DIR/$DOWNLOAD_NAME
-cp $SCRIPT_DIR/parse.py .

 # Kim et al 2014 [PMID 25017736]
 wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt
 wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
 wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt

-# Abelin et al 2017 [PMID 28228285]
-# This is now in IEDB, so commenting out.
-# wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2
-
-#
-# For the supplementary tables downloaded below, the ID indicates the PMID.
-#
-# These have all been incorporated into IEDB so we now leave them commented out.
-#ID=28904123  # Di Marco et al 2017
-#wget -q http://www.jimmunol.org/highwire/filestream/347380/field_highwire_adjunct_files/1/JI_1700938_Supplemental_Table_1.xlsx -O "${ID}.xlsx"
-#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
-
-#ID=30410026  # Illing et al 2018
-#wget -q https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-018-07109-w/MediaObjects/41467_2018_7109_MOESM3_ESM.xlsx -O "${ID}.xlsx"
-#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
-
-#ID=28855257  # Mobbs et al 2017
-#wget -q http://www.jbc.org/lookup/suppl/doi:10.1074/jbc.M117.806976/-/DC1/jbc.M117.806976-1.xlsx -O "${ID}.xlsx"
-#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
-
-#ID=29437277  # Ramarathinam et al 2018
-#wget -q https://onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fpmic.201700253&file=pmic12831-sup-0002-Data.xlsx -O "${ID}.xlsx"
-#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
-
-#ID=28218747  # Pymm et al 2017
-#wget -q https://media.nature.com/original/nature-assets/nsmb/journal/v24/n4/extref/nsmb.3381-S2.xlsx -O "${ID}.xlsx"
-#python parse.py --format "$ID" --input "${ID}.xlsx" --out-csv "${ID}.csv"
-
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
 tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *

--- a/downloads-generation/data_published/parse.py
+++ b/downloads-generation/data_published/parse.py
-"""
-Parse various publications' supplementary tables
-"""
-import sys
-import argparse
-
-import pandas
-
-import mhcnames
-
-
-def normalize_allele_name(s):
-    try:
-        return mhcnames.normalize_allele_name(s)
-    except Exception:
-        return "UNKNOWN"
-
-
-parser = argparse.ArgumentParser(usage=__doc__)
-
-parser.add_argument(
-    "--format",
-    metavar="PMID",
-    required=True,
-    help="pubmed ID of paper to parse")
-parser.add_argument(
-    "--input",
-    required=True,
-    help="Input data")
-parser.add_argument(
-    "--out-csv",
-    required=True,
-    help="Output file to write")
-
-parser.add_argument(
-    "--out-csv",
-    required=True,
-    help="Result file")
-
-
-PARSERS = {}
-
-# Di Marco et al 2017
-def parse_28904123(input):
-    import ipdb ; ipdb.set_trace()
-
-PARSERS["28904123"] = parse_28904123
-
-# Illing et al 2018
-def parse_30410026(input):
-    import ipdb ; ipdb.set_trace()
-
-PARSERS["30410026"] = parse_30410026
-
-# Mobbs et al 2017
-def parse_28855257(input):
-    import ipdb ; ipdb.set_trace()
-
-PARSERS["28855257"] = parse_28855257
-
-# Ramarathinam et al 2018
-def parse_29437277(input):
-    import ipdb ; ipdb.set_trace()
-
-PARSERS["29437277"] = parse_29437277
-
-# Pymm et al 2017
-def parse_28218747(input):
-    import ipdb ; ipdb.set_trace()
-
-PARSERS["28218747"] = parse_28218747
-
-
-def run():
-    args = parser.parse_args(sys.argv[1:])
-
-    if args.input.endswith(".xlsx"):
-        handle = pandas.read_excel(args.input, sheet_name=None)
-    else:
-        raise ValueError("Unsupported input: %s" % args.input)
-
-    parse_function = PARSERS.get(args.format)
-    if not parse_function:
-        raise ValueError("Unsupported format: %s" % args.format)
-
-    result = parse_function(handle)
-
-    result.to_csv(args.out_csv, index=False)
-    print("Wrote dataframe of shape %s: %s" % (result.shape, args.out_csv))
-
-if __name__ == '__main__':
-    run()