diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index c3eca7cd28b7d66efc5278864bf900507e822af3..1380ade1c7b6bec1858b94697b50f7a86d90d5ad 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -16,6 +16,8 @@ from collections import OrderedDict from appdirs import user_data_dir from pkg_resources import resource_string +import pandas + ENVIRONMENT_VARIABLES = [ "MHCFLURRY_DATA_DIR", "MHCFLURRY_DOWNLOADS_CURRENT_RELEASE", @@ -130,15 +132,30 @@ def get_current_release_downloads(): metadata : dict Info about the download from downloads.yml such as URL + + up_to_date : bool or None + Whether the download URL(s) match what was used to download the current + data. This is None if it cannot be determined. """ downloads = ( get_downloads_metadata() ['releases'] [get_current_release()] ['downloads']) + + def up_to_date(dir, urls): + try: + df = pandas.read_csv(join(dir, "DOWNLOAD_INFO.csv")) + return list(df.url) == list(urls) + except IOError: + return None + return OrderedDict( (download["name"], { 'downloaded': exists(join(get_downloads_dir(), download["name"])), + 'up_to_date': up_to_date( + join(get_downloads_dir(), download["name"]), + [download['url']] if 'url' in download else download['part_urls']), 'metadata': download, }) for download in downloads ) diff --git a/mhcflurry/downloads_command.py b/mhcflurry/downloads_command.py index 321c32b4af4f839be8536e5af3c15095ae87782a..b8ded545ca4a6083056850835820e40755b74156 100644 --- a/mhcflurry/downloads_command.py +++ b/mhcflurry/downloads_command.py @@ -36,6 +36,7 @@ from tqdm import tqdm tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 import posixpath +import pandas try: from urllib.request import urlretrieve @@ -262,6 +263,10 @@ def fetch_subcommand(args): for member in tqdm(tar.getmembers(), desc='Extracting'): tar.extractall(path=result_dir, members=[member]) tar.close() + + # Save URLs that were used for this download. + pandas.DataFrame({"url": urls}).to_csv( + os.path.join(result_dir, "DOWNLOAD_INFO.csv"), index=False) qprint("Extracted %d files to: %s" % ( len(names), quote(result_dir))) finally: @@ -298,8 +303,8 @@ def info_subcommand(args): downloads = get_current_release_downloads() - format_string = "%-40s %-12s %-20s " - print(format_string % ("DOWNLOAD NAME", "DOWNLOADED?", "URL")) + format_string = "%-40s %-12s %-12s %-20s " + print(format_string % ("DOWNLOAD NAME", "DOWNLOADED?", "UP TO DATE?", "URL")) for (item, info) in downloads.items(): urls = ( @@ -313,6 +318,10 @@ def info_subcommand(args): print(format_string % ( item, yes_no(info['downloaded']), + "" if not info['downloaded'] else ( + "UNKNOWN" if info['up_to_date'] is None + else yes_no(info['up_to_date']) + ), url_description))