diff --git a/VERSION.txt b/VERSION.txt index 27c97fc4bbcd0a5759d52a9880bc8ad13ce3bfc7..24c49700a8927be8fd2d2f0cc50a4344756c24e2 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -tabScraper 0.2.0 \ No newline at end of file +tabScraper 0.3.0 \ No newline at end of file diff --git a/releases/tabScraper-0.3.0.zip b/releases/tabScraper-0.3.0.zip new file mode 100644 index 0000000000000000000000000000000000000000..7fe7e2273e96ccbbba75e8983bd5d8d6f5254b2a Binary files /dev/null and b/releases/tabScraper-0.3.0.zip differ diff --git a/tabScraper.spec b/tabScraper.spec new file mode 100644 index 0000000000000000000000000000000000000000..f716748fc236ea30aa9e53bfb49663764c7d4776 --- /dev/null +++ b/tabScraper.spec @@ -0,0 +1,32 @@ +# -*- mode: python -*- + +block_cipher = None + + +a = Analysis(['tabscraper\\tabScraper.py'], + pathex=['C:\\Users\\Lindsey.152\\source\\repos\\tabScraper'], + binaries=[('chromedriver.exe', 'bin')], + datas=[], + hiddenimports=['pikepdf._cpphelpers'], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + [], + name='tabScraper', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + runtime_tmpdir=None, + console=True ) diff --git a/tabscraper/scraper.py b/tabscraper/scraper.py index a6c714e3ce0f8a12a739dbb53069f17e7f15eda2..08f78d809b20ef5f7fc2730fc732a1d15aa92626 100644 --- a/tabscraper/scraper.py +++ b/tabscraper/scraper.py @@ -4,61 +4,60 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from time import sleep import os -import shutil -import inspect import sys +import requests # cecl -if getattr(sys, 'frozen', False) : - # running in a bundle - chromedriver_path = os.path.join(sys._MEIPASS, 'bin','chromedriver') -else : - # running live - chromedriver_path = 'chromedriver' - -options = webdriver.ChromeOptions() -prefs = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], - "download.default_directory": os.path.join(os.getcwd(), "reports")} -options.add_experimental_option("prefs", prefs) -# options.add_argument("user-data-dir=chrome_pro") - +if getattr(sys, 'frozen', False): + # running in a bundle + chromedriver_path = os.path.join(sys._MEIPASS, 'bin','chromedriver') +else: + # running live + chromedriver_path = 'chromedriver' def login(start_url): - driver = webdriver.Chrome(chromedriver_path,chrome_options=options) - driver.get(start_url) + ''' + login + :param start_url: tableau server url + returns a requests.Session object with auth cookies preloaded + ''' + + # webdriver setup, disable logging to stderr + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument("--log-level=3") + chrome_options.add_argument("--log-path={0}".format(os.devnull)) + chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) - WebDriverWait(driver, 10).until( + driver = webdriver.Chrome(chromedriver_path,chrome_options=chrome_options) + driver.get(start_url) + WebDriverWait(driver, 20).until( EC.visibility_of_element_located((By.ID, "username"))) - - ##### uncomment and fill out username and pw for testing ##### - ### username = '' - ### password = '' - # driver.find_element_by_id('username').send_keys(username) - # driver.find_element_by_id('password').send_keys(password) - # driver.find_element_by_id('submit').click() - ##### - # wait 30s for user to login WebDriverWait(driver, 30).until( EC.invisibility_of_element_located((By.ID, "username"))) - - # handle buckeyepass duo + # wait for buckeyepass duo frame to load duo_button = '''//*[@id="auth_methods"]/fieldset[1]/div[1]/button''' WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.TAG_NAME, 'iframe'))) + # switch context to duo iframe iframe = driver.find_element_by_tag_name('iframe') driver.switch_to.frame(iframe) + # send Push notification WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.XPATH, duo_button))) driver.find_element_by_xpath(duo_button).click() - + # wait for page redirect WebDriverWait(driver, 30).until( - EC.visibility_of_element_located((By.ID, 'ng-app'))) - WebDriverWait(driver, 10).until( - EC.visibility_of_element_located((By.TAG_NAME, 'iframe'))) - iframe = driver.find_element_by_tag_name('iframe') - return driver + EC.url_matches('https://dataviz.rae.osu.edu/#/views')) + # transfer cookies from webdriver to new requests.session + session = requests.Session() + cookies = dict([(x['name'], x['value']) for x in driver.get_cookies()]) + session.cookies = requests.utils.cookiejar_from_dict(cookies) + # close out webdriver + driver.quit() + + return session def wait_for_file(f_name, timeout=10, exist=True): @@ -72,15 +71,15 @@ def wait_for_file(f_name, timeout=10, exist=True): return False return True - -def download(driver, output_folder, default_fp, views): +def download(session, output_folder, views): for f_name, url in views.items(): view_fpath = os.path.join(output_folder, f_name) - driver.get(url) + r = session.get(url) + with open(view_fpath, 'wb') as f: + f.write(r.content) # wait for file to finish downloading - assert wait_for_file(default_fp) - shutil.move(default_fp, view_fpath) - # wait until file has been moved - assert wait_for_file(default_fp, exist=False) - assert wait_for_file(view_fpath) + download_status = wait_for_file(view_fpath) + status_string = "Success" if download_status else "Failed" + print("Downloading {0}...{1}".format(f_name,status_string)) return None + diff --git a/tabscraper/tabScraper.py b/tabscraper/tabScraper.py index cbba32e08d73c7082e96e4eabf4829498ad1b1f3..5d783a834dc310071d739611add1d8ffa5b43655 100644 --- a/tabscraper/tabScraper.py +++ b/tabscraper/tabScraper.py @@ -41,10 +41,10 @@ view_dict = dict([(v, '{0}/{1}'.format(dashboard_url, v)) for v in view_list]) if not os.path.exists(output_folder): os.mkdir(output_folder) -# start up WebDriver +# download individual PDFs driver = login(start_url) -download(driver, output_folder, dashboard_file_path, view_dict) -driver.quit() +download(driver, output_folder, view_dict) + pdf = Pdf.new() pdf_list = [Pdf.open(os.path.join(output_folder, v)) for v in view_dict]