diff --git a/README.md b/README.md index 3452885..458bbef 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ ## Blackboard marks downloader (UWA) +NOTE: _Who gives a shit about marks? I don't think I am as much of a tryhard as when I first made this script. Either way I'm still patching this when the crappy code breaks at the end of each semester..._ + --- **Dependencies**: diff --git a/constants/constants.py b/constants/constants.py index 32d1b92..54daf3d 100755 --- a/constants/constants.py +++ b/constants/constants.py @@ -6,6 +6,6 @@ BASE_URL = "https://lms.uwa.edu.au" # Include protocol. DL_DIR = os.getcwd() + os.path.sep + "tmp" + os.path.sep Path(DL_DIR).mkdir(parents=True, exist_ok=True) -SAVE_DIR = "grades" +SAVE_DIR = "grades_2024-07-23_B" URL_LIST = SAVE_DIR + os.path.sep + "URLS.txt" diff --git a/main.py b/main.py index 37339dd..275c0a7 100755 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import time from selenium.webdriver.remote.webdriver import WebDriver from typing import cast import requests @@ -8,6 +9,7 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains +from selenium.common.exceptions import ElementClickInterceptedException # For chrome stuff from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.chrome.options import Options @@ -97,15 +99,26 @@ def scrape_further(driver: WebDriver, path, session): # cant be arsed to figure out how the pspdfkit js that executes this download works. SwitchToIFrame( driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']")) - SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']")) + # New version does not have nested iframe and uses a shadowroot instead... + # SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']")) get_feedback = True - except Exception: - print("No feedback to download") + except: pass + if get_feedback: - dl_button = WaitClickable( - driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']")) - dl_button.click() + # dl_button = WaitClickable(driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']")) + # New version does not have nested iframe and uses a shadowroot instead... + # Loop since it takes a while for the iframe to load... + while True: + try: + dl_button = driver.execute_script("return arguments[0].shadowRoot.querySelector(\"button[title='Download']\")", driver.find_element(By.XPATH, "//div[@class='PSPDFKit-Container']")) + dl_button.click() + break + except: + time.sleep(1) download_file(path) + print("[INFO]: Downloaded feedback") + else: + print("\x1b[1;31m[WARNING]\x1b\x1b[0m: No feedback to download") request_stack.download_all() # end of scrape_further @@ -137,7 +150,7 @@ OPTIONS.add_argument('--disable-dev-shm-usage') OPTIONS.add_experimental_option("prefs", prefs) # OPTIONS.add_argument("--headless") driver = webdriver.Chrome( - executable_path='chromedriver.exe', + executable_path='chromedriver', desired_capabilities=CAPABILITIES, options=OPTIONS ) @@ -189,8 +202,11 @@ for i, course in enumerate(course_details): } """) - WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() - WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() + try: + WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() + WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() + except ElementClickInterceptedException: # already clicked on All category - do not do anything + pass table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div") diff --git a/utils/asset.py b/utils/asset.py index 21a2e33..b474ffe 100755 --- a/utils/asset.py +++ b/utils/asset.py @@ -66,15 +66,15 @@ class Asset: def write_metadata(self, headers): metacsv = [ - ["original_filename", self.original_filename], - ["readable_filename", self.filename], - ["url", self.url], + ["original_filename", self.original_filename or "error"], + ["readable_filename", self.filename or "error"], + ["url", self.url or "error"], ["pathhash", hashlib.md5( - self.url.encode()).hexdigest()], - ["etag", headers['ETag']], - ["etaghash", self.etag_hash], - ["last-modified", headers["Last-Modified"]], - ["content-length", headers["Content-Length"]], + self.url.encode()).hexdigest() or "error"], + ["etag", headers['ETag'] or "error"], + ["etaghash", self.etag_hash or "error"], + ["last-modified", headers["Last-Modified"] or "error"], + ["content-length", headers["Content-Length"] or "error"], ["age", ""], ] csvpath = self.path.joinpath("ZZZ_metadata") diff --git a/utils/utils.py b/utils/utils.py index 1a9cdb6..c4560d1 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -49,7 +49,7 @@ def save_html(dir, filename, driver: WebDriver, page_log_file=False): def download_file(dest): d = Path(DL_DIR) - time.sleep(2) + time.sleep(10) # sorry for blocking! downloading = True poll = 1.0 while downloading: diff --git a/utils/wait.py b/utils/wait.py index 403f996..d37cd9c 100755 --- a/utils/wait.py +++ b/utils/wait.py @@ -1,6 +1,6 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC -timeout = 5 +timeout = 10 def WaitClickable(driver, locator):