update to match new pdf viewer which uses shadow root instead of nested

iframe.
2024-11-30 11:40:16 +08:00 · 2024-07-23 02:50:19 +08:00 · 2024-07-23 02:50:19 +08:00 · 357b196613
commit 357b196613
parent c97582bef2
6 changed files with 38 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,7 @@
 ## Blackboard marks downloader (UWA)

+NOTE: _Who gives a shit about marks? I don't think I am as much of a tryhard as when I first made this script. Either way I'm still patching this when the crappy code breaks at the end of each semester..._
+
 ---

 **Dependencies**:
--- a/constants/constants.py
+++ b/constants/constants.py
@ -6,6 +6,6 @@ BASE_URL = "https://lms.uwa.edu.au"  # Include protocol.
 DL_DIR = os.getcwd() + os.path.sep + "tmp" + os.path.sep
 Path(DL_DIR).mkdir(parents=True, exist_ok=True)

-SAVE_DIR = "grades"
+SAVE_DIR = "grades_2024-07-23_B"

 URL_LIST = SAVE_DIR + os.path.sep + "URLS.txt"
--- a/main.py
+++ b/main.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3

+import time
 from selenium.webdriver.remote.webdriver import WebDriver
 from typing import cast
 import requests
@ -8,6 +9,7 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
+from selenium.common.exceptions import ElementClickInterceptedException
 # For chrome stuff
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 from selenium.webdriver.chrome.options import Options
@ -97,15 +99,26 @@ def scrape_further(driver: WebDriver, path, session):
        # cant be arsed to figure out how the pspdfkit js that executes this download works.
        SwitchToIFrame(
            driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
-        SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
+        # New version does not have nested iframe and uses a shadowroot instead...
+        # SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
        get_feedback = True
-    except Exception:
-        print("No feedback to download")
+    except: pass
+
    if get_feedback:
-        dl_button = WaitClickable(
-            driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
+        # dl_button = WaitClickable(driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
+        # New version does not have nested iframe and uses a shadowroot instead...
+        # Loop since it takes a while for the iframe to load...
+        while True:
+            try:
+                dl_button = driver.execute_script("return arguments[0].shadowRoot.querySelector(\"button[title='Download']\")", driver.find_element(By.XPATH, "//div[@class='PSPDFKit-Container']"))
                dl_button.click()
+                break
+            except:
+                time.sleep(1)
        download_file(path)
+        print("[INFO]: Downloaded feedback")
+    else:
+        print("\x1b[1;31m[WARNING]\x1b\x1b[0m: No feedback to download")
    request_stack.download_all()
 # end of scrape_further

@ -137,7 +150,7 @@ OPTIONS.add_argument('--disable-dev-shm-usage')
 OPTIONS.add_experimental_option("prefs", prefs)
 # OPTIONS.add_argument("--headless")
 driver = webdriver.Chrome(
-    executable_path='chromedriver.exe',
+    executable_path='chromedriver',
    desired_capabilities=CAPABILITIES,
    options=OPTIONS
 )
@ -189,8 +202,11 @@ for i, course in enumerate(course_details):
    }
    """)

+    try:
        WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
        WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
+    except ElementClickInterceptedException:    # already clicked on All category - do not do anything
+        pass

    table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div")

--- a/utils/asset.py
+++ b/utils/asset.py
@ -66,15 +66,15 @@ class Asset:

    def write_metadata(self, headers):
        metacsv = [
-            ["original_filename", self.original_filename],
-            ["readable_filename", self.filename],
-            ["url", self.url],
+            ["original_filename", self.original_filename or "error"],
+            ["readable_filename", self.filename or "error"],
+            ["url", self.url or "error"],
            ["pathhash", hashlib.md5(
-                self.url.encode()).hexdigest()],
-            ["etag", headers['ETag']],
-            ["etaghash", self.etag_hash],
-            ["last-modified", headers["Last-Modified"]],
-            ["content-length", headers["Content-Length"]],
+                self.url.encode()).hexdigest() or "error"],
+            ["etag", headers['ETag'] or "error"],
+            ["etaghash", self.etag_hash or "error"],
+            ["last-modified", headers["Last-Modified"] or "error"],
+            ["content-length", headers["Content-Length"] or "error"],
            ["age", ""],
        ]
        csvpath = self.path.joinpath("ZZZ_metadata")
--- a/utils/utils.py
+++ b/utils/utils.py
@ -49,7 +49,7 @@ def save_html(dir, filename, driver: WebDriver, page_log_file=False):

 def download_file(dest):
    d = Path(DL_DIR)
-    time.sleep(2)
+    time.sleep(10)   # sorry for blocking!
    downloading = True
    poll = 1.0
    while downloading:
--- a/utils/wait.py
+++ b/utils/wait.py
@ -1,6 +1,6 @@
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-timeout = 5
+timeout = 10


 def WaitClickable(driver, locator):