update to match new pdf viewer which uses shadow root instead of nested

iframe.
This commit is contained in:
Peter 2024-07-23 02:50:19 +08:00
parent c97582bef2
commit 357b196613
6 changed files with 38 additions and 20 deletions

View File

@ -1,5 +1,7 @@
## Blackboard marks downloader (UWA)
NOTE: _Who gives a shit about marks? I don't think I am as much of a tryhard as when I first made this script. Either way I'm still patching this when the crappy code breaks at the end of each semester..._
---
**Dependencies**:

View File

@ -6,6 +6,6 @@ BASE_URL = "https://lms.uwa.edu.au" # Include protocol.
DL_DIR = os.getcwd() + os.path.sep + "tmp" + os.path.sep
Path(DL_DIR).mkdir(parents=True, exist_ok=True)
SAVE_DIR = "grades"
SAVE_DIR = "grades_2024-07-23_B"
URL_LIST = SAVE_DIR + os.path.sep + "URLS.txt"

28
main.py
View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import time
from selenium.webdriver.remote.webdriver import WebDriver
from typing import cast
import requests
@ -8,6 +9,7 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException
# For chrome stuff
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
@ -97,15 +99,26 @@ def scrape_further(driver: WebDriver, path, session):
# cant be arsed to figure out how the pspdfkit js that executes this download works.
SwitchToIFrame(
driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
# New version does not have nested iframe and uses a shadowroot instead...
# SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
get_feedback = True
except Exception:
print("No feedback to download")
except: pass
if get_feedback:
dl_button = WaitClickable(
driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
# dl_button = WaitClickable(driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
# New version does not have nested iframe and uses a shadowroot instead...
# Loop since it takes a while for the iframe to load...
while True:
try:
dl_button = driver.execute_script("return arguments[0].shadowRoot.querySelector(\"button[title='Download']\")", driver.find_element(By.XPATH, "//div[@class='PSPDFKit-Container']"))
dl_button.click()
break
except:
time.sleep(1)
download_file(path)
print("[INFO]: Downloaded feedback")
else:
print("\x1b[1;31m[WARNING]\x1b\x1b[0m: No feedback to download")
request_stack.download_all()
# end of scrape_further
@ -137,7 +150,7 @@ OPTIONS.add_argument('--disable-dev-shm-usage')
OPTIONS.add_experimental_option("prefs", prefs)
# OPTIONS.add_argument("--headless")
driver = webdriver.Chrome(
executable_path='chromedriver.exe',
executable_path='chromedriver',
desired_capabilities=CAPABILITIES,
options=OPTIONS
)
@ -189,8 +202,11 @@ for i, course in enumerate(course_details):
}
""")
try:
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
except ElementClickInterceptedException: # already clicked on All category - do not do anything
pass
table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div")

View File

@ -66,15 +66,15 @@ class Asset:
def write_metadata(self, headers):
metacsv = [
["original_filename", self.original_filename],
["readable_filename", self.filename],
["url", self.url],
["original_filename", self.original_filename or "error"],
["readable_filename", self.filename or "error"],
["url", self.url or "error"],
["pathhash", hashlib.md5(
self.url.encode()).hexdigest()],
["etag", headers['ETag']],
["etaghash", self.etag_hash],
["last-modified", headers["Last-Modified"]],
["content-length", headers["Content-Length"]],
self.url.encode()).hexdigest() or "error"],
["etag", headers['ETag'] or "error"],
["etaghash", self.etag_hash or "error"],
["last-modified", headers["Last-Modified"] or "error"],
["content-length", headers["Content-Length"] or "error"],
["age", ""],
]
csvpath = self.path.joinpath("ZZZ_metadata")

View File

@ -49,7 +49,7 @@ def save_html(dir, filename, driver: WebDriver, page_log_file=False):
def download_file(dest):
d = Path(DL_DIR)
time.sleep(2)
time.sleep(10) # sorry for blocking!
downloading = True
poll = 1.0
while downloading:

View File

@ -1,6 +1,6 @@
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
timeout = 5
timeout = 10
def WaitClickable(driver, locator):