update to match new pdf viewer which uses shadow root instead of nested

iframe.
This commit is contained in:
Peter 2024-07-23 02:50:19 +08:00
parent c97582bef2
commit 357b196613
6 changed files with 38 additions and 20 deletions

View File

@ -1,5 +1,7 @@
## Blackboard marks downloader (UWA) ## Blackboard marks downloader (UWA)
NOTE: _Who gives a shit about marks? I don't think I am as much of a tryhard as when I first made this script. Either way I'm still patching this when the crappy code breaks at the end of each semester..._
--- ---
**Dependencies**: **Dependencies**:

View File

@ -6,6 +6,6 @@ BASE_URL = "https://lms.uwa.edu.au" # Include protocol.
DL_DIR = os.getcwd() + os.path.sep + "tmp" + os.path.sep DL_DIR = os.getcwd() + os.path.sep + "tmp" + os.path.sep
Path(DL_DIR).mkdir(parents=True, exist_ok=True) Path(DL_DIR).mkdir(parents=True, exist_ok=True)
SAVE_DIR = "grades" SAVE_DIR = "grades_2024-07-23_B"
URL_LIST = SAVE_DIR + os.path.sep + "URLS.txt" URL_LIST = SAVE_DIR + os.path.sep + "URLS.txt"

28
main.py
View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import time
from selenium.webdriver.remote.webdriver import WebDriver from selenium.webdriver.remote.webdriver import WebDriver
from typing import cast from typing import cast
import requests import requests
@ -8,6 +9,7 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException
# For chrome stuff # For chrome stuff
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
@ -97,15 +99,26 @@ def scrape_further(driver: WebDriver, path, session):
# cant be arsed to figure out how the pspdfkit js that executes this download works. # cant be arsed to figure out how the pspdfkit js that executes this download works.
SwitchToIFrame( SwitchToIFrame(
driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']")) driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']")) # New version does not have nested iframe and uses a shadowroot instead...
# SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
get_feedback = True get_feedback = True
except Exception: except: pass
print("No feedback to download")
if get_feedback: if get_feedback:
dl_button = WaitClickable( # dl_button = WaitClickable(driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']")) # New version does not have nested iframe and uses a shadowroot instead...
# Loop since it takes a while for the iframe to load...
while True:
try:
dl_button = driver.execute_script("return arguments[0].shadowRoot.querySelector(\"button[title='Download']\")", driver.find_element(By.XPATH, "//div[@class='PSPDFKit-Container']"))
dl_button.click() dl_button.click()
break
except:
time.sleep(1)
download_file(path) download_file(path)
print("[INFO]: Downloaded feedback")
else:
print("\x1b[1;31m[WARNING]\x1b\x1b[0m: No feedback to download")
request_stack.download_all() request_stack.download_all()
# end of scrape_further # end of scrape_further
@ -137,7 +150,7 @@ OPTIONS.add_argument('--disable-dev-shm-usage')
OPTIONS.add_experimental_option("prefs", prefs) OPTIONS.add_experimental_option("prefs", prefs)
# OPTIONS.add_argument("--headless") # OPTIONS.add_argument("--headless")
driver = webdriver.Chrome( driver = webdriver.Chrome(
executable_path='chromedriver.exe', executable_path='chromedriver',
desired_capabilities=CAPABILITIES, desired_capabilities=CAPABILITIES,
options=OPTIONS options=OPTIONS
) )
@ -189,8 +202,11 @@ for i, course in enumerate(course_details):
} }
""") """)
try:
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
except ElementClickInterceptedException: # already clicked on All category - do not do anything
pass
table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div") table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div")

View File

@ -66,15 +66,15 @@ class Asset:
def write_metadata(self, headers): def write_metadata(self, headers):
metacsv = [ metacsv = [
["original_filename", self.original_filename], ["original_filename", self.original_filename or "error"],
["readable_filename", self.filename], ["readable_filename", self.filename or "error"],
["url", self.url], ["url", self.url or "error"],
["pathhash", hashlib.md5( ["pathhash", hashlib.md5(
self.url.encode()).hexdigest()], self.url.encode()).hexdigest() or "error"],
["etag", headers['ETag']], ["etag", headers['ETag'] or "error"],
["etaghash", self.etag_hash], ["etaghash", self.etag_hash or "error"],
["last-modified", headers["Last-Modified"]], ["last-modified", headers["Last-Modified"] or "error"],
["content-length", headers["Content-Length"]], ["content-length", headers["Content-Length"] or "error"],
["age", ""], ["age", ""],
] ]
csvpath = self.path.joinpath("ZZZ_metadata") csvpath = self.path.joinpath("ZZZ_metadata")

View File

@ -49,7 +49,7 @@ def save_html(dir, filename, driver: WebDriver, page_log_file=False):
def download_file(dest): def download_file(dest):
d = Path(DL_DIR) d = Path(DL_DIR)
time.sleep(2) time.sleep(10) # sorry for blocking!
downloading = True downloading = True
poll = 1.0 poll = 1.0
while downloading: while downloading:

View File

@ -1,6 +1,6 @@
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
timeout = 5 timeout = 10
def WaitClickable(driver, locator): def WaitClickable(driver, locator):