From 8e76eb8b55469fbffd2a84d5569f09ccd955df8f Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 18 Jun 2021 19:23:19 +0800 Subject: [PATCH] more hackyness - download marked files w/ feedback --- .gitignore | 1 + README.md | 9 ++++- constants/constants.py | 6 ++++ main.py | 82 ++++++++++++++++++++++++++++++------------ utils/utils.py | 41 +++++++++++++++++++++ utils/wait.py | 8 ++--- 6 files changed, 119 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index ec9f6f6..21727bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ grades/ +tmp/ __pycache__ chromedriver* test* \ No newline at end of file diff --git a/README.md b/README.md index 029a844..2453be0 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,11 @@ Made this script to download my marks, receipts and all the stuff I uploaded for There is no bulk marks download feature in the current lms, even though it seems other blackboard installations can give students this bulk download ability. It relies on a lot of js crap so I ended up using selenium all the way through. Doesn't download styles to save space, you'll have to download the css and js yourself and it has to be absolute because the script makes no effort to make the links relative. -This one was made for UWA but you may be able to tweak it for your institution (see constants.py). \ No newline at end of file +This one was made for UWA but you may be able to tweak it for your institution (see constants.py). + +Just made it able to download the graded results which may contain annotations. Using a really hacky method to do it so it doesn't create a metadata file for it. + +## Note: +* Does not download turnitin reports. You have to click the link manually to the feedback site. +* Does not download multiple submission attempts - only downloads the last/graded attempt. +* Check that the default page is the 'all' category for the marks instead of something else like the submitted category. The script should correct this but just to be safe click on all if it isn't already \ No newline at end of file diff --git a/constants/constants.py b/constants/constants.py index ab2b1bb..bce7679 100644 --- a/constants/constants.py +++ b/constants/constants.py @@ -1 +1,7 @@ +import os +from pathlib import Path + BASE_URL = "https://lms.uwa.edu.au" # Include protocol. + +DL_DIR = os.getcwd()+os.path.sep+"tmp"+os.path.sep +Path(DL_DIR).mkdir(parents=True, exist_ok=True) \ No newline at end of file diff --git a/main.py b/main.py index d866dfd..4364272 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,7 @@ from selenium.webdriver.chrome.options import Options # --- from urllib.parse import parse_qs, urlparse import os +from os.path import sep import requests import time import getpass @@ -23,12 +24,14 @@ import pathlib import utils.selectors from utils.asset import Asset, RequestStack from utils.wait import SwitchToIFrame, WaitClickable, WaitDiv -from constants.constants import BASE_URL +from constants.constants import BASE_URL, DL_DIR from utils.login import login from utils.selectors import Selectors -from utils.utils import friendly_filename, get_assignment_name, get_text_excluding_children, save_html +from utils.utils import download_file, friendly_filename, get_assignment_name, get_text_excluding_children, save_html import code from random import randint +from pathlib import Path +from selenium.common.exceptions import ElementNotInteractableException testing = False try: @@ -39,8 +42,8 @@ except: cookie = None +# stupid bug def click_the_fing_button(driver,button): - # https://stackoverflow.com/a/67414801 stupid bug try: ActionChains(driver).move_to_element(button) ActionChains(driver).click(button).perform() @@ -73,17 +76,46 @@ def scrape_further(driver,path): attempts = [ x.get_attribute('href') for x in attempts ] for i, attempt in enumerate(attempts): request_stack.add_file(attempt,path) + + get_feedback = False + try: + # download button causes a tab to appear quickly, download, then disappear + # need to capture the url to get the metadata and dl to the correct location + # cant be arsed to figure out how the pspdfkit js that executes this download works. + SwitchToIFrame(driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']")) + SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']")) + get_feedback = True + except: + print("No feedback to download") + if get_feedback: + dl_button = WaitClickable(driver,(By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']")) + dl_button.click() + download_file(path) request_stack.download_all() +# end of scrape_further + parser = argparse.ArgumentParser(description='Automated microsoft SSO login.') # parser.add_argument("-p", "--password", help="Automatically use provided password", default="") parser.add_argument("-u", "--username", help="Automatically use provided userID", default="") +path = ['grades'] args = parser.parse_args() CAPABILITIES = DesiredCapabilities.CHROME -CAPABILITIES['goog:loggingPrefs'] = {'performance': 'ALL'} +CAPABILITIES['goog:loggingPrefs'] = { + 'performance' : 'ALL', +} + +for f in os.listdir(DL_DIR): + os.remove(Path(DL_DIR).joinpath(f)) +prefs = { + "profile.default_content_settings.popups": 0, + "download.default_directory": DL_DIR, + "directory_upgrade": True + } OPTIONS = Options() +OPTIONS.add_experimental_option("prefs", prefs) # OPTIONS.add_argument("--headless") driver = webdriver.Chrome( executable_path='chromedriver', @@ -94,13 +126,15 @@ driver.maximize_window() cookie = {'Cookie': login(args, driver)} # do Login. +# need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line. driver.get(BASE_URL+"/webapps/gradebook/do/student/viewCourses") - try: WaitClickable(driver,(By.CLASS_NAME, "button-1")).click() except: print("no tos warning - skipped") -SwitchToIFrame(driver, (By.ID, 'mybbCanvas')) + +driver.get(BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades") +save_html(sep.join(path), 'entrypoint', driver.page_source) # get courseIDs courses = driver.find_element_by_id("left_stream_mygrades")\ @@ -118,7 +152,6 @@ for i, course_results in enumerate(courses): 'url' : course_url }) -path = ['grades'] for i, course in enumerate(course_details): path.append(course['name']) # course name print(course['name']) @@ -130,12 +163,11 @@ for i, course in enumerate(course_details): } """) + WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click() WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click() table = driver.find_elements_by_xpath("//div[@id='grades_wrapper']/div") - save_html("/".join(path), path[0], driver.page_source) - for i, assignment in enumerate(table): print(i) buttons = assignment.find_elements_by_tag_name("input") @@ -151,14 +183,17 @@ for i, course in enumerate(course_details): path.append(assignment_name) # download information if it exists. if information_link: - ActionChains(driver).move_to_element(block).click(block).perform() - print("Switched "+assignment_name) - WebDriverWait(driver,10).until(EC.number_of_windows_to_be(2)) - driver.switch_to.window(driver.window_handles[1]) - save_html("/".join(path),"information",driver.page_source) - scrape_further(driver, "/".join(path)) - driver.close() - driver.switch_to.window(driver.window_handles[0]) + try: + ActionChains(driver).move_to_element(block).click(block).perform() + print("Switched "+assignment_name) + WebDriverWait(driver,10).until(EC.number_of_windows_to_be(2)) + driver.switch_to.window(driver.window_handles[1]) + save_html(sep.join(path),"information",driver.page_source) + scrape_further(driver, sep.join(path)) + driver.close() + driver.switch_to.window(driver.window_handles[0]) + except ElementNotInteractableException: + print('idk') # download rubric if it exists. for button in buttons: action = button.get_attribute("onclick") @@ -167,25 +202,26 @@ for i, course in enumerate(course_details): driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") driver.switch_to.window(driver.window_handles[1]) WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer")) - save_html("/".join(path),"rubric",driver.page_source) + save_html(sep.join(path),"rubric",driver.page_source) driver.find_element_by_xpath("//li[@id='listViewTab']/a").click() WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList")) - save_html("/".join(path),"list",driver.page_source) + save_html(sep.join(path),"list",driver.page_source) detailed_buttons = driver.find_elements_by_xpath("//div[@class='u_controlsWrapper']/input") detailed_buttons[1].click() detailed_buttons[0].click() - save_html("/".join(path),"list_detailed",driver.page_source) + save_html(sep.join(path),"list_detailed",driver.page_source) driver.close() driver.switch_to.window(driver.window_handles[0]) - path.pop() + path.pop() + save_html(sep.join(path), path[0], driver.page_source) WaitClickable(driver,(By.XPATH,"//a[@value='S']")).click() - save_html("/".join(path),"submitted",driver.page_source) + save_html(sep.join(path),"submitted",driver.page_source) try: WaitClickable(driver,(By.XPATH,"//div[@id='submissionReceipts']//a")).click() WaitClickable(driver,(By.XPATH,"//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click() except: print('No items?') - save_html("/".join(path),"receipts",driver.page_source) + save_html(sep.join(path),"receipts",driver.page_source) path.pop() diff --git a/utils/utils.py b/utils/utils.py index e53dd4b..068221b 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,5 +1,14 @@ import pathlib import re +from constants.constants import DL_DIR +from utils.wait import WaitClickable +from utils.asset import Asset +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +import time +import os +from pathlib import Path +import shutil def friendly_filename(name): name = friendly_dirname(name) @@ -27,6 +36,38 @@ def save_html(dir,filename,page_source): with open(file, "w", encoding="utf-8") as f: f.write(page_source) +# Why is it so hard to just get the url of a single tab... +# def get_fast_dl(driver,button): +# windows = len(driver.window_handles) +# return + +# Because selenium seems to fuck up the url switching to a "download" tab, +# I have to use the inbuilt download in chrome :(. That also means no etag/metadata +# but to be honest it's using annotate-au.foundations.blackboard.com and not bbcswebdav system +# so the tag may not exist in the first place. +def download_file(dest): + d = Path(DL_DIR) + time.sleep(2) + downloading = True + poll = 1.0 + while downloading: + for f in os.listdir(d): + if Path(f).suffix == '.crdownload': + time.sleep(poll) + poll *= 1.5 + break + else: + _dest = Path(dest).joinpath("MARKED__"+f) + try: + shutil.move(d.joinpath(f),_dest) + except shutil.SameFileError: + os.remove(_dest) + shutil.move(d.joinpath(f),_dest) + + if len(os.listdir(d)) == 0: + downloading = False + + # https://stackoverflow.com/a/19040341 def get_text_excluding_children(driver, element): return driver.execute_script(""" diff --git a/utils/wait.py b/utils/wait.py index a491830..b7eeca7 100644 --- a/utils/wait.py +++ b/utils/wait.py @@ -1,7 +1,7 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC - +timeout = 4 # find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element_by_id(name)) -WaitClickable = lambda driver,locator:WebDriverWait(driver, 10).until(EC.element_to_be_clickable(locator)) -WaitDiv = lambda driver,locator:WebDriverWait(driver, 5).until(EC.presence_of_element_located(locator)) -SwitchToIFrame = lambda driver,locator:WebDriverWait(driver, 5).until(EC.frame_to_be_available_and_switch_to_it(locator)) \ No newline at end of file +WaitClickable = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(locator)) +WaitDiv = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator)) +SwitchToIFrame = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator)) \ No newline at end of file