update selenium, 2023 version

This commit is contained in:
Peter 2022-11-26 15:51:00 +08:00
parent 234cd31bf4
commit c97582bef2
11 changed files with 538 additions and 515 deletions

3
.gitignore vendored Normal file → Executable file
View File

@ -5,4 +5,5 @@ chromedriver*
test*
.vscode/
*.7z
*.tar
*.tar
.venv

52
README.md Normal file → Executable file
View File

@ -1,24 +1,28 @@
## Blackboard marks downloader (UWA)
---
**Dependencies**:
- python
- selenium
- chromedriver, placed relative to this directory
Run the script with `py main.py` and enter your student number and password. I'm not taking your personal details, but *don't take my word for it* - always check the source if you don't trust it!
---
Made this script to download my marks, receipts and all the stuff I uploaded for my first semester. It's a fucking mess of spaghetti python code because to be honest I really just wanted to get this out of the way and have some time for other stuff after the first round of exams. It's a mess of code, with some bits (the login) being picked from the scraper script and some of the scraper asset objects being translated from ruby to python here (in a quick and incomplete way). This will probably will break in some way when the UI is overhauled for next semester :/
There is no bulk marks download feature in the current lms, even though it seems other blackboard installations can give students this bulk download ability. It relies on a lot of js crap so I ended up using selenium all the way through. Doesn't download styles to save space, you'll have to download the css and js yourself and it has to be absolute because the script makes no effort to make the links relative.
This one was made for UWA but you may be able to tweak it for your institution (see constants.py).
Just made it able to download the graded results which may contain annotations. Using a really hacky method to do it so it doesn't create a metadata file for it.
## Note:
* Does not download turnitin reports. You have to click the link manually to the feedback site.
* Does not download multiple submission attempts - only downloads the last/graded attempt.
* Check that the default page is the 'all' category for the marks instead of something else like the submitted category. The script should correct this but just to be safe click on all if it isn't already
* Sometimes chromedriver closes after logging in, when not in headless mode. Try interacting with the page before logging in.
## Blackboard marks downloader (UWA)
---
**Dependencies**:
- python
- selenium
- chromedriver, placed relative to this directory
Run the script with `py main.py` and enter your student number and password. I'm not taking your personal details, but _don't take my word for it_ - always check the source if you don't trust it!
---
Made this script to download my marks, receipts and all the stuff I uploaded for my first semester.
There is no bulk marks download feature in the current lms, even though it seems other blackboard installations can give students this bulk download ability. Saves visited pages to `URLS.txt` so you can use something like SingleFile extension and use their batch save url feature to save the list of urls visited (I recommend enabling scripts in the singlefile settings so that comments are saved)
This one was made for UWA but you may be able to tweak it for your institution (see constants.py).
Just made it able to download the graded results which may contain annotations. Using a really hacky method to do it so it doesn't create a metadata file for it.
## Note:
- Does not download turnitin reports. You have to click the link manually to the feedback site.
- Does not download multiple submission attempts - only downloads the last/graded attempt.
- Check that the default page is the 'all' category for the marks instead of something else like the submitted category. The script should correct this but just to be safe click on all if it isn't already
- Sometimes chromedriver closes after logging in, when not in headless mode. Try interacting with the page before logging in.

10
constants/constants.py Normal file → Executable file
View File

@ -1,7 +1,11 @@
import os
from pathlib import Path
BASE_URL = "https://lms.uwa.edu.au" # Include protocol.
BASE_URL = "https://lms.uwa.edu.au" # Include protocol.
DL_DIR = os.getcwd()+os.path.sep+"tmp"+os.path.sep
Path(DL_DIR).mkdir(parents=True, exist_ok=True)
DL_DIR = os.getcwd() + os.path.sep + "tmp" + os.path.sep
Path(DL_DIR).mkdir(parents=True, exist_ok=True)
SAVE_DIR = "grades"
URL_LIST = SAVE_DIR + os.path.sep + "URLS.txt"

514
main.py Normal file → Executable file
View File

@ -1,252 +1,262 @@
#!/usr/bin/env python3
import requests
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
# For chrome stuff
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
# ---
from urllib.parse import parse_qs, urlparse
import os
from os.path import sep
import re
import argparse
from utils.asset import RequestStack
from utils.wait import SwitchToIFrame, WaitClickable, WaitDiv
from constants.constants import BASE_URL, DL_DIR
from utils.login import login
from utils.utils import download_file, get_assignment_name, save_html
from pathlib import Path
from selenium.common.exceptions import ElementNotInteractableException
testing = False
try:
testing = True
from utils.test import get_etc
except:
def get_etc(*args): return False
# stupid bug
def click_the_fing_button(driver, button):
try:
ActionChains(driver).move_to_element(button)
ActionChains(driver).click(button).perform()
WebDriverWait(driver, 2).until(EC.number_of_windows_to_be(2))
except:
# hack to wake selenium up when it doesnt want to click the button!
driver.set_window_size(1024, 768)
click_the_fing_button(driver, button)
driver.maximize_window()
# You can probably replace this with a recursive method like in blackboard scraper but tbh i just want to get this script done so i can stop working for once.
def scrape_further(driver, path, session):
# attempts for bb-held tests
attempts = driver.find_elements(
By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]")
attempts = [x.get_attribute('href') for x in attempts]
for i, attempt in enumerate(attempts):
name = "attempt_" + \
str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]"
attempt = re.sub("^"+BASE_URL, "", attempt)
driver.execute_script("window.open('"+BASE_URL+attempt+"')")
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(3))
driver.switch_to.window(driver.window_handles[2])
save_html(path, name, driver.page_source)
if testing:
get_etc(driver, session, path)
driver.close()
driver.switch_to.window(driver.window_handles[1])
# Comments may contain feedback links
request_stack = RequestStack(session)
etc_files = driver.find_elements(
By.XPATH, "//a[contains(@href, '/bbcswebdav')]")
etc_files = [x.get_attribute('href') for x in etc_files]
for i, item in enumerate(etc_files):
if (not item is None) and ("bbcswebdav" in item):
request_stack.add_file(item, path)
# submission file for assignment
attempts = driver.find_elements(
By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]")
attempts = [x.get_attribute('href') for x in attempts]
for i, attempt in enumerate(attempts):
request_stack.add_file(attempt, path)
get_feedback = False
try:
# download button causes a tab to appear quickly, download, then disappear
# need to capture the url to get the metadata and dl to the correct location
# cant be arsed to figure out how the pspdfkit js that executes this download works.
SwitchToIFrame(
driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
get_feedback = True
except:
print("No feedback to download")
if get_feedback:
dl_button = WaitClickable(
driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
dl_button.click()
download_file(path)
request_stack.download_all()
# end of scrape_further
parser = argparse.ArgumentParser(description='Automated microsoft SSO login.')
# parser.add_argument("-p", "--password", help="Automatically use provided password", default="")
parser.add_argument("-u", "--username",
help="Automatically use provided userID", default="")
path = ['grades']
args = parser.parse_args()
CAPABILITIES = DesiredCapabilities.CHROME
CAPABILITIES['goog:loggingPrefs'] = {
'performance': 'ALL',
}
for f in os.listdir(DL_DIR):
os.remove(Path(DL_DIR).joinpath(f))
prefs = {
"profile.default_content_settings.popups": 0,
"download.default_directory": DL_DIR,
"directory_upgrade": True
}
OPTIONS = Options()
OPTIONS.add_argument('--no-sandbox')
OPTIONS.add_argument('--disable-dev-shm-usage')
OPTIONS.add_experimental_option("prefs", prefs)
# OPTIONS.add_argument("--headless")
driver = webdriver.Chrome(
executable_path='chromedriver.exe',
desired_capabilities=CAPABILITIES,
options=OPTIONS
)
driver.maximize_window()
cookies = login(args, driver) # do Login.
session = requests.Session()
for cookie in cookies:
session.cookies.set(cookie["name"], cookie["value"])
# need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line.
driver.get(BASE_URL+"/webapps/gradebook/do/student/viewCourses")
try:
WaitClickable(driver, (By.CLASS_NAME, "button-1")).click()
except:
print("no tos warning - skipped")
driver.get(
BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades")
save_html(sep.join(path), 'entrypoint', driver.page_source)
WaitClickable(driver, (By.ID, "left_stream_mygrades"))
# get courseIDs
courses = driver.find_element(By.ID, "left_stream_mygrades")\
.find_elements(By.XPATH, "//div[@role='tab']")
course_details = []
for i, course_results in enumerate(courses):
course_results = courses[i]
ActionChains(driver).move_to_element(course_results).perform()
course_url = course_results.get_attribute("bb:rhs")
course_name = course_results.find_elements(
By.XPATH, "//span[@class='stream_area_name']")[i].text
course_name += " [" + \
parse_qs(urlparse(course_url).query)['course_id'][0]+"]"
course_details.append({
'name': course_name,
'url': course_url
})
for i, course in enumerate(course_details):
path.append(course['name']) # course name
print(course['name'])
driver.get(BASE_URL+course['url'])
driver.execute_script("""
mygrades.loadContentFrame = function(url) {
window.open(url);
}
""")
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div")
for i, assignment in enumerate(table):
print(i)
buttons = assignment.find_elements(By.TAG_NAME, "input")
block = None
assignment_name = None
information_link = False
try:
block = assignment.find_element(
By.XPATH, "./div[@class='cell gradable']/a[@onclick]")
information_link = True
except:
block = assignment.find_element(
By.XPATH, "./div[@class='cell gradable']")
assignment_name = get_assignment_name(driver, block)
path.append(assignment_name)
# download information if it exists.
if information_link:
try:
ActionChains(driver).move_to_element(
block).click(block).perform()
print("Switched "+assignment_name)
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
driver.switch_to.window(driver.window_handles[1])
save_html(sep.join(path), "information", driver.page_source)
scrape_further(driver, sep.join(path), session)
driver.close()
driver.switch_to.window(driver.window_handles[0])
except ElementNotInteractableException:
print('idk')
# download rubric if it exists.
for button in buttons:
action = button.get_attribute("onclick")
if action != None and "showInLightBox" not in action:
click_the_fing_button(driver, button)
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight)")
driver.switch_to.window(driver.window_handles[1])
WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer"))
save_html(sep.join(path), "rubric", driver.page_source)
driver.find_element(
By.XPATH, "//li[@id='listViewTab']/a").click()
WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList"))
save_html(sep.join(path), "list", driver.page_source)
detailed_buttons = driver.find_elements(
By.XPATH, "//div[@class='u_controlsWrapper']/input")
detailed_buttons[1].click()
detailed_buttons[0].click()
save_html(sep.join(path), "list_detailed", driver.page_source)
driver.close()
driver.switch_to.window(driver.window_handles[0])
path.pop()
save_html(sep.join(path), path[0], driver.page_source)
WaitClickable(driver, (By.XPATH, "//a[@value='S']")).click()
save_html(sep.join(path), "submitted", driver.page_source)
try:
WaitClickable(
driver, (By.XPATH, "//div[@id='submissionReceipts']//a")).click()
WaitClickable(
driver, (By.XPATH, "//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click()
except:
print('No items?')
save_html(sep.join(path), "receipts", driver.page_source)
path.pop()
driver.quit()
#!/usr/bin/env python3
from selenium.webdriver.remote.webdriver import WebDriver
from typing import cast
import requests
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
# For chrome stuff
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
# ---
from urllib.parse import parse_qs, urlparse
import os
from os.path import sep
import re
import argparse
from utils.asset import RequestStack
from utils.wait import SwitchToIFrame, WaitClickable, WaitDiv
from constants.constants import BASE_URL, DL_DIR, SAVE_DIR
from utils.login import login
from utils.utils import download_file, get_assignment_name, save_html
from pathlib import Path
from selenium.common.exceptions import ElementNotInteractableException
testing = False
# try:
# testing = True
# from utils.test import get_etc
# except Exception:
def get_etc(*args):
return False
# stupid bug
def click_the_fing_button(driver: WebDriver, button):
try:
ActionChains(driver).move_to_element(button)
ActionChains(driver).click(button).perform()
WebDriverWait(driver, 2).until(EC.number_of_windows_to_be(2))
except Exception:
# hack to wake selenium up when it doesnt want to click the button!
driver.set_window_size(1024, 768)
click_the_fing_button(driver, button)
driver.maximize_window()
# You can probably replace this with a recursive method like in blackboard
# scraper but tbh i just want to get this script done so i can stop working for
# once.
def scrape_further(driver: WebDriver, path, session):
# attempts for bb-held tests
attempts = driver.find_elements(
By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]")
attempts = [x.get_attribute('href') for x in attempts]
for i, attempt in enumerate(attempts):
name = "attempt_" + \
str(i) + "_[" + parse_qs(urlparse(attempt).query)['attempt_id'][0] + "]"
attempt = re.sub("^" + BASE_URL, "", attempt)
driver.execute_script("window.open('" + BASE_URL + attempt + "')")
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(3))
driver.switch_to.window(driver.window_handles[2])
save_html(path, name, driver, True)
if testing:
get_etc(driver, session, path)
driver.close()
driver.switch_to.window(driver.window_handles[1])
# Comments may contain feedback links
request_stack = RequestStack(session)
etc_files = driver.find_elements(
By.XPATH, "//a[contains(@href, '/bbcswebdav')]")
etc_files = [x.get_attribute('href') for x in etc_files]
for i, item in enumerate(etc_files):
if (item is not None) and ("bbcswebdav" in item):
request_stack.add_file(item, path)
# submission file for assignment
attempts = driver.find_elements(
By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]")
attempts = [x.get_attribute('href') for x in attempts]
for i, attempt in enumerate(attempts):
request_stack.add_file(attempt, path)
get_feedback = False
try:
# download button causes a tab to appear quickly, download, then disappear
# need to capture the url to get the metadata and dl to the correct location
# cant be arsed to figure out how the pspdfkit js that executes this download works.
SwitchToIFrame(
driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
get_feedback = True
except Exception:
print("No feedback to download")
if get_feedback:
dl_button = WaitClickable(
driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
dl_button.click()
download_file(path)
request_stack.download_all()
# end of scrape_further
parser = argparse.ArgumentParser(description='Automated microsoft SSO login.')
# parser.add_argument("-p", "--password", help="Automatically use provided password", default="")
parser.add_argument("-u", "--username",
help="Automatically use provided userID", default="")
path = [SAVE_DIR]
Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)
args = parser.parse_args()
CAPABILITIES = cast("dict[str, object]", DesiredCapabilities.CHROME.copy())
CAPABILITIES['goog:loggingPrefs'] = {
'performance': 'ALL',
}
for f in os.listdir(DL_DIR):
os.remove(Path(DL_DIR).joinpath(f))
prefs = {
"profile.default_content_settings.popups": 0,
"download.default_directory": DL_DIR,
"directory_upgrade": True
}
OPTIONS = Options()
OPTIONS.add_argument('--no-sandbox')
OPTIONS.add_argument('--disable-dev-shm-usage')
OPTIONS.add_experimental_option("prefs", prefs)
# OPTIONS.add_argument("--headless")
driver = webdriver.Chrome(
executable_path='chromedriver.exe',
desired_capabilities=CAPABILITIES,
options=OPTIONS
)
driver.maximize_window()
cookies = login(args, driver) # do Login.
session = requests.Session()
for cookie in cookies:
session.cookies.set(cookie["name"], cookie["value"])
# need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line.
driver.get(BASE_URL + "/webapps/gradebook/do/student/viewCourses")
try:
WaitClickable(driver, (By.CLASS_NAME, "button-1")).click()
except Exception:
print("no tos warning - skipped")
driver.get(
BASE_URL + "/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades")
save_html(sep.join(path), 'entrypoint', driver, True)
WaitClickable(driver, (By.ID, "left_stream_mygrades"))
# get courseIDs
courses = driver.find_element(By.ID, "left_stream_mygrades")\
.find_elements(By.XPATH, "//div[@role='tab']")
course_details = []
for i, course_results in enumerate(courses):
course_results = courses[i]
ActionChains(driver).move_to_element(course_results).perform()
course_url = course_results.get_attribute("bb:rhs")
course_name = course_results.find_elements(
By.XPATH, "//span[@class='stream_area_name']")[i].text
course_name += " [" + \
parse_qs(urlparse(course_url).query)['course_id'][0] + "]"
course_details.append({
'name': course_name,
'url': course_url
})
for i, course in enumerate(course_details):
path.append(course['name']) # course name
print(course['name'])
driver.get(BASE_URL + course['url'])
driver.execute_script("""
mygrades.loadContentFrame = function(url) {
window.open(url);
}
""")
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div")
for i, assignment in enumerate(table):
print(i)
buttons = assignment.find_elements(By.TAG_NAME, "input")
block = None
assignment_name = None
information_link = False
try:
block = assignment.find_element(
By.XPATH, "./div[@class='cell gradable']/a[@onclick]")
information_link = True
except Exception:
block = assignment.find_element(
By.XPATH, "./div[@class='cell gradable']")
assignment_name = get_assignment_name(driver, block)
path.append(assignment_name)
# download information if it exists.
if information_link:
try:
ActionChains(driver).move_to_element(
block).click(block).perform()
print("Switched " + assignment_name)
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
driver.switch_to.window(driver.window_handles[1])
save_html(sep.join(path), "information", driver, True)
scrape_further(driver, sep.join(path), session)
driver.close()
driver.switch_to.window(driver.window_handles[0])
except ElementNotInteractableException:
print('idk')
# download rubric if it exists.
for button in buttons:
action = button.get_attribute("onclick")
if action is not None and "showInLightBox" not in action:
click_the_fing_button(driver, button)
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight)")
driver.switch_to.window(driver.window_handles[1])
WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer"))
save_html(sep.join(path), "rubric", driver, True)
driver.find_element(
By.XPATH, "//li[@id='listViewTab']/a").click()
WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList"))
save_html(sep.join(path), "list", driver, True)
detailed_buttons = driver.find_elements(
By.XPATH, "//div[@class='u_controlsWrapper']/input")
detailed_buttons[1].click()
detailed_buttons[0].click()
save_html(sep.join(path), "list_detailed", driver, True)
driver.close()
driver.switch_to.window(driver.window_handles[0])
path.pop()
save_html(sep.join(path), path[0], driver, True)
WaitClickable(driver, (By.XPATH, "//a[@value='S']")).click()
save_html(sep.join(path), "submitted", driver, True)
try:
WaitClickable(
driver, (By.XPATH, "//div[@id='submissionReceipts']//a")).click()
WaitClickable(
driver, (By.XPATH, "//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click()
except Exception:
print('No items?')
save_html(sep.join(path), "receipts", driver, True)
path.pop()
driver.quit()

BIN
requirements.txt Executable file

Binary file not shown.

8
utils/__init__.py Normal file → Executable file
View File

@ -1,4 +1,4 @@
# https://stackoverflow.com/a/49375740
import os
import sys
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
# https://stackoverflow.com/a/49375740
import os
import sys
sys.path.append(os.path.dirname(os.path.realpath(__file__)))

168
utils/asset.py Normal file → Executable file
View File

@ -1,84 +1,84 @@
from constants.constants import BASE_URL
import re
import hashlib
import requests
import shutil
import csv
from pathlib import Path
def convert_filename(name, hash):
_name = name.split('.')
if len(_name) > 1:
_name[-2] += ("["+hash+"]")
else:
_name[0] += ("["+hash+"]")
return '.'.join(_name)
class RequestStack:
def __init__(self, token):
self.request_stack = []
self.token = token
super().__init__()
def add_file(self, url, path):
self.request_stack.append(Asset(url, path))
def download_all(self):
for file in self.request_stack:
print(f"\tDownloading {file.url}")
file.download(self.token)
class Asset:
def __init__(self, url, path):
self.path = Path(path)
self.url = re.sub("^"+BASE_URL, "", url)
# self.file_id = re.findall('file_id=(.+)&',url)
self.path.mkdir(parents=True, exist_ok=True)
super().__init__()
def download(self, session):
response = session.get(
BASE_URL+self.url, stream=True, allow_redirects=False)
headers = response.headers
if response.status_code == 302 and len(headers['location']) > 0:
Asset(headers['location'], self.path).download(session)
return
elif response.status_code != 200:
print("[!] Error "+str(response.status_code))
return response.status_code
headers = {x: re.sub(r'^"*|"*?$', '', headers.get(x))
for x in headers} # ewww regex
if 'Content-Disposition' in headers.keys():
self.original_filename = re.findall(
'filename="(.+)"', headers['Content-Disposition'])[0]
else:
self.original_filename = re.sub(".*/", "", self.url)
self.etag_hash = hashlib.md5(headers['ETag'].encode()).hexdigest()
self.filename = convert_filename(
self.original_filename, self.etag_hash[0:6])
with open(self.path.joinpath(self.filename), 'wb') as f:
shutil.copyfileobj(response.raw, f)
self.write_metadata(headers)
def write_metadata(self, headers):
metacsv = [
["original_filename", self.original_filename],
["readable_filename", self.filename],
["url", self.url],
["pathhash", hashlib.md5(
self.url.encode()).hexdigest()],
["etag", headers['ETag']],
["etaghash", self.etag_hash],
["last-modified", headers["Last-Modified"]],
["content-length", headers["Content-Length"]],
["age", ""],
]
csvpath = self.path.joinpath("ZZZ_metadata")
csvpath.mkdir(parents=True, exist_ok=True)
with open(csvpath.joinpath(self.filename+"__metadata.csv"), "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(metacsv)
from constants.constants import BASE_URL
import re
import hashlib
import requests
import shutil
import csv
from pathlib import Path
def convert_filename(name, hash):
_name = name.split('.')
if len(_name) > 1:
_name[-2] += ("[" + hash + "]")
else:
_name[0] += ("[" + hash + "]")
return '.'.join(_name)
class RequestStack:
def __init__(self, token):
self.request_stack = []
self.token = token
super().__init__()
def add_file(self, url, path):
self.request_stack.append(Asset(url, path))
def download_all(self):
for file in self.request_stack:
print(f"\tDownloading {file.url}")
file.download(self.token)
class Asset:
def __init__(self, url, path):
self.path = Path(path)
self.url = re.sub("^" + BASE_URL, "", url)
# self.file_id = re.findall('file_id=(.+)&',url)
self.path.mkdir(parents=True, exist_ok=True)
super().__init__()
def download(self, session):
response = session.get(
BASE_URL + self.url, stream=True, allow_redirects=False)
headers = response.headers
if response.status_code == 302 and len(headers['location']) > 0:
Asset(headers['location'], self.path).download(session)
return
elif response.status_code != 200:
print("[!] Error " + str(response.status_code))
return response.status_code
headers = {x: re.sub(r'^"*|"*?$', '', headers.get(x))
for x in headers} # ewww regex
if 'Content-Disposition' in headers.keys():
self.original_filename = re.findall(
'filename="(.+)"', headers['Content-Disposition'])[0]
else:
self.original_filename = re.sub(".*/", "", self.url)
self.etag_hash = hashlib.md5(headers['ETag'].encode()).hexdigest()
self.filename = convert_filename(
self.original_filename, self.etag_hash[0:6])
with open(self.path.joinpath(self.filename), 'wb') as f:
shutil.copyfileobj(response.raw, f)
self.write_metadata(headers)
def write_metadata(self, headers):
metacsv = [
["original_filename", self.original_filename],
["readable_filename", self.filename],
["url", self.url],
["pathhash", hashlib.md5(
self.url.encode()).hexdigest()],
["etag", headers['ETag']],
["etaghash", self.etag_hash],
["last-modified", headers["Last-Modified"]],
["content-length", headers["Content-Length"]],
["age", ""],
]
csvpath = self.path.joinpath("ZZZ_metadata")
csvpath.mkdir(parents=True, exist_ok=True)
with open(csvpath.joinpath(self.filename + "__metadata.csv"), "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(metacsv)

92
utils/login.py Normal file → Executable file
View File

@ -1,46 +1,46 @@
import sys
from utils.wait import WaitClickable
from utils.selectors import Selectors
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlparse
from selenium.webdriver.support import expected_conditions as EC
from getpass import getpass
from constants.constants import BASE_URL
import re
import json
def login(args, driver):
driver.get(BASE_URL)
USERNAME = args.username
if len(USERNAME) == 0:
print('UserID: ')
USERNAME = input()
USERNAME += '@student.uwa.edu.au'
print('Password: ')
PASSWORD = getpass('')
WaitClickable(driver, Selectors.BOX_USERNAME).send_keys(USERNAME)
WaitClickable(driver, Selectors.BUTTON_NEXT).click()
print('Entered username.')
try:
WaitClickable(driver, Selectors.BOX_PASSWORD).send_keys(PASSWORD)
WaitClickable(driver, Selectors.BUTTON_NEXT).click()
print('Entered password.')
except:
print(WebDriverWait(driver, 1).until(
EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text)
driver.quit()
exit(2)
WaitClickable(driver, Selectors.BUTTON_DENY).click()
# WaitClickable(driver,BUTTON_NEXT).click() #IF you want to remember credentials, switch these comments
cookie = driver.get_cookies()
if not cookie == None:
return cookie
print('Could not get auth cookie - Invalid ID or password?', file=sys.stderr)
driver.quit()
exit(1)
import sys
from utils.wait import WaitClickable
from utils.selectors import Selectors
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlparse
from selenium.webdriver.support import expected_conditions as EC
from getpass import getpass
from constants.constants import BASE_URL
import re
import json
def login(args, driver):
driver.get(BASE_URL)
USERNAME = args.username
if len(USERNAME) == 0:
print('UserID: ')
USERNAME = input()
USERNAME += '@student.uwa.edu.au'
print('Password: ')
PASSWORD = getpass('')
WaitClickable(driver, Selectors.BOX_USERNAME).send_keys(USERNAME)
WaitClickable(driver, Selectors.BUTTON_NEXT).click()
print('Entered username.')
try:
WaitClickable(driver, Selectors.BOX_PASSWORD).send_keys(PASSWORD)
WaitClickable(driver, Selectors.BUTTON_NEXT).click()
print('Entered password.')
except Exception:
print(WebDriverWait(driver, 1).until(
EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text)
driver.quit()
exit(2)
WaitClickable(driver, Selectors.BUTTON_DENY).click()
# WaitClickable(driver,BUTTON_NEXT).click() #IF you want to remember credentials, switch these comments
cookie = driver.get_cookies()
if cookie is not None:
return cookie
print('Could not get auth cookie - Invalid ID or password?', file=sys.stderr)
driver.quit()
exit(1)

22
utils/selectors.py Normal file → Executable file
View File

@ -1,11 +1,11 @@
from selenium.webdriver.common.by import By
class Selectors:
# Microsoft login
BOX_USERNAME = (By.ID, "i0116")
BOX_PASSWORD = (By.ID, "i0118")
DIV_USERERROR = (By.ID, 'usernameError')
BUTTON_NEXT = (By.ID, "idSIButton9")
BUTTON_DENY = (By.ID, "idBtn_Back")
# Selectors for grades
from selenium.webdriver.common.by import By
class Selectors:
# Microsoft login
BOX_USERNAME = (By.ID, "i0116")
BOX_PASSWORD = (By.ID, "i0118")
DIV_USERERROR = (By.ID, 'usernameError')
BUTTON_NEXT = (By.ID, "idSIButton9")
BUTTON_DENY = (By.ID, "idBtn_Back")
# Selectors for grades

153
utils/utils.py Normal file → Executable file
View File

@ -1,74 +1,79 @@
import pathlib
import re
from constants.constants import DL_DIR
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from pathlib import Path
import shutil
def friendly_filename(name):
name = friendly_dirname(name)
return re.sub("[\\\/]", '', name)
def friendly_dirname(name):
# .gsub(/[^\w\s_-]+/, '')
# .gsub(/\s+/, '_')
# pipeline:
name = re.sub("[\x00-\x1f]", '', name)
name = re.sub("[\:\<\>\"\|\?\*]", '', name)
name = re.sub("(^|\b\s)\s+($|\s?\b)", '\\1\\2', name)
return name.strip()
def get_assignment_name(driver, block):
s = friendly_filename(get_text_excluding_children(driver, block))
print("Assesment: "+s)
return s
def save_html(dir, filename, page_source):
dir = pathlib.Path(friendly_dirname(dir))
dir.mkdir(parents=True, exist_ok=True)
file = dir.joinpath(friendly_filename(filename)+".html")
with open(file, "w", encoding="utf-8") as f:
f.write(page_source)
# NOTE: Switching to a "download" tab causes issues so we must use the in built
# download in Chrome, which does not have etag or metadata information.
# Files are using annotate-au.foundations.blackboard.com and not bbcswebdav system
# so the tag may not exist in the first place.
def download_file(dest):
d = Path(DL_DIR)
time.sleep(2)
downloading = True
poll = 1.0
while downloading:
for f in os.listdir(d):
if Path(f).suffix == '.crdownload':
time.sleep(poll)
poll *= 1.5
break
else:
_dest = Path(dest).joinpath("MARKED__"+f)
try:
shutil.move(d.joinpath(f), _dest)
except shutil.SameFileError:
os.remove(_dest)
shutil.move(d.joinpath(f), _dest)
if len(os.listdir(d)) == 0:
downloading = False
# https://stackoverflow.com/a/19040341
def get_text_excluding_children(driver, element):
return driver.execute_script("""
return jQuery(arguments[0]).contents().filter(function() {
return this.nodeType == Node.TEXT_NODE;
}).text();
""", element)
from selenium.webdriver.remote.webdriver import WebDriver
import pathlib
import re
from typing import Union
from constants.constants import DL_DIR, URL_LIST
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from pathlib import Path
import shutil
def friendly_filename(name):
name = friendly_dirname(name)
return re.sub("[\\\/]", '', name)
def friendly_dirname(name):
# .gsub(/[^\w\s_-]+/, '')
# .gsub(/\s+/, '_')
# pipeline:
name = re.sub("[\x00-\x1f]", '', name)
name = re.sub("[\:\<\>\"\|\?\*]", '', name)
name = re.sub("(^|\b\s)\s+($|\s?\b)", '\\1\\2', name)
return name.strip()
def get_assignment_name(driver: WebDriver, block):
s = friendly_filename(get_text_excluding_children(driver, block))
print("Assesment: " + s)
return s
def save_html(dir, filename, driver: WebDriver, page_log_file=False):
if page_log_file:
with open(URL_LIST, "a", encoding="utf-8") as f:
f.write(driver.current_url + "\n")
dir = pathlib.Path(friendly_dirname(dir))
dir.mkdir(parents=True, exist_ok=True)
file = dir.joinpath(friendly_filename(filename) + ".html")
with open(file, "w", encoding="utf-8") as f:
f.write(driver.page_source)
# NOTE: Switching to a "download" tab causes issues so we must use the in built
# download in Chrome, which does not have etag or metadata information.
# Files are using annotate-au.foundations.blackboard.com and not bbcswebdav system
# so the tag may not exist in the first place.
def download_file(dest):
d = Path(DL_DIR)
time.sleep(2)
downloading = True
poll = 1.0
while downloading:
for f in os.listdir(d):
if Path(f).suffix == '.crdownload':
time.sleep(poll)
poll *= 1.5
break
else:
_dest = Path(dest).joinpath("MARKED__" + f)
try:
shutil.move(str(d.joinpath(f)), _dest)
except shutil.SameFileError:
os.remove(_dest)
shutil.move(str(d.joinpath(f)), _dest)
if len(os.listdir(d)) == 0:
downloading = False
# https://stackoverflow.com/a/19040341
def get_text_excluding_children(driver, element):
return driver.execute_script("""
return jQuery(arguments[0]).contents().filter(function() {
return this.nodeType == Node.TEXT_NODE;
}).text();
""", element)

31
utils/wait.py Normal file → Executable file
View File

@ -1,16 +1,15 @@
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
timeout = 5
# find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element(By.ID, name))
def WaitClickable(driver, locator): return WebDriverWait(
driver, timeout).until(EC.element_to_be_clickable(locator))
def WaitDiv(driver, locator): return WebDriverWait(
driver, timeout).until(EC.presence_of_element_located(locator))
def SwitchToIFrame(driver, locator): return WebDriverWait(
driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator))
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
timeout = 5
def WaitClickable(driver, locator):
return WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(locator))
def WaitDiv(driver, locator):
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))
def SwitchToIFrame(driver, locator):
return WebDriverWait(driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator))