crappy sphaghetti code - inital release

2024-11-30 11:40:16 +08:00 · 2021-06-18 03:01:14 +08:00 · 2021-06-18 03:01:14 +08:00 · a4352dfd3e
commit a4352dfd3e
10 changed files with 398 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+grades/
+__pycache__
+chromedriver*
+test*
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
+## Blackboard marks downloader (UWA)
+---
+**Dependencies**:
+- python
+- selenium
+- chromedriver, placed relative to this directory
+
+Run the script with `py main.py` and enter your student number and password. I'm not taking your personal details, but *don't my word for it* - always check the source if you don't trust it!
+
+---
+
+Made this script to download my marks, receipts and all the stuff I uploaded for my first semester. It's a fucking mess of spaghetti python code because to be honest I really just wanted to get this out of the way and have some time for other stuff after the first round of exams. It's a mess of code, with some bits (the login) being picked from the scraper script and some of the scraper asset objects being translated from ruby to python here (in a quick and incomplete way). This will probably will break in some way when the UI is overhauled for next semester :/
+
+There is no bulk marks download feature in the current lms, even though it seems other blackboard installations can give students this bulk download ability. It relies on a lot of js crap so I ended up using selenium all the way through. Doesn't download styles to save space, you'll have to download the css and js yourself and it has to be absolute because the script makes no effort to make the links relative.
+
+This one was made for UWA but you may be able to tweak it for your institution (see constants.py).
--- a/constants/constants.py
+++ b/constants/constants.py
@ -0,0 +1 @@
+BASE_URL = "https://lms.uwa.edu.au" # Include protocol.
--- a/main.py
+++ b/main.py
@ -0,0 +1,192 @@
+
+from selenium import webdriver
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.action_chains import ActionChains
+# For chrome stuff
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.chrome.options import Options
+# ---
+from urllib.parse import parse_qs, urlparse
+import os
+import requests
+import time
+import getpass
+import json
+import re
+import sys
+import argparse
+import pathlib
+
+import utils.selectors
+from utils.asset import Asset, RequestStack
+from utils.wait import SwitchToIFrame, WaitClickable, WaitDiv
+from constants.constants import BASE_URL
+from utils.login import login
+from utils.selectors import Selectors
+from utils.utils import friendly_filename, get_assignment_name, get_text_excluding_children, save_html
+import code
+from random import randint
+
+testing = False
+try:
+    testing = True
+    from utils.test import get_etc  
+except:
+    def get_etc(*args): return False
+
+cookie = None
+
+def click_the_fing_button(driver,button):
+    # https://stackoverflow.com/a/67414801 stupid bug
+    try:
+        ActionChains(driver).move_to_element(button)
+        ActionChains(driver).click(button).perform()
+        WebDriverWait(driver,2).until(EC.number_of_windows_to_be(2))
+    except:
+        driver.set_window_size(1024, 768)   # hack to wake selenium up when it doesnt want to click the button!
+        click_the_fing_button(driver,button)
+        driver.maximize_window()
+
+# You can probably replace this with a recursive method like in blackboard scraper but tbh i just want to get this script done so i can stop working for once.
+def scrape_further(driver,path):
+    # attempts for bb-held tests
+    attempts = driver.find_elements_by_xpath("//a[starts-with(@href, '/webapps/assessment')]")
+    attempts = [ x.get_attribute('href') for x in attempts ]
+    for i, attempt in enumerate(attempts):
+        name = "attempt_"+str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]"
+        attempt = re.sub("^"+BASE_URL,"",attempt)
+        driver.execute_script("window.open('"+BASE_URL+attempt+"')")
+        WebDriverWait(driver,10).until(EC.number_of_windows_to_be(3))
+        driver.switch_to.window(driver.window_handles[2])
+        save_html(path, name, driver.page_source)
+        if testing:
+            get_etc(driver, cookie, path)
+        driver.close()
+        driver.switch_to.window(driver.window_handles[1])
+
+    # submission file for assignment
+    request_stack = RequestStack(cookie)
+    attempts = driver.find_elements_by_xpath("//a[starts-with(@href, '/webapps/assignment/download')]")
+    attempts = [ x.get_attribute('href') for x in attempts ]
+    for i, attempt in enumerate(attempts):
+        request_stack.add_file(attempt,path)
+    request_stack.download_all()
+
+parser = argparse.ArgumentParser(description='Automated microsoft SSO login.')
+# parser.add_argument("-p", "--password", help="Automatically use provided password", default="")
+parser.add_argument("-u", "--username", help="Automatically use provided userID", default="")
+
+args = parser.parse_args()
+
+CAPABILITIES = DesiredCapabilities.CHROME
+CAPABILITIES['goog:loggingPrefs'] = {'performance': 'ALL'}
+OPTIONS = Options()
+# OPTIONS.add_argument("--headless")
+driver = webdriver.Chrome(
+                            executable_path='chromedriver',
+                            desired_capabilities=CAPABILITIES,
+                            options=OPTIONS
+                        )
+driver.maximize_window()
+
+cookie = {'Cookie': login(args, driver)} # do Login.
+
+driver.get(BASE_URL+"/webapps/gradebook/do/student/viewCourses")
+
+try:
+    WaitClickable(driver,(By.CLASS_NAME, "button-1")).click()
+except:
+    print("no tos warning - skipped")
+SwitchToIFrame(driver, (By.ID, 'mybbCanvas'))
+
+# get courseIDs
+courses = driver.find_element_by_id("left_stream_mygrades")\
+                .find_elements_by_xpath("//div[@role='tab']")
+
+course_details = []
+for i, course_results in enumerate(courses):
+    course_results = courses[i]
+    ActionChains(driver).move_to_element(course_results).perform()
+    course_url = course_results.get_attribute("bb:rhs")
+    course_name = course_results.find_elements_by_xpath("//span[@class='stream_area_name']")[i].text
+    course_name += " ["+parse_qs(urlparse(course_url).query)['course_id'][0]+"]"
+    course_details.append({
+        'name': course_name,
+        'url' : course_url
+    })
+
+path = ['grades']
+for i, course in enumerate(course_details):
+    path.append(course['name']) # course name
+    print(course['name'])
+    driver.get(BASE_URL+course['url'])
+
+    driver.execute_script("""
+    mygrades.loadContentFrame = function(url) {
+        window.open(url);
+    }
+    """)
+
+    WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click()
+
+    table = driver.find_elements_by_xpath("//div[@id='grades_wrapper']/div")
+
+    save_html("/".join(path), path[0], driver.page_source)
+
+    for i, assignment in enumerate(table):
+        print(i)
+        buttons = assignment.find_elements_by_tag_name("input")
+        block = None
+        assignment_name = None
+        information_link = False
+        try:
+            block = assignment.find_element_by_xpath("./div[@class='cell gradable']/a[@onclick]")
+            information_link = True
+        except:
+            block = assignment.find_element_by_xpath("./div[@class='cell gradable']")
+        assignment_name = get_assignment_name(driver,block)
+        path.append(assignment_name)
+        # download information if it exists.
+        if information_link:
+            ActionChains(driver).move_to_element(block).click(block).perform()
+            print("Switched "+assignment_name)
+            WebDriverWait(driver,10).until(EC.number_of_windows_to_be(2))
+            driver.switch_to.window(driver.window_handles[1])
+            save_html("/".join(path),"information",driver.page_source)
+            scrape_further(driver, "/".join(path))
+            driver.close()
+            driver.switch_to.window(driver.window_handles[0])
+        # download rubric if it exists.
+        for button in buttons:
+            action = button.get_attribute("onclick")
+            if action != None and "showInLightBox" not in action:
+                click_the_fing_button(driver,button)
+                driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
+                driver.switch_to.window(driver.window_handles[1])
+                WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer"))
+                save_html("/".join(path),"rubric",driver.page_source)
+                driver.find_element_by_xpath("//li[@id='listViewTab']/a").click()
+                WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList"))
+                save_html("/".join(path),"list",driver.page_source)
+                detailed_buttons = driver.find_elements_by_xpath("//div[@class='u_controlsWrapper']/input")
+                detailed_buttons[1].click()
+                detailed_buttons[0].click()
+                save_html("/".join(path),"list_detailed",driver.page_source)
+                driver.close()
+                driver.switch_to.window(driver.window_handles[0])
+        path.pop()
+    WaitClickable(driver,(By.XPATH,"//a[@value='S']")).click()
+    save_html("/".join(path),"submitted",driver.page_source)
+    try:
+        WaitClickable(driver,(By.XPATH,"//div[@id='submissionReceipts']//a")).click()
+        WaitClickable(driver,(By.XPATH,"//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click()
+    except:
+        print('No items?')
+    save_html("/".join(path),"receipts",driver.page_source)
+    path.pop()
+
+
+driver.quit()
--- a/utils/init.py
+++ b/utils/init.py
@ -0,0 +1,3 @@
+# https://stackoverflow.com/a/49375740
+import os, sys
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
--- a/utils/asset.py
+++ b/utils/asset.py
@ -0,0 +1,76 @@
+import wget
+from constants.constants import BASE_URL
+import re
+import hashlib
+import requests
+import shutil
+import csv
+from pathlib import Path
+
+def convert_filename(name, hash):
+    _name = name.split('.')
+    if len(_name) > 1:
+        _name[-2] += ("["+hash+"]")
+    else:
+        _name[0] += ("["+hash+"]")
+    return '.'.join(_name)
+
+class RequestStack:
+    def __init__(self,token):
+        self.request_stack = []
+        self.token = token
+        super().__init__()
+
+    def add_file(self,url,path):
+        self.request_stack.append(Asset(url,path))
+    
+    def download_all(self):
+        for file in self.request_stack:
+            file.download(self.token)
+
+class Asset:
+    def __init__(self,url,path):
+        self.path = Path(path)
+        self.url = re.sub("^"+BASE_URL,"",url)
+        # self.file_id = re.findall('file_id=(.+)&',url)
+        self.path.mkdir(parents=True, exist_ok=True)
+        super().__init__()
+
+    def download(self,req_headers):
+        response = requests.get(BASE_URL+self.url, stream=True, headers=req_headers, allow_redirects=False)
+        headers = response.headers
+        if response.status_code == 302 and len(headers['location']) > 0:
+            Asset(headers['location'], self.path).download(req_headers)
+            return
+        elif response.status_code != 200:
+            print("Error "+str(response.status_code))
+            return response.status_code
+        headers = { x:re.sub(r'^"*|"*?$', '', headers.get(x)) for x in headers } # ewww regex
+        if 'Content-Disposition' in headers.keys():
+            self.original_filename = re.findall('filename="(.+)"', headers['Content-Disposition'])[0]
+        else:
+            self.original_filename = re.sub(".*/","",self.url)
+        self.etag_hash = hashlib.md5(headers['ETag'].encode()).hexdigest()
+        self.filename = convert_filename(self.original_filename, self.etag_hash[0:6])
+
+        with open(self.path.joinpath(self.filename), 'wb') as f:
+            shutil.copyfileobj(response.raw, f)
+        self.write_metadata(headers)
+
+    def write_metadata(self,headers):
+        metacsv = [
+            ["original_filename",   self.original_filename],
+            ["readable_filename",   self.filename],
+            ["url",                 self.url],
+            ["pathhash",            hashlib.md5(self.url.encode()).hexdigest()],
+            ["etag",                headers['ETag']],
+            ["etaghash",            self.etag_hash],
+            ["last-modified",       headers["Last-Modified"]],
+            ["content-length",      headers["Content-Length"]],
+            ["age",                 ""],
+        ]
+        csvpath = self.path.joinpath("ZZZ_metadata")
+        csvpath.mkdir(parents=True, exist_ok=True)
+        with open(csvpath.joinpath(self.filename+"__metadata.csv"), "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerows(metacsv)
--- a/utils/login.py
+++ b/utils/login.py
@ -0,0 +1,53 @@
+from utils.wait import WaitClickable
+from utils.selectors import Selectors
+import sys
+from selenium.webdriver.support.wait import WebDriverWait
+from urllib.parse import urlparse
+from selenium.webdriver.support import expected_conditions as EC
+from getpass import getpass
+from constants.constants import BASE_URL
+import re
+import json
+
+def try_cookie(driver):
+    for entry in driver.get_log('performance'):
+        parameters = json.loads(entry["message"])['message']['params']
+        if (
+            'documentURL' in  parameters.keys()
+            and re.search(r'https://lms.uwa.edu.au/webapps/portal.*', parameters['documentURL']) != None
+        ):
+            return parameters['redirectResponse']['requestHeaders']['Cookie']
+
+def login(args, driver):
+    USERNAME = args.username
+    if len(USERNAME) == 0:
+        print('UserID: ')
+        USERNAME = input()
+    USERNAME += '@student.uwa.edu.au'
+    print('Password: ')
+    PASSWORD = getpass('')
+    
+    driver.get(BASE_URL)
+
+    WaitClickable(driver,Selectors.BOX_USERNAME).send_keys(USERNAME)
+    WaitClickable(driver,Selectors.BUTTON_NEXT).click()
+    print('Entered username.')
+
+    try:
+        WaitClickable(driver,Selectors.BOX_PASSWORD).send_keys(PASSWORD)
+        WaitClickable(driver,Selectors.BUTTON_NEXT).click()
+        print('Entered password.')
+    except:
+        print(WebDriverWait(driver, 1).until(EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text)
+        driver.quit()
+        exit(2)
+
+    WaitClickable(driver,Selectors.BUTTON_DENY).click()
+    # WaitClickable(driver,BUTTON_NEXT).click() #IF you want to remember credentials, switch these comments
+    current_uri = urlparse(driver.current_url)
+    if '{uri.scheme}://{uri.netloc}'.format(uri=current_uri) != BASE_URL:
+        driver.quit()
+        print("Login failed.")
+        exit(-1)
+    
+    return try_cookie(driver)
--- a/utils/selectors.py
+++ b/utils/selectors.py
@ -0,0 +1,10 @@
+from selenium.webdriver.common.by import By
+
+class Selectors:
+    # Microsoft login
+    BOX_USERNAME = (By.ID, "i0116")
+    BOX_PASSWORD = (By.ID, "i0118")
+    DIV_USERERROR = (By.ID, 'usernameError')
+    BUTTON_NEXT = (By.ID, "idSIButton9")
+    BUTTON_DENY = (By.ID, "idBtn_Back")
+    # Selectors for grades
--- a/utils/utils.py
+++ b/utils/utils.py
@ -0,0 +1,36 @@
+import pathlib
+import re
+
+def friendly_filename(name):
+    name = friendly_dirname(name)
+    return re.sub("[\\\/]",'',name)
+
+def friendly_dirname(name):
+    #.gsub(/[^\w\s_-]+/, '')
+    # .gsub(/\s+/, '_')
+    # pipeline:
+    name = re.sub("[\x00-\x1f]",'',name)
+    name = re.sub("[\:\<\>\"\|\?\*]",'',name)
+    name = re.sub("(^|\b\s)\s+($|\s?\b)", '\\1\\2', name)
+    return name.strip()
+
+
+def get_assignment_name(driver,block):
+    s = friendly_filename(get_text_excluding_children(driver,block))
+    print("Assesment: "+s)
+    return s
+
+def save_html(dir,filename,page_source):
+    dir = pathlib.Path(friendly_dirname(dir))
+    dir.mkdir(parents=True, exist_ok=True)
+    file = dir.joinpath(friendly_filename(filename)+".html")
+    with open(file, "w", encoding="utf-8") as f:
+        f.write(page_source)
+
+# https://stackoverflow.com/a/19040341
+def get_text_excluding_children(driver, element):
+    return driver.execute_script("""
+    return jQuery(arguments[0]).contents().filter(function() {
+        return this.nodeType == Node.TEXT_NODE;
+    }).text();
+    """, element)
--- a/utils/wait.py
+++ b/utils/wait.py
@ -0,0 +1,7 @@
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+# find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element_by_id(name))
+WaitClickable = lambda driver,locator:WebDriverWait(driver, 10).until(EC.element_to_be_clickable(locator))
+WaitDiv = lambda driver,locator:WebDriverWait(driver, 5).until(EC.presence_of_element_located(locator))
+SwitchToIFrame = lambda driver,locator:WebDriverWait(driver, 5).until(EC.frame_to_be_available_and_switch_to_it(locator))
				`@ -0,0 +1 @@`
				`BASE_URL = "https://lms.uwa.edu.au" # Include protocol.`