Download feedback in comments, and format code

2024-11-30 11:40:16 +08:00 · 2022-07-21 00:41:43 +08:00 · 2022-07-21 00:41:43 +08:00 · 234cd31bf4
commit 234cd31bf4
parent e3ed2765d6
8 changed files with 162 additions and 106 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,5 @@ __pycache__
 chromedriver*
 test*
 .vscode/
+*.7z
+*.tar
--- a/main.py
+++ b/main.py
@ -31,27 +31,33 @@ try:
 except:
    def get_etc(*args): return False

+
 # stupid bug
-def click_the_fing_button(driver,button):
+def click_the_fing_button(driver, button):
    try:
        ActionChains(driver).move_to_element(button)
        ActionChains(driver).click(button).perform()
-        WebDriverWait(driver,2).until(EC.number_of_windows_to_be(2))
+        WebDriverWait(driver, 2).until(EC.number_of_windows_to_be(2))
    except:
-        driver.set_window_size(1024, 768)   # hack to wake selenium up when it doesnt want to click the button!
-        click_the_fing_button(driver,button)
+        # hack to wake selenium up when it doesnt want to click the button!
+        driver.set_window_size(1024, 768)
+        click_the_fing_button(driver, button)
        driver.maximize_window()

 # You can probably replace this with a recursive method like in blackboard scraper but tbh i just want to get this script done so i can stop working for once.
-def scrape_further(driver,path,session):
+
+
+def scrape_further(driver, path, session):
    # attempts for bb-held tests
-    attempts = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]")
-    attempts = [ x.get_attribute('href') for x in attempts ]
+    attempts = driver.find_elements(
+        By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]")
+    attempts = [x.get_attribute('href') for x in attempts]
    for i, attempt in enumerate(attempts):
-        name = "attempt_"+str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]"
-        attempt = re.sub("^"+BASE_URL,"",attempt)
+        name = "attempt_" + \
+            str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]"
+        attempt = re.sub("^"+BASE_URL, "", attempt)
        driver.execute_script("window.open('"+BASE_URL+attempt+"')")
-        WebDriverWait(driver,10).until(EC.number_of_windows_to_be(3))
+        WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(3))
        driver.switch_to.window(driver.window_handles[2])
        save_html(path, name, driver.page_source)
        if testing:
@ -59,25 +65,36 @@ def scrape_further(driver,path,session):
        driver.close()
        driver.switch_to.window(driver.window_handles[1])

-    # submission file for assignment
+    # Comments may contain feedback links
    request_stack = RequestStack(session)
-    attempts = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]")
-    attempts = [ x.get_attribute('href') for x in attempts ]
+    etc_files = driver.find_elements(
+        By.XPATH, "//a[contains(@href, '/bbcswebdav')]")
+    etc_files = [x.get_attribute('href') for x in etc_files]
+    for i, item in enumerate(etc_files):
+        if (not item is None) and ("bbcswebdav" in item):
+            request_stack.add_file(item, path)
+
+    # submission file for assignment
+    attempts = driver.find_elements(
+        By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]")
+    attempts = [x.get_attribute('href') for x in attempts]
    for i, attempt in enumerate(attempts):
-        request_stack.add_file(attempt,path)
+        request_stack.add_file(attempt, path)

    get_feedback = False
    try:
        # download button causes a tab to appear quickly, download, then disappear
        # need to capture the url to get the metadata and dl to the correct location
        # cant be arsed to figure out how the pspdfkit js that executes this download works.
-        SwitchToIFrame(driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
+        SwitchToIFrame(
+            driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']"))
        SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']"))
        get_feedback = True
    except:
        print("No feedback to download")
    if get_feedback:
-        dl_button = WaitClickable(driver,(By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
+        dl_button = WaitClickable(
+            driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']"))
        dl_button.click()
        download_file(path)
    request_stack.download_all()
@ -86,14 +103,15 @@ def scrape_further(driver,path,session):

 parser = argparse.ArgumentParser(description='Automated microsoft SSO login.')
 # parser.add_argument("-p", "--password", help="Automatically use provided password", default="")
-parser.add_argument("-u", "--username", help="Automatically use provided userID", default="")
+parser.add_argument("-u", "--username",
+                    help="Automatically use provided userID", default="")

 path = ['grades']
 args = parser.parse_args()

 CAPABILITIES = DesiredCapabilities.CHROME
 CAPABILITIES['goog:loggingPrefs'] = {
-    'performance'           : 'ALL',
+    'performance': 'ALL',
 }

 for f in os.listdir(DL_DIR):
@ -102,15 +120,17 @@ prefs = {
    "profile.default_content_settings.popups": 0,
    "download.default_directory": DL_DIR,
    "directory_upgrade": True
-        }
+}
 OPTIONS = Options()
+OPTIONS.add_argument('--no-sandbox')
+OPTIONS.add_argument('--disable-dev-shm-usage')
 OPTIONS.add_experimental_option("prefs", prefs)
 # OPTIONS.add_argument("--headless")
 driver = webdriver.Chrome(
    executable_path='chromedriver.exe',
    desired_capabilities=CAPABILITIES,
    options=OPTIONS
-                        )
+)
 driver.maximize_window()

 cookies = login(args, driver)  # do Login.
@ -121,13 +141,15 @@ for cookie in cookies:
 # need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line.
 driver.get(BASE_URL+"/webapps/gradebook/do/student/viewCourses")
 try:
-    WaitClickable(driver,(By.CLASS_NAME, "button-1")).click()
+    WaitClickable(driver, (By.CLASS_NAME, "button-1")).click()
 except:
    print("no tos warning - skipped")

-driver.get(BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades")
+driver.get(
+    BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades")
 save_html(sep.join(path), 'entrypoint', driver.page_source)

+WaitClickable(driver, (By.ID, "left_stream_mygrades"))
 # get courseIDs
 courses = driver.find_element(By.ID, "left_stream_mygrades")\
                .find_elements(By.XPATH, "//div[@role='tab']")
@ -137,11 +159,13 @@ for i, course_results in enumerate(courses):
    course_results = courses[i]
    ActionChains(driver).move_to_element(course_results).perform()
    course_url = course_results.get_attribute("bb:rhs")
-    course_name = course_results.find_elements(By.XPATH, "//span[@class='stream_area_name']")[i].text
-    course_name += " ["+parse_qs(urlparse(course_url).query)['course_id'][0]+"]"
+    course_name = course_results.find_elements(
+        By.XPATH, "//span[@class='stream_area_name']")[i].text
+    course_name += " [" + \
+        parse_qs(urlparse(course_url).query)['course_id'][0]+"]"
    course_details.append({
        'name': course_name,
-        'url' : course_url
+        'url': course_url
    })

 for i, course in enumerate(course_details):
@ -155,8 +179,8 @@ for i, course in enumerate(course_details):
    }
    """)

-    WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click()
-    WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click()
+    WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()
+    WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click()

    table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div")

@ -167,20 +191,23 @@ for i, course in enumerate(course_details):
        assignment_name = None
        information_link = False
        try:
-            block = assignment.find_element(By.XPATH, "./div[@class='cell gradable']/a[@onclick]")
+            block = assignment.find_element(
+                By.XPATH, "./div[@class='cell gradable']/a[@onclick]")
            information_link = True
        except:
-            block = assignment.find_element(By.XPATH, "./div[@class='cell gradable']")
-        assignment_name = get_assignment_name(driver,block)
+            block = assignment.find_element(
+                By.XPATH, "./div[@class='cell gradable']")
+        assignment_name = get_assignment_name(driver, block)
        path.append(assignment_name)
        # download information if it exists.
        if information_link:
            try:
-                ActionChains(driver).move_to_element(block).click(block).perform()
+                ActionChains(driver).move_to_element(
+                    block).click(block).perform()
                print("Switched "+assignment_name)
-                WebDriverWait(driver,10).until(EC.number_of_windows_to_be(2))
+                WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
                driver.switch_to.window(driver.window_handles[1])
-                save_html(sep.join(path),"information",driver.page_source)
+                save_html(sep.join(path), "information", driver.page_source)
                scrape_further(driver, sep.join(path), session)
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
@ -190,30 +217,35 @@ for i, course in enumerate(course_details):
        for button in buttons:
            action = button.get_attribute("onclick")
            if action != None and "showInLightBox" not in action:
-                click_the_fing_button(driver,button)
-                driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
+                click_the_fing_button(driver, button)
+                driver.execute_script(
+                    "window.scrollTo(0, document.body.scrollHeight)")
                driver.switch_to.window(driver.window_handles[1])
                WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer"))
-                save_html(sep.join(path),"rubric",driver.page_source)
-                driver.find_element(By.XPATH, "//li[@id='listViewTab']/a").click()
+                save_html(sep.join(path), "rubric", driver.page_source)
+                driver.find_element(
+                    By.XPATH, "//li[@id='listViewTab']/a").click()
                WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList"))
-                save_html(sep.join(path),"list",driver.page_source)
-                detailed_buttons = driver.find_elements(By.XPATH, "//div[@class='u_controlsWrapper']/input")
+                save_html(sep.join(path), "list", driver.page_source)
+                detailed_buttons = driver.find_elements(
+                    By.XPATH, "//div[@class='u_controlsWrapper']/input")
                detailed_buttons[1].click()
                detailed_buttons[0].click()
-                save_html(sep.join(path),"list_detailed",driver.page_source)
+                save_html(sep.join(path), "list_detailed", driver.page_source)
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
        path.pop()
    save_html(sep.join(path), path[0], driver.page_source)
-    WaitClickable(driver,(By.XPATH,"//a[@value='S']")).click()
-    save_html(sep.join(path),"submitted",driver.page_source)
+    WaitClickable(driver, (By.XPATH, "//a[@value='S']")).click()
+    save_html(sep.join(path), "submitted", driver.page_source)
    try:
-        WaitClickable(driver,(By.XPATH,"//div[@id='submissionReceipts']//a")).click()
-        WaitClickable(driver,(By.XPATH,"//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click()
+        WaitClickable(
+            driver, (By.XPATH, "//div[@id='submissionReceipts']//a")).click()
+        WaitClickable(
+            driver, (By.XPATH, "//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click()
    except:
        print('No items?')
-    save_html(sep.join(path),"receipts",driver.page_source)
+    save_html(sep.join(path), "receipts", driver.page_source)
    path.pop()


--- a/utils/init.py
+++ b/utils/init.py
@ -1,3 +1,4 @@
 # https://stackoverflow.com/a/49375740
-import os, sys
+import os
+import sys
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
--- a/utils/asset.py
+++ b/utils/asset.py
@ -6,6 +6,7 @@ import shutil
 import csv
 from pathlib import Path

+
 def convert_filename(name, hash):
    _name = name.split('.')
    if len(_name) > 1:
@ -14,30 +15,33 @@ def convert_filename(name, hash):
        _name[0] += ("["+hash+"]")
    return '.'.join(_name)

+
 class RequestStack:
-    def __init__(self,token):
+    def __init__(self, token):
        self.request_stack = []
        self.token = token
        super().__init__()

-    def add_file(self,url,path):
-        self.request_stack.append(Asset(url,path))
+    def add_file(self, url, path):
+        self.request_stack.append(Asset(url, path))

    def download_all(self):
        for file in self.request_stack:
            print(f"\tDownloading {file.url}")
            file.download(self.token)

+
 class Asset:
-    def __init__(self,url,path):
+    def __init__(self, url, path):
        self.path = Path(path)
-        self.url = re.sub("^"+BASE_URL,"",url)
+        self.url = re.sub("^"+BASE_URL, "", url)
        # self.file_id = re.findall('file_id=(.+)&',url)
        self.path.mkdir(parents=True, exist_ok=True)
        super().__init__()

-    def download(self,session):
-        response = session.get(BASE_URL+self.url, stream=True, allow_redirects=False)
+    def download(self, session):
+        response = session.get(
+            BASE_URL+self.url, stream=True, allow_redirects=False)
        headers = response.headers
        if response.status_code == 302 and len(headers['location']) > 0:
            Asset(headers['location'], self.path).download(session)
@ -45,24 +49,28 @@ class Asset:
        elif response.status_code != 200:
            print("[!] Error "+str(response.status_code))
            return response.status_code
-        headers = { x:re.sub(r'^"*|"*?$', '', headers.get(x)) for x in headers } # ewww regex
+        headers = {x: re.sub(r'^"*|"*?$', '', headers.get(x))
+                   for x in headers}  # ewww regex
        if 'Content-Disposition' in headers.keys():
-            self.original_filename = re.findall('filename="(.+)"', headers['Content-Disposition'])[0]
+            self.original_filename = re.findall(
+                'filename="(.+)"', headers['Content-Disposition'])[0]
        else:
-            self.original_filename = re.sub(".*/","",self.url)
+            self.original_filename = re.sub(".*/", "", self.url)
        self.etag_hash = hashlib.md5(headers['ETag'].encode()).hexdigest()
-        self.filename = convert_filename(self.original_filename, self.etag_hash[0:6])
+        self.filename = convert_filename(
+            self.original_filename, self.etag_hash[0:6])

        with open(self.path.joinpath(self.filename), 'wb') as f:
            shutil.copyfileobj(response.raw, f)
        self.write_metadata(headers)

-    def write_metadata(self,headers):
+    def write_metadata(self, headers):
        metacsv = [
            ["original_filename",   self.original_filename],
            ["readable_filename",   self.filename],
            ["url",                 self.url],
-            ["pathhash",            hashlib.md5(self.url.encode()).hexdigest()],
+            ["pathhash",            hashlib.md5(
+                self.url.encode()).hexdigest()],
            ["etag",                headers['ETag']],
            ["etaghash",            self.etag_hash],
            ["last-modified",       headers["Last-Modified"]],
--- a/utils/login.py
+++ b/utils/login.py
@ -9,6 +9,7 @@ from constants.constants import BASE_URL
 import re
 import json

+
 def login(args, driver):
    driver.get(BASE_URL)
    USERNAME = args.username
@ -19,24 +20,26 @@ def login(args, driver):
    print('Password: ')
    PASSWORD = getpass('')

-    WaitClickable(driver,Selectors.BOX_USERNAME).send_keys(USERNAME)
-    WaitClickable(driver,Selectors.BUTTON_NEXT).click()
+    WaitClickable(driver, Selectors.BOX_USERNAME).send_keys(USERNAME)
+    WaitClickable(driver, Selectors.BUTTON_NEXT).click()
    print('Entered username.')

    try:
-        WaitClickable(driver,Selectors.BOX_PASSWORD).send_keys(PASSWORD)
-        WaitClickable(driver,Selectors.BUTTON_NEXT).click()
+        WaitClickable(driver, Selectors.BOX_PASSWORD).send_keys(PASSWORD)
+        WaitClickable(driver, Selectors.BUTTON_NEXT).click()
        print('Entered password.')
    except:
-        print(WebDriverWait(driver, 1).until(EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text)
+        print(WebDriverWait(driver, 1).until(
+            EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text)
        driver.quit()
        exit(2)

-    WaitClickable(driver,Selectors.BUTTON_DENY).click()
+    WaitClickable(driver, Selectors.BUTTON_DENY).click()
    # WaitClickable(driver,BUTTON_NEXT).click() #IF you want to remember credentials, switch these comments

    cookie = driver.get_cookies()
-    if not cookie == None: return cookie
+    if not cookie == None:
+        return cookie

    print('Could not get auth cookie - Invalid ID or password?', file=sys.stderr)
    driver.quit()
--- a/utils/selectors.py
+++ b/utils/selectors.py
@ -1,5 +1,6 @@
 from selenium.webdriver.common.by import By

+
 class Selectors:
    # Microsoft login
    BOX_USERNAME = (By.ID, "i0116")
--- a/utils/utils.py
+++ b/utils/utils.py
@ -7,41 +7,41 @@ import os
 from pathlib import Path
 import shutil

+
 def friendly_filename(name):
    name = friendly_dirname(name)
-    return re.sub("[\\\/]",'',name)
+    return re.sub("[\\\/]", '', name)
+

 def friendly_dirname(name):
-    #.gsub(/[^\w\s_-]+/, '')
+    # .gsub(/[^\w\s_-]+/, '')
    # .gsub(/\s+/, '_')
    # pipeline:
-    name = re.sub("[\x00-\x1f]",'',name)
-    name = re.sub("[\:\<\>\"\|\?\*]",'',name)
+    name = re.sub("[\x00-\x1f]", '', name)
+    name = re.sub("[\:\<\>\"\|\?\*]", '', name)
    name = re.sub("(^|\b\s)\s+($|\s?\b)", '\\1\\2', name)
    return name.strip()


-def get_assignment_name(driver,block):
-    s = friendly_filename(get_text_excluding_children(driver,block))
+def get_assignment_name(driver, block):
+    s = friendly_filename(get_text_excluding_children(driver, block))
    print("Assesment: "+s)
    return s

-def save_html(dir,filename,page_source):
+
+def save_html(dir, filename, page_source):
    dir = pathlib.Path(friendly_dirname(dir))
    dir.mkdir(parents=True, exist_ok=True)
    file = dir.joinpath(friendly_filename(filename)+".html")
    with open(file, "w", encoding="utf-8") as f:
        f.write(page_source)

-# Why is it so hard to just get the url of a single tab...
-# def get_fast_dl(driver,button):
-#     windows = len(driver.window_handles)
-#     return 
-
-# Because selenium seems to fuck up the url switching to a "download" tab, 
-# I have to use the inbuilt download in chrome :(. That also means no etag/metadata 
-# but to be honest it's using annotate-au.foundations.blackboard.com and not bbcswebdav system 
+# NOTE: Switching to a "download" tab causes issues so we must use the in built
+# download in Chrome, which does not have etag or metadata information.
+# Files are using annotate-au.foundations.blackboard.com and not bbcswebdav system
 # so the tag may not exist in the first place.
+
+
 def download_file(dest):
    d = Path(DL_DIR)
    time.sleep(2)
@ -56,10 +56,10 @@ def download_file(dest):
            else:
                _dest = Path(dest).joinpath("MARKED__"+f)
                try:
-                    shutil.move(d.joinpath(f),_dest)
+                    shutil.move(d.joinpath(f), _dest)
                except shutil.SameFileError:
                    os.remove(_dest)
-                    shutil.move(d.joinpath(f),_dest)
+                    shutil.move(d.joinpath(f), _dest)

        if len(os.listdir(d)) == 0:
            downloading = False
--- a/utils/wait.py
+++ b/utils/wait.py
@ -2,6 +2,15 @@ from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 timeout = 5
 # find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element(By.ID, name))
-WaitClickable = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(locator))
-WaitDiv = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))
-SwitchToIFrame = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator))
+
+
+def WaitClickable(driver, locator): return WebDriverWait(
+    driver, timeout).until(EC.element_to_be_clickable(locator))
+
+
+def WaitDiv(driver, locator): return WebDriverWait(
+    driver, timeout).until(EC.presence_of_element_located(locator))
+
+
+def SwitchToIFrame(driver, locator): return WebDriverWait(
+    driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator))