From 234cd31bf4d68873669bc3808b0472c8839c91ab Mon Sep 17 00:00:00 2001 From: Peter Date: Thu, 21 Jul 2022 00:41:43 +0800 Subject: [PATCH] Download feedback in comments, and format code --- .gitignore | 4 +- main.py | 144 +++++++++++++++++++++++++++------------------ utils/__init__.py | 5 +- utils/asset.py | 38 +++++++----- utils/login.py | 23 ++++---- utils/selectors.py | 3 +- utils/utils.py | 36 ++++++------ utils/wait.py | 15 ++++- 8 files changed, 162 insertions(+), 106 deletions(-) diff --git a/.gitignore b/.gitignore index d4b1961..c2e9b6f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ tmp/ __pycache__ chromedriver* test* -.vscode/ \ No newline at end of file +.vscode/ +*.7z +*.tar \ No newline at end of file diff --git a/main.py b/main.py index ebb0e57..575174e 100644 --- a/main.py +++ b/main.py @@ -27,31 +27,37 @@ from selenium.common.exceptions import ElementNotInteractableException testing = False try: testing = True - from utils.test import get_etc + from utils.test import get_etc except: def get_etc(*args): return False + # stupid bug -def click_the_fing_button(driver,button): +def click_the_fing_button(driver, button): try: ActionChains(driver).move_to_element(button) ActionChains(driver).click(button).perform() - WebDriverWait(driver,2).until(EC.number_of_windows_to_be(2)) + WebDriverWait(driver, 2).until(EC.number_of_windows_to_be(2)) except: - driver.set_window_size(1024, 768) # hack to wake selenium up when it doesnt want to click the button! - click_the_fing_button(driver,button) + # hack to wake selenium up when it doesnt want to click the button! + driver.set_window_size(1024, 768) + click_the_fing_button(driver, button) driver.maximize_window() # You can probably replace this with a recursive method like in blackboard scraper but tbh i just want to get this script done so i can stop working for once. -def scrape_further(driver,path,session): + + +def scrape_further(driver, path, session): # attempts for bb-held tests - attempts = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]") - attempts = [ x.get_attribute('href') for x in attempts ] + attempts = driver.find_elements( + By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]") + attempts = [x.get_attribute('href') for x in attempts] for i, attempt in enumerate(attempts): - name = "attempt_"+str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]" - attempt = re.sub("^"+BASE_URL,"",attempt) + name = "attempt_" + \ + str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]" + attempt = re.sub("^"+BASE_URL, "", attempt) driver.execute_script("window.open('"+BASE_URL+attempt+"')") - WebDriverWait(driver,10).until(EC.number_of_windows_to_be(3)) + WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(3)) driver.switch_to.window(driver.window_handles[2]) save_html(path, name, driver.page_source) if testing: @@ -59,25 +65,36 @@ def scrape_further(driver,path,session): driver.close() driver.switch_to.window(driver.window_handles[1]) - # submission file for assignment + # Comments may contain feedback links request_stack = RequestStack(session) - attempts = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]") - attempts = [ x.get_attribute('href') for x in attempts ] + etc_files = driver.find_elements( + By.XPATH, "//a[contains(@href, '/bbcswebdav')]") + etc_files = [x.get_attribute('href') for x in etc_files] + for i, item in enumerate(etc_files): + if (not item is None) and ("bbcswebdav" in item): + request_stack.add_file(item, path) + + # submission file for assignment + attempts = driver.find_elements( + By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]") + attempts = [x.get_attribute('href') for x in attempts] for i, attempt in enumerate(attempts): - request_stack.add_file(attempt,path) + request_stack.add_file(attempt, path) get_feedback = False try: # download button causes a tab to appear quickly, download, then disappear # need to capture the url to get the metadata and dl to the correct location # cant be arsed to figure out how the pspdfkit js that executes this download works. - SwitchToIFrame(driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']")) + SwitchToIFrame( + driver, (By.XPATH, "//iframe[@class='docviewer_iframe_embed']")) SwitchToIFrame(driver, (By.XPATH, "//iframe[@title='PSPDFKit']")) get_feedback = True except: print("No feedback to download") if get_feedback: - dl_button = WaitClickable(driver,(By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']")) + dl_button = WaitClickable( + driver, (By.XPATH, "//button[contains(@class,'PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button')][@title='Download']")) dl_button.click() download_file(path) request_stack.download_all() @@ -86,34 +103,37 @@ def scrape_further(driver,path,session): parser = argparse.ArgumentParser(description='Automated microsoft SSO login.') # parser.add_argument("-p", "--password", help="Automatically use provided password", default="") -parser.add_argument("-u", "--username", help="Automatically use provided userID", default="") +parser.add_argument("-u", "--username", + help="Automatically use provided userID", default="") path = ['grades'] args = parser.parse_args() CAPABILITIES = DesiredCapabilities.CHROME CAPABILITIES['goog:loggingPrefs'] = { - 'performance' : 'ALL', + 'performance': 'ALL', } for f in os.listdir(DL_DIR): os.remove(Path(DL_DIR).joinpath(f)) prefs = { - "profile.default_content_settings.popups": 0, - "download.default_directory": DL_DIR, - "directory_upgrade": True - } + "profile.default_content_settings.popups": 0, + "download.default_directory": DL_DIR, + "directory_upgrade": True +} OPTIONS = Options() +OPTIONS.add_argument('--no-sandbox') +OPTIONS.add_argument('--disable-dev-shm-usage') OPTIONS.add_experimental_option("prefs", prefs) # OPTIONS.add_argument("--headless") driver = webdriver.Chrome( - executable_path='chromedriver.exe', - desired_capabilities=CAPABILITIES, - options=OPTIONS - ) + executable_path='chromedriver.exe', + desired_capabilities=CAPABILITIES, + options=OPTIONS +) driver.maximize_window() -cookies = login(args, driver) # do Login. +cookies = login(args, driver) # do Login. session = requests.Session() for cookie in cookies: session.cookies.set(cookie["name"], cookie["value"]) @@ -121,13 +141,15 @@ for cookie in cookies: # need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line. driver.get(BASE_URL+"/webapps/gradebook/do/student/viewCourses") try: - WaitClickable(driver,(By.CLASS_NAME, "button-1")).click() + WaitClickable(driver, (By.CLASS_NAME, "button-1")).click() except: print("no tos warning - skipped") -driver.get(BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades") +driver.get( + BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades") save_html(sep.join(path), 'entrypoint', driver.page_source) +WaitClickable(driver, (By.ID, "left_stream_mygrades")) # get courseIDs courses = driver.find_element(By.ID, "left_stream_mygrades")\ .find_elements(By.XPATH, "//div[@role='tab']") @@ -137,15 +159,17 @@ for i, course_results in enumerate(courses): course_results = courses[i] ActionChains(driver).move_to_element(course_results).perform() course_url = course_results.get_attribute("bb:rhs") - course_name = course_results.find_elements(By.XPATH, "//span[@class='stream_area_name']")[i].text - course_name += " ["+parse_qs(urlparse(course_url).query)['course_id'][0]+"]" + course_name = course_results.find_elements( + By.XPATH, "//span[@class='stream_area_name']")[i].text + course_name += " [" + \ + parse_qs(urlparse(course_url).query)['course_id'][0]+"]" course_details.append({ 'name': course_name, - 'url' : course_url + 'url': course_url }) for i, course in enumerate(course_details): - path.append(course['name']) # course name + path.append(course['name']) # course name print(course['name']) driver.get(BASE_URL+course['url']) @@ -155,8 +179,8 @@ for i, course in enumerate(course_details): } """) - WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click() - WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click() + WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() + WaitClickable(driver, (By.XPATH, "//a[@value='A']")).click() table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div") @@ -167,20 +191,23 @@ for i, course in enumerate(course_details): assignment_name = None information_link = False try: - block = assignment.find_element(By.XPATH, "./div[@class='cell gradable']/a[@onclick]") + block = assignment.find_element( + By.XPATH, "./div[@class='cell gradable']/a[@onclick]") information_link = True except: - block = assignment.find_element(By.XPATH, "./div[@class='cell gradable']") - assignment_name = get_assignment_name(driver,block) + block = assignment.find_element( + By.XPATH, "./div[@class='cell gradable']") + assignment_name = get_assignment_name(driver, block) path.append(assignment_name) # download information if it exists. if information_link: try: - ActionChains(driver).move_to_element(block).click(block).perform() + ActionChains(driver).move_to_element( + block).click(block).perform() print("Switched "+assignment_name) - WebDriverWait(driver,10).until(EC.number_of_windows_to_be(2)) + WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2)) driver.switch_to.window(driver.window_handles[1]) - save_html(sep.join(path),"information",driver.page_source) + save_html(sep.join(path), "information", driver.page_source) scrape_further(driver, sep.join(path), session) driver.close() driver.switch_to.window(driver.window_handles[0]) @@ -190,31 +217,36 @@ for i, course in enumerate(course_details): for button in buttons: action = button.get_attribute("onclick") if action != None and "showInLightBox" not in action: - click_the_fing_button(driver,button) - driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") + click_the_fing_button(driver, button) + driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight)") driver.switch_to.window(driver.window_handles[1]) WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer")) - save_html(sep.join(path),"rubric",driver.page_source) - driver.find_element(By.XPATH, "//li[@id='listViewTab']/a").click() + save_html(sep.join(path), "rubric", driver.page_source) + driver.find_element( + By.XPATH, "//li[@id='listViewTab']/a").click() WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList")) - save_html(sep.join(path),"list",driver.page_source) - detailed_buttons = driver.find_elements(By.XPATH, "//div[@class='u_controlsWrapper']/input") + save_html(sep.join(path), "list", driver.page_source) + detailed_buttons = driver.find_elements( + By.XPATH, "//div[@class='u_controlsWrapper']/input") detailed_buttons[1].click() detailed_buttons[0].click() - save_html(sep.join(path),"list_detailed",driver.page_source) + save_html(sep.join(path), "list_detailed", driver.page_source) driver.close() driver.switch_to.window(driver.window_handles[0]) - path.pop() + path.pop() save_html(sep.join(path), path[0], driver.page_source) - WaitClickable(driver,(By.XPATH,"//a[@value='S']")).click() - save_html(sep.join(path),"submitted",driver.page_source) + WaitClickable(driver, (By.XPATH, "//a[@value='S']")).click() + save_html(sep.join(path), "submitted", driver.page_source) try: - WaitClickable(driver,(By.XPATH,"//div[@id='submissionReceipts']//a")).click() - WaitClickable(driver,(By.XPATH,"//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click() + WaitClickable( + driver, (By.XPATH, "//div[@id='submissionReceipts']//a")).click() + WaitClickable( + driver, (By.XPATH, "//div[@id='listContainer_itemcount']//a[@class='pagelink']")).click() except: print('No items?') - save_html(sep.join(path),"receipts",driver.page_source) + save_html(sep.join(path), "receipts", driver.page_source) path.pop() -driver.quit() \ No newline at end of file +driver.quit() diff --git a/utils/__init__.py b/utils/__init__.py index 40137d1..4723f18 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,3 +1,4 @@ # https://stackoverflow.com/a/49375740 -import os, sys -sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file +import os +import sys +sys.path.append(os.path.dirname(os.path.realpath(__file__))) diff --git a/utils/asset.py b/utils/asset.py index b4eb000..a852032 100644 --- a/utils/asset.py +++ b/utils/asset.py @@ -6,6 +6,7 @@ import shutil import csv from pathlib import Path + def convert_filename(name, hash): _name = name.split('.') if len(_name) > 1: @@ -14,30 +15,33 @@ def convert_filename(name, hash): _name[0] += ("["+hash+"]") return '.'.join(_name) + class RequestStack: - def __init__(self,token): + def __init__(self, token): self.request_stack = [] self.token = token super().__init__() - def add_file(self,url,path): - self.request_stack.append(Asset(url,path)) - + def add_file(self, url, path): + self.request_stack.append(Asset(url, path)) + def download_all(self): for file in self.request_stack: print(f"\tDownloading {file.url}") file.download(self.token) + class Asset: - def __init__(self,url,path): + def __init__(self, url, path): self.path = Path(path) - self.url = re.sub("^"+BASE_URL,"",url) + self.url = re.sub("^"+BASE_URL, "", url) # self.file_id = re.findall('file_id=(.+)&',url) self.path.mkdir(parents=True, exist_ok=True) super().__init__() - def download(self,session): - response = session.get(BASE_URL+self.url, stream=True, allow_redirects=False) + def download(self, session): + response = session.get( + BASE_URL+self.url, stream=True, allow_redirects=False) headers = response.headers if response.status_code == 302 and len(headers['location']) > 0: Asset(headers['location'], self.path).download(session) @@ -45,24 +49,28 @@ class Asset: elif response.status_code != 200: print("[!] Error "+str(response.status_code)) return response.status_code - headers = { x:re.sub(r'^"*|"*?$', '', headers.get(x)) for x in headers } # ewww regex + headers = {x: re.sub(r'^"*|"*?$', '', headers.get(x)) + for x in headers} # ewww regex if 'Content-Disposition' in headers.keys(): - self.original_filename = re.findall('filename="(.+)"', headers['Content-Disposition'])[0] + self.original_filename = re.findall( + 'filename="(.+)"', headers['Content-Disposition'])[0] else: - self.original_filename = re.sub(".*/","",self.url) + self.original_filename = re.sub(".*/", "", self.url) self.etag_hash = hashlib.md5(headers['ETag'].encode()).hexdigest() - self.filename = convert_filename(self.original_filename, self.etag_hash[0:6]) + self.filename = convert_filename( + self.original_filename, self.etag_hash[0:6]) with open(self.path.joinpath(self.filename), 'wb') as f: shutil.copyfileobj(response.raw, f) self.write_metadata(headers) - def write_metadata(self,headers): + def write_metadata(self, headers): metacsv = [ ["original_filename", self.original_filename], ["readable_filename", self.filename], ["url", self.url], - ["pathhash", hashlib.md5(self.url.encode()).hexdigest()], + ["pathhash", hashlib.md5( + self.url.encode()).hexdigest()], ["etag", headers['ETag']], ["etaghash", self.etag_hash], ["last-modified", headers["Last-Modified"]], @@ -73,4 +81,4 @@ class Asset: csvpath.mkdir(parents=True, exist_ok=True) with open(csvpath.joinpath(self.filename+"__metadata.csv"), "w", newline="") as f: writer = csv.writer(f) - writer.writerows(metacsv) \ No newline at end of file + writer.writerows(metacsv) diff --git a/utils/login.py b/utils/login.py index c74f0d6..9e07f9b 100644 --- a/utils/login.py +++ b/utils/login.py @@ -9,6 +9,7 @@ from constants.constants import BASE_URL import re import json + def login(args, driver): driver.get(BASE_URL) USERNAME = args.username @@ -19,25 +20,27 @@ def login(args, driver): print('Password: ') PASSWORD = getpass('') - WaitClickable(driver,Selectors.BOX_USERNAME).send_keys(USERNAME) - WaitClickable(driver,Selectors.BUTTON_NEXT).click() + WaitClickable(driver, Selectors.BOX_USERNAME).send_keys(USERNAME) + WaitClickable(driver, Selectors.BUTTON_NEXT).click() print('Entered username.') try: - WaitClickable(driver,Selectors.BOX_PASSWORD).send_keys(PASSWORD) - WaitClickable(driver,Selectors.BUTTON_NEXT).click() + WaitClickable(driver, Selectors.BOX_PASSWORD).send_keys(PASSWORD) + WaitClickable(driver, Selectors.BUTTON_NEXT).click() print('Entered password.') except: - print(WebDriverWait(driver, 1).until(EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text) + print(WebDriverWait(driver, 1).until( + EC.visibility_of_element_located(Selectors.DIV_USERERROR)).text) driver.quit() exit(2) - WaitClickable(driver,Selectors.BUTTON_DENY).click() + WaitClickable(driver, Selectors.BUTTON_DENY).click() # WaitClickable(driver,BUTTON_NEXT).click() #IF you want to remember credentials, switch these comments - + cookie = driver.get_cookies() - if not cookie == None: return cookie - + if not cookie == None: + return cookie + print('Could not get auth cookie - Invalid ID or password?', file=sys.stderr) driver.quit() - exit(1) \ No newline at end of file + exit(1) diff --git a/utils/selectors.py b/utils/selectors.py index 09698b5..299da91 100644 --- a/utils/selectors.py +++ b/utils/selectors.py @@ -1,5 +1,6 @@ from selenium.webdriver.common.by import By + class Selectors: # Microsoft login BOX_USERNAME = (By.ID, "i0116") @@ -7,4 +8,4 @@ class Selectors: DIV_USERERROR = (By.ID, 'usernameError') BUTTON_NEXT = (By.ID, "idSIButton9") BUTTON_DENY = (By.ID, "idBtn_Back") - # Selectors for grades \ No newline at end of file + # Selectors for grades diff --git a/utils/utils.py b/utils/utils.py index 358fb92..6575ee0 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -7,41 +7,41 @@ import os from pathlib import Path import shutil + def friendly_filename(name): name = friendly_dirname(name) - return re.sub("[\\\/]",'',name) + return re.sub("[\\\/]", '', name) + def friendly_dirname(name): - #.gsub(/[^\w\s_-]+/, '') + # .gsub(/[^\w\s_-]+/, '') # .gsub(/\s+/, '_') # pipeline: - name = re.sub("[\x00-\x1f]",'',name) - name = re.sub("[\:\<\>\"\|\?\*]",'',name) + name = re.sub("[\x00-\x1f]", '', name) + name = re.sub("[\:\<\>\"\|\?\*]", '', name) name = re.sub("(^|\b\s)\s+($|\s?\b)", '\\1\\2', name) return name.strip() -def get_assignment_name(driver,block): - s = friendly_filename(get_text_excluding_children(driver,block)) +def get_assignment_name(driver, block): + s = friendly_filename(get_text_excluding_children(driver, block)) print("Assesment: "+s) return s -def save_html(dir,filename,page_source): + +def save_html(dir, filename, page_source): dir = pathlib.Path(friendly_dirname(dir)) dir.mkdir(parents=True, exist_ok=True) file = dir.joinpath(friendly_filename(filename)+".html") with open(file, "w", encoding="utf-8") as f: f.write(page_source) -# Why is it so hard to just get the url of a single tab... -# def get_fast_dl(driver,button): -# windows = len(driver.window_handles) -# return - -# Because selenium seems to fuck up the url switching to a "download" tab, -# I have to use the inbuilt download in chrome :(. That also means no etag/metadata -# but to be honest it's using annotate-au.foundations.blackboard.com and not bbcswebdav system +# NOTE: Switching to a "download" tab causes issues so we must use the in built +# download in Chrome, which does not have etag or metadata information. +# Files are using annotate-au.foundations.blackboard.com and not bbcswebdav system # so the tag may not exist in the first place. + + def download_file(dest): d = Path(DL_DIR) time.sleep(2) @@ -56,10 +56,10 @@ def download_file(dest): else: _dest = Path(dest).joinpath("MARKED__"+f) try: - shutil.move(d.joinpath(f),_dest) + shutil.move(d.joinpath(f), _dest) except shutil.SameFileError: os.remove(_dest) - shutil.move(d.joinpath(f),_dest) + shutil.move(d.joinpath(f), _dest) if len(os.listdir(d)) == 0: downloading = False @@ -71,4 +71,4 @@ def get_text_excluding_children(driver, element): return jQuery(arguments[0]).contents().filter(function() { return this.nodeType == Node.TEXT_NODE; }).text(); - """, element) \ No newline at end of file + """, element) diff --git a/utils/wait.py b/utils/wait.py index c582169..1e245d1 100644 --- a/utils/wait.py +++ b/utils/wait.py @@ -2,6 +2,15 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC timeout = 5 # find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element(By.ID, name)) -WaitClickable = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(locator)) -WaitDiv = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator)) -SwitchToIFrame = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator)) \ No newline at end of file + + +def WaitClickable(driver, locator): return WebDriverWait( + driver, timeout).until(EC.element_to_be_clickable(locator)) + + +def WaitDiv(driver, locator): return WebDriverWait( + driver, timeout).until(EC.presence_of_element_located(locator)) + + +def SwitchToIFrame(driver, locator): return WebDriverWait( + driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator))