From e3ed2765d6c2dab1d6384b4cf187e31dee5d9b66 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 14 Nov 2021 01:53:15 +0800 Subject: [PATCH] Update to new LMS, update chromdriver to 95, unspaghettified *some* code --- .gitignore | 5 +++-- README.md | 3 ++- main.py | 56 ++++++++++++++++++++++---------------------------- utils/asset.py | 10 ++++----- utils/login.py | 26 ++++++++--------------- utils/utils.py | 3 --- utils/wait.py | 4 ++-- 7 files changed, 44 insertions(+), 63 deletions(-) diff --git a/.gitignore b/.gitignore index 21727bf..d4b1961 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ -grades/ +grades* tmp/ __pycache__ chromedriver* -test* \ No newline at end of file +test* +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index 2453be0..35601b8 100644 --- a/README.md +++ b/README.md @@ -20,4 +20,5 @@ Just made it able to download the graded results which may contain annotations. ## Note: * Does not download turnitin reports. You have to click the link manually to the feedback site. * Does not download multiple submission attempts - only downloads the last/graded attempt. -* Check that the default page is the 'all' category for the marks instead of something else like the submitted category. The script should correct this but just to be safe click on all if it isn't already \ No newline at end of file +* Check that the default page is the 'all' category for the marks instead of something else like the submitted category. The script should correct this but just to be safe click on all if it isn't already +* Sometimes chromedriver closes after logging in, when not in headless mode. Try interacting with the page before logging in. \ No newline at end of file diff --git a/main.py b/main.py index 4364272..ebb0e57 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,10 @@ +#!/usr/bin/env python3 +import requests from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By -from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains # For chrome stuff from selenium.webdriver.common.desired_capabilities import DesiredCapabilities @@ -12,24 +13,14 @@ from selenium.webdriver.chrome.options import Options from urllib.parse import parse_qs, urlparse import os from os.path import sep -import requests -import time -import getpass -import json import re -import sys import argparse -import pathlib -import utils.selectors -from utils.asset import Asset, RequestStack +from utils.asset import RequestStack from utils.wait import SwitchToIFrame, WaitClickable, WaitDiv from constants.constants import BASE_URL, DL_DIR from utils.login import login -from utils.selectors import Selectors -from utils.utils import download_file, friendly_filename, get_assignment_name, get_text_excluding_children, save_html -import code -from random import randint +from utils.utils import download_file, get_assignment_name, save_html from pathlib import Path from selenium.common.exceptions import ElementNotInteractableException @@ -40,8 +31,6 @@ try: except: def get_etc(*args): return False -cookie = None - # stupid bug def click_the_fing_button(driver,button): try: @@ -54,9 +43,9 @@ def click_the_fing_button(driver,button): driver.maximize_window() # You can probably replace this with a recursive method like in blackboard scraper but tbh i just want to get this script done so i can stop working for once. -def scrape_further(driver,path): +def scrape_further(driver,path,session): # attempts for bb-held tests - attempts = driver.find_elements_by_xpath("//a[starts-with(@href, '/webapps/assessment')]") + attempts = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/webapps/assessment')]") attempts = [ x.get_attribute('href') for x in attempts ] for i, attempt in enumerate(attempts): name = "attempt_"+str(i)+"_["+parse_qs(urlparse(attempt).query)['attempt_id'][0]+"]" @@ -66,13 +55,13 @@ def scrape_further(driver,path): driver.switch_to.window(driver.window_handles[2]) save_html(path, name, driver.page_source) if testing: - get_etc(driver, cookie, path) + get_etc(driver, session, path) driver.close() driver.switch_to.window(driver.window_handles[1]) # submission file for assignment - request_stack = RequestStack(cookie) - attempts = driver.find_elements_by_xpath("//a[starts-with(@href, '/webapps/assignment/download')]") + request_stack = RequestStack(session) + attempts = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/webapps/assignment/download')]") attempts = [ x.get_attribute('href') for x in attempts ] for i, attempt in enumerate(attempts): request_stack.add_file(attempt,path) @@ -118,13 +107,16 @@ OPTIONS = Options() OPTIONS.add_experimental_option("prefs", prefs) # OPTIONS.add_argument("--headless") driver = webdriver.Chrome( - executable_path='chromedriver', + executable_path='chromedriver.exe', desired_capabilities=CAPABILITIES, options=OPTIONS ) driver.maximize_window() -cookie = {'Cookie': login(args, driver)} # do Login. +cookies = login(args, driver) # do Login. +session = requests.Session() +for cookie in cookies: + session.cookies.set(cookie["name"], cookie["value"]) # need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line. driver.get(BASE_URL+"/webapps/gradebook/do/student/viewCourses") @@ -137,15 +129,15 @@ driver.get(BASE_URL+"/webapps/streamViewer/streamViewer?cmd=view&streamName=mygr save_html(sep.join(path), 'entrypoint', driver.page_source) # get courseIDs -courses = driver.find_element_by_id("left_stream_mygrades")\ - .find_elements_by_xpath("//div[@role='tab']") +courses = driver.find_element(By.ID, "left_stream_mygrades")\ + .find_elements(By.XPATH, "//div[@role='tab']") course_details = [] for i, course_results in enumerate(courses): course_results = courses[i] ActionChains(driver).move_to_element(course_results).perform() course_url = course_results.get_attribute("bb:rhs") - course_name = course_results.find_elements_by_xpath("//span[@class='stream_area_name']")[i].text + course_name = course_results.find_elements(By.XPATH, "//span[@class='stream_area_name']")[i].text course_name += " ["+parse_qs(urlparse(course_url).query)['course_id'][0]+"]" course_details.append({ 'name': course_name, @@ -166,19 +158,19 @@ for i, course in enumerate(course_details): WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click() WaitClickable(driver,(By.XPATH,"//a[@value='A']")).click() - table = driver.find_elements_by_xpath("//div[@id='grades_wrapper']/div") + table = driver.find_elements(By.XPATH, "//div[@id='grades_wrapper']/div") for i, assignment in enumerate(table): print(i) - buttons = assignment.find_elements_by_tag_name("input") + buttons = assignment.find_elements(By.TAG_NAME, "input") block = None assignment_name = None information_link = False try: - block = assignment.find_element_by_xpath("./div[@class='cell gradable']/a[@onclick]") + block = assignment.find_element(By.XPATH, "./div[@class='cell gradable']/a[@onclick]") information_link = True except: - block = assignment.find_element_by_xpath("./div[@class='cell gradable']") + block = assignment.find_element(By.XPATH, "./div[@class='cell gradable']") assignment_name = get_assignment_name(driver,block) path.append(assignment_name) # download information if it exists. @@ -189,7 +181,7 @@ for i, course in enumerate(course_details): WebDriverWait(driver,10).until(EC.number_of_windows_to_be(2)) driver.switch_to.window(driver.window_handles[1]) save_html(sep.join(path),"information",driver.page_source) - scrape_further(driver, sep.join(path)) + scrape_further(driver, sep.join(path), session) driver.close() driver.switch_to.window(driver.window_handles[0]) except ElementNotInteractableException: @@ -203,10 +195,10 @@ for i, course in enumerate(course_details): driver.switch_to.window(driver.window_handles[1]) WaitDiv(driver, (By.CLASS_NAME, "rubricControlContainer")) save_html(sep.join(path),"rubric",driver.page_source) - driver.find_element_by_xpath("//li[@id='listViewTab']/a").click() + driver.find_element(By.XPATH, "//li[@id='listViewTab']/a").click() WaitDiv(driver, (By.CLASS_NAME, "rubricGradingList")) save_html(sep.join(path),"list",driver.page_source) - detailed_buttons = driver.find_elements_by_xpath("//div[@class='u_controlsWrapper']/input") + detailed_buttons = driver.find_elements(By.XPATH, "//div[@class='u_controlsWrapper']/input") detailed_buttons[1].click() detailed_buttons[0].click() save_html(sep.join(path),"list_detailed",driver.page_source) diff --git a/utils/asset.py b/utils/asset.py index 65f008b..b4eb000 100644 --- a/utils/asset.py +++ b/utils/asset.py @@ -1,4 +1,3 @@ -import wget from constants.constants import BASE_URL import re import hashlib @@ -26,6 +25,7 @@ class RequestStack: def download_all(self): for file in self.request_stack: + print(f"\tDownloading {file.url}") file.download(self.token) class Asset: @@ -36,14 +36,14 @@ class Asset: self.path.mkdir(parents=True, exist_ok=True) super().__init__() - def download(self,req_headers): - response = requests.get(BASE_URL+self.url, stream=True, headers=req_headers, allow_redirects=False) + def download(self,session): + response = session.get(BASE_URL+self.url, stream=True, allow_redirects=False) headers = response.headers if response.status_code == 302 and len(headers['location']) > 0: - Asset(headers['location'], self.path).download(req_headers) + Asset(headers['location'], self.path).download(session) return elif response.status_code != 200: - print("Error "+str(response.status_code)) + print("[!] Error "+str(response.status_code)) return response.status_code headers = { x:re.sub(r'^"*|"*?$', '', headers.get(x)) for x in headers } # ewww regex if 'Content-Disposition' in headers.keys(): diff --git a/utils/login.py b/utils/login.py index daef890..c74f0d6 100644 --- a/utils/login.py +++ b/utils/login.py @@ -1,6 +1,6 @@ +import sys from utils.wait import WaitClickable from utils.selectors import Selectors -import sys from selenium.webdriver.support.wait import WebDriverWait from urllib.parse import urlparse from selenium.webdriver.support import expected_conditions as EC @@ -9,16 +9,8 @@ from constants.constants import BASE_URL import re import json -def try_cookie(driver): - for entry in driver.get_log('performance'): - parameters = json.loads(entry["message"])['message']['params'] - if ( - 'documentURL' in parameters.keys() - and re.search(r'https://lms.uwa.edu.au/webapps/portal.*', parameters['documentURL']) != None - ): - return parameters['redirectResponse']['requestHeaders']['Cookie'] - def login(args, driver): + driver.get(BASE_URL) USERNAME = args.username if len(USERNAME) == 0: print('UserID: ') @@ -26,8 +18,6 @@ def login(args, driver): USERNAME += '@student.uwa.edu.au' print('Password: ') PASSWORD = getpass('') - - driver.get(BASE_URL) WaitClickable(driver,Selectors.BOX_USERNAME).send_keys(USERNAME) WaitClickable(driver,Selectors.BUTTON_NEXT).click() @@ -44,10 +34,10 @@ def login(args, driver): WaitClickable(driver,Selectors.BUTTON_DENY).click() # WaitClickable(driver,BUTTON_NEXT).click() #IF you want to remember credentials, switch these comments - current_uri = urlparse(driver.current_url) - if '{uri.scheme}://{uri.netloc}'.format(uri=current_uri) != BASE_URL: - driver.quit() - print("Login failed.") - exit(-1) - return try_cookie(driver) \ No newline at end of file + cookie = driver.get_cookies() + if not cookie == None: return cookie + + print('Could not get auth cookie - Invalid ID or password?', file=sys.stderr) + driver.quit() + exit(1) \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py index 068221b..358fb92 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,9 +1,6 @@ import pathlib import re from constants.constants import DL_DIR -from utils.wait import WaitClickable -from utils.asset import Asset -from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time import os diff --git a/utils/wait.py b/utils/wait.py index b7eeca7..c582169 100644 --- a/utils/wait.py +++ b/utils/wait.py @@ -1,7 +1,7 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC -timeout = 4 -# find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element_by_id(name)) +timeout = 5 +# find_element_safe = lambda name,timeout=30:WebDriverWait(driver, timeout).until(lambda x: x.find_element(By.ID, name)) WaitClickable = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(locator)) WaitDiv = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator)) SwitchToIFrame = lambda driver,locator:WebDriverWait(driver, timeout).until(EC.frame_to_be_available_and_switch_to_it(locator)) \ No newline at end of file