2021-11-14 01:53:15 +08:00
#!/usr/bin/env python3
2021-06-18 03:01:14 +08:00
2021-11-14 01:53:15 +08:00
import requests
2021-06-18 03:01:14 +08:00
from selenium import webdriver
from selenium . webdriver . support import expected_conditions as EC
from selenium . webdriver . support . wait import WebDriverWait
from selenium . webdriver . common . by import By
from selenium . webdriver . common . action_chains import ActionChains
# For chrome stuff
from selenium . webdriver . common . desired_capabilities import DesiredCapabilities
from selenium . webdriver . chrome . options import Options
# ---
from urllib . parse import parse_qs , urlparse
import os
2021-06-18 19:23:19 +08:00
from os . path import sep
2021-06-18 03:01:14 +08:00
import re
import argparse
2021-11-14 01:53:15 +08:00
from utils . asset import RequestStack
2021-06-18 03:01:14 +08:00
from utils . wait import SwitchToIFrame , WaitClickable , WaitDiv
2021-06-18 19:23:19 +08:00
from constants . constants import BASE_URL , DL_DIR
2021-06-18 03:01:14 +08:00
from utils . login import login
2021-11-14 01:53:15 +08:00
from utils . utils import download_file , get_assignment_name , save_html
2021-06-18 19:23:19 +08:00
from pathlib import Path
from selenium . common . exceptions import ElementNotInteractableException
2021-06-18 03:01:14 +08:00
testing = False
try :
testing = True
2022-07-21 00:41:43 +08:00
from utils . test import get_etc
2021-06-18 03:01:14 +08:00
except :
def get_etc ( * args ) : return False
2022-07-21 00:41:43 +08:00
2021-06-18 19:23:19 +08:00
# stupid bug
2022-07-21 00:41:43 +08:00
def click_the_fing_button ( driver , button ) :
2021-06-18 03:01:14 +08:00
try :
ActionChains ( driver ) . move_to_element ( button )
ActionChains ( driver ) . click ( button ) . perform ( )
2022-07-21 00:41:43 +08:00
WebDriverWait ( driver , 2 ) . until ( EC . number_of_windows_to_be ( 2 ) )
2021-06-18 03:01:14 +08:00
except :
2022-07-21 00:41:43 +08:00
# hack to wake selenium up when it doesnt want to click the button!
driver . set_window_size ( 1024 , 768 )
click_the_fing_button ( driver , button )
2021-06-18 03:01:14 +08:00
driver . maximize_window ( )
# You can probably replace this with a recursive method like in blackboard scraper but tbh i just want to get this script done so i can stop working for once.
2022-07-21 00:41:43 +08:00
def scrape_further ( driver , path , session ) :
2021-06-18 03:01:14 +08:00
# attempts for bb-held tests
2022-07-21 00:41:43 +08:00
attempts = driver . find_elements (
By . XPATH , " //a[starts-with(@href, ' /webapps/assessment ' )] " )
attempts = [ x . get_attribute ( ' href ' ) for x in attempts ]
2021-06-18 03:01:14 +08:00
for i , attempt in enumerate ( attempts ) :
2022-07-21 00:41:43 +08:00
name = " attempt_ " + \
str ( i ) + " _[ " + parse_qs ( urlparse ( attempt ) . query ) [ ' attempt_id ' ] [ 0 ] + " ] "
attempt = re . sub ( " ^ " + BASE_URL , " " , attempt )
2021-06-18 03:01:14 +08:00
driver . execute_script ( " window.open( ' " + BASE_URL + attempt + " ' ) " )
2022-07-21 00:41:43 +08:00
WebDriverWait ( driver , 10 ) . until ( EC . number_of_windows_to_be ( 3 ) )
2021-06-18 03:01:14 +08:00
driver . switch_to . window ( driver . window_handles [ 2 ] )
save_html ( path , name , driver . page_source )
if testing :
2021-11-14 01:53:15 +08:00
get_etc ( driver , session , path )
2021-06-18 03:01:14 +08:00
driver . close ( )
driver . switch_to . window ( driver . window_handles [ 1 ] )
2022-07-21 00:41:43 +08:00
# Comments may contain feedback links
2021-11-14 01:53:15 +08:00
request_stack = RequestStack ( session )
2022-07-21 00:41:43 +08:00
etc_files = driver . find_elements (
By . XPATH , " //a[contains(@href, ' /bbcswebdav ' )] " )
etc_files = [ x . get_attribute ( ' href ' ) for x in etc_files ]
for i , item in enumerate ( etc_files ) :
if ( not item is None ) and ( " bbcswebdav " in item ) :
request_stack . add_file ( item , path )
# submission file for assignment
attempts = driver . find_elements (
By . XPATH , " //a[starts-with(@href, ' /webapps/assignment/download ' )] " )
attempts = [ x . get_attribute ( ' href ' ) for x in attempts ]
2021-06-18 03:01:14 +08:00
for i , attempt in enumerate ( attempts ) :
2022-07-21 00:41:43 +08:00
request_stack . add_file ( attempt , path )
2021-06-18 19:23:19 +08:00
get_feedback = False
try :
# download button causes a tab to appear quickly, download, then disappear
# need to capture the url to get the metadata and dl to the correct location
# cant be arsed to figure out how the pspdfkit js that executes this download works.
2022-07-21 00:41:43 +08:00
SwitchToIFrame (
driver , ( By . XPATH , " //iframe[@class= ' docviewer_iframe_embed ' ] " ) )
2021-06-18 19:23:19 +08:00
SwitchToIFrame ( driver , ( By . XPATH , " //iframe[@title= ' PSPDFKit ' ] " ) )
get_feedback = True
except :
print ( " No feedback to download " )
if get_feedback :
2022-07-21 00:41:43 +08:00
dl_button = WaitClickable (
driver , ( By . XPATH , " //button[contains(@class, ' PSPDFKit-Toolbar-Button PSPDFKit-Tool-Button ' )][@title= ' Download ' ] " ) )
2021-06-18 19:23:19 +08:00
dl_button . click ( )
download_file ( path )
2021-06-18 03:01:14 +08:00
request_stack . download_all ( )
2021-06-18 19:23:19 +08:00
# end of scrape_further
2021-06-18 03:01:14 +08:00
parser = argparse . ArgumentParser ( description = ' Automated microsoft SSO login. ' )
# parser.add_argument("-p", "--password", help="Automatically use provided password", default="")
2022-07-21 00:41:43 +08:00
parser . add_argument ( " -u " , " --username " ,
help = " Automatically use provided userID " , default = " " )
2021-06-18 03:01:14 +08:00
2021-06-18 19:23:19 +08:00
path = [ ' grades ' ]
2021-06-18 03:01:14 +08:00
args = parser . parse_args ( )
CAPABILITIES = DesiredCapabilities . CHROME
2021-06-18 19:23:19 +08:00
CAPABILITIES [ ' goog:loggingPrefs ' ] = {
2022-07-21 00:41:43 +08:00
' performance ' : ' ALL ' ,
2021-06-18 19:23:19 +08:00
}
for f in os . listdir ( DL_DIR ) :
os . remove ( Path ( DL_DIR ) . joinpath ( f ) )
prefs = {
2022-07-21 00:41:43 +08:00
" profile.default_content_settings.popups " : 0 ,
" download.default_directory " : DL_DIR ,
" directory_upgrade " : True
}
2021-06-18 03:01:14 +08:00
OPTIONS = Options ( )
2022-07-21 00:41:43 +08:00
OPTIONS . add_argument ( ' --no-sandbox ' )
OPTIONS . add_argument ( ' --disable-dev-shm-usage ' )
2021-06-18 19:23:19 +08:00
OPTIONS . add_experimental_option ( " prefs " , prefs )
2021-06-18 03:01:14 +08:00
# OPTIONS.add_argument("--headless")
driver = webdriver . Chrome (
2022-07-21 00:41:43 +08:00
executable_path = ' chromedriver.exe ' ,
desired_capabilities = CAPABILITIES ,
options = OPTIONS
)
2021-06-18 03:01:14 +08:00
driver . maximize_window ( )
2022-07-21 00:41:43 +08:00
cookies = login ( args , driver ) # do Login.
2021-11-14 01:53:15 +08:00
session = requests . Session ( )
for cookie in cookies :
session . cookies . set ( cookie [ " name " ] , cookie [ " value " ] )
2021-06-18 03:01:14 +08:00
2021-06-18 19:23:19 +08:00
# need to load this page JUST to remove the tos warning so it doesnt fuck up everything down the line.
2021-06-18 03:01:14 +08:00
driver . get ( BASE_URL + " /webapps/gradebook/do/student/viewCourses " )
try :
2022-07-21 00:41:43 +08:00
WaitClickable ( driver , ( By . CLASS_NAME , " button-1 " ) ) . click ( )
2021-06-18 03:01:14 +08:00
except :
print ( " no tos warning - skipped " )
2021-06-18 19:23:19 +08:00
2022-07-21 00:41:43 +08:00
driver . get (
BASE_URL + " /webapps/streamViewer/streamViewer?cmd=view&streamName=mygrades " )
2021-06-18 19:23:19 +08:00
save_html ( sep . join ( path ) , ' entrypoint ' , driver . page_source )
2021-06-18 03:01:14 +08:00
2022-07-21 00:41:43 +08:00
WaitClickable ( driver , ( By . ID , " left_stream_mygrades " ) )
2021-06-18 03:01:14 +08:00
# get courseIDs
2021-11-14 01:53:15 +08:00
courses = driver . find_element ( By . ID , " left_stream_mygrades " ) \
. find_elements ( By . XPATH , " //div[@role= ' tab ' ] " )
2021-06-18 03:01:14 +08:00
course_details = [ ]
for i , course_results in enumerate ( courses ) :
course_results = courses [ i ]
ActionChains ( driver ) . move_to_element ( course_results ) . perform ( )
course_url = course_results . get_attribute ( " bb:rhs " )
2022-07-21 00:41:43 +08:00
course_name = course_results . find_elements (
By . XPATH , " //span[@class= ' stream_area_name ' ] " ) [ i ] . text
course_name + = " [ " + \
parse_qs ( urlparse ( course_url ) . query ) [ ' course_id ' ] [ 0 ] + " ] "
2021-06-18 03:01:14 +08:00
course_details . append ( {
' name ' : course_name ,
2022-07-21 00:41:43 +08:00
' url ' : course_url
2021-06-18 03:01:14 +08:00
} )
for i , course in enumerate ( course_details ) :
2022-07-21 00:41:43 +08:00
path . append ( course [ ' name ' ] ) # course name
2021-06-18 03:01:14 +08:00
print ( course [ ' name ' ] )
driver . get ( BASE_URL + course [ ' url ' ] )
driver . execute_script ( """
mygrades . loadContentFrame = function ( url ) {
window . open ( url ) ;
}
""" )
2022-07-21 00:41:43 +08:00
WaitClickable ( driver , ( By . XPATH , " //a[@value= ' A ' ] " ) ) . click ( )
WaitClickable ( driver , ( By . XPATH , " //a[@value= ' A ' ] " ) ) . click ( )
2021-06-18 03:01:14 +08:00
2021-11-14 01:53:15 +08:00
table = driver . find_elements ( By . XPATH , " //div[@id= ' grades_wrapper ' ]/div " )
2021-06-18 03:01:14 +08:00
for i , assignment in enumerate ( table ) :
print ( i )
2021-11-14 01:53:15 +08:00
buttons = assignment . find_elements ( By . TAG_NAME , " input " )
2021-06-18 03:01:14 +08:00
block = None
assignment_name = None
information_link = False
try :
2022-07-21 00:41:43 +08:00
block = assignment . find_element (
By . XPATH , " ./div[@class= ' cell gradable ' ]/a[@onclick] " )
2021-06-18 03:01:14 +08:00
information_link = True
except :
2022-07-21 00:41:43 +08:00
block = assignment . find_element (
By . XPATH , " ./div[@class= ' cell gradable ' ] " )
assignment_name = get_assignment_name ( driver , block )
2021-06-18 03:01:14 +08:00
path . append ( assignment_name )
# download information if it exists.
if information_link :
2021-06-18 19:23:19 +08:00
try :
2022-07-21 00:41:43 +08:00
ActionChains ( driver ) . move_to_element (
block ) . click ( block ) . perform ( )
2021-06-18 19:23:19 +08:00
print ( " Switched " + assignment_name )
2022-07-21 00:41:43 +08:00
WebDriverWait ( driver , 10 ) . until ( EC . number_of_windows_to_be ( 2 ) )
2021-06-18 19:23:19 +08:00
driver . switch_to . window ( driver . window_handles [ 1 ] )
2022-07-21 00:41:43 +08:00
save_html ( sep . join ( path ) , " information " , driver . page_source )
2021-11-14 01:53:15 +08:00
scrape_further ( driver , sep . join ( path ) , session )
2021-06-18 19:23:19 +08:00
driver . close ( )
driver . switch_to . window ( driver . window_handles [ 0 ] )
except ElementNotInteractableException :
print ( ' idk ' )
2021-06-18 03:01:14 +08:00
# download rubric if it exists.
for button in buttons :
action = button . get_attribute ( " onclick " )
if action != None and " showInLightBox " not in action :
2022-07-21 00:41:43 +08:00
click_the_fing_button ( driver , button )
driver . execute_script (
" window.scrollTo(0, document.body.scrollHeight) " )
2021-06-18 03:01:14 +08:00
driver . switch_to . window ( driver . window_handles [ 1 ] )
WaitDiv ( driver , ( By . CLASS_NAME , " rubricControlContainer " ) )
2022-07-21 00:41:43 +08:00
save_html ( sep . join ( path ) , " rubric " , driver . page_source )
driver . find_element (
By . XPATH , " //li[@id= ' listViewTab ' ]/a " ) . click ( )
2021-06-18 03:01:14 +08:00
WaitDiv ( driver , ( By . CLASS_NAME , " rubricGradingList " ) )
2022-07-21 00:41:43 +08:00
save_html ( sep . join ( path ) , " list " , driver . page_source )
detailed_buttons = driver . find_elements (
By . XPATH , " //div[@class= ' u_controlsWrapper ' ]/input " )
2021-06-18 03:01:14 +08:00
detailed_buttons [ 1 ] . click ( )
detailed_buttons [ 0 ] . click ( )
2022-07-21 00:41:43 +08:00
save_html ( sep . join ( path ) , " list_detailed " , driver . page_source )
2021-06-18 03:01:14 +08:00
driver . close ( )
driver . switch_to . window ( driver . window_handles [ 0 ] )
2022-07-21 00:41:43 +08:00
path . pop ( )
2021-06-18 19:23:19 +08:00
save_html ( sep . join ( path ) , path [ 0 ] , driver . page_source )
2022-07-21 00:41:43 +08:00
WaitClickable ( driver , ( By . XPATH , " //a[@value= ' S ' ] " ) ) . click ( )
save_html ( sep . join ( path ) , " submitted " , driver . page_source )
2021-06-18 03:01:14 +08:00
try :
2022-07-21 00:41:43 +08:00
WaitClickable (
driver , ( By . XPATH , " //div[@id= ' submissionReceipts ' ]//a " ) ) . click ( )
WaitClickable (
driver , ( By . XPATH , " //div[@id= ' listContainer_itemcount ' ]//a[@class= ' pagelink ' ] " ) ) . click ( )
2021-06-18 03:01:14 +08:00
except :
print ( ' No items? ' )
2022-07-21 00:41:43 +08:00
save_html ( sep . join ( path ) , " receipts " , driver . page_source )
2021-06-18 03:01:14 +08:00
path . pop ( )
2022-07-21 00:41:43 +08:00
driver . quit ( )