import pathlib import re from constants.constants import DL_DIR from selenium.webdriver.support import expected_conditions as EC import time import os from pathlib import Path import shutil def friendly_filename(name): name = friendly_dirname(name) return re.sub("[\\\/]", '', name) def friendly_dirname(name): # .gsub(/[^\w\s_-]+/, '') # .gsub(/\s+/, '_') # pipeline: name = re.sub("[\x00-\x1f]", '', name) name = re.sub("[\:\<\>\"\|\?\*]", '', name) name = re.sub("(^|\b\s)\s+($|\s?\b)", '\\1\\2', name) return name.strip() def get_assignment_name(driver, block): s = friendly_filename(get_text_excluding_children(driver, block)) print("Assesment: "+s) return s def save_html(dir, filename, page_source): dir = pathlib.Path(friendly_dirname(dir)) dir.mkdir(parents=True, exist_ok=True) file = dir.joinpath(friendly_filename(filename)+".html") with open(file, "w", encoding="utf-8") as f: f.write(page_source) # NOTE: Switching to a "download" tab causes issues so we must use the in built # download in Chrome, which does not have etag or metadata information. # Files are using annotate-au.foundations.blackboard.com and not bbcswebdav system # so the tag may not exist in the first place. def download_file(dest): d = Path(DL_DIR) time.sleep(2) downloading = True poll = 1.0 while downloading: for f in os.listdir(d): if Path(f).suffix == '.crdownload': time.sleep(poll) poll *= 1.5 break else: _dest = Path(dest).joinpath("MARKED__"+f) try: shutil.move(d.joinpath(f), _dest) except shutil.SameFileError: os.remove(_dest) shutil.move(d.joinpath(f), _dest) if len(os.listdir(d)) == 0: downloading = False # https://stackoverflow.com/a/19040341 def get_text_excluding_children(driver, element): return driver.execute_script(""" return jQuery(arguments[0]).contents().filter(function() { return this.nodeType == Node.TEXT_NODE; }).text(); """, element)