import reimport osimport os.pathimport getpassimport urllib.requestimport urllib.parseimport http.cookiejarimport html.parser# define some global variablessite_hostname = "mmlsmelaka.mmu.edu.my"site_url = "http://" + site_hostnamesite_url_secure = "https://" + site_hostnameclass LoginParser(html.parser.HTMLParser): def handle_starttag(self, tag, attrs): global mmls_hst, mmls_key if tag == "input": name = [v for k, v in attrs if k == "name"][0] value = ([v for k, v in attrs if k == "value"] or [""])[0] if name == "hst": mmls_hst = value elif name == "key": mmls_key = valueclass MMLSParser(html.parser.HTMLParser): # prepare our regex course_regex = re.compile(r"[A-Z]{3}\d{4}") link_regex = re.compile(r"/Student/Courses/coursecontent/courses\.php\?crskid=\w{51}&crdid=\d{10}", re.IGNORECASE) roman_regex = re.compile(r"^I{1,3}$", re.IGNORECASE) # some local variables _handle_a = 0 _handle_td = 0 _handle_td_counter = 0 _handle_td_extract = 0 # lists course_id = [] course_name = [] course_link = [] def handle_starttag(self, tag, attrs): if tag == "a": href = [v for k, v in attrs if k == "href"][0] if href and self.link_regex.match(href): # add this link to our course link list self.course_link.append(href) self._handle_a = 1 self._handle_td = 1 if self._handle_td and tag == "td": if self._handle_td_counter >= 2: # reset flag and counter self._handle_td = 0 self._handle_td_counter = 0 self._handle_td_extract = 1 else: # increment counter self._handle_td_counter += 1 def handle_data(self, data): if self._handle_a: # append this to our course name self._handle_a = 0 self.course_id.append(data.strip()) if self._handle_td_extract: # extract the course name and convert ANNOYING UPPERCASE to Proper Case name = [] for word in data.strip().split(): if self.roman_regex.match(word): name.append(word.upper()) else: name.append(word.capitalize()) name = " ".join(name) # append to list self.course_name.append(name) # done extracting self._handle_td_extract = 0 # generate the resultant courses list self.course_list = tuple(zip(self.course_id, self.course_name, self.course_link))# prepare the connectioncj = http.cookiejar.CookieJar()opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj))# connect to MMLS Melakaprint("Connecting to MMLS Melaka ...")f = opener.open(site_url_secure)parser = LoginParser()parser.feed(f.read().decode())parser.close()f.close()# attempt to loginprint()mmls_userid = input("User ID: ")mmls_password = getpass.getpass("Password: ")print()params = urllib.parse.urlencode( { "Submit" : " Login ", "hst" : mmls_hst, "key" : mmls_key, "txtPassword" : mmls_password, "txtUserID" : mmls_userid, })f = opener.open(site_url_secure + "/check_login.php", params)# is login successful?if f.url == site_url_secure + "/Student/Default/Main.php": # login successful print("Login successful ... processing course list") # feed the content to our parser to generate the course list parser = MMLSParser() parser.feed(f.read().decode()) parser.close() f.close() course_list = parser.course_list # prepare our regex frame_regex = re.compile(r"sconav_new.php\?t=\d+&lo=no", re.IGNORECASE) tree_regex = re.compile(r"/Student/Courses/coursecontent/mynav/course_tree/[A-Z]{3}\d{4}.*_tree\.js", re.IGNORECASE) downloadable_regex = re.compile(r"Launch.php\?.*&*path=Contents/.+\.\w{1,4}", re.IGNORECASE) filename_regex = re.compile(r"/[\w\s%,.-]+\.\w{1,4}$", re.IGNORECASE) # scan for downloadable contents print() print("Course list acquired ... now scanning for downloadable contents") print() for course in course_list: # load and parse the course page for frame link f = opener.open(site_url + course[2]) page = f.read().decode() f.close() frame = frame_regex.search(page).group() # now load the actual frame and parse for downloadable contents f = opener.open(site_url + "/Student/Courses/coursecontent/mynav/" + frame, timeout=5) page = f.read().decode() f.close() tree = tree_regex.search(page).group() # finally we can load the tree script and scan for downloadable contents f = opener.open(site_url + tree, timeout=5) page = f.read().decode() f.close() downloads = downloadable_regex.findall(page) # create a folder to store these files directory = "{0} - {1}".format(course[0], course[1]) if not os.path.exists(directory): # mkdir os.mkdir(directory) # download these files for download in downloads: # process and generate the download link pos = download.index("&path=") path = download[pos+6:] header = download[:pos] path = "/".join([urllib.parse.quote(x) for x in path.split("/")]) url = site_url + "/Student/Courses/coursecontent/mynav/" + header + "&path=" + path # extract the file name filename = os.path.join(directory, filename_regex.search(download).group()[1:]) try: # start downloading downstream = opener.open(url, timeout=5) # do we need to download this file ? filesize = int([v for (k, v) in downstream.info()._headers if k == "Content-Length"][0]) if not os.path.exists(filename): # start downloading print("{0} ... downloading".format(filename)) # in and out file = open(filename, "wb") data = downstream.read() file.write(data) downstream.close() file.close() else: # updated print("{0} ... latest".format(filename)) # done, cleanup downstream.close() except UnicodeEncodeError as e: # encoding error, no idea what cause it anyway pass except Exception as e: # exception occurred print("{0} ... {1}".format(filename, e)) # done ! print() print("Content synchronized with MMLS Melaka")else: # failed print("Something went wrong!")# cleanupopener.close()# press any key to continueinput(" ")