mmls-sync

public python v1 · immutable
#1791648
·published 2010-02-09 18:43 UTC
import reimport osimport os.pathimport getpassimport urllib.requestimport urllib.parseimport http.cookiejarimport html.parser# define some global variablessite_hostname = "mmlsmelaka.mmu.edu.my"site_url = "http://" + site_hostnamesite_url_secure = "https://" + site_hostnameclass LoginParser(html.parser.HTMLParser):        def handle_starttag(self, tag, attrs):                global mmls_hst, mmls_key                if tag == "input":                        name = [v for k, v in attrs if k == "name"][0]            value = ([v for k, v in attrs if k == "value"] or [""])[0]                        if name == "hst":                mmls_hst = value            elif name == "key":                mmls_key = value            class MMLSParser(html.parser.HTMLParser):        # prepare our regex    course_regex = re.compile(r"[A-Z]{3}\d{4}")    link_regex = re.compile(r"/Student/Courses/coursecontent/courses\.php\?crskid=\w{51}&crdid=\d{10}", re.IGNORECASE)        roman_regex = re.compile(r"^I{1,3}$", re.IGNORECASE)        # some local variables    _handle_a = 0    _handle_td = 0    _handle_td_counter = 0    _handle_td_extract = 0        # lists    course_id = []    course_name = []    course_link = []        def handle_starttag(self, tag, attrs):                if tag == "a":                        href = [v for k, v in attrs if k == "href"][0]            if href and self.link_regex.match(href):                # add this link to our course link list                self.course_link.append(href)                self._handle_a = 1                self._handle_td = 1                if self._handle_td and tag == "td":                        if self._handle_td_counter >= 2:                # reset flag and counter                self._handle_td = 0                self._handle_td_counter = 0                self._handle_td_extract = 1            else:                # increment counter                self._handle_td_counter += 1                        def handle_data(self, data):                if self._handle_a:            # append this to our course name            self._handle_a = 0            self.course_id.append(data.strip())                    if self._handle_td_extract:            # extract the course name and convert ANNOYING UPPERCASE to Proper Case            name = []            for word in data.strip().split():                if self.roman_regex.match(word):                    name.append(word.upper())                else:                    name.append(word.capitalize())                            name = " ".join(name)                        # append to list            self.course_name.append(name)                        # done extracting            self._handle_td_extract = 0                        # generate the resultant courses list            self.course_list = tuple(zip(self.course_id, self.course_name, self.course_link))            # prepare the connectioncj = http.cookiejar.CookieJar()opener = urllib.request.build_opener(    urllib.request.HTTPCookieProcessor(cj))# connect to MMLS Melakaprint("Connecting to MMLS Melaka ...")f = opener.open(site_url_secure)parser = LoginParser()parser.feed(f.read().decode())parser.close()f.close()# attempt to loginprint()mmls_userid = input("User ID: ")mmls_password = getpass.getpass("Password: ")print()params = urllib.parse.urlencode(    {        "Submit"        : " Login ",        "hst"           : mmls_hst,        "key"           : mmls_key,        "txtPassword"   : mmls_password,        "txtUserID"     : mmls_userid,    })f = opener.open(site_url_secure + "/check_login.php", params)# is login successful?if f.url == site_url_secure + "/Student/Default/Main.php":    # login successful    print("Login successful ... processing course list")        # feed the content to our parser to generate the course list    parser = MMLSParser()    parser.feed(f.read().decode())    parser.close()    f.close()    course_list = parser.course_list        # prepare our regex    frame_regex = re.compile(r"sconav_new.php\?t=\d+&lo=no", re.IGNORECASE)    tree_regex = re.compile(r"/Student/Courses/coursecontent/mynav/course_tree/[A-Z]{3}\d{4}.*_tree\.js", re.IGNORECASE)    downloadable_regex = re.compile(r"Launch.php\?.*&*path=Contents/.+\.\w{1,4}", re.IGNORECASE)    filename_regex = re.compile(r"/[\w\s%,.-]+\.\w{1,4}$", re.IGNORECASE)        # scan for downloadable contents    print()    print("Course list acquired ... now scanning for downloadable contents")    print()    for course in course_list:        # load and parse the course page for frame link        f = opener.open(site_url + course[2])        page = f.read().decode()        f.close()        frame = frame_regex.search(page).group()                # now load the actual frame and parse for downloadable contents        f = opener.open(site_url + "/Student/Courses/coursecontent/mynav/" + frame, timeout=5)        page = f.read().decode()        f.close()        tree = tree_regex.search(page).group()                # finally we can load the tree script and scan for downloadable contents        f = opener.open(site_url + tree, timeout=5)        page = f.read().decode()        f.close()        downloads = downloadable_regex.findall(page)                # create a folder to store these files        directory = "{0} - {1}".format(course[0], course[1])        if not os.path.exists(directory):            # mkdir            os.mkdir(directory)                # download these files        for download in downloads:            # process and generate the download link            pos = download.index("&path=")            path = download[pos+6:]            header = download[:pos]            path = "/".join([urllib.parse.quote(x) for x in path.split("/")])            url = site_url + "/Student/Courses/coursecontent/mynav/" + header + "&path=" + path                        # extract the file name            filename = os.path.join(directory, filename_regex.search(download).group()[1:])                        try:                # start downloading                downstream = opener.open(url, timeout=5)                                # do we need to download this file ?                filesize = int([v for (k, v) in downstream.info()._headers if k == "Content-Length"][0])                if not os.path.exists(filename):                    # start downloading                    print("{0} ... downloading".format(filename))                                        # in and out                    file = open(filename, "wb")                    data = downstream.read()                    file.write(data)                    downstream.close()                    file.close()                                    else:                    # updated                    print("{0} ... latest".format(filename))                                # done, cleanup                downstream.close()                            except UnicodeEncodeError as e:                # encoding error, no idea what cause it anyway                pass                            except Exception as e:                # exception occurred                print("{0} ... {1}".format(filename, e))                        # done !    print()    print("Content synchronized with MMLS Melaka")else:    # failed    print("Something went wrong!")    # cleanupopener.close()# press any key to continueinput(" ")