commit 1d199bfcfa150e5e29f2990896aef9c41d6d3e2d Author: Jan Scheiper Date: Thu Apr 16 19:54:20 2020 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..98a1400 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.vscode +*.txt +*.csv +__pycache__ \ No newline at end of file diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..0474422 --- /dev/null +++ b/scrape.py @@ -0,0 +1,69 @@ +from lxml import html, etree +import requests, re, sys + +def scrape_id(id): + extensionsRe = re.compile("(\d+).*?\/.*?(\d+)") + + complete_url = "https://www.mintos.com/webapp/en/{id}/".format(id=id) + print("Scraping id {id}".format(id=id)) + + page = requests.get(complete_url) + dom = html.fromstring(page.content) + + result = { + "extension": False, + "no_of_extensions": None, + "possible_extensions": None, + } + + # find the Schedule extension row + + for td in dom.cssselect("tr > td.field-description"): + title = td.text.strip() + if title == "Schedule extension": + row = td.getparent() + value_td = row.cssselect("td.value > a") + + if len(value_td) != 1: + raise Exception("Unexpected page format") + + value_td = value_td[0] + + result["extension"] = (value_td.text.strip() == "Yes") + break + + if result["extension"]: + for td in dom.cssselect("tr > td.field-description"): + title = td.text.strip() + if "Number of schedule extensions" in title: + row = td.getparent() + value_td = row.cssselect("td.value > a") + + if len(value_td) != 1: + raise Exception("Unexpected page format") + + value_td = value_td[0] + + extensions_str = value_td.text.strip() + matches = extensionsRe.match(extensions_str) + + if matches is None: + raise Exception("Unexpected page format (regex failed)") + + result["no_of_extensions"] = int(matches.group(1)) + result["possible_extensions"] = int(matches.group(2)) + + return result + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: scrape.py ") + exit(0) + + result = scrape_id(sys.argv[1]) + + print("Schedule Extension: {}".format(result["extension"])) + + if result["extension"]: + print("Extension count: {}".format(result["no_of_extensions"])) + print("Max extensions: {}".format(result["possible_extensions"])) \ No newline at end of file diff --git a/update_list.py b/update_list.py new file mode 100644 index 0000000..7e81185 --- /dev/null +++ b/update_list.py @@ -0,0 +1,24 @@ +from scrape import scrape_id +import csv, sys, datetime + +if len(sys.argv) < 3: + print("Usage: update_list.py ") + exit(0) + +list_filename = sys.argv[1] +update_filename = sys.argv[2] + +update_file = open(update_filename, "w+") +update_file_writer = csv.writer(update_file) + +current_time_string = "{0:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + +with open(list_filename) as list_file: + for id_str in list_file: + id = id_str.strip() + + result = scrape_id(id) + + update_file_writer.writerow([ id, current_time_string, result["extension"], result["no_of_extensions"], result["possible_extensions"]]) + +update_file.close() \ No newline at end of file