from lxml import html, etree import requests, re, sys def scrape_id(id): extensionsRe = re.compile("(\d+).*?\/.*?(\d+)") complete_url = "https://www.mintos.com/webapp/en/{id}/".format(id=id) print("Scraping id {id}".format(id=id)) page = requests.get(complete_url) dom = html.fromstring(page.content) result = { "extension": False, "no_of_extensions": None, "possible_extensions": None, } # find the Schedule extension row for td in dom.cssselect("tr > td.field-description"): title = td.text.strip() if title == "Schedule extension": row = td.getparent() value_td = row.cssselect("td.value > a") if len(value_td) != 1: raise Exception("Unexpected page format") value_td = value_td[0] result["extension"] = (value_td.text.strip() == "Yes") break if result["extension"]: for td in dom.cssselect("tr > td.field-description"): title = td.text.strip() if "Number of schedule extensions" in title: row = td.getparent() value_td = row.cssselect("td.value > a") if len(value_td) != 1: raise Exception("Unexpected page format") value_td = value_td[0] extensions_str = value_td.text.strip() matches = extensionsRe.match(extensions_str) if matches is None: raise Exception("Unexpected page format (regex failed)") result["no_of_extensions"] = int(matches.group(1)) result["possible_extensions"] = int(matches.group(2)) return result if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: scrape.py ") sys.exit(0) result = scrape_id(sys.argv[1]) print("Schedule Extension: {}".format(result["extension"])) if result["extension"]: print("Extension count: {}".format(result["no_of_extensions"])) print("Max extensions: {}".format(result["possible_extensions"]))