69 lines
2.1 KiB
Python
69 lines
2.1 KiB
Python
from lxml import html, etree
|
|
import requests, re, sys
|
|
|
|
def scrape_id(id):
|
|
extensionsRe = re.compile("(\d+).*?\/.*?(\d+)")
|
|
|
|
complete_url = "https://www.mintos.com/webapp/en/{id}/".format(id=id)
|
|
print("Scraping id {id}".format(id=id))
|
|
|
|
page = requests.get(complete_url)
|
|
dom = html.fromstring(page.content)
|
|
|
|
result = {
|
|
"extension": False,
|
|
"no_of_extensions": None,
|
|
"possible_extensions": None,
|
|
}
|
|
|
|
# find the Schedule extension row
|
|
|
|
for td in dom.cssselect("tr > td.field-description"):
|
|
title = td.text.strip()
|
|
if title == "Schedule extension":
|
|
row = td.getparent()
|
|
value_td = row.cssselect("td.value > a")
|
|
|
|
if len(value_td) != 1:
|
|
raise Exception("Unexpected page format")
|
|
|
|
value_td = value_td[0]
|
|
|
|
result["extension"] = (value_td.text.strip() == "Yes")
|
|
break
|
|
|
|
if result["extension"]:
|
|
for td in dom.cssselect("tr > td.field-description"):
|
|
title = td.text.strip()
|
|
if "Number of schedule extensions" in title:
|
|
row = td.getparent()
|
|
value_td = row.cssselect("td.value > a")
|
|
|
|
if len(value_td) != 1:
|
|
raise Exception("Unexpected page format")
|
|
|
|
value_td = value_td[0]
|
|
|
|
extensions_str = value_td.text.strip()
|
|
matches = extensionsRe.match(extensions_str)
|
|
|
|
if matches is None:
|
|
raise Exception("Unexpected page format (regex failed)")
|
|
|
|
result["no_of_extensions"] = int(matches.group(1))
|
|
result["possible_extensions"] = int(matches.group(2))
|
|
|
|
return result
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print("Usage: scrape.py <id>")
|
|
sys.exit(0)
|
|
|
|
result = scrape_id(sys.argv[1])
|
|
|
|
print("Schedule Extension: {}".format(result["extension"]))
|
|
|
|
if result["extension"]:
|
|
print("Extension count: {}".format(result["no_of_extensions"]))
|
|
print("Max extensions: {}".format(result["possible_extensions"])) |