initial commit
This commit is contained in:
commit
1d199bfcfa
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
.vscode
|
||||
*.txt
|
||||
*.csv
|
||||
__pycache__
|
||||
69
scrape.py
Normal file
69
scrape.py
Normal file
@ -0,0 +1,69 @@
|
||||
from lxml import html, etree
|
||||
import requests, re, sys
|
||||
|
||||
def scrape_id(id):
|
||||
extensionsRe = re.compile("(\d+).*?\/.*?(\d+)")
|
||||
|
||||
complete_url = "https://www.mintos.com/webapp/en/{id}/".format(id=id)
|
||||
print("Scraping id {id}".format(id=id))
|
||||
|
||||
page = requests.get(complete_url)
|
||||
dom = html.fromstring(page.content)
|
||||
|
||||
result = {
|
||||
"extension": False,
|
||||
"no_of_extensions": None,
|
||||
"possible_extensions": None,
|
||||
}
|
||||
|
||||
# find the Schedule extension row
|
||||
|
||||
for td in dom.cssselect("tr > td.field-description"):
|
||||
title = td.text.strip()
|
||||
if title == "Schedule extension":
|
||||
row = td.getparent()
|
||||
value_td = row.cssselect("td.value > a")
|
||||
|
||||
if len(value_td) != 1:
|
||||
raise Exception("Unexpected page format")
|
||||
|
||||
value_td = value_td[0]
|
||||
|
||||
result["extension"] = (value_td.text.strip() == "Yes")
|
||||
break
|
||||
|
||||
if result["extension"]:
|
||||
for td in dom.cssselect("tr > td.field-description"):
|
||||
title = td.text.strip()
|
||||
if "Number of schedule extensions" in title:
|
||||
row = td.getparent()
|
||||
value_td = row.cssselect("td.value > a")
|
||||
|
||||
if len(value_td) != 1:
|
||||
raise Exception("Unexpected page format")
|
||||
|
||||
value_td = value_td[0]
|
||||
|
||||
extensions_str = value_td.text.strip()
|
||||
matches = extensionsRe.match(extensions_str)
|
||||
|
||||
if matches is None:
|
||||
raise Exception("Unexpected page format (regex failed)")
|
||||
|
||||
result["no_of_extensions"] = int(matches.group(1))
|
||||
result["possible_extensions"] = int(matches.group(2))
|
||||
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: scrape.py <id>")
|
||||
exit(0)
|
||||
|
||||
result = scrape_id(sys.argv[1])
|
||||
|
||||
print("Schedule Extension: {}".format(result["extension"]))
|
||||
|
||||
if result["extension"]:
|
||||
print("Extension count: {}".format(result["no_of_extensions"]))
|
||||
print("Max extensions: {}".format(result["possible_extensions"]))
|
||||
24
update_list.py
Normal file
24
update_list.py
Normal file
@ -0,0 +1,24 @@
|
||||
from scrape import scrape_id
|
||||
import csv, sys, datetime
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: update_list.py <list.txt> <update_file.csv>")
|
||||
exit(0)
|
||||
|
||||
list_filename = sys.argv[1]
|
||||
update_filename = sys.argv[2]
|
||||
|
||||
update_file = open(update_filename, "w+")
|
||||
update_file_writer = csv.writer(update_file)
|
||||
|
||||
current_time_string = "{0:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now())
|
||||
|
||||
with open(list_filename) as list_file:
|
||||
for id_str in list_file:
|
||||
id = id_str.strip()
|
||||
|
||||
result = scrape_id(id)
|
||||
|
||||
update_file_writer.writerow([ id, current_time_string, result["extension"], result["no_of_extensions"], result["possible_extensions"]])
|
||||
|
||||
update_file.close()
|
||||
Loading…
Reference in New Issue
Block a user