8000 Update database files · arthurdejong/python-stdnum@583b066 · GitHub
[go: up one dir, main page]

Skip to content

Commit 583b066

Browse files
committed
Update database files
This also updates the script to download updated Chinese location names.
1 parent dd309e4 commit 583b066

File tree

6 files changed

+648
-508
lines changed

6 files changed

+648
-508
lines changed

getcnloc.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# getcnloc.py - script to fetch data from the China (PRC) government site
44
#
55
# Copyright (C) 2014 Jiangge Zhang
6+
# Copyright (C) 2015 Arthur de Jong
67
#
78
# This library is free software; you can redistribute it and/or
89
# modify it under the terms of the GNU Lesser General Public
@@ -43,20 +44,23 @@ def make_etree(response, encoding='utf-8'):
4344
return lxml.html.fromstring(response.text)
4445

4546

46-
def iter_revisions():
47-
html = make_etree(requests.get(revisions_url))
47+
def get_revisions(url):
48+
"""Return the links to versions of the published administrative division
49+
codes."""
50+
html = make_etree(requests.get(url))
4851
anchors = html.xpath('.//div[@class="center_list"]/ul/li/a')
4952
for anchor in anchors:
50-
url = urljoin(revisions_url, anchor.attrib['href'])
53+
url = urljoin(url, anchor.attrib['href'])
5154
date_text = anchor.findtext('.//span/*[@class="cont_tit02"]')
5255
date = datetime.strptime(date_text, '%Y-%m-%d').date()
5356
yield url, date
5457

5558

5659
def iter_records(url):
5760
html = make_etree(requests.get(url))
58-
lines = html.xpath('.//div[@class="xilan_con"]//p/text()')
61+
lines = html.xpath('.//div[@class="xilan_con"]//p')
5962
for line in lines:
63+
line = ' '.join(line.xpath('.//text()'))
6064
try:
6165
city_code, city_name = line.strip().split()
6266
except ValueError:
@@ -66,8 +70,7 @@ def iter_records(url):
6670
yield city_code.strip(), city_name.strip()
6771

6872

69-
def group_records():
70-
url, _ = max(iter_revisions(), key=itemgetter(1)) # latest revision
73+
def group_records(url):
7174

7275
provinces = {}
7376
prefectures = {}
@@ -96,7 +99,9 @@ def print_data_file(file):
9699
print("# generated from National Bureau of Statistics of the People's",
97100
file=file)
98101
print('# Republic of China, downloaded from %s' % revisions_url, file=file)
99-
for city_code, city_data in group_records():
102+
url, dt = max(get_revisions(revisions_url), key=itemgetter(1))
103+
print('# %s (revision %s)' % (url, dt), file=file)
104+
for city_code, city_data in group_records(url):
100105
if not all(city_data.values()):
101106
continue
102107
city_pairs = ' '.join(

0 commit comments

Comments
 (0)
0