3
3
# getcnloc.py - script to fetch data from the China (PRC) government site
4
4
#
5
5
# Copyright (C) 2014 Jiangge Zhang
6
+ # Copyright (C) 2015 Arthur de Jong
6
7
#
7
8
# This library is free software; you can redistribute it and/or
8
9
# modify it under the terms of the GNU Lesser General Public
@@ -43,20 +44,23 @@ def make_etree(response, encoding='utf-8'):
43
44
return lxml .html .fromstring (response .text )
44
45
45
46
46
- def iter_revisions ():
47
- html = make_etree (requests .get (revisions_url ))
47
+ def get_revisions (url ):
48
+ """Return the links to versions of the published administrative division
49
+ codes."""
50
+ html = make_etree (requests .get (url ))
48
51
anchors = html .xpath ('.//div[@class="center_list"]/ul/li/a' )
49
52
for anchor in anchors :
50
- url = urljoin (revisions_url , anchor .attrib ['href' ])
53
+ url = urljoin (url , anchor .attrib ['href' ])
51
54
date_text = anchor .findtext ('.//span/*[@class="cont_tit02"]' )
52
55
date = datetime .strptime (date_text , '%Y-%m-%d' ).date ()
53
56
yield url , date
54
57
55
58
56
59
def iter_records (url ):
57
60
html = make_etree (requests .get (url ))
58
- lines = html .xpath ('.//div[@class="xilan_con"]//p/text() ' )
61
+ lines = html .xpath ('.//div[@class="xilan_con"]//p' )
59
62
for line in lines :
63
+ line = ' ' .join (line .xpath ('.//text()' ))
60
64
try :
61
65
city_code , city_name = line .strip ().split ()
62
66
except ValueError :
@@ -66,8 +70,7 @@ def iter_records(url):
66
70
yield city_code .strip (), city_name .strip ()
67
71
68
72
69
- def group_records ():
70
- url , _ = max (iter_revisions (), key = itemgetter (1 )) # latest revision
73
+ def group_records (url ):
71
74
72
75
provinces = {}
73
76
prefectures = {}
@@ -96,7 +99,9 @@ def print_data_file(file):
96
99
print ("# generated from National Bureau of Statistics of the People's" ,
97
100
file = file )
98
101
print ('# Republic of China, downloaded from %s' % revisions_url , file = file )
99
- for city_code , city_data in group_records ():
102
+ url , dt = max (get_revisions (revisions_url ), key = itemgetter (1 ))
103
+ print ('# %s (revision %s)' % (url , dt ), file = file )
104
+ for city_code , city_data in group_records (url ):
100
105
if not all (city_data .values ()):
101
106
continue
102
107
city_pairs = ' ' .join (
0 commit comments