81
81
82
82
83
83
remove_ref_re = re .compile (r'<ref>.*?</ref>' )
84
+ remove_comment_re = re .compile (r'{{.*?}}' )
85
+ remove_href_re = re .compile (ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+' +
86
+ ur'[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|' +
87
+ ur'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|' +
88
+ ur'(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>' +
89
+ ur'?\xab\xbb\u201c\u201d\u2018\u2019]))' )
84
90
85
91
86
92
def cleanup_value (val ):
87
93
"""Remove unneeded markup from the value."""
88
94
# remove uninteresting things from value
89
- val = val . replace ( '[ ' , '' ). replace ( ']' , '' ). strip ( )
95
+ val = remove_comment_re . sub ( ' ' , val )
90
96
val = remove_ref_re .sub ('' , val )
97
+ val = remove_href_re .sub ('' , val )
98
+ val = val .replace ('[' , '' ).replace (']' , '' ).replace ('\' \' ' , '' ).strip ()
99
+ val = val .split ('|' )[- 1 ]
91
100
# replace value
101
+ val = val .replace ('Unknown' , '' )
92
102
val = val .replace ('United Kingdom|UK' , 'United Kingdom' )
93
103
val = val .replace ('United States|US' , 'United States' )
94
104
val = val .replace ('New Zealand|NZ' , 'New Zealand' ).strip ()
@@ -105,14 +115,14 @@ def get_mncs_from_wikipedia(data):
105
115
"""Update the collection of Mobile Country Codes from Wikipedia.
106
116
This parses a Wikipedia page to extract the MCC and MNC, the first
107
117
part of any IMSI, and stores the results."""
108
- mnc_country_re = re .compile (r'^==== \s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+==== $' )
109
- mnc_line_re = re .compile (r'^\|\s+ (?P<mcc>[0-9]+)' +
110
- r'\s+\|\|\s+ (?P<mnc>[0-9]+)' +
111
- r'(\s+\|\|\s+ (?P<brand>[^| ]*)' +
112
- r'(\s+\|\|\s+ (?P<operator>[^| ]*)' +
113
- r'(\s+\|\|\s+ (?P<status>[^| ]*)' +
114
- r'(\s+\|\|\s+ (?P<bands>[^| ]*)' +
115
- r'(\s+\|\|\s+ (?P<notes>[^| ]*)' +
118
+ mnc_country_re = re .compile (r'^[=]{2,4} \s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+[=]{2,4} $' )
119
+ mnc_line_re = re .compile (r'^\|\s* (?P<mcc>[0-9]+)' +
120
+ r'\s*\\\\\s* (?P<mnc>[0-9]+)' +
121
+ r'(\s*\\\\\s* (?P<brand>[^\\ ]*)' +
122
+ r'(\s*\\\\\s* (?P<operator>[^\\ ]*)' +
123
+ r'(\s*\\\\\s* (?P<status>[^\\ ]*)' +
124
+ r'(\s*\\\\\s* (?P<bands>[^\\ ]*)' +
125
+ r'(\s*\\\\\s* (?P<notes>[^\\ ]*)' +
116
126
r')?)?)?)?)?' )
117
127
f = urllib .urlopen (mcc_list_url )
118
128
country = cc = ''
@@ -122,13 +132,33 @@ def get_mncs_from_wikipedia(data):
122
132
if match :
123
133
country = match .group ('country' )
124
134
cc = (match .group ('cc' ) or '' ).lower ()
135
+ if '||' not in line :
136
+ continue
137
+ line = line .replace ('||' , '\\ \\ ' )
125
138
match = mnc_line_re .match (line )
126
139
if match :
127
- update_mncs (data , match .group ('mcc' ), match .group ('mnc' ),
128
- country = country , cc = cc , brand = match .group ('brand' ),
129
- operator = match .group ('operator' ),
130
- status = match .group ('status' ),
131
- bands = match .group ('bands' ))
140
+ mnc_list = str2range (match .group ('mnc' ))
141
+ for mnc in mnc_list :
142
+ update_mncs (data , match .group ('mcc' ), mnc ,
143
+ country = country , cc = cc , brand = match .group ('brand' ),
144
+ operator = match .group ('operator' ),
145
+ status = match .group ('status' ),
146
+ bands = match .group ('bands' ))
147
+
148
+
149
+ def str2range (x ):
150
+ result = []
151
+ for part in x .split (',' ):
152
+ if '-' in part :
153
+ a , b = part .split ('-' )
154
+ f = '%0' + str (len (b )) + 'd'
155
+ a , b = int (a ), int (b )
156
+ for i in range (a , b + 1 ):
157
+ result .append (f % (i ))
158
+ else :
159
+ a = part
160
+ result .append (part )
161
+ return result
132
162
133
163
134
164
if __name__ == '__main__' :
0 commit comments