10000 Update parsing in getimsi script · sharoonthomas/python-stdnum@4609a22 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4609a22

Browse files
eneq123arthurdejong
authored andcommitted
Update parsing in getimsi script
This updates the regexes and includes seom optimizations. See: arthurdejong#1
1 parent 9ec3cb0 commit 4609a22

File tree

2 files changed

+1743
-1699
lines changed

2 files changed

+1743
-1699
lines changed

getimsi.py

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,24 @@
8181

8282

8383
remove_ref_re = re.compile(r'<ref>.*?</ref>')
84+
remove_comment_re = re.compile(r'{{.*?}}')
85+
remove_href_re = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+' +
86+
ur'[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|' +
87+
ur'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|' +
88+
ur'(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>' +
89+
ur'?\xab\xbb\u201c\u201d\u2018\u2019]))')
8490

8591

8692
def cleanup_value(val):
8793
"""Remove unneeded markup from the value."""
8894
# remove uninteresting things from value
89-
val = val.replace('[', '').replace(']', '').strip()
95+
val = remove_comment_re.sub('', val)
9096
val = remove_ref_re.sub('', val)
97+
val = remove_href_re.sub('', val)
98+
val = val.replace('[', '').replace(']', '').replace('\'\'', '').strip()
99+
val = val.split('|')[-1]
91100
# replace value
101+
val = val.replace('Unknown', '')
92102
val = val.replace('United Kingdom|UK', 'United Kingdom')
93103
val = val.replace('United States|US', 'United States')
94104
val = val.replace('New Zealand|NZ', 'New Zealand').strip()
@@ -105,14 +115,14 @@ def get_mncs_from_wikipedia(data):
105115
"""Update the collection of Mobile Country Codes from Wikipedia.
106116
This parses a Wikipedia page to extract the MCC and MNC, the first
107117
part of any IMSI, and stores the results."""
108-
mnc_country_re = re.compile(r'^====\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+====$')
109-
mnc_line_re = re.compile(r'^\|\s+(?P<mcc>[0-9]+)' +
110-
r'\s+\|\|\s+(?P<mnc>[0-9]+)' +
111-
r'(\s+\|\|\s+(?P<brand>[^|]*)' +
112-
r'(\s+\|\|\s+(?P<operator>[^|]*)' +
113-
r'(\s+\|\|\s+(?P<status>[^|]*)' +
114-
r'(\s+\|\|\s+(?P<bands>[^|]*)' +
115-
r'(\s+\|\|\s+(?P<notes>[^|]*)' +
118+
mnc_country_re = re.compile(r'^[=]{2,4}\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+[=]{2,4}$')
119+
mnc_line_re = re.compile(r'^\|\s*(?P<mcc>[0-9]+)' +
120+
r'\s*\\\\\s*(?P<mnc>[0-9]+)' +
121+
r'(\s*\\\\\s*(?P<brand>[^\\]*)' +
122+
r'(\s*\\\\\s*(?P<operator>[^\\]*)' +
123+
r'(\s*\\\\\s*(?P<status>[^\\]*)' +
124+
r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
125+
r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
116126
r')?)?)?)?)?')
117127
f = urllib.urlopen(mcc_list_url)
118128
country = cc = ''
@@ -122,13 +132,33 @@ def get_mncs_from_wikipedia(data):
122132
if match:
123133
country = match.group('country')
124134
cc = (match.group('cc') or '').lower()
135+
if '||' not in line:
136+
continue
137+
line = line.replace('||', '\\\\')
125138
match = mnc_line_re.match(line)
126139
if match:
127-
update_mncs(data, match.group('mcc'), match.group('mnc'),
128-
country=country, cc=cc, brand=match.group('brand'),
129-
operator=match.group('operator'),
130-
status=match.group('status'),
131-
bands=match.group('bands'))
140+
mnc_list = str2range(match.group('mnc'))
141+
for mnc in mnc_list:
142+
update_mncs(data, match.group('mcc'), mnc,
143+
country=country, cc=cc, brand=match.group('brand'),
144+
operator=match.group('operator'),
145+
status=match.group('status'),
146+
bands=match.group('bands'))
147+
148+
149+
def str2range(x):
150+
result = []
151+
for part in x.split(','):
152+
if '-' in part:
153+
a, b = part.split('-')
154+
f = '%0' + str(len(b)) + 'd'
155+
a, b = int(a), int(b)
156+
for i in range(a, b + 1):
157+
result.append(f % (i))
158+
else:
159+
a = part
160+
result.append(part)
161+
return result
132162

133163

134164
if __name__ == '__main__':

0 commit comments

Comments
 (0)
0