8000 Only process the shortest matches in the numdb module · Dzikpol/python-stdnum@38c368d · GitHub
[go: up one dir, main page]

Skip to content

Commit 38c368d

Browse files
committed
Only process the shortest matches in the numdb module
This ensures that matching numbers is done consistently when the numdb database file has conflicting information about the length of numbers. This also refactors the _find() function to be simpler and reduces the number of recursive calls that have to be done. The tests have been re-formatted to use pprint to make it easier to spot differences if any of the tests fail (instead of just saying expected True, got False). Closes arthurdejong#257
1 parent b7901d6 commit 38c368d

File tree

2 files changed

+34
-71
lines changed

2 files changed

+34
-71
lines changed

stdnum/numdb.py

Lines changed: 31 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# numdb.py - module for handling hierarchically organised numbers
22
#
3-
# Copyright (C) 2010-2019 Arthur de Jong
3+
# Copyright (C) 2010-2021 Arthur de Jong
44
#
55
# This library is free software; you can redistribute it and/or
66
# modify it under the terms of the GNU Lesser General Public
@@ -39,47 +39,21 @@
3939
4040
To split the number and get properties for each part:
4141
42-
>>> dbfile.info('01006') == [
43-
... ('0', {'prop1': 'foo'}),
44-
... ('100', {'prop2': 'bar'}),
45-
... ('6', {}),
46-
... ]
47-
True
48-
>>> dbfile.info('02006') == [
49-
... ('0', {'prop1': 'foo'}),
50-
... ('200', {'prop2': 'bar', 'prop3': 'baz'}),
51-
... ('6', {}),
52-
... ]
53-
True
54-
>>> dbfile.info('03456') == [
55-
... ('0', {'prop1': 'foo'}),
56-
... ('345', {'prop2': 'bar', 'prop3': 'baz'}),
57-
... ('6', {}),
58-
... ]
59-
True
60-
>>> dbfile.info('902006') == [
61-
... ('90', {'prop1': 'booz'}),
62-
... ('20', {'prop2': 'foo'}),
63-
... ('06', {}),
64-
... ]
65-
True
66-
>>> dbfile.info('909856') == [
67-
... ('90', {'prop1': 'booz'}),
68-
... ('985', {'prop2': 'fooz'}),
69-
... ('6', {}),
70-
... ]
71-
True
72-
>>> dbfile.info('9889') == [
73-
... ('98', {'prop1': 'booz'}),
74-
... ('89', {'prop2': 'foo'}),
75-
... ]
76-
True
77-
>>> dbfile.info('633322') == [
78-
... ('6', {'prop1': 'boo'}),
79-
... ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}),
80-
... ('22', {}),
81-
... ]
82-
True
42+
>>> import pprint
43+
>>> pprint.pprint(dbfile.info('01006'))
44+
[('0', {'prop1': 'foo'}), ('100', {'prop2': 'bar'}), ('6', {})]
45+
>>> pprint.pprint(dbfile.info('02006'))
46+
[('0', {'prop1': 'foo'}), ('200', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})]
47+
>>> pprint.pprint(dbfile.info('03456'))
48+
[('0', {'prop1': 'foo'}), ('345', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})]
49+
>>> pprint.pprint(dbfile.info('902006'))
50+
[('90', {'prop1': 'booz'}), ('20', {'prop2': 'foo'}), ('06', {})]
51+
>>> pprint.pprint(dbfile.info('909856'))
52+
[('90', {'prop1': 'booz'}), ('985', {'prop2': 'fooz'}), ('6', {})]
53+
>>> pprint.pprint(dbfile.info('9889'))
54+
[('98', {'prop1': 'booz'}), ('89', {'prop2': 'foo'})]
55+
>>> pprint.pprint(dbfile.info('633322'))
56+
[('6', {'prop1': 'boo'}), ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}), ('22', {})]
8357
8458
"""
8559

@@ -114,41 +88,27 @@ def __init__(self):
11488
"""Construct an empty database."""
11589
self.prefixes = []
11690

117-
@staticmethod
118-
def _merge(results):
119-
"""Merge the provided list of possible results into a single result
120-
list (this is a generator)."""
121-
# expand the results to all have the same length
122-
ml = max(len(x) for x in results)
123-
results = [x + (ml - len(x)) * [None]
124-
for x in results]
125-
# go over each part
126-
for parts in zip(*results):
127-
# regroup parts into parts list and properties list
128-
partlist, proplist = list(zip(*(x for x in parts if x)))
129-
part = min(partlist, key=len)
130-
props = {}
131-
for p in proplist:
132-
props.update(p)
133-
yield part, props
134-
13591
@staticmethod
13692
def _find(number, prefixes):
13793
"""Lookup the specified number in the list of prefixes, this will
13894
return basically what info() should return but works recursively."""
13995
if not number:
14096
return []
141-
results = []
142-
if prefixes:
143-
for length, low, high, props, children in prefixes:
144-
if low <= number[:length] <= high and len(number) >= length:
145-
results.append([(number[:length], props)] +
146-
NumDB._find(number[length:], children))
147-
# not-found fallback
148-
if not results:
149-
return [(number, {})]
150-
# merge the results into a single result
151-
return list(NumDB._merge(results))
97+
part = number
98+
properties = {}
99+
next_prefixes = []
100+
# go over prefixes and find matches
101+
for length, low, high, props, children in prefixes:
102+
if len(part) >= length and low <= part[:length] <= high:
103+
# only use information from the shortest match
104+
if length < len(part):
105+
part = part[:length]
106+
properties = {}
107+
next_prefixes = []
108+
properties.update(props)
109+
next_prefixes.extend(children or [])
110+
# return first part and recursively find next matches
111+
return [(part, properties)] + NumDB._find(number[len(part):], next_prefixes)
152112

153113
def info(self, number):
154114
"""Split the provided number in components and associate properties

tests/numdb-test.dat

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
# numdb-test.dat: used for testing the stdnum.numdb module
12
# this is a comment line
23
0-8 prop1="foo"
34
100-999 prop2="bar"
45
200,300-399 prop3="baz"
56
6 prop1="boo"
67
333 prop4="bla"
78
90-99 prop1="booz"
9+
200 comment1="this value will be ignored because a shorter one matches"
810
00-89 prop2="foo"
11+
200 comment2="this value will also be ignored"
912
900-999 prop2="fooz"

0 commit comments

Comments
 (0)
0