8000 Change prefix handling to support prefixes on first names, fix #60 · derek73/python-nameparser@ff6d888 · GitHub
[go: up one dir, main page]

Skip to content

Commit ff6d888

Browse files
committed
Change prefix handling to support prefixes on first names, fix #60
1 parent 03e580c commit ff6d888

File tree

5 files changed

+76
-22
lines changed

5 files changed

+76
-22
lines changed

docs/customize.rst

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below).
3939
Editable attributes of nameparser.config.CONSTANTS
4040
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4141

42-
* :py:obj:`~nameparser.config.Constants.titles` - Pieces that come before the name. Cannot include things that may be first names
43-
* :py:obj:`~nameparser.config.Constants.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David"
44-
* :py:obj:`~nameparser.config.Constants.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d."
45-
* :py:obj:`~nameparser.config.Constants.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr."
46-
* :py:obj:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
47-
* :py:obj:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding
48-
* :py:obj:`~nameparser.config.Constants.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D"
49-
* :py:obj:`~nameparser.config.Constants.regexes` - Regular expressions used to find words, initials, nicknames, etc.
42+
* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
43+
* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
44+
* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
45+
* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
46+
* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
47+
* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
48+
* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
49+
* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
5050

5151
Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning
5252
the constants for your project. These methods automatically lower case and

docs/release_log.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ Release Log
22
===========
33
* 1.0.0 - August 30, 2018
44
- Fix support for nicknames in single quotes (#74)
5+
- Change prefix handling to support prefixes on first names (#60)
56
- No other big changes, just bumping to v1 to indicate approprite project maturity
67
* 0.5.8 - August 19, 2018
78
- Add "Junior" to suffixes (#76)

docs/usage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ pass the parameter `force=True`.
9696
Nickname Handling
9797
------------------
9898

99-
The content of parenthesis or double quotes in the name will be
99+
The content of parenthesis or quotes in the name will be
100100
available from the nickname attribute.
101101

102102
.. doctest:: nicknames

nameparser/parser.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ def is_suffix(self, piece):
319319
return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \
320320
or (lc(piece) in self.C.suffix_not_acronyms)) \
321321
and not self.is_an_initial(piece)
322-
322+
323323
def are_suffixes(self, pieces):
324324
"""Return True if all pieces are suffixes."""
325325
for piece in pieces:
@@ -444,6 +444,7 @@ def parse_full_name(self):
444444
self.last_list = []
445445
self.suffix_list = []
446446
self.nickname_list = []
447+
self.prefix_joins = []
447448
self.unparsable = True
448449

449450

@@ -489,6 +490,14 @@ def parse_full_name(self):
489490
self.last_list.append(piece)
490491
self.suffix_list += pieces[i+1:]
491492
break
493+
if piece in self.prefix_joins:
494+
last_piece = pieces[-1:][0]
495+
if self.is_suffix(last_piece):
496+
self.last_list += pieces[i:-1]
497+
self.suffix = last_piece
498+
else:
499+
self.last_list += pieces[i:]
500+
break
492501
if not nxt:
493502
self.last_list.append(piece)
494503
continue
@@ -528,6 +537,14 @@ def parse_full_name(self):
528537
self.last_list.append(piece)
529538
self.suffix_list = pieces[i+1:] + self.suffix_list
530539
break
540+
if piece in self.prefix_joins:
541+
last_piece = pieces[-1:][0]
542+
if self.is_suffix(last_piece):
543+
self.last_list += pieces[i:-1]
544+
self.suffix_list.insert(0, last_piece)
545+
F438 else:
546+
self.last_list += pieces[i:]
547+
break
531548
if not nxt:
532549
self.last_list.append(piece)
533550
continue
@@ -544,7 +561,7 @@ def parse_full_name(self):
544561
# lastname part may have suffixes in it
545562
lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
546563
for piece in lastname_pieces:
547-
# the first one is always a last name, even if it look like
564+
# the first one is always a last name, even if it looks like
548565
# a suffix
549566
if self.is_suffix(piece) and len(self.last_list) > 0:
550567
self.suffix_list.append(piece)
@@ -568,6 +585,9 @@ def parse_full_name(self):
568585
if self.is_suffix(piece):
569586
self.suffix_list.append(piece)
570587
continue
588+
if piece in self.prefix_joins:
589+
self.last_list += pieces[i:]
590+
break
571591
self.middle_list.append(piece)
572592
try:
573593
if parts[2]:
@@ -742,15 +762,16 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
742762
prefixes = list(filter(self.is_prefix, pieces))
743763
if prefixes:
744764
i = pieces.index(prefixes[0])
745-
# join everything after the prefix until the next suffix
746-
next_suffix = list(filter(self.is_suffix, pieces[i:]))
747-
if next_suffix:
748-
j = pieces.index(next_suffix[0])
749-
new_piece = ' '.join(pieces[i:j])
750-
pieces = pieces[:i] + [new_piece] + pieces[j:]
751-
else:
752-
new_piece = ' '.join(pieces[i:])
753-
pieces = pieces[:i] + [new_piece]
765+
# join everything after the prefix until the next non prefix
766+
# store joined pieces in prefix_joins. When a prefix occurs in a last name,
767+
# I think it means the rest of the name is part of the last name, so prefix_joins
768+
# lets us do that in the parser flow.
769+
non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:]))
770+
if non_suffixes:
771+
j = pieces.index(non_suffixes[0])
772+
new_piece = ' '.join(pieces[i:j + 1])
773+
self.prefix_joins += [new_piece]
774+
pieces = pieces[:i] + [new_piece] + pieces[j + 1:]
754775

755776
log.debug("pieces: {0}".format(pieces))
756777
return pieces

tests.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,8 +1256,8 @@ def test_portuguese_dos(self):
12561256
def test_portuguese_prefixes(self):
12571257
hn = HumanName("Joao da Silva do Amaral de Souza")
12581258
self.m(hn.first, "Joao", hn)
1259-
self.m(hn.middle, "da Silva do Amaral de", hn)
1260-
self.m(hn.last, "Souza", hn)
1259+
self.m(hn.middle, "", hn)
1260+
self.m(hn.last, "da Silva do Amaral de Souza", hn)
12611261

12621262

12631263
class ConstantsCustomization(HumanNameTestBase):
@@ -1474,6 +1474,12 @@ def test_prefix_before_two_part_last_name_with_suffix(self):
14741474
self.m(hn.last, "von bergen wessels", hn)
14751475
self.m(hn.suffix, "III", hn)
14761476

1477+
def test_prefix_before_two_part_last_name_with_acronym_suffix(self):
1478+
hn = HumanName("pennie von bergen wessels M.D.")
1479+
self.m(hn.first, "pennie", hn)
1480+
self.m(hn.last, "von bergen wessels", hn)
1481+
self.m(hn.suffix, "M.D.", hn)
1482+
14771483
def test_two_part_last_name_with_suffix_comma(self):
14781484
hn = HumanName("pennie von bergen wessels, III")
14791485
self.m(hn.first, "pennie", hn)
@@ -1486,6 +1492,32 @@ def test_two_part_last_name_with_suffix(self):
14861492
self.m(hn.last, "von bergen wessels", hn)
14871493
self.m(hn.suffix, "III", hn)
14881494

1495+
def test_last_name_two_part_last_name_with_two_suffixes(self):
1496+
hn = HumanName( 83CB "von bergen wessels MD, pennie III")
1497+
self.m(hn.first, "pennie", hn)
1498+
self.m(hn.last, "von bergen wessels", hn)
1499+
self.m(hn.suffix, "MD, III", hn)
1500+
1501+
def test_comma_two_part_last_name_with_acronym_suffix(self):
1502+
hn = HumanName("von bergen wessels, pennie MD")
1503+
self.m(hn.first, "pennie", hn)
1504+
self.m(hn.last, "von bergen wessels", hn)
1505+
self.m(hn.suffix, "MD", hn)
1506+
1507+
def test_comma_two_part_last_name_with_suffix_in_first_part(self):
1508+
# I'm kinda surprised this works, not really sure if this is a
1509+
# realistic place for a suffix to be.
1510+
hn = HumanName("von bergen wessels MD, pennie")
1511+
self.m(hn.first, "pennie", hn)
1512+
self.m(hn.last, "von bergen wessels", hn)
1513+
self.m(hn.suffix, "MD", hn)
1514+
1515+
def test_title_two_part_last_name_with_suffix_in_first_part(self):
1516+
hn = HumanName("pennie von bergen wessels MD, III")
1517+
self.m(hn.first, "pennie", hn)
1518+
self.m(hn.last, "von bergen wessels", hn)
1519+
self.m(hn.suffix, "MD, III", hn)
1520+
14891521

14901522
class SuffixesTestCase(HumanNameTestBase):
14911523

0 commit comments

Comments
 (0)
0