8000 function to process first and last names · adrianhust/python_reference@044d334 · GitHub
[go: up one dir, main page]

Skip to content

Commit 044d334

Browse files
committed
function to process first and last names
1 parent 731425d commit 044d334

File tree

1 file changed

+82
-0
lines changed

1 file changed

+82
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Sebastian Raschka 2014
2+
#
3+
# A Python function to generalize first and last names.
4+
# The typical use case of such a function to merge data that have been collected
5+
# from different sources (e.g., names of soccer players as shown in the doctest.)
6+
#
7+
8+
import unicodedata
9+
import string
10+
import re
11+
12+
def preprocess_names(name, output_sep=' ', firstname_output_letters=1):
13+
"""
14+
Function that outputs a person's name in the format
15+
<last_name><separator><firstname letter(s)> (all lowercase)
16+
17+
>>> preprocess_names("Samuel Eto'o")
18+
'etoo s'
19+
20+
>>> preprocess_names("Eto'o, Samuel")
21+
'etoo s'
22+
23+
>>> preprocess_names("Eto'o,Samuel")
24+
'etoo s'
25+
26+
>>> preprocess_names('Xavi')
27+
'xavi'
28+
29+
>>> preprocess_names('Yaya Touré')
30+
'toure y'
31+
32+
>>> preprocess_names('José Ángel Pozo')
33+
'pozo j'
34+
35+
>>> preprocess_names('Pozo, José Ángel')
36+
'pozo j'
37+
38+
>>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
39+
'pozo jo'
40+
41+
>>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
42+
'etoo sa'
43+
44+
>>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
45+
'etoo'
46+
47+
>>> preprocess_names("Eto'o, Samuel", output_sep=', ')
48+
'etoo, s'
49+
50+
"""
51+
52+
# set first and last name positions
53+
last, first = 'last', 'first'
54+
last_pos = -1
55+
56+
if ',' in name:
57+
last, first = first, last
58+
name = name.replace(',', ' ')
59+
last_pos = 1
60+
61+
spl = name.split()
62+
if len(spl) > 2:
63+
name = '%s %s' % (spl[0], spl[last_pos])
64+
65+
spl1, *spl2 = name.split()
66+
'%s %s' % (spl1, ''.join(spl2))
67+
68+
# remove accents
69+
name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
70+
71+
# get first and last name if applicable
72+
m = re.match('(?P<first>\w+)\W+(?P<last>\w+)', name)
73+
if m:
74+
output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
75+
else:
76+
output = name
77+
return output.lower().strip()
78+
79+
80+
if __name__ == "__main__":
81+
import doctest
82+
doctest.testmod()

0 commit comments

Comments
 (0)
0