1
+ # Sebastian Raschka 2014
2
+ #
3
+ # A Python function to generalize first and last names.
4
+ # The typical use case of such a function to merge data that have been collected
5
+ # from different sources (e.g., names of soccer players as shown in the doctest.)
6
+ #
7
+
8
+ import unicodedata
9
+ import string
10
+ import re
11
+
12
+ def preprocess_names (name , output_sep = ' ' , firstname_output_letters = 1 ):
13
+ """
14
+ Function that outputs a person's name in the format
15
+ <last_name><separator><firstname letter(s)> (all lowercase)
16
+
17
+ >>> preprocess_names("Samuel Eto'o")
18
+ 'etoo s'
19
+
20
+ >>> preprocess_names("Eto'o, Samuel")
21
+ 'etoo s'
22
+
23
+ >>> preprocess_names("Eto'o,Samuel")
24
+ 'etoo s'
25
+
26
+ >>> preprocess_names('Xavi')
27
+ 'xavi'
28
+
29
+ >>> preprocess_names('Yaya Touré')
30
+ 'toure y'
31
+
32
+ >>> preprocess_names('José Ángel Pozo')
33
+ 'pozo j'
34
+
35
+ >>> preprocess_names('Pozo, José Ángel')
36
+ 'pozo j'
37
+
38
+ >>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
39
+ 'pozo jo'
40
+
41
+ >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
42
+ 'etoo sa'
43
+
44
+ >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
45
+ 'etoo'
46
+
47
+ >>> preprocess_names("Eto'o, Samuel", output_sep=', ')
48
+ 'etoo, s'
49
+
50
+ """
51
+
52
+ # set first and last name positions
53
+ last , first = 'last' , 'first'
54
+ last_pos = - 1
55
+
56
+ if ',' in name :
57
+ last , first = first , last
58
+ name = name .replace (',' , ' ' )
59
+ last_pos = 1
60
+
61
+ spl = name .split ()
62
+ if len (spl ) > 2 :
63
+ name = '%s %s' % (spl [0 ], spl [last_pos ])
64
+
65
+ spl1 , * spl2 = name .split ()
66
+ '%s %s' % (spl1 , '' .join (spl2 ))
67
+
68
+ # remove accents
69
+ name = '' .join (x for x in unicodedata .normalize ('NFKD' , name ) if x in string .ascii_letters + ' ' )
70
+
71
+ # get first and last name if applicable
72
+ m = re .match ('(?P<first>\w+)\W+(?P<last>\w+)' , name )
73
+ if m :
74
+ output = '%s%s%s' % (m .group (last ), output_sep , m .group (first )[:firstname_output_letters ])
75
+ else :
76
+ output = name
77
+ return output .lower ().strip ()
78
+
79
+
80
+ if __name__ == "__main__" :
81
+ import doctest
82
+ doctest .testmod ()
0 commit comments