14
14
import os
15
15
import time
16
16
import re
17
+ from unidecode import unidecode
17
18
18
19
import pexpect
19
20
@@ -25,6 +26,27 @@ def remove_id(word):
25
26
"""Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
26
27
return word .count ("-" ) == 0 and word or word [0 :word .rindex ("-" )]
27
28
29
+ def parse_bracketed (s ):
30
+ '''Parse word features [abc=... def = ...]
31
+ Also manages to parse out features that have XML within them
32
+ '''
33
+ word = None
34
+ attrs = {}
35
+ temp = {}
36
+ # Substitute XML tags, to replace them later
37
+ for i , tag in enumerate (re .findall (r"(<[^<>]+>.*<\/[^<>]+>)" , s )):
38
+ temp ["^^^%d^^^" % i ] = tag
39
+ s = s .replace (tag , "^^^%d^^^" % i )
40
+ # Load key-value pairs, substituting as necessary
41
+ for attr , val in re .findall (r"([^=\s]*)=([^=\s]*)" , s ):
42
+ if val in temp :
43
+ val = temp [val ]
44
+ if attr == 'Text' :
45
+ word = val
46
+ else :
47
+ attrs [attr ] = val
48
+ return (word , attrs )
49
+
28
50
def parse_parser_results (text ):
29
51
""" This is the nasty bit of code to interact with the command-line
30
52
interface of the CoreNLP tools. Takes a string of the parser results
@@ -33,12 +55,14 @@ def parse_parser_results(text):
33
55
"""
34
56
state = 0
35
57
tmp = {}
36
- results = []
58
+ coref_set = []
59
+ results = { "sentences" : [] }
60
+ text = unidecode (text ) # Force output conversion to ASCII to avoid RPC error
37
61
for line in text .split ("\n " ):
38
62
if line .startswith ("Sentence #" ):
39
63
state = 1
40
64
if len (tmp .keys ()) != 0 :
41
- results .append (tmp )
65
+ results [ "sentences" ] .append (tmp ) # Put results in "sentences" key so "corefs" can exist outside
42
66
tmp = {}
43
67
elif state == 1 :
44
68
tmp ['text' ] = line .strip ()
@@ -51,28 +75,17 @@ def parse_parser_results(text):
51
75
exp = re .compile ('\[([^\]]+)\]' )
52
76
matches = exp .findall (line )
53
77
for s in matches :
54
- print s
55
- # split into attribute-value list
56
- av = re .split ("=| " , s )
57
- # make [ignore,ignore,a,b,c,d] into [[a,b],[c,d]]
58
- # and save as attr-value dict, convert numbers into ints
59
- #tmp['words'].append((av[1], dict(zip(*[av[2:][x::2] for x in (0, 1)]))))
60
- # tried to convert digits to ints instead of strings, but
61
- # it seems the results of this can't be serialized into JSON?
62
- word = av [1 ]
63
- attributes = {}
64
- for a ,v in zip (* [av [2 :][x ::2 ] for x in (0 , 1 )]):
65
- if v .isdigit ():
66
- attributes [a ] = int (v )
67
- else :
68
- attributes [a ] = v
69
- tmp ['words' ].append ((word , attributes ))
78
+ tmp ['words' ].append (parse_bracketed (s ))
70
79
state = 3
80
+ tmp ['parsetree' ] = []
71
81
elif state == 3 :
72
- # skip over parse tree
82
+ # Output parse tree as well (useful especially if you want to pull this into NLTK)
73
83
if not (line .startswith (" " ) or line .startswith ("(ROOT" )):
74
84
state = 4
75
- tmp ['tuples' ] = []
85
+ tmp ['parsetree' ] = " " .join (tmp ['parsetree' ])
86
+ tmp ['tuples' ] = []
87
+ else :
88
+ tmp ['parsetree' ].append (line .strip ())
76
89
if state == 4 :
77
90
# dependency parse
78
91
line = line .rstrip ()
@@ -81,27 +94,34 @@ def parse_parser_results(text):
81
94
if len (split_entry ) == 3 :
82
95
rel , left , right = map (lambda x : remove_id (x ), split_entry )
83
96
tmp ['tuples' ].append (tuple ([rel ,left ,right ]))
84
- elif "Coreference links " in line :
97
+ elif "Coreference set " in line :
85
98
state = 5
99
+ coref_set = []
86
100
elif state == 5 :
87
- crexp = re .compile ('\s(\d*)\s(\d*)\s\-\>\s(\d*)\s(\d*), that is' )
101
+ if "Coreference set" in line : # Create new coreference set if needed
102
+ if len (coref_set ) > 0 :
103
+ if results .has_key ('coref' ):
104
+ results ['coref' ].append (coref_set )
105
+ else :
106
+ results ['coref' ] = [coref_set ]
107
+ coref_set = []
108
+ else :
109
+ # Updated for new coreference format
110
+ crexp = re .compile (r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"" )
88
111
matches = crexp .findall (line )
89
- for src_i , src_pos , sink_i , sink_pos in matches :
90
- # TODO: src_i and sink_i correspond to the sentences.
91
- # this was built for single sentences, and thus ignores
92
- # the sentence number. Should be fixed, but would require
93
- # restructuring the entire output.
94
- print "COREF MATCH" , src_i , sink_i
95
- src = tmp ['words' ][int (src_pos )- 1 ][0 ]
96
- sink = tmp ['words' ][int (sink_pos )- 1 ][0 ]
97
- if tmp .has_key ('coref' ):
98
- tmp ['coref' ].append ((src , sink ))
99
- else :
100
- tmp ['coref' ] = [(src , sink )]
101
-
112
+ for src_i , src_pos , src_l , src_r , sink_i , sink_pos , sink_l , sink_r , src_word , sink_word in matches :
113
+ src_i , src_pos , src_l , src_r = int (src_i )- 1 , int (src_pos )- 1 , int (src_l )- 1 , int (src_r )- 1
114
+ sink_i , sink_pos , sink_l , sink_r = int (sink_i )- 1 , int (sink_pos )- 1 , int (sink_l )- 1 , int (sink_r )- 1
115
+ print "COREF MATCH" , src_i , sink_i
116
+ coref_set .append (((src_word , src_i , src_pos , src_l , src_r ), (sink_word , sink_i , sink_pos , sink_l , sink_r )))
102
117
print "CR" , line
103
118
if len (tmp .keys ()) != 0 :
104
- results .append (tmp )
119
+ results ["sentences" ].append (tmp )
120
+ if len (coref_set ) > 0 : # Add final coreference set if needed
121
+ if results .has_key ('coref' ):
122
+ results ['coref' ].append (coref_set )
123
+ else :
124
+ results ['coref' ] = [coref_set ]
105
125
return results
106
126
107
127
class StanfordCoreNLP (object ):
@@ -116,8 +136,8 @@ def __init__(self):
116
136
Spawns the server as a process.
117
137
"""
118
138
119
- jars = ["stanford-corenlp-2011-09-16 .jar" ,
120
- "stanford-corenlp-2011-09-14 -models.jar" ,
139
+ jars = ["stanford-corenlp-2012-04-09 .jar" ,
140
+ "stanford-corenlp-2012-04-09 -models.jar" ,
121
141
"joda-time.jar" ,
122
142
"xom.jar" ]
123
143
0 commit comments