19
19
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20
20
21
21
22
- import json , optparse , os , re , sys , time , traceback
22
+ import json
23
+ import optparse
24
+ import os
25
+ import re
26
+ import sys
27
+ import traceback
23
28
import pexpect
24
29
import tempfile
25
30
import shutil
34
39
35
40
DIRECTORY = "stanford-corenlp-full-2013-06-20"
36
41
42
+
37
43
class bc :
38
44
HEADER = '\033 [95m'
39
45
OKBLUE = '\033 [94m'
@@ -44,31 +50,38 @@ class bc:
44
50
45
51
46
52
class ProcessError (Exception ):
53
+
47
54
def __init__ (self , value ):
48
55
self .value = value
56
+
49
57
def __str__ (self ):
50
58
return repr (self .value )
51
59
60
+
52
61
class ParserError (Exception ):
62
+
53
63
def __init__ (self , value ):
54
64
self .value = value
65
+
55
66
def __str__ (self ):
56
67
return repr (self .value )
57
68
69
+
58
70
class TimeoutError (Exception ):
71
+
59
72
def __init__ (self , value ):
60
73
self .value = value
74
+
61
75
def __str__ (self ):
62
76
return repr (self .value )
63
77
64
78
65
- def init_corenlp_command (corenlp_path , memory ):
79
+ def init_corenlp_command (corenlp_path , memory , properties ):
66
80
"""
67
81
Checks the location of the jar files.
68
82
Spawns the server as a process.
69
83
"""
70
84
71
-
72
85
# TODO: Can edit jar constants
73
86
jars = ["stanford-corenlp-3.2.0.jar" ,
74
87
"stanford-corenlp-3.2.0-models.jar" ,
@@ -80,17 +93,16 @@ def init_corenlp_command(corenlp_path, memory):
80
93
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
81
94
# include the properties file, so you can change defaults
82
95
# but any changes in output format will break parse_parser_results()
83
- property_name = "default.properties"
84
- current_dir_pr = os .path .dirname (os .path .abspath ( __file__ )) + "/" + property_name
85
- if os .path .exists (property_name ):
86
- props = "-props %s" % (property_name )
96
+ current_dir_pr = os .path .dirname (os .path .abspath (__file__ )) + "/" + properties
97
+ if os .path .exists (properties ):
98
+ props = "-props %s" % (properties )
87
99
elif os .path .exists (current_dir_pr ):
88
100
props = "-props %s" % (current_dir_pr )
89
101
else :
90
- raise Exception ("Error! Cannot locate: default.properties" )
102
+ raise Exception ("Error! Cannot locate: " % properties )
91
103
92
104
# add and check classpaths
93
- jars = [corenlp_path + "/" + jar for jar in jars ]
105
+ jars = [corenlp_path + "/" + jar for jar in jars ]
94
106
for jar in jars :
95
107
if not os .path .exists (jar ):
96
108
raise Exception ("Error! Cannot locate: %s" % jar )
@@ -143,7 +155,7 @@ def parse_parser_results(text):
143
155
line = line .strip ()
144
156
145
157
if line .startswith ("Sentence #" ):
146
- sentence = {'words' :[], 'parsetree' :[], 'dependencies' :[]}
158
+ sentence = {'words' : [], 'parsetree' : [], 'dependencies' : []}
147
159
results ["sentences" ].append (sentence )
148
160
state = STATE_TEXT
149
161
@@ -172,7 +184,7 @@ def parse_parser_results(text):
172
184
split_entry = re .split ("\(|, " , line [:- 1 ])
173
185
if len (split_entry ) == 3 :
174
186
rel , left , right = map (lambda x : remove_id (x ), split_entry )
175
- sentence ['dependencies' ].append (tuple ([rel ,left ,right ]))
187
+ sentence ['dependencies' ].append (tuple ([rel , left , right ]))
176
188
177
189
elif state == STATE_COREFERENCE :
178
190
if "Coreference set" in line :
@@ -182,12 +194,13 @@ def parse_parser_results(text):
182
194
results ['coref' ].append (coref_set )
183
195
else :
184
196
for src_i , src_pos , src_l , src_r , sink_i , sink_pos , sink_l , sink_r , src_word , sink_word in CR_PATTERN .findall (line ):
185
- src_i , src_pos , src_l , src_r = int (src_i )- 1 , int (src_pos )- 1 , int (src_l )- 1 , int (src_r )- 1
186
- sink_i , sink_pos , sink_l , sink_r = int (sink_i )- 1 , int (sink_pos )- 1 , int (sink_l )- 1 , int (sink_r )- 1
197
+ src_i , src_pos , src_l , src_r = int (src_i ) - 1 , int (src_pos ) - 1 , int (src_l ) - 1 , int (src_r ) - 1
198
+ sink_i , sink_pos , sink_l , sink_r = int (sink_i ) - 1 , int (sink_pos ) - 1 , int (sink_l ) - 1 , int (sink_r ) - 1
187
199
coref_set .append (((src_word , src_i , src_pos , src_l , src_r ), (sink_word , sink_i , sink_pos , sink_l , sink_r )))
188
200
189
201
return results
190
202
203
+
191
204
def parse_parser_xml_results (xml , file_name = "" , raw_output = False ):
192
205
import xmltodict
193
206
from collections import OrderedDict
@@ -214,10 +227,10 @@ def extract_words_from_xml(sent_node):
214
227
raw_coref_list = document [u'coreference' ][u'coreference' ]
215
228
216
229
# To dicrease is for given index different from list index
217
- coref_index = [[[int (raw_coref_list [j ][u'mention' ][i ]['sentence' ])- 1 ,
218
- int (raw_coref_list [j ][u'mention' ][i ]['head' ])- 1 ,
219
- int (raw_coref_list [j ][u'mention' ][i ]['start' ])- 1 ,
220
- int (raw_coref_list [j ][u'mention' ][i ]['end' ])- 1 ]
230
+ coref_index = [[[int (raw_coref_list [j ][u'mention' ][i ]['sentence' ]) - 1 ,
231
+ int (raw_coref_list [j ][u'mention' ][i ]['head' ]) - 1 ,
232
+ int (raw_coref_list [j ][u'mention' ][i ]['start' ]) - 1 ,
233
+ int (raw_coref_list [j ][u'mention' ][i ]['end' ]) - 1 ]
221
234
for i in xrange (len (raw_coref_list [j ][u'mention' ]))]
222
235
for j in xrange (len (raw_coref_list ))]
223
236
@@ -230,7 +243,7 @@ def extract_words_from_xml(sent_node):
230
243
coref_list [j ][k ].insert (0 , ' ' .join (exted_words ))
231
244
232
245
coref_list = [[[coref_list [j ][i ], coref_list [j ][0 ]]
233
- for i in xrange (len (coref_list [j ])) if i != 0 ]
246
+ for i in xrange (len (coref_list [j ])) if i != 0 ]
234
247
for j in xrange (len (coref_list ))]
235
248
else :
236
249
coref_flag = False
@@ -243,7 +256,7 @@ def extract_words_from_xml(sent_node):
243
256
dep ['dep' ][i ]['dependent' ]['#text' ]]
244
257
for dep in raw_sent_list [j ][u'dependencies' ]
245
258
for i in xrange (len (dep ['dep' ]))
246
- if dep ['@type' ]== 'basic-dependencies' ],
259
+ if dep ['@type' ] == 'basic-dependencies' ],
247
260
'text' : extract_words_from_xml (raw_sent_list [j ]),
248
261
'parsetree' : str (raw_sent_list [j ]['parse' ]),
249
262
'words' : [[str (token ['word' ]), OrderedDict ([
@@ -252,12 +265,12 @@ def extract_words_from_xml(sent_node):
252
265
('CharacterOffsetBegin' , str (token ['CharacterOffsetBegin' ])),
253
266
('PartOfSpeech' , str (token ['POS' ])),
254
267
('Lemma' , str (token ['lemma' ]))])]
255
- for token in raw_sent_list [j ][u'tokens' ][u'token' ]]}
268
+ for token in raw_sent_list [j ][u'tokens' ][u'token' ]]}
256
269
257
- for j in xrange (len (raw_sent_list )) ]
270
+ for j in xrange (len (raw_sent_list ))]
258
271
259
272
if coref_flag :
260
- results = {'coref' :coref_list , 'sentences' :sentences }
273
+ results = {'coref' : coref_list , 'sentences' : sentences }
261
274
else :
262
275
results = {'sentences' : sentences }
263
276
@@ -266,7 +279,8 @@ def extract_words_from_xml(sent_node):
266
279
267
280
return results
268
281
269
- def parse_xml_output (input_dir , corenlp_path = DIRECTORY , memory = "3g" , raw_output = False ):
282
+
283
+ def parse_xml_output (input_dir , corenlp_path = DIRECTORY , memory = "3g" , raw_output = False , properties = 'default.properties' ):
270
284
"""Because interaction with the command-line interface of the CoreNLP
271
285
tools is limited to very short text bits, it is necessary to parse xml
272
286
output"""
@@ -278,15 +292,15 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
278
292
279
293
#we get a list of the cleaned files that we want to parse:
280
294
281
- files = [input_dir + '/' + f for f in os .listdir (input_dir )]
295
+ files = [input_dir + '/' + f for f in os .listdir (input_dir )]
282
296
283
297
#creating the file list of files to parse
284
298
285
299
file_list .write ('\n ' .join (files ))
286
300
file_list .seek (0 )
287
301
288
- command = init_corenlp_command (corenlp_path , memory )\
289
- + ' -filelist %s -outputDirectory %s' % (file_list .name , xml_dir )
302
+ command = init_corenlp_command (corenlp_path , memory , properties )\
303
+ + ' -filelist %s -outputDirectory %s' % (file_list .name , xml_dir )
290
304
291
305
#creates the xml file of parser output:
292
306
@@ -296,7 +310,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
296
310
# result = []
297
311
try :
298
312
for output_file in os .listdir (xml_dir ):
299
- with open (xml_dir + '/' + output_file , 'r' ) as xml :
313
+ with open (xml_dir + '/' + output_file , 'r' ) as xml :
300
314
# parsed = xml.read()
301
315
file_name = re .sub ('.xml$' , '' , os .path .basename (output_file ))
302
316
# result.append(parse_parser_xml_results(xml.read(), file_name,
@@ -308,38 +322,42 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
308
322
shutil .rmtree (xml_dir )
309
323
# return result
310
324
325
+
311
326
class StanfordCoreNLP :
327
+
312
328
"""
313
329
Command-line interaction with Stanford's CoreNLP java utilities.
314
330
Can be run as a JSON-RPC server or imported as a module.
315
331
"""
316
- def __init__ (self , corenlp_path = DIRECTORY , memory = "3g" ):
332
+
333
+ def __init__ (self , corenlp_path = DIRECTORY , memory = "3g" , properties = 'default.properties' ):
317
334
"""
318
335
Checks the location of the jar files.
319
336
Spawns the server as a process.
320
337
"""
321
338
322
339
# spawn the server
323
- start_corenlp = init_corenlp_command (corenlp_path , memory )
324
- if VERBOSE : print start_corenlp
340
+ start_corenlp = init_corenlp_command (corenlp_path , memory , properties )
341
+ if VERBOSE :
342
+ print start_corenlp
325
343
self .corenlp = pexpect .spawn (start_corenlp )
326
344
327
345
# show progress bar while loading the models
328
346
if VERBOSE :
329
347
widgets = ['Loading Models: ' , Fraction ()]
330
348
pbar = ProgressBar (widgets = widgets , maxval = 5 , force_update = True ).start ()
331
- self . corenlp . expect ( "done." , timeout = 20 ) # Load pos tagger model (~5sec)
332
- if VERBOSE : pbar . update ( 1 )
333
- self . corenlp . expect ( "done." , timeout = 200 ) # Load NER-all classifier (~33sec)
334
- if VERBOSE : pbar . update ( 2 )
335
- self . corenlp . expect ( "done." , timeout = 600 ) # Load NER-muc classifier (~60sec )
336
- if VERBOSE : pbar . update ( 3 )
337
- self . corenlp . expect ( "done." , timeout = 600 ) # Load CoNLL classifier (~50sec)
338
- if VERBOSE : pbar . update ( 4 )
339
- self .corenlp .expect ("done." , timeout = 200 ) # Loading PCFG (~3sec)
340
- if VERBOSE : pbar .update (5 )
341
- self .corenlp .expect ("Entering interactive shell." )
342
- if VERBOSE : pbar .finish ()
349
+ # Model timeouts:
350
+ # pos tagger model (~5sec )
351
+ # NER-all classifier (~33sec)
352
+ # NER-muc classifier (~60sec )
353
+ # CoNLL classifier (~50sec )
354
+ # PCFG (~3sec )
355
+ timeouts = [ 20 , 200 , 600 , 600 , 20 ]
356
+ for i in xrange ( 5 ):
357
+ self .corenlp .expect ("done." , timeout = timeouts [ i ]) # Load model
358
+ pbar .update (i + 1 )
359
+ self .corenlp .expect ("Entering interactive shell." )
360
+ pbar .finish ()
343
361
344
362
# interactive shell
345
363
self .corenlp .expect ("\n NLP> " , timeout = 3 )
@@ -373,7 +391,7 @@ def _parse(self, text):
373
391
def clean_up ():
374
392
while True :
375
393
try :
376
- self .corenlp .read_nonblocking (8192 , 0.1 )
394
+ self .corenlp .read_nonblocking (8192 , 0.1 )
377
395
except pexpect .TIMEOUT :
378
396
break
379
397
clean_up ()
@@ -412,11 +430,13 @@ def clean_up():
412
430
'output' : incoming }
413
431
return
414
432
415
- if VERBOSE : print "%s\n %s" % ('=' * 40 , incoming )
433
+ if VERBOSE :
434
+ print "%s\n %s" % ('=' * 40 , incoming )
416
435
try :
417
436
results = parse_parser_results (incoming )
418
- except Exception , e :
419
- if VERBOSE : print traceback .format_exc ()
437
+ except Exception as e :
438
+ if VERBOSE :
439
+ print traceback .format_exc ()
420
440
raise e
421
441
422
442
return results
@@ -459,21 +479,25 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
459
479
The code below starts an JSONRPC server
460
480
"""
461
481
from jsonrpclib .SimpleJSONRPCServer import SimpleJSONRPCServer
462
- VERBOSE = True
463
482
parser = optparse .OptionParser (usage = "%prog [OPTIONS]" )
464
483
parser .add_option ('-p' , '--port' , default = '8080' ,
465
484
help = 'Port to serve on (default 8080)' )
466
485
parser .add_option ('-H' , '--host' , default = '127.0.0.1' ,
467
486
help = 'Host to serve on (default localhost; 0.0.0.0 to make public)' )
487
+ parser .add_option ('-q' , '--quiet' , action = 'store_false' , default = True , dest = 'verbose' ,
488
+ help = "Quiet mode, don't print status msgs to stdout" )
468
489
parser .add_option ('-S' , '--corenlp' , default = DIRECTORY ,
469
490
help = 'Stanford CoreNLP tool directory (default %s)' % DIRECTORY )
491
+ parser .add_option ('-P' , '--properties' , default = 'default.properties' ,
492
+ help = 'Stanford CoreNLP properties fieles (default: default.properties)' )
470
493
options , args = parser .parse_args ()
494
+ VERBOSE = options .verbose
471
495
# server = jsonrpc.Server(jsonrpc.JsonRpc20(),
472
496
# jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
473
497
try :
474
498
server = SimpleJSONRPCServer ((options .host , int (options .port )))
475
499
476
- nlp = StanfordCoreNLP (options .corenlp )
500
+ nlp = StanfordCoreNLP (options .corenlp , properties = options . properties )
477
501
server .register_function (nlp .parse )
478
502
479
503
print 'Serving on http://%s:%s' % (options .host , options .port )
0 commit comments