10000 Pep8, added properties and quiet options · potatochip/corenlp-python@b013eea · GitHub
[go: up one dir, main page]

Skip to content

Commit b013eea

Browse files
committed
Pep8, added properties and quiet options
1 parent 12d4134 commit b013eea

File tree

1 file changed

+72
-48
lines changed

1 file changed

+72
-48
lines changed

corenlp/corenlp.py

Lines changed: 72 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@
1919
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
2020

2121

22-
import json, optparse, os, re, sys, time, traceback
22+
import json
23+
import optparse
24+
import os
25+
import re
26+
import sys
27+
import traceback
2328
import pexpect
2429
import tempfile
2530
import shutil
@@ -34,6 +39,7 @@
3439

3540
DIRECTORY = "stanford-corenlp-full-2013-06-20"
3641

42+
3743
class bc:
3844
HEADER = '\033[95m'
3945
OKBLUE = '\033[94m'
@@ -44,31 +50,38 @@ class bc:
4450

4551

4652
class ProcessError(Exception):
53+
4754
def __init__(self, value):
4855
self.value = value
56+
4957
def __str__(self):
5058
return repr(self.value)
5159

60+
5261
class ParserError(Exception):
62+
5363
def __init__(self, value):
5464
self.value = value
65+
5566
def __str__(self):
5667
return repr(self.value)
5768

69+
5870
class TimeoutError(Exception):
71+
5972
def __init__(self, value):
6073
self.value = value
74+
6175
def __str__(self):
6276
return repr(self.value)
6377

6478

65-
def init_corenlp_command(corenlp_path, memory):
79+
def init_corenlp_command(corenlp_path, memory, properties):
6680
"""
6781
Checks the location of the jar files.
6882
Spawns the server as a process.
6983
"""
7084

71-
7285
# TODO: Can edit jar constants
7386
jars = ["stanford-corenlp-3.2.0.jar",
7487
"stanford-corenlp-3.2.0-models.jar",
@@ -80,17 +93,16 @@ def init_corenlp_command(corenlp_path, memory):
8093
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
8194
# include the properties file, so you can change defaults
8295
# but any changes in output format will break parse_parser_results()
83-
property_name = "default.properties"
84-
current_dir_pr = os.path.dirname(os.path.abspath( __file__ )) +"/"+ property_name
85-
if os.path.exists(property_name):
86-
props = "-props %s" % (property_name)
96+
current_dir_pr = os.path.dirname(os.path.abspath(__file__)) + "/" + properties
97+
if os.path.exists(properties):
98+
props = "-props %s" % (properties)
8799
elif os.path.exists(current_dir_pr):
88100
props = "-props %s" % (current_dir_pr)
89101
else:
90-
raise Exception("Error! Cannot locate: default.properties")
102+
raise Exception("Error! Cannot locate: " % properties)
91103

92104
# add and check classpaths
93-
jars = [corenlp_path +"/"+ jar for jar in jars]
105+
jars = [corenlp_path + "/" + jar for jar in jars]
94106
for jar in jars:
95107
if not os.path.exists(jar):
96108
raise Exception("Error! Cannot locate: %s" % jar)
@@ -143,7 +155,7 @@ def parse_parser_results(text):
143155
line = line.strip()
144156

145157
if line.startswith("Sentence #"):
146-
sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
158+
sentence = {'words': [], 'parsetree': [], 'dependencies': []}
147159
results["sentences"].append(sentence)
148160
state = STATE_TEXT
149161

@@ -172,7 +184,7 @@ def parse_parser_results(text):
172184
split_entry = re.split("\(|, ", line[:-1])
173185
if len(split_entry) == 3:
174186
rel, left, right = map(lambda x: remove_id(x), split_entry)
175-
sentence['dependencies'].append(tuple([rel,left,right]))
187+
sentence['dependencies'].append(tuple([rel, left, right]))
176188

177189
elif state == STATE_COREFERENCE:
178190
if "Coreference set" in line:
@@ -182,12 +194,13 @@ def parse_parser_results(text):
182194
results['coref'].append(coref_set)
183195
else:
184196
for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
185-
src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
186-
sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
197+
src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1
198+
sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1
187199
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
188200

189201
return results
190202

203+
191204
def parse_parser_xml_results(xml, file_name="", raw_output=False):
192205
import xmltodict
193206
from collections import OrderedDict
@@ -214,10 +227,10 @@ def extract_words_from_xml(sent_node):
214227
raw_coref_list = document[u'coreference'][u'coreference']
215228

216229
# To dicrease is for given index different from list index
217-
coref_index = [[[int(raw_coref_list[j][u'mention'][i]['sentence'])-1,
218-
int(raw_coref_list[j][u'mention'][i]['head'])-1,
219-
int(raw_coref_list[j][u'mention'][i]['start'])-1,
220-
int(raw_coref_list[j][u'mention'][i]['end'])-1]
230+
coref_index = [[[int(raw_coref_list[j][u'mention'][i]['sentence']) - 1,
231+
int(raw_coref_list[j][u'mention'][i]['head']) - 1,
232+
int(raw_coref_list[j][u'mention'][i]['start']) - 1,
233+
int(raw_coref_list[j][u'mention'][i]['end']) - 1]
221234
for i in xrange(len(raw_coref_list[j][u'mention']))]
222235
for j in xrange(len(raw_coref_list))]
223236

@@ -230,7 +243,7 @@ def extract_words_from_xml(sent_node):
230243
coref_list[j][k].insert(0, ' '.join(exted_words))
231244

232245
coref_list = [[[coref_list[j][i], coref_list[j][0]]
233-
for i in xrange(len(coref_list[j])) if i != 0]
246+
for i in xrange(len(coref_list[j])) if i != 0]
234247
for j in xrange(len(coref_list))]
235248
else:
236249
coref_flag = False
@@ -243,7 +256,7 @@ def extract_words_from_xml(sent_node):
243256
dep['dep'][i]['dependent']['#text']]
244257
for dep in raw_sent_list[j][u'dependencies']
245258
for i in xrange(len(dep['dep']))
246-
if dep['@type']=='basic-dependencies'],
259+
if dep['@type'] == 'basic-dependencies'],
247260
'text': extract_words_from_xml(raw_sent_list[j]),
248261
'parsetree': str(raw_sent_list[j]['parse']),
249262
'words': [[str(token['word']), OrderedDict([
@@ -252,12 +265,12 @@ def extract_words_from_xml(sent_node):
252265
('CharacterOffsetBegin', str(token['CharacterOffsetBegin'])),
253266
('PartOfSpeech', str(token['POS'])),
254267
('Lemma', str(token['lemma']))])]
255-
for token in raw_sent_list[j][u'tokens'][u'token']]}
268+
for token in raw_sent_list[j][u'tokens'][u'token']]}
256269

257-
for j in xrange(len(raw_sent_list)) ]
270+
for j in xrange(len(raw_sent_list))]
258271

259272
if coref_flag:
260-
results = {'coref':coref_list, 'sentences':sentences}
273+
results = {'coref': coref_list, 'sentences': sentences}
261274
else:
262275
results = {'sentences': sentences}
263276

@@ -266,7 +279,8 @@ def extract_words_from_xml(sent_node):
266279

267280
return results
268281

269-
def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False):
282+
283+
def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False, properties='default.properties'):
270284
"""Because interaction with the command-line interface of the CoreNLP
271285
tools is limited to very short text bits, it is necessary to parse xml
272286
output"""
@@ -278,15 +292,15 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
278292

279293
#we get a list of the cleaned files that we want to parse:
280294

281-
files = [input_dir+'/'+f for f in os.listdir(input_dir)]
295+
files = [input_dir + '/' + f for f in os.listdir(input_dir)]
282296

283297
#creating the file list of files to parse
284298

285299
file_list.write('\n'.join(files))
286300
file_list.seek(0)
287301

288-
command = init_corenlp_command(corenlp_path, memory)\
289-
+ ' -filelist %s -outputDirectory %s' % (file_list.name, xml_dir)
302+
command = init_corenlp_command(corenlp_path, memory, properties)\
303+
+ ' -filelist %s -outputDirectory %s' % (file_list.name, xml_dir)
290304

291305
#creates the xml file of parser output:
292306

@@ -296,7 +310,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
296310
# result = []
297311
try:
298312
for output_file in os.listdir(xml_dir):
299-
with open(xml_dir+'/'+output_file, 'r') as xml:
313+
with open(xml_dir + '/' + output_file, 'r') as xml:
300314
# parsed = xml.read()
301315
file_name = re.sub('.xml$', '', os.path.basename(output_file))
302316
# result.append(parse_parser_xml_results(xml.read(), file_name,
@@ -308,38 +322,42 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
308322
shutil.rmtree(xml_dir)
309323
# return result
310324

325+
311326
class StanfordCoreNLP:
327+
312328
"""
313329
Command-line interaction with Stanford's CoreNLP java utilities.
314330
Can be run as a JSON-RPC server or imported as a module.
315331
"""
316-
def __init__(self, corenlp_path=DIRECTORY, memory="3g"):
332+
333+
def __init__(self, corenlp_path=DIRECTORY, memory="3g", properties='default.properties'):
317334
"""
318335
Checks the location of the jar files.
319336
Spawns the server as a process.
320337
"""
321338

322339
# spawn the server
323-
start_corenlp = init_corenlp_command(corenlp_path, memory)
324-
if VERBOSE: print start_corenlp
340+
start_corenlp = init_corenlp_command(corenlp_path, memory, properties)
341+
if VERBOSE:
342+
print start_corenlp
325343
self.corenlp = pexpect.spawn(start_corenlp)
326344

327345
# show progress bar while loading the models
328346
if VERBOSE:
329347
widgets = ['Loading Models: ', Fraction()]
330348
pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
331-
self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
332-
if VERBOSE: pbar.update(1)
333-
self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
334-
if VERBOSE: pbar.update(2)
335-
self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
336-
if VERBOSE: pbar.update(3)
337-
self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
338-
if VERBOSE: pbar.update(4)
339-
self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
340-
if VERBOSE: pbar.update(5)
341-
self.corenlp.expect("Entering interactive shell.")
342-
if VERBOSE: pbar.finish()
349+
# Model timeouts:
350+
# pos tagger model (~5sec)
351+
# NER-all classifier (~33sec)
352+
# NER-muc classifier (~60sec)
353+
# CoNLL classifier (~50sec)
354+
# PCFG (~3sec)
355+
timeouts = [20, 200, 600, 600, 20]
356+
for i in xrange(5):
357+
self.corenlp.expect("done.", timeout=timeouts[i]) # Load model
358+
pbar.update(i + 1)
359+
self.corenlp.expect("Entering interactive shell.")
360+
pbar.finish()
343361

344362
# interactive shell
345363
self.corenlp.expect("\nNLP> ", timeout=3)
@@ -373,7 +391,7 @@ def _parse(self, text):
373391
def clean_up():
374392
while True:
375393
try:
376-
self.corenlp.read_nonblocking (8192, 0.1)
394+
self.corenlp.read_nonblocking(8192, 0.1)
377395
except pexpect.TIMEOUT:
378396
break
379397
clean_up()
@@ -412,11 +430,13 @@ def clean_up():
412430
'output': incoming}
413431
return
414432

415-
if VERBOSE: print "%s\n%s" % ('='*40, incoming)
433+
if VERBOSE:
434+
print "%s\n%s" % ('=' * 40, incoming)
416435
try:
417436
results = parse_parser_results(incoming)
418-
except Exception, e:
419-
if VERBOSE: print traceback.format_exc()
437+
except Exception as e:
438+
if VERBOSE:
439+
print traceback.format_exc()
420440
raise e
421441

422442
return results
@@ -459,21 +479,25 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
459479
The code below starts an JSONRPC server
460480
"""
461481
from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer
462-
VERBOSE = True
463482
parser = optparse.OptionParser(usage="%prog [OPTIONS]")
464483
parser.add_option('-p', '--port', default='8080',
465484
help='Port to serve on (default 8080)')
466485
parser.add_option('-H', '--host', default='127.0.0.1',
467486
help='Host to serve on (default localhost; 0.0.0.0 to make public)')
487+
parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose',
488+
help="Quiet mode, don't print status msgs to stdout")
468489
parser.add_option('-S', '--corenlp', default=DIRECTORY,
469490
help='Stanford CoreNLP tool directory (default %s)' % DIRECTORY)
491+
parser.add_option('-P', '--properties', default='default.properties',
492+
help='Stanford CoreNLP properties fieles (default: default.properties)')
470493
options, args = parser.parse_args()
494+
VERBOSE = options.verbose
471495
# server = jsonrpc.Server(jsonrpc.JsonRpc20(),
472496
# jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
473497
try:
474498
server = SimpleJSONRPCServer((options.host, int(options.port)))
475499

476-
nlp = StanfordCoreNLP(options.corenlp)
500+
nlp = StanfordCoreNLP(options.corenlp, properties=options.properties)
477501
server.register_function(nlp.parse)
478502

479503
print 'Serving on http://%s:%s' % (options.host, options.port)

0 commit comments

Comments
 (0)
0