33
33
from subprocess import call
34
34
35
35
VERBOSE = False
36
+ #VERBOSE = True
36
37
STATE_START , STATE_TEXT , STATE_WORDS , STATE_TREE , STATE_DEPENDENCY , STATE_COREFERENCE = 0 , 1 , 2 , 3 , 4 , 5
37
38
WORD_PATTERN = re .compile ('\[([^\]]+)\]' )
38
39
CR_PATTERN = re .compile (r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"" )
@@ -92,10 +93,13 @@ def init_corenlp_command(corenlp_path, memory, properties):
92
93
"""
93
94
94
95
# TODO: Can edit jar constants
95
- jars = ["stanford-corenlp-3.2.0.jar" ,
96
- "stanford-corenlp-3.2.0-models.jar" ,
96
+ # jars = ["stanford-corenlp-3.2.0.jar",
97
+ # "stanford-corenlp-3.2.0-models.jar",
98
+ jars = ["stanford-corenlp-3.4.jar" ,
99
+ "stanford-corenlp-3.4-models.jar" ,
97
100
"xom.jar" ,
98
101
"joda-time.jar" ,
102
+ "ejml-0.23.jar" ,
99
103
"jollyday.jar"
100
104
]
101
105
@@ -241,6 +245,8 @@ def extract_words_from_xml(sent_node):
241
245
if 'dep' in dep
242
246
for i in xrange (len (dep ['dep' ]))
243
247
if dep ['@type' ] == 'collapsed-ccprocessed-dependencies' ],
248
+ 'sentimentValue' : str (raw_sent_list [j ]['@sentimentValue' ]),
249
+ 'sentiment' : str (raw_sent_list [j ]['@sentiment' ]),
244
250
'text' : extract_words_from_xml (raw_sent_list [j ]),
245
251
'parsetree' : str (raw_sent_list [j ]['parse' ]),
246
252
'words' : [[str (token ['word' ]), OrderedDict ([
@@ -261,7 +267,8 @@ def extract_words_from_xml(sent_node):
261
267
return results
262
268
263
269
264
- def parse_xml_output (input_dir , corenlp_path = DIRECTORY , memory = "3g" , raw_output = False , properties = 'default.properties' ):
270
+ def parse_xml_output (input_dir , corenlp_path = DIRECTORY , memory = "3g" ,
271
+ raw_output = False , properties = 'default.properties' ):
265
272
"""Because interaction with the command-line interface of the CoreNLP
266
273
tools is limited to very short text bits, it is necessary to parse xml
267
274
output"""
@@ -451,7 +458,8 @@ def parse(self, text):
451
458
return json .dumps (self .raw_parse (text ))
452
459
453
460
454
- def batch_parse (input_folder , corenlp_path = DIRECTORY , memory = "3g" , raw_output = False ):
461
+ def batch_parse (input_folder , corenlp_path = DIRECTORY , properties = 'default.properties' , raw_output = False , memory = "3g" ):
462
+
455
463
"""
456
464
This function takes input files,
457
465
sends list of input files to the Stanford parser,
@@ -464,7 +472,8 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
464
472
if not os .path .exists (input_folder ):
465
473
raise Exception ("input_folder does not exist" )
466
474
467
- return parse_xml_output (input_folder , corenlp_path , memory , raw_output = raw_output )
475
+ return parse_xml_output (input_folder , corenlp_path , memory ,
476
+ raw_output = raw_output , properties = properties )
468
477
469
478
470
479
if __name__ == '__main__' :
0 commit comments