@@ -188,7 +188,7 @@ def parse_parser_results(text):
188
188
189
189
return results
190
190
191
- def parse_parser_xml_results (xml , file_name = "" ):
191
+ def parse_parser_xml_results (xml , file_name = "" , raw_output = False ):
192
192
import xmltodict
193
193
from collections import OrderedDict
194
194
@@ -198,6 +198,9 @@ def extract_words_from_xml(sent_node):
198
198
199
199
# Turning the raw xml into a raw python dictionary:
200
200
raw_dict = xmltodict .parse (xml )
201
+ if raw_output :
202
+ return raw_dict
203
+
201
204
document = raw_dict [u'root' ][u'document' ]
202
205
203
206
# Making a raw sentence list of dictionaries:
@@ -263,7 +266,7 @@ def extract_words_from_xml(sent_node):
263
266
264
267
return results
265
268
266
- def parse_xml_output (input_dir , corenlp_path = DIRECTORY , memory = "3g" ):
269
+ def parse_xml_output (input_dir , corenlp_path = DIRECTORY , memory = "3g" , raw_output = False ):
267
270
"""Because interaction with the command-line interface of the CoreNLP
268
271
tools is limited to very short text bits, it is necessary to parse xml
269
272
output"""
@@ -296,7 +299,8 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g"):
296
299
with open (xml_dir + '/' + output_file , 'r' ) as xml :
297
300
# parsed = xml.read()
298
301
file_name = re .sub ('.xml$' , '' , os .path .basename (output_file ))
299
- result .append (parse_parser_xml_results (xml .read (), file_name ))
302
+ result .append (parse_parser_xml_results (xml .read (), file_name ,
303
+ raw_output = raw_output ))
300
304
finally :
301
305
file_list .close ()
302
306
shutil .rmtree (xml_dir )
@@ -432,19 +436,20 @@ def parse(self, text):
432
436
return json .dumps (self .raw_parse (text ))
433
437
434
438
435
- def batch_parse (input_folder , corenlp_path = DIRECTORY , memory = "3g" ):
439
+ def batch_parse (input_folder , corenlp_path = DIRECTORY , memory = "3g" , raw_output = False ):
436
440
"""
437
441
This function takes input files,
438
442
sends list of input files to the Stanford parser,
439
443
reads in the results from temporary folder in your OS and
440
444
returns a generator object of list that consist of dictionary entry.
445
+ If raw_output is true, the dictionary returned will correspond exactly to XML.
441
446
( The function needs xmltodict,
442
447
and doesn't need init 'StanfordCoreNLP' class. )
443
448
"""
444
449
if not os .path .exists (input_folder ):
445
450
raise Exception ("Not exist input_folder" )
446
451
447
- return parse_xml_output (input_folder , corenlp_path , memory )
452
+ return parse_xml_output (input_folder , corenlp_path , memory , raw_output = raw_output )
448
453
449
454
450
455
if __name__ == '__main__' :
0 commit comments