8000 Merged in andrewyates/corenlp-python (pull request #1) · kowey/stanford-corenlp-python@ab2b8a5 · GitHub
[go: up one dir, main page]

Skip to content

Commit ab2b8a5

Browse files
committed
Merged in andrewyates/corenlp-python (pull request relwell#1)
add raw_output option to return CoreNLP's XML as a dictionary without converting the format
2 parents 44f97db + 966f6c1 commit ab2b8a5

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

corenlp/corenlp.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def parse_parser_results(text):
188188

189189
return results
190190

191-
def parse_parser_xml_results(xml, file_name=""):
191+
def parse_parser_xml_results(xml, file_name="", raw_output=False):
192192
import xmltodict
193193
from collections import OrderedDict
194194

@@ -198,6 +198,9 @@ def extract_words_from_xml(sent_node):
198198

199199
# Turning the raw xml into a raw python dictionary:
200200
raw_dict = xmltodict.parse(xml)
201+
if raw_output:
202+
return raw_dict
203+
201204
document = raw_dict[u'root'][u'document']
202205

203206
# Making a raw sentence list of dictionaries:
@@ -263,7 +266,7 @@ def extract_words_from_xml(sent_node):
263266

264267
return results
265268

266-
def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g"):
269+
def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False):
267270
"""Because interaction with the command-line interface of the CoreNLP
268271
tools is limited to very short text bits, it is necessary to parse xml
269272
output"""
@@ -296,7 +299,8 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g"):
296299
with open(xml_dir+'/'+output_file, 'r') as xml:
297300
# parsed = xml.read()
298301
file_name = re.sub('.xml$', '', os.path.basename(output_file))
299-
result.append(parse_parser_xml_results(xml.read(), file_name))
302+
result.append(parse_parser_xml_results(xml.read(), file_name,
303+
raw_output=raw_output))
300304
finally:
301305
file_list.close()
302306
shutil.rmtree(xml_dir)
@@ -432,19 +436,20 @@ def parse(self, text):
432436
return json.dumps(self.raw_parse(text))
433437

434438

435-
def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g"):
439+
def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=False):
436440
"""
437441
This function takes input files,
438442
sends list of input files to the Stanford parser,
439443
reads in the results from temporary folder in your OS and
440444
returns a generator object of list that consist of dictionary entry.
445+
If raw_output is true, the dictionary returned will correspond exactly to XML.
441446
( The function needs xmltodict,
442447
and doesn't need init 'StanfordCoreNLP' class. )
443448
"""
444449
if not os.path.exists(input_folder):
445450
raise Exception("Not exist input_folder")
446451

447-
return parse_xml_output(input_folder, corenlp_path, memory)
452+
return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output)
448453

449454

450455
if __name__ == '__main__':

0 commit comments

Comments
 (0)
0