8000 updated to load 3.4 stanford nlp library · cytora/stanford-corenlp-python@4fb60de · GitHub
[go: up one dir, main page]

Skip to content

Commit 4fb60de

Browse files
committed
updated to load 3.4 stanford nlp library
minor changes to include sentiment analysis in the runs NOTE: sentiment analysis does not come through the server currently, only via batch xml parsing
1 parent c17d656 commit 4fb60de

File tree

2 files changed

+15
-6
lines changed

2 files changed

+15
-6
lines changed

corenlp/corenlp.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from subprocess import call
3434

3535
VERBOSE = False
36+
#VERBOSE = True
3637
STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
3738
WORD_PATTERN = re.compile('\[([^\]]+)\]')
3839
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
@@ -92,10 +93,13 @@ def init_corenlp_command(corenlp_path, memory, properties):
9293
"""
9394

9495
# TODO: Can edit jar constants
95-
jars = ["stanford-corenlp-3.2.0.jar",
96-
"stanford-corenlp-3.2.0-models.jar",
96+
# jars = ["stanford-corenlp-3.2.0.jar",
97+
# "stanford-corenlp-3.2.0-models.jar",
98+
jars = ["stanford-corenlp-3.4.jar",
99+
"stanford-corenlp-3.4-models.jar",
97100
"xom.jar",
98101
"joda-time.jar",
102+
"ejml-0.23.jar",
99103
"jollyday.jar"
100104
]
101105

@@ -241,6 +245,8 @@ def extract_words_from_xml(sent_node):
241245
if 'dep' in dep
242246
for i in xrange(len(dep['dep']))
243247
if dep['@type'] == 'collapsed-ccprocessed-dependencies'],
248+
'sentimentValue': str(raw_sent_list[j]['@sentimentValue']),
249+
'sentiment': str(raw_sent_list[j]['@sentiment']),
244250
'text': extract_words_from_xml(raw_sent_list[j]),
245251
'parsetree': str(raw_sent_list[j]['parse']),
246252
'words': [[str(token['word']), OrderedDict([
@@ -261,7 +267,8 @@ def extract_words_from_xml(sent_node):
261267
return results
262268

263269

264-
def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False, properties='default.properties'):
270+
def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g",
271+
raw_output=False, properties='default.properties'):
265272
"""Because interaction with the command-line interface of the CoreNLP
266273
tools is limited to very short text bits, it is necessary to parse xml
267274
output"""
@@ -451,7 +458,8 @@ def parse(self, text):
451458
return json.dumps(self.raw_parse(text))
452459

453460

454-
def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=False):
461+
def batch_parse(input_folder, corenlp_path=DIRECTORY, properties='default.properties', raw_output=False, memory="3g"):
462+
455463
"""
456464
This function takes input files,
457465
sends list of input files to the Stanford parser,
@@ -464,7 +472,8 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
464472
if not os.path.exists(input_folder):
465473
raise Exception("input_folder does not exist")
466474

467-
return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output)
475+
return parse_xml_output(input_folder, corenlp_path, memory,
476+
raw_output=raw_output, properties=properties)
468477

469478

470479
if __name__ == '__main__':

corenlp/default.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
annotators = tokenize, ssplit, pos, lemma, parse
1+
annotators = tokenize, ssplit, pos, lemma, parse, sentiment
22

33
# A true-casing annotator is also available (see below)
44
#annotators = tokenize, ssplit, pos, lemma, truecase

0 commit comments

Comments
 (0)
0