updated to load 3.4 stanford nlp library

zigeuner · zigeuner · commit 4fb60de7d2fa · 2014-07-25T10:22:36.000-07:00
minor changes to include sentiment analysis in the runs
   NOTE: sentiment analysis does not come through the server currently, only via batch xml parsing
diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py
@@ -33,6 +33,7 @@
 from subprocess import call
 
 VERBOSE = False
+#VERBOSE = True
 STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
 WORD_PATTERN = re.compile('\[([^\]]+)\]')
 CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
@@ -92,10 +93,13 @@ def init_corenlp_command(corenlp_path, memory, properties):
     """
 
     # TODO: Can edit jar constants
-    jars = ["stanford-corenlp-3.2.0.jar",
-            "stanford-corenlp-3.2.0-models.jar",
+#    jars = ["stanford-corenlp-3.2.0.jar",
+#            "stanford-corenlp-3.2.0-models.jar",
+    jars = ["stanford-corenlp-3.4.jar",
+            "stanford-corenlp-3.4-models.jar",            
             "xom.jar",
             "joda-time.jar",
+            "ejml-0.23.jar",
             "jollyday.jar"
             ]
 
@@ -241,6 +245,8 @@ def extract_words_from_xml(sent_node):
                                    if 'dep' in dep
                                    for i in xrange(len(dep['dep']))
                                    if dep['@type'] == 'collapsed-ccprocessed-dependencies'],
+                  'sentimentValue': str(raw_sent_list[j]['@sentimentValue']),
+                  'sentiment': str(raw_sent_list[j]['@sentiment']),
                   'text': extract_words_from_xml(raw_sent_list[j]),
                   'parsetree': str(raw_sent_list[j]['parse']),
                   'words': [[str(token['word']), OrderedDict([
@@ -261,7 +267,8 @@ def extract_words_from_xml(sent_node):
     return results
 
 
-def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False, properties='default.properties'):
+def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g",
+                     raw_output=False, properties='default.properties'):
     """Because interaction with the command-line interface of the CoreNLP
     tools is limited to very short text bits, it is necessary to parse xml
     output"""
@@ -451,7 +458,8 @@ def parse(self, text):
         return json.dumps(self.raw_parse(text))
 
 
-def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=False):
+def batch_parse(input_folder, corenlp_path=DIRECTORY, properties='default.properties', raw_output=False, memory="3g"):
+                
     """
     This function takes input files,
     sends list of input files to the Stanford parser,
@@ -464,7 +472,8 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
     if not os.path.exists(input_folder):
         raise Exception("input_folder does not exist")
 
-    return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output)
+    return parse_xml_output(input_folder, corenlp_path, memory,
+                            raw_output=raw_output, properties=properties)
 
 
 if __name__ == '__main__':
diff --git a/corenlp/default.properties b/corenlp/default.properties
@@ -1,4 +1,4 @@
-annotators = tokenize, ssplit, pos, lemma, parse
+annotators = tokenize, ssplit, pos, lemma, parse, sentiment
 
 # A true-casing annotator is also available (see below)
 #annotators = tokenize, ssplit, pos, lemma, truecase

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-annotators = tokenize, ssplit, pos, lemma, parse`
	`1`	`+annotators = tokenize, ssplit, pos, lemma, parse, sentiment`
`2`	`2`
`3`	`3`	`# A true-casing annotator is also available (see below)`
`4`	`4`	`#annotators = tokenize, ssplit, pos, lemma, truecase`