silverasm · abendebury · May 11, 2014 · Jul 16, 2014 · Jul 27, 2014 · Jul 28, 2014
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
 # A Python wrapper for the Java Stanford Core NLP tools
----------------------------
 
-This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server.
+This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server.
 
 ## Edited
+   * Tested only with the current annotator configuration: not a general-purpose wrapper
+   * Update to Stanford CoreNLP v3.5.2
    * Added multi-threaded load balancing
-   * Update to Stanford CoreNLP v3.2.0
    * Fix many bugs & improve performance
    * Using jsonrpclib for stability and performance
    * Can edit the constants as argument such as Stanford Core NLP directory
@@ -22,15 +22,6 @@ This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/da
 
 To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package.  By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run.
 
-
-In other words:
-
-    sudo pip install pexpect unidecode jsonrpclib   # jsonrpclib is optional
-    git clone https://bitbucket.org/torotoki/corenlp-python.git
-	  cd corenlp-python
-    wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-06-20.zip
-    unzip stanford-corenlp-full-2013-06-20.zip
-
 Then, to launch a server:
 
     python corenlp/corenlp.py
@@ -159,8 +150,11 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i
 
 (note: The function requires xmltodict now, you should install it by `sudo pip install xmltodict`)
 
-## Developer
+## Developers
    * Hiroyoshi Komatsu [hiroyoshi.komat@gmail.com]
    * Johannes Castner [jac2130@columbia.edu]
    * Robert Elwell [robert@wikia-inc.com]
    * Tristan Chong [tristan@wikia-inc.com]
+   * Aditi Muralidharan [aditi.shrikumar@gmail.com]
+   * Ian MacFarland [ianmacfarland@ischool.berkeley.edu]
+
diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py
@@ -18,7 +18,6 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
-
 import json
 import optparse
 import os
@@ -28,9 +27,18 @@
 import pexpect
 import tempfile
 import shutil
+import re
 from progressbar import ProgressBar, Fraction
 from unidecode import unidecode
 from subprocess import call
+import glob
+
+use_winpexpect = True
+
+try:
+    import winpexpect
+except ImportError:
+    use_winpexpect = False
 
 VERBOSE = False
 STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
@@ -92,31 +100,21 @@ def init_corenlp_command(corenlp_path, memory, properties):
     """
 
     # TODO: Can edit jar constants
-    jars = ["stanford-corenlp-3.2.0.jar",
-            "stanford-corenlp-3.2.0-models.jar",
-            "xom.jar",
-            "joda-time.jar",
-            "jollyday.jar"
-            ]
+    jar_mask = "*.jar"
+    jars = glob.glob(os.path.join(corenlp_path, jar_mask))
 
     java_path = "java"
     classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
     # include the properties file, so you can change defaults
     # but any changes in output format will break parse_parser_results()
-    current_dir_pr =  os.path.dirname(os.path.abspath(__file__)) + "/" + properties
+    current_dir_pr =  os.path.join(os.path.dirname(os.path.abspath(__file__)), properties)
     if os.path.exists(properties):
         props = "-props %s" % (properties.replace(" ", "\\ "))
     elif os.path.exists(current_dir_pr):
         props = "-props %s" % (current_dir_pr.replace(" ", "\\ "))
     else:
         raise Exception("Error! Cannot locate: %s" % properties)
 
-    # add and check classpaths
-    jars = [corenlp_path + "/" + jar for jar in jars]
-    for jar in jars:
-        if not os.path.exists(jar):
-            raise Exception("Error! Cannot locate: %s" % jar)
-
     # add memory limit on JVM
     if memory:
         limit = "-Xmx%s" % memory
@@ -125,12 +123,6 @@ def init_corenlp_command(corenlp_path, memory, properties):
 
     return "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props)
 
-
-def remove_id(word):
-    """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
-    return word.replace("'", "")
-
-
 def parse_bracketed(s):
     '''Parse word features [abc=... def = ...]
     Also manages to parse out features that have XML within them
@@ -143,13 +135,13 @@ def parse_bracketed(s):
         temp["^^^%d^^^" % i] = tag
         s = s.replace(tag, "^^^%d^^^" % i)
     # Load key-value pairs, substituting as necessary
-    for attr, val in re.findall(r"([^=\s]*)=([^=\s]*)", s):
+    for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s):
         if val in temp:
-            val = temp[val]
+            val = remove_escapes(temp[val])
         if attr == 'Text':
-            word = val
+            word = remove_escapes(val)
         else:
-            attrs[attr] = val
+            attrs[attr] = remove_escapes(val)
     return (word, attrs)
 
 
@@ -161,7 +153,8 @@ def parse_parser_results(text):
     """
     results = {"sentences": []}
     state = STATE_START
-    for line in unidecode(text.decode('utf-8')).split("\n"):
+    lines = unidecode(text.decode('utf-8')).split("\n")
+    for index, line in enumerate(lines):
         line = line.strip()
 
         if line.startswith("Sentence #"):
@@ -170,31 +163,31 @@ def parse_parser_results(text):
             state = STATE_TEXT
 
         elif state == STATE_TEXT:
-            sentence['text'] = line
+            sentence['text'] = remove_escapes(line)
             state = STATE_WORDS
 
         elif state == STATE_WORDS:
             if not line.startswith("[Text="):
                 raise ParserError('Parse error. Could not find "[Text=" in: %s' % line)
             for s in WORD_PATTERN.findall(line):
                 sentence['words'].append(parse_bracketed(s))
-            state = STATE_TREE
-
-        elif state == STATE_TREE:
-            if len(line) == 0:
+            if not lines[index + 1].startswith("[Text="):
                 state = STATE_DEPENDENCY
-                sentence['parsetree'] = " ".join(sentence['parsetree'])
-            else:
-                sentence['parsetree'].append(line)
+                # skipping TREE because the new depparse annotator doesn't make a parse tree
 
+
         elif state == STATE_DEPENDENCY:
             if len(line) == 0:
                 state = STATE_COREFERENCE
             else:
                 split_entry = re.split("\(|, |-", line[:-1])
                 if len(split_entry) == 5:
-                    rel, left, leftindex, right, rightindex = map(lambda x: remove_id(x), split_entry)
-                    sentence['dependencies'].append(tuple([rel, left, leftindex, right, rightindex]))
+                    rel, left, leftindex, right, rightindex = split_entry
+                    leftindex = re.sub("[^0-9]", "", leftindex)
+                    rightindex = re.sub("[^0-9]", "", rightindex)
+                    sentence['dependencies'].append(tuple([rel,
+                        remove_escapes(left), leftindex, remove_escapes(right),
+                        rightindex]))
 
         elif state == STATE_COREFERENCE:
             if "Coreference set" in line:
@@ -273,7 +266,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
 
     #we get a list of the cleaned files that we want to parse:
 
-    files = [input_dir + '/' + f for f in os.listdir(input_dir) if f.endswith(".txt")]
+    files = [os.path.join(input_dir , f) for f in os.listdir(input_dir) if f.endswith(".txt")]
 
     #creating the file list of files to parse
 
@@ -291,7 +284,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
     # result = []
     try:
         for output_file in os.listdir(xml_dir):
-            with open(xml_dir + '/' + output_file, 'r') as xml:
+            with open(os.path.join(xml_dir + output_file), 'r') as xml:
                 # parsed = xml.read()
                 file_name = re.sub('.xml$', '', os.path.basename(output_file))
                 # result.append(parse_parser_xml_results(xml.read(), file_name,
@@ -314,7 +307,12 @@ class StanfordCoreNLP:
     def _spawn_corenlp(self):
         if VERBOSE:
             print self.start_corenlp
-        self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192, searchwindowsize=80)
+        if use_winpexpect:
+            self.corenlp = winpexpect.winspawn(self.start_corenlp, maxread=8192,
+                searchwindowsize=80)
+        else:
+            self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192,
+                searchwindowsize=80)
 
         # show progress bar while loading the models
         if VERBOSE:
@@ -348,7 +346,12 @@ def __init__(self, corenlp_path=DIRECTORY, memory="3g", properties='default.prop
         self._spawn_corenlp()
 
     def close(self, force=True):
-        self.corenlp.terminate(force)
+        global use_winpexpect
+        if use_winpexpect:
+            self.corenlp.terminate()
+        else:
+            self.corenlp.terminate(force)
+
 
     def isalive(self):
         return self.corenlp.isalive()
@@ -466,6 +469,24 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
 
     return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output)
 
+def remove_escapes(text):
+    """Given a string, remove PTB3 escape characters.
+    """
+    escapes = {"-lrb-": "(",
+        "-rrb-": ")",
+        "-lsb-": "[",
+        "-rsb-": "]",
+        "-lcb-": "{",
+        "-rcb-": "}",
+        "-LRB-": "(",
+        "-RRB-": ")",
+        "-LSB-": "[",
+        "-RSB-": "]",
+        "-LCB-": "{",
+        "-RCB-": "}"}
+    if text:
+        pattern = re.compile('|'.join(re.escape(key) for key in escapes.keys()))
+        return pattern.sub(lambda x: escapes[x.group()], text)
 
 if __name__ == '__main__':
     """
@@ -500,3 +521,4 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
     except KeyboardInterrupt:
         print >>sys.stderr, "Bye."
         exit()
+
diff --git a/corenlp/default.properties b/corenlp/default.properties
@@ -1,4 +1,8 @@
-annotators = tokenize, ssplit, pos, lemma, parse
+annotators = tokenize, ssplit, pos, lemma, depparse
+
+# specify Stanford Dependencies format for backwards compatibility
+# (new default is Universal Dependencies in 3.5.2)
+depparse.model = edu/stanford/nlp/models/parser/nndep/english_SD.gz
 
 # A true-casing annotator is also available (see below)
 #annotators = tokenize, ssplit, pos, lemma, truecase

diff --git a/setup.py b/setup.py
@@ -0,0 +1,38 @@
+import sys
+from setuptools import setup, find_packages
+
+PACKAGE = "corenlp"
+NAME = "stanford-corenlp-python"
+DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)"
+AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan, Ian MacFarland"
+AUTHOR_EMAIL = "aditi.shrikumar@gmail.com"
+URL = "https://github.com/Wordseer/stanford-corenlp-python"
+VERSION = "3.3.10"
+INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"]
+
+PEXPECT = "pexpect >= 2.4"
+WINPEXPECT = "winpexpect >= 1.5"
+
+if "win32" in sys.platform or "cygwin" in sys.platform:
+	INSTALLATION_REQS.append(WINPEXPECT)
+else:
+	INSTALLATION_REQS.append(PEXPECT)
+
+setup(
+    name=NAME,
+    version=VERSION,
+    description=DESCRIPTION,
+    author=AUTHOR,
+    author_email=AUTHOR_EMAIL,
+    url=URL,
+    packages=find_packages(),
+    package_data = {"": ["*.properties"],
+        "corenlp": ["*.properties"]},
+    install_requires=INSTALLATION_REQS,
+    classifiers=[
+        ("License :: OSI Approved :: GNU General Public License v2 or later "
+            "(GPLv2+)"),
+        "Programming Language :: Python",
+    ],
+)
+