diff --git a/README.md b/README.md index 0f7a2c6..e3a1400 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # A Python wrapper for the Java Stanford Core NLP tools ---------------------------- -This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. +This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. ## Edited + * Tested only with the current annotator configuration: not a general-purpose wrapper + * Update to Stanford CoreNLP v3.5.2 * Added multi-threaded load balancing - * Update to Stanford CoreNLP v3.2.0 * Fix many bugs & improve performance * Using jsonrpclib for stability and performance * Can edit the constants as argument such as Stanford Core NLP directory @@ -22,15 +22,6 @@ This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/da To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. - -In other words: - - sudo pip install pexpect unidecode jsonrpclib # jsonrpclib is optional - git clone https://bitbucket.org/torotoki/corenlp-python.git - cd corenlp-python - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-06-20.zip - unzip stanford-corenlp-full-2013-06-20.zip - Then, to launch a server: python corenlp/corenlp.py @@ -159,8 +150,11 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i (note: The function requires xmltodict now, you should install it by `sudo pip install xmltodict`) -## Developer +## Developers * Hiroyoshi Komatsu [hiroyoshi.komat@gmail.com] * Johannes Castner [jac2130@columbia.edu] * Robert Elwell [robert@wikia-inc.com] * Tristan Chong [tristan@wikia-inc.com] + * Aditi Muralidharan [aditi.shrikumar@gmail.com] + * Ian MacFarland [ianmacfarland@ischool.berkeley.edu] + diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py old mode 100755 new mode 100644 index 6e21ad9..f2c9a16 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -18,7 +18,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - import json import optparse import os @@ -28,9 +27,18 @@ import pexpect import tempfile import shutil +import re from progressbar import ProgressBar, Fraction from unidecode import unidecode from subprocess import call +import glob + +use_winpexpect = True + +try: + import winpexpect +except ImportError: + use_winpexpect = False VERBOSE = False STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 @@ -92,18 +100,14 @@ def init_corenlp_command(corenlp_path, memory, properties): """ # TODO: Can edit jar constants - jars = ["stanford-corenlp-3.2.0.jar", - "stanford-corenlp-3.2.0-models.jar", - "xom.jar", - "joda-time.jar", - "jollyday.jar" - ] + jar_mask = "*.jar" + jars = glob.glob(os.path.join(corenlp_path, jar_mask)) java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - current_dir_pr = os.path.dirname(os.path.abspath(__file__)) + "/" + properties + current_dir_pr = os.path.join(os.path.dirname(os.path.abspath(__file__)), properties) if os.path.exists(properties): props = "-props %s" % (properties.replace(" ", "\\ ")) elif os.path.exists(current_dir_pr): @@ -111,12 +115,6 @@ def init_corenlp_command(corenlp_path, memory, properties): else: raise Exception("Error! Cannot locate: %s" % properties) - # add and check classpaths - jars = [corenlp_path + "/" + jar for jar in jars] - for jar in jars: - if not os.path.exists(jar): - raise Exception("Error! Cannot locate: %s" % jar) - # add memory limit on JVM if memory: limit = "-Xmx%s" % memory @@ -125,12 +123,6 @@ def init_corenlp_command(corenlp_path, memory, properties): return "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props) - -def remove_id(word): - """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """ - return word.replace("'", "") - - def parse_bracketed(s): '''Parse word features [abc=... def = ...] Also manages to parse out features that have XML within them @@ -143,13 +135,13 @@ def parse_bracketed(s): temp["^^^%d^^^" % i] = tag s = s.replace(tag, "^^^%d^^^" % i) # Load key-value pairs, substituting as necessary - for attr, val in re.findall(r"([^=\s]*)=([^=\s]*)", s): + for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s): if val in temp: - val = temp[val] + val = remove_escapes(temp[val]) if attr == 'Text': - word = val + word = remove_escapes(val) else: - attrs[attr] = val + attrs[attr] = remove_escapes(val) return (word, attrs) @@ -161,7 +153,8 @@ def parse_parser_results(text): """ results = {"sentences": []} state = STATE_START - for line in unidecode(text.decode('utf-8')).split("\n"): + lines = unidecode(text.decode('utf-8')).split("\n") + for index, line in enumerate(lines): line = line.strip() if line.startswith("Sentence #"): @@ -170,7 +163,7 @@ def parse_parser_results(text): state = STATE_TEXT elif state == STATE_TEXT: - sentence['text'] = line + sentence['text'] = remove_escapes(line) state = STATE_WORDS elif state == STATE_WORDS: @@ -178,23 +171,23 @@ def parse_parser_results(text): raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - - elif state == STATE_TREE: - if len(line) == 0: + if not lines[index + 1].startswith("[Text="): state = STATE_DEPENDENCY - sentence['parsetree'] = " ".join(sentence['parsetree']) - else: - sentence['parsetree'].append(line) + # skipping TREE because the new depparse annotator doesn't make a parse tree + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE else: split_entry = re.split("\(|, |-", line[:-1]) if len(split_entry) == 5: - rel, left, leftindex, right, rightindex = map(lambda x: remove_id(x), split_entry) - sentence['dependencies'].append(tuple([rel, left, leftindex, right, rightindex])) + rel, left, leftindex, right, rightindex = split_entry + leftindex = re.sub("[^0-9]", "", leftindex) + rightindex = re.sub("[^0-9]", "", rightindex) + sentence['dependencies'].append(tuple([rel, + remove_escapes(left), leftindex, remove_escapes(right), + rightindex])) elif state == STATE_COREFERENCE: if "Coreference set" in line: @@ -273,7 +266,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output= #we get a list of the cleaned files that we want to parse: - files = [input_dir + '/' + f for f in os.listdir(input_dir) if f.endswith(".txt")] + files = [os.path.join(input_dir , f) for f in os.listdir(input_dir) if f.endswith(".txt")] #creating the file list of files to parse @@ -291,7 +284,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output= # result = [] try: for output_file in os.listdir(xml_dir): - with open(xml_dir + '/' + output_file, 'r') as xml: + with open(os.path.join(xml_dir + output_file), 'r') as xml: # parsed = xml.read() file_name = re.sub('.xml$', '', os.path.basename(output_file)) # result.append(parse_parser_xml_results(xml.read(), file_name, @@ -314,7 +307,12 @@ class StanfordCoreNLP: def _spawn_corenlp(self): if VERBOSE: print self.start_corenlp - self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192, searchwindowsize=80) + if use_winpexpect: + self.corenlp = winpexpect.winspawn(self.start_corenlp, maxread=8192, + searchwindowsize=80) + else: + self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192, + searchwindowsize=80) # show progress bar while loading the models if VERBOSE: @@ -348,7 +346,12 @@ def __init__(self, corenlp_path=DIRECTORY, memory="3g", properties='default.prop self._spawn_corenlp() def close(self, force=True): - self.corenlp.terminate(force) + global use_winpexpect + if use_winpexpect: + self.corenlp.terminate() + else: + self.corenlp.terminate(force) + def isalive(self): return self.corenlp.isalive() @@ -466,6 +469,24 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output) +def remove_escapes(text): + """Given a string, remove PTB3 escape characters. + """ + escapes = {"-lrb-": "(", + "-rrb-": ")", + "-lsb-": "[", + "-rsb-": "]", + "-lcb-": "{", + "-rcb-": "}", + "-LRB-": "(", + "-RRB-": ")", + "-LSB-": "[", + "-RSB-": "]", + "-LCB-": "{", + "-RCB-": "}"} + if text: + pattern = re.compile('|'.join(re.escape(key) for key in escapes.keys())) + return pattern.sub(lambda x: escapes[x.group()], text) if __name__ == '__main__': """ @@ -500,3 +521,4 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa except KeyboardInterrupt: print >>sys.stderr, "Bye." exit() + diff --git a/corenlp/default.properties b/corenlp/default.properties index c475c92..70ac093 100644 --- a/corenlp/default.properties +++ b/corenlp/default.properties @@ -1,4 +1,8 @@ -annotators = tokenize, ssplit, pos, lemma, parse +annotators = tokenize, ssplit, pos, lemma, depparse + +# specify Stanford Dependencies format for backwards compatibility +# (new default is Universal Dependencies in 3.5.2) +depparse.model = edu/stanford/nlp/models/parser/nndep/english_SD.gz # A true-casing annotator is also available (see below) #annotators = tokenize, ssplit, pos, lemma, truecase diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..869427c --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +import sys +from setuptools import setup, find_packages + +PACKAGE = "corenlp" +NAME = "stanford-corenlp-python" +DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)" +AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan, Ian MacFarland" +AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" +URL = "https://github.com/Wordseer/stanford-corenlp-python" +VERSION = "3.3.10" +INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] + +PEXPECT = "pexpect >= 2.4" +WINPEXPECT = "winpexpect >= 1.5" + +if "win32" in sys.platform or "cygwin" in sys.platform: + INSTALLATION_REQS.append(WINPEXPECT) +else: + INSTALLATION_REQS.append(PEXPECT) + +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + author=AUTHOR, + author_email=AUTHOR_EMAIL, + url=URL, + packages=find_packages(), + package_data = {"": ["*.properties"], + "corenlp": ["*.properties"]}, + install_requires=INSTALLATION_REQS, + classifiers=[ + ("License :: OSI Approved :: GNU General Public License v2 or later " + "(GPLv2+)"), + "Programming Language :: Python", + ], +) +