From 9c99ab0298c25cff6c0663b0f47631ab532b942e Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sat, 10 May 2014 17:50:16 -0700 Subject: [PATCH 01/27] Added setup.py script --- setup.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..93a0261 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +from setuptools import setup, find_packages + +PACKAGE = "corenlp" +NAME = "wordseer-stanford-corenlp-python" +DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)" +AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" +AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" +URL = "https://github.com/silverasm/stanford-corenlp-python" +VERSION = "3.3.0-0" + +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + author=AUTHOR, + author_email=AUTHOR_EMAIL, + url=URL, + packages=find_packages(), + package_data = {"": ["*.properties"], + "corenlp": ["*.properties"]}, + install_requires=[ + "pexpect >= 2.4", + "unidecode >= 0.04.12", + "xmltodict >= 0.4.6", + ], + classifiers=[ + ("License :: OSI Approved :: GNU General Public License v2 or later " + "(GPLv2+)"), + "Programming Language :: Python", + ], +) From c17d656bb398bffa6030dd0d34bb8fe90b0a78d8 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Tue, 15 Jul 2014 18:45:31 -0700 Subject: [PATCH 02/27] Updated README, updated setup.py with newer info --- README.md | 5 +++-- setup.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0f7a2c6..f0fb897 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ # A Python wrapper for the Java Stanford Core NLP tools ---------------------------- This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. @@ -159,8 +158,10 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i (note: The function requires xmltodict now, you should install it by `sudo pip install xmltodict`) -## Developer +## Developers * Hiroyoshi Komatsu [hiroyoshi.komat@gmail.com] * Johannes Castner [jac2130@columbia.edu] * Robert Elwell [robert@wikia-inc.com] * Tristan Chong [tristan@wikia-inc.com] + * Aditi Muralidharan [aditi.shrikumar@gmail.com] + diff --git a/setup.py b/setup.py index 93a0261..396bdda 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ from setuptools import setup, find_packages PACKAGE = "corenlp" -NAME = "wordseer-stanford-corenlp-python" +NAME = "stanford-corenlp-python" DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)" AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" -URL = "https://github.com/silverasm/stanford-corenlp-python" +URL = "https://github.com/Wordseer/stanford-corenlp-python" VERSION = "3.3.0-0" setup( @@ -29,3 +29,4 @@ "Programming Language :: Python", ], ) + From e9f5973278f08f2e0667619d16b0193341e7cc17 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sun, 27 Jul 2014 13:33:59 +0000 Subject: [PATCH 03/27] Removed the remove_id method, let's see what happens --- corenlp/corenlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 6e21ad9..1017735 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -18,7 +18,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - import json import optparse import os @@ -193,7 +192,7 @@ def parse_parser_results(text): else: split_entry = re.split("\(|, |-", line[:-1]) if len(split_entry) == 5: - rel, left, leftindex, right, rightindex = map(lambda x: remove_id(x), split_entry) + rel, left, leftindex, right, rightindex = split_entry sentence['dependencies'].append(tuple([rel, left, leftindex, right, rightindex])) elif state == STATE_COREFERENCE: @@ -500,3 +499,4 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa except KeyboardInterrupt: print >>sys.stderr, "Bye." exit() + From 8593c0aaa50296a049c833125a9b546ce65c423f Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Mon, 28 Jul 2014 12:42:46 +0000 Subject: [PATCH 04/27] Removed remove_id method altogether. --- corenlp/corenlp.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 1017735..23418fa 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -124,12 +124,6 @@ def init_corenlp_command(corenlp_path, memory, properties): return "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props) - -def remove_id(word): - """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """ - return word.replace("'", "") - - def parse_bracketed(s): '''Parse word features [abc=... def = ...] Also manages to parse out features that have XML within them From b664532367ce77fc576cadccbbba7e7fc35af2fb Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Wed, 30 Jul 2014 16:59:09 -0700 Subject: [PATCH 05/27] Fixed bug which caused weird issues with quotes in numeric values. --- corenlp/corenlp.py | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 23418fa..a818728 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -27,6 +27,7 @@ import pexpect import tempfile import shutil +import re from progressbar import ProgressBar, Fraction from unidecode import unidecode from subprocess import call @@ -187,6 +188,8 @@ def parse_parser_results(text): split_entry = re.split("\(|, |-", line[:-1]) if len(split_entry) == 5: rel, left, leftindex, right, rightindex = split_entry + leftindex = re.sub("[^0-9]", "", leftindex) + rightindex = re.sub("[^0-9]", "", rightindex) sentence['dependencies'].append(tuple([rel, left, leftindex, right, rightindex])) elif state == STATE_COREFERENCE: diff --git a/setup.py b/setup.py index 396bdda..80ae7a7 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.0-0" +VERSION = "3.3.2-0" setup( name=NAME, From b4b9348ce2c7ce052ad3ebbaa300beb1dbce100c Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sat, 2 Aug 2014 13:32:05 -0700 Subject: [PATCH 06/27] Parser shouldn't eat equals signs anymore --- corenlp/corenlp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index a818728..e53eca8 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - +import pdb import json import optparse import os @@ -137,7 +137,7 @@ def parse_bracketed(s): temp["^^^%d^^^" % i] = tag s = s.replace(tag, "^^^%d^^^" % i) # Load key-value pairs, substituting as necessary - for attr, val in re.findall(r"([^=\s]*)=([^=\s]*)", s): + for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s): if val in temp: val = temp[val] if attr == 'Text': @@ -171,6 +171,7 @@ def parse_parser_results(text): if not line.startswith("[Text="): raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): + pdb.set_trace() sentence['words'].append(parse_bracketed(s)) state = STATE_TREE From aec4b351c3e7d5ddbbedc2f72c43bb38a32fb708 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sat, 2 Aug 2014 13:33:16 -0700 Subject: [PATCH 07/27] Removed debug code --- corenlp/corenlp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index e53eca8..625d75d 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -import pdb + import json import optparse import os @@ -171,7 +171,6 @@ def parse_parser_results(text): if not line.startswith("[Text="): raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): - pdb.set_trace() sentence['words'].append(parse_bracketed(s)) state = STATE_TREE From 3e47fbc0a2d1913dd71ee5201260d0fa0fe1fdf4 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Wed, 6 Aug 2014 11:42:38 -0700 Subject: [PATCH 08/27] Added support for winpexpect, hopefully not to hacky --- corenlp/corenlp.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 625d75d..9d127d1 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -32,6 +32,13 @@ from unidecode import unidecode from subprocess import call +use_winpexpect = True + +try: + import winpexpect +except ImportError: + use_winpexpect = False + VERBOSE = False STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 WORD_PATTERN = re.compile('\[([^\]]+)\]') @@ -310,7 +317,12 @@ class StanfordCoreNLP: def _spawn_corenlp(self): if VERBOSE: print self.start_corenlp - self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192, searchwindowsize=80) + if use_winpexpect: + self.corenlp = winpexpect.winspawn(self.start_corenlp, maxread=8192, + searchwindowsize=80) + else: + self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192, + searchwindowsize=80) # show progress bar while loading the models if VERBOSE: From deb7db65d392d3fe6e44542aaff618056211ab1e Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Fri, 8 Aug 2014 12:20:46 -0700 Subject: [PATCH 09/27] Updated version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 80ae7a7..553fb52 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.2-0" +VERSION = "3.3.3-0" setup( name=NAME, From 7746e822f7ee9459a2de2a8f42eddd18e4c2be1c Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Fri, 8 Aug 2014 20:30:44 -0700 Subject: [PATCH 10/27] Disabled PBT3 escaping. --- corenlp/default.properties | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/corenlp/default.properties b/corenlp/default.properties index c475c92..3f6c32c 100644 --- a/corenlp/default.properties +++ b/corenlp/default.properties @@ -49,7 +49,7 @@ annotators = tokenize, ssplit, pos, lemma, parse #dcoref.female = /scr/nlp/data/Bergsma-Gender/female.unigrams.txt #dcoref.plural = /scr/nlp/data/Bergsma-Gender/plural.unigrams.txt #dcoref.singular = /scr/nlp/data/Bergsma-Gender/singular.unigrams.txt - +tokenize.options = ptb3Escaping=false # This is the regular expression that describes which xml tags to keep # the text from. In order to on off the xml removal, add cleanxml @@ -63,3 +63,4 @@ annotators = tokenize, ssplit, pos, lemma, parse # Whether or not to allow malformed xml # StanfordCoreNLP.properties #wordnet.dir=models/wordnet-3.0-prolog + From 0d6394a6e12e2d05eacca9df8bb80e47f5678088 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Fri, 8 Aug 2014 20:46:15 -0700 Subject: [PATCH 11/27] Pushed version 3.3.4 to pypi. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 553fb52..8e72630 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.3-0" +VERSION = "3.3.4-0" setup( name=NAME, From 0f2f99b6a02095356f1d083cca91ce6a79be6879 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sat, 9 Aug 2014 21:20:16 -0700 Subject: [PATCH 12/27] Revert "Disabled PBT3 escaping." This reverts commit 7746e822f7ee9459a2de2a8f42eddd18e4c2be1c. --- corenlp/default.properties | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/corenlp/default.properties b/corenlp/default.properties index 3f6c32c..c475c92 100644 --- a/corenlp/default.properties +++ b/corenlp/default.properties @@ -49,7 +49,7 @@ annotators = tokenize, ssplit, pos, lemma, parse #dcoref.female = /scr/nlp/data/Bergsma-Gender/female.unigrams.txt #dcoref.plural = /scr/nlp/data/Bergsma-Gender/plural.unigrams.txt #dcoref.singular = /scr/nlp/data/Bergsma-Gender/singular.unigrams.txt -tokenize.options = ptb3Escaping=false + # This is the regular expression that describes which xml tags to keep # the text from. In order to on off the xml removal, add cleanxml @@ -63,4 +63,3 @@ tokenize.options = ptb3Escaping=false # Whether or not to allow malformed xml # StanfordCoreNLP.properties #wordnet.dir=models/wordnet-3.0-prolog - From beb06d185f4c5a28d28e7e3c765f7dd79e894c1f Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sat, 9 Aug 2014 21:21:50 -0700 Subject: [PATCH 13/27] Reverted and uploaded. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e72630..d8966e7 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.4-0" +VERSION = "3.3.5-0" setup( name=NAME, From 8f0450fe4e12e7e6292369156f1c9002ba777b2a Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sun, 10 Aug 2014 08:58:20 -0700 Subject: [PATCH 14/27] Replace PTB3 escapes with characters in the parsing. --- corenlp/corenlp.py | 32 ++++++++++++++++++++++++++------ setup.py | 2 +- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 9d127d1..0cae5c7 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -146,11 +146,11 @@ def parse_bracketed(s): # Load key-value pairs, substituting as necessary for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s): if val in temp: - val = temp[val] + val = remove_escapes(temp[val]) if attr == 'Text': - word = val + word = remove_escapes(val) else: - attrs[attr] = val + attrs[attr] = remove_escapes(val) return (word, attrs) @@ -171,7 +171,7 @@ def parse_parser_results(text): state = STATE_TEXT elif state == STATE_TEXT: - sentence['text'] = line + sentence['text'] = remove_escapes(line) state = STATE_WORDS elif state == STATE_WORDS: @@ -186,7 +186,7 @@ def parse_parser_results(text): state = STATE_DEPENDENCY sentence['parsetree'] = " ".join(sentence['parsetree']) else: - sentence['parsetree'].append(line) + sentence['parsetree'].append(remove_escapes(line)) elif state == STATE_DEPENDENCY: if len(line) == 0: @@ -197,7 +197,9 @@ def parse_parser_results(text): rel, left, leftindex, right, rightindex = split_entry leftindex = re.sub("[^0-9]", "", leftindex) rightindex = re.sub("[^0-9]", "", rightindex) - sentence['dependencies'].append(tuple([rel, left, leftindex, right, rightindex])) + sentence['dependencies'].append(tuple([rel, + remove_escapes(left), leftindex, remove_escapes(right), + rightindex])) elif state == STATE_COREFERENCE: if "Coreference set" in line: @@ -474,6 +476,24 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output) +def remove_escapes(text): + """Given a string, remove PTB3 escape characters. + """ + escapes = {"-lrb-": "(", + "-rrb-": ")", + "-lsb-": "[", + "-rsb-": "]", + "-lcb-": "{", + "-rcb-": "}", + "-LRB-": "(", + "-RRB-": ")", + "-LSB-": "[", + "-RSB-": "]", + "-LCB-": "{", + "-RCB-": "}"} + if text: + pattern = re.compile('|'.join(re.escape(key) for key in escapes.keys())) + return pattern.sub(lambda x: escapes[x.group()], text) if __name__ == '__main__': """ diff --git a/setup.py b/setup.py index d8966e7..124cf91 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.5-0" +VERSION = "3.3.6-0" setup( name=NAME, From 1ec5bb7dafae0cdb9a60ff52980497113b6c8895 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Mon, 11 Aug 2014 19:29:39 -0700 Subject: [PATCH 15/27] Script should now use winpexpect if necessary. --- setup.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 124cf91..ab439a8 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,6 @@ +import sys from setuptools import setup, find_packages +import pdb PACKAGE = "corenlp" NAME = "stanford-corenlp-python" @@ -7,7 +9,18 @@ AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" VERSION = "3.3.6-0" +INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] +PEXPECT = "pexpect >= 2.4" +WINPEXPECT = "winpexpect >= 1.5" + +pdb.set_trace() + +if "win" in sys.platform: + INSTALLATION_REQS.append(WINPEXPECT) +else: + INSTALLATION_REQS.append(PEXPECT) + setup( name=NAME, version=VERSION, @@ -18,11 +31,7 @@ packages=find_packages(), package_data = {"": ["*.properties"], "corenlp": ["*.properties"]}, - install_requires=[ - "pexpect >= 2.4", - "unidecode >= 0.04.12", - "xmltodict >= 0.4.6", - ], + install_requires=INSTALLATION_REQS, classifiers=[ ("License :: OSI Approved :: GNU General Public License v2 or later " "(GPLv2+)"), From 0dfbf2c81437f602667dc04fde25c95642ea8054 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Tue, 12 Aug 2014 18:58:57 -0700 Subject: [PATCH 16/27] Incremented version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 124cf91..b51f392 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.6-0" +VERSION = "3.3.7-0" setup( name=NAME, From b73ddaf926258849e6bbeef2332e22a99640f43a Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Tue, 12 Aug 2014 19:07:38 -0700 Subject: [PATCH 17/27] Removed debug code --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index f26d008..1f230e0 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ import sys from setuptools import setup, find_packages -import pdb PACKAGE = "corenlp" NAME = "stanford-corenlp-python" @@ -14,8 +13,6 @@ PEXPECT = "pexpect >= 2.4" WINPEXPECT = "winpexpect >= 1.5" -pdb.set_trace() - if "win" in sys.platform: INSTALLATION_REQS.append(WINPEXPECT) else: From c68fd4fe814ab3ff2fb7f47a92428e375f61dcae Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Tue, 12 Aug 2014 19:56:38 -0700 Subject: [PATCH 18/27] Fixed commit which made the script think that windows was mac. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 1f230e0..6dedbdd 100644 --- a/setup.py +++ b/setup.py @@ -7,13 +7,13 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.7-0" +VERSION = "3.3.8-0" INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] PEXPECT = "pexpect >= 2.4" WINPEXPECT = "winpexpect >= 1.5" -if "win" in sys.platform: +if "win32" in sys.platform or "cygwin" in sys.platform: INSTALLATION_REQS.append(WINPEXPECT) else: INSTALLATION_REQS.append(PEXPECT) From e1d7c5619bfc2ab09fdd6589c1bc50530dd50f52 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Wed, 13 Aug 2014 11:04:58 -0700 Subject: [PATCH 19/27] Now with pep 396! --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6dedbdd..35c75eb 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.8-0" +VERSION = "3.3.8" INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] PEXPECT = "pexpect >= 2.4" From 84eed35eb0d705fb98766641f531db89bdfd7fdd Mon Sep 17 00:00:00 2001 From: jannah Date: Sat, 13 Sep 2014 15:05:33 -0700 Subject: [PATCH 20/27] Added support windows made file paths OS neutral and changed jars to * --- corenlp/corenlp.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 0cae5c7..cb0a7de 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -105,12 +105,14 @@ def init_corenlp_command(corenlp_path, memory, properties): "joda-time.jar", "jollyday.jar" ] + + jars = ["*"] java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - current_dir_pr = os.path.dirname(os.path.abspath(__file__)) + "/" + properties + current_dir_pr = os.path.join(os.path.dirname(os.path.abspath(__file__)), properties) if os.path.exists(properties): props = "-props %s" % (properties.replace(" ", "\\ ")) elif os.path.exists(current_dir_pr): @@ -119,9 +121,9 @@ def init_corenlp_command(corenlp_path, memory, properties): raise Exception("Error! Cannot locate: %s" % properties) # add and check classpaths - jars = [corenlp_path + "/" + jar for jar in jars] + jars = [os.path.join(corenlp_path,jar) for jar in jars] for jar in jars: - if not os.path.exists(jar): + if not os.path.exists(jar) and not "*" in jar: raise Exception("Error! Cannot locate: %s" % jar) # add memory limit on JVM @@ -278,7 +280,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output= #we get a list of the cleaned files that we want to parse: - files = [input_dir + '/' + f for f in os.listdir(input_dir) if f.endswith(".txt")] + files = [os.path.join(input_dir , f) for f in os.listdir(input_dir) if f.endswith(".txt")] #creating the file list of files to parse @@ -296,7 +298,7 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output= # result = [] try: for output_file in os.listdir(xml_dir): - with open(xml_dir + '/' + output_file, 'r') as xml: + with open(os.path.join(xml_dir + output_file), 'r') as xml: # parsed = xml.read() file_name = re.sub('.xml$', '', os.path.basename(output_file)) # result.append(parse_parser_xml_results(xml.read(), file_name, @@ -358,7 +360,12 @@ def __init__(self, corenlp_path=DIRECTORY, memory="3g", properties='default.prop self._spawn_corenlp() def close(self, force=True): - self.corenlp.terminate(force) + global use_winpexpect + if use_winpexpect: + self.corenlp.terminate() + else: + self.corenlp.terminate(force) + def isalive(self): return self.corenlp.isalive() From e4126f5b711ef7f3e7a663a3c52dba5dc088531a Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Sat, 13 Sep 2014 16:26:02 -0700 Subject: [PATCH 21/27] Generalized jar loading --- corenlp/corenlp.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index cb0a7de..ce419d4 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -31,6 +31,7 @@ from progressbar import ProgressBar, Fraction from unidecode import unidecode from subprocess import call +import glob use_winpexpect = True @@ -99,14 +100,8 @@ def init_corenlp_command(corenlp_path, memory, properties): """ # TODO: Can edit jar constants - jars = ["stanford-corenlp-3.2.0.jar", - "stanford-corenlp-3.2.0-models.jar", - "xom.jar", - "joda-time.jar", - "jollyday.jar" - ] - - jars = ["*"] + jar_mask = ["*.jar"] + jars = glob.glob(os.path.join(corenlp_path, jar)) java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" @@ -120,12 +115,6 @@ def init_corenlp_command(corenlp_path, memory, properties): else: raise Exception("Error! Cannot locate: %s" % properties) - # add and check classpaths - jars = [os.path.join(corenlp_path,jar) for jar in jars] - for jar in jars: - if not os.path.exists(jar) and not "*" in jar: - raise Exception("Error! Cannot locate: %s" % jar) - # add memory limit on JVM if memory: limit = "-Xmx%s" % memory From 847367f0b30908851184c7bd0f60b580f867263b Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Tue, 16 Sep 2014 11:22:10 -0700 Subject: [PATCH 22/27] Incremented version counter --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 35c75eb..badbb0a 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.8" +VERSION = "3.3.9" INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] PEXPECT = "pexpect >= 2.4" From 754c06292cab727c8b0fb8316b27e14cfdd51502 Mon Sep 17 00:00:00 2001 From: PlasmaSheep Date: Tue, 16 Sep 2014 11:38:04 -0700 Subject: [PATCH 23/27] Fixed errors causing masks to fail. --- corenlp/corenlp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index ce419d4..4592acc 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -100,8 +100,8 @@ def init_corenlp_command(corenlp_path, memory, properties): """ # TODO: Can edit jar constants - jar_mask = ["*.jar"] - jars = glob.glob(os.path.join(corenlp_path, jar)) + jar_mask = "*.jar" + jars = glob.glob(os.path.join(corenlp_path, jar_mask)) java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" @@ -354,7 +354,7 @@ def close(self, force=True): self.corenlp.terminate() else: self.corenlp.terminate(force) - + def isalive(self): return self.corenlp.isalive() From e72ce4074c67f05de9d320d785bdf06481fbc2da Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Fri, 12 Jun 2015 17:40:01 -0700 Subject: [PATCH 24/27] handle different text output from depparse annotator --- corenlp/corenlp.py | 13 +++++-------- corenlp/default.properties | 6 +++++- 2 files changed, 10 insertions(+), 9 deletions(-) mode change 100755 => 100644 corenlp/corenlp.py diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py old mode 100755 new mode 100644 index 4592acc..f2c9a16 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -153,7 +153,8 @@ def parse_parser_results(text): """ results = {"sentences": []} state = STATE_START - for line in unidecode(text.decode('utf-8')).split("\n"): + lines = unidecode(text.decode('utf-8')).split("\n") + for index, line in enumerate(lines): line = line.strip() if line.startswith("Sentence #"): @@ -170,15 +171,11 @@ def parse_parser_results(text): raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - - elif state == STATE_TREE: - if len(line) == 0: + if not lines[index + 1].startswith("[Text="): state = STATE_DEPENDENCY - sentence['parsetree'] = " ".join(sentence['parsetree']) - else: - sentence['parsetree'].append(remove_escapes(line)) + # skipping TREE because the new depparse annotator doesn't make a parse tree + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE diff --git a/corenlp/default.properties b/corenlp/default.properties index c475c92..70ac093 100644 --- a/corenlp/default.properties +++ b/corenlp/default.properties @@ -1,4 +1,8 @@ -annotators = tokenize, ssplit, pos, lemma, parse +annotators = tokenize, ssplit, pos, lemma, depparse + +# specify Stanford Dependencies format for backwards compatibility +# (new default is Universal Dependencies in 3.5.2) +depparse.model = edu/stanford/nlp/models/parser/nndep/english_SD.gz # A true-casing annotator is also available (see below) #annotators = tokenize, ssplit, pos, lemma, truecase From 9bd284ec168aec7624c4916fd5e1afec55f587e9 Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Fri, 12 Jun 2015 17:49:06 -0700 Subject: [PATCH 25/27] update README --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f0fb897..4771019 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # A Python wrapper for the Java Stanford Core NLP tools -This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. +This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. ## Edited + * Tested only with the current annotator configuration: not a general-purpose wrapper + * Update to Stanford CoreNLP v3.5.2 * Added multi-threaded load balancing - * Update to Stanford CoreNLP v3.2.0 * Fix many bugs & improve performance * Using jsonrpclib for stability and performance * Can edit the constants as argument such as Stanford Core NLP directory @@ -164,4 +165,5 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i * Robert Elwell [robert@wikia-inc.com] * Tristan Chong [tristan@wikia-inc.com] * Aditi Muralidharan [aditi.shrikumar@gmail.com] + * Ian MacFarland [ianmacfarland@ischool.berkeley.edu] From 34ed4b6ff74e2da7892749b3c05c1de115fe282e Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Fri, 12 Jun 2015 17:51:37 -0700 Subject: [PATCH 26/27] increment version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index badbb0a..869427c 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ PACKAGE = "corenlp" NAME = "stanford-corenlp-python" DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)" -AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" +AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan, Ian MacFarland" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.9" +VERSION = "3.3.10" INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] PEXPECT = "pexpect >= 2.4" From 6030814dc624b63ce2eef4b2fe0c88e12e002df8 Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Sat, 13 Jun 2015 15:28:14 -0700 Subject: [PATCH 27/27] readme update --- README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/README.md b/README.md index 4771019..e3a1400 100644 --- a/README.md +++ b/README.md @@ -22,15 +22,6 @@ This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](htt To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. - -In other words: - - sudo pip install pexpect unidecode jsonrpclib # jsonrpclib is optional - git clone https://bitbucket.org/torotoki/corenlp-python.git - cd corenlp-python - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-06-20.zip - unzip stanford-corenlp-full-2013-06-20.zip - Then, to launch a server: python corenlp/corenlp.py