kowey
diff --git a/‎README.md
Lines changed: 7 additions & 156 deletions b/‎README.md
Lines changed: 7 additions & 156 deletions
diff --git a/‎corenlp/corenlp.py
Lines changed: 63 additions & 42 deletions b/‎corenlp/corenlp.py
Lines changed: 63 additions & 42 deletions
@@ -7,15 +7,15 @@ This is a fork of [stanford-corenlp-python](https://github.com/dasmith/stanford-
    * Update to Stanford CoreNLP v1.3.5
    * Fix many bugs & improve performance
    * Using jsonrpclib for stability and performance
-   * Can edit the constants as argument such as Stanford Core NLP directory.
+   * Can edit the constants as argument such as Stanford Core NLP directory
    * Adjust parameters not to timeout in high load
-   * Fix a problem on input long texts by Johannes Castner [stanford-corenlp-python](https://github.com/jac2130/stanford-corenlp-python)
+   * Fix a problem on input long texts, by Johannes Castner [stanford-corenlp-python](https://github.com/jac2130/stanford-corenlp-python)
    * Packaging
 
 ## Requirements
-   * [jsonrpclib](https://github.com/joshmarshall/jsonrpclib)
    * [pexpect](http://www.noah.org/wiki/pexpect)
-   * [unidecode](http://pypi.python.org/pypi/Unidecode) (optionally)
+   * [unidecode](http://pypi.python.org/pypi/Unidecode)
+   * [jsonrpclib](https://github.com/joshmarshall/jsonrpclib) (optionally)
 
 ## Download and Usage
 
@@ -124,165 +124,16 @@ Not to use JSON-RPC, load the module instead:
 
 If you need to parse long texts (more than 30-50 sentences), you have to use a batch_parse() function. It reads text files from input directory and returns a generator object of dictionaries parsed each file results:
 
-    from corenlp import batch_process
+    from corenlp import batch_parse
+    corenlp_dir = "stanford-corenlp-full-2013-04-04/"
     raw_text_directory = "sample_raw_text/"
-    parsed = batch_process(raw_text_directory)  # It returns a generator object
+    parsed = batch_process(raw_text_directory, corenlp_dir)  # It returns a generator object
     print parsed  #=> [{'coref': ..., 'sentences': ..., 'file_name': 'new_sample.txt'}]
 
 ## Developer
    * Hiroyoshi Komatsu [hiroyoshi.komat@gmail.com]
    * Johannes Castner [jac2130@columbia.edu]
 
-
-Following are the README in original stanford-corenlp-python.
-
--------------------------------------
-
- Python interface to Stanford Core NLP tools v1.3.3
-
-This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml).  It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server.
-
-
-   * Python interface to Stanford CoreNLP tools: tagging, phrase-structure parsing, dependency parsing, named entity resolution, and coreference resolution.
-   * Runs an JSON-RPC server that wraps the Java server and outputs JSON.
-   * Outputs parse trees which can be used by [nltk](http://nltk.googlecode.com/svn/trunk/doc/howto/tree.html).
-
-
-It requires [pexpect](http://www.noah.org/wiki/pexpect) and (optionally) [unidecode](http://pypi.python.org/pypi/Unidecode) to handle non-ASCII text.  This script includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/).
-
-It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON.  The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 1.3.3** released 2012-07-09.
-
-## Download and Usage
-
-To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the tgz file containing Stanford's CoreNLP package.  By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run.
-
-In other words:
-
-    sudo pip install pexpect unidecode   # unidecode is optional
-    git clone git://github.com/dasmith/stanford-corenlp-python.git
-	  cd stanford-corenlp-python
-    wget http://nlp.stanford.edu/software/stanford-corenlp-2012-07-09.tgz
-    tar xvfz stanford-corenlp-2012-07-09.tgz
-
-Then, to launch a server:
-
-    python corenlp.py
-
-Optionally, you can specify a host or port:
-
-    python corenlp.py -H 0.0.0.0 -p 3456
-
-That will run a public JSON-RPC server on port 3456.
-
-Assuming you are running on port 8080, the code in `client.py` shows an example parse:
-
-    import jsonrpc
-    from simplejson import loads
-    server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),
-            jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080)))
-
-    result = loads(server.parse("Hello world.  It is so beautiful"))
-    print "Result", result
-
-That returns a dictionary containing the keys `sentences` and (when applicable) `corefs`. The key `sentences` contains a list of dictionaries for each sentence, which contain `parsetree`, `text`, `tuples` containing the dependencies, and `words`, containing information about parts of speech, NER, etc:
-
-	{u'sentences': [{u'parsetree': u'(ROOT (S (VP (NP (INTJ (UH Hello)) (NP (NN world)))) (. !)))',
-	                 u'text': u'Hello world!',
-	                 u'tuples': [[u'dep', u'world', u'Hello'],
-	                             [u'root', u'ROOT', u'world']],
-	                 u'words': [[u'Hello',
-	                             {u'CharacterOffsetBegin': u'0',
-	                              u'CharacterOffsetEnd': u'5',
-	                              u'Lemma': u'hello',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'UH'}],
-	                            [u'world',
-	                             {u'CharacterOffsetBegin': u'6',
-	                              u'CharacterOffsetEnd': u'11',
-	                              u'Lemma': u'world',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'NN'}],
-	                            [u'!',
-	                             {u'CharacterOffsetBegin': u'11',
-	                              u'CharacterOffsetEnd': u'12',
-	                              u'Lemma': u'!',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'.'}]]},
-	                {u'parsetree': u'(ROOT (S (NP (PRP It)) (VP (VBZ is) (ADJP (RB so) (JJ beautiful))) (. .)))',
-	                 u'text': u'It is so beautiful.',
-	                 u'tuples': [[u'nsubj', u'beautiful', u'It'],
-	                             [u'cop', u'beautiful', u'is'],
-	                             [u'advmod', u'beautiful', u'so'],
-	                             [u'root', u'ROOT', u'beautiful']],
-	                 u'words': [[u'It',
-	                             {u'CharacterOffsetBegin': u'14',
-	                              u'CharacterOffsetEnd': u'16',
-	                              u'Lemma': u'it',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'PRP'}],
-	                            [u'is',
-	                             {u'CharacterOffsetBegin': u'17',
-	                              u'CharacterOffsetEnd': u'19',
-	                              u'Lemma': u'be',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'VBZ'}],
-	                            [u'so',
-	                             {u'CharacterOffsetBegin': u'20',
-	                              u'CharacterOffsetEnd': u'22',
-	                              u'Lemma': u'so',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'RB'}],
-	                            [u'beautiful',
-	                             {u'CharacterOffsetBegin': u'23',
-	                              u'CharacterOffsetEnd': u'32',
-	                              u'Lemma': u'beautiful',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'JJ'}],
-	                            [u'.',
-	                             {u'CharacterOffsetBegin': u'32',
-	                              u'CharacterOffsetEnd': u'33',
-	                              u'Lemma': u'.',
-	                              u'NamedEntityTag': u'O',
-	                              u'PartOfSpeech': u'.'}]]}],
-	u'coref': [[[[u'It', 1, 0, 0, 1], [u'Hello world', 0, 1, 0, 2]]]]}
-
-To use it in a regular script or to edit/debug it (because errors via RPC are opaque), load the module instead:
-
-    from corenlp import *
-    corenlp = StanfordCoreNLP()  # wait a few minutes...
-    corenlp.parse("Parse it")
-
-<!--
-
-## Adding WordNet
-
-Note: wordnet doesn't seem to be supported using this approach.  Looks like you'll need Java.
-
-Download WordNet-3.0 Prolog:  http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
-tar xvfz WNprolog-3.0.tar.gz
-
--->
-
-
-## Questions
-
-**Stanford CoreNLP tools require a large amount of free memory**.  Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines.  32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less.
-If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process:
-
-	java -cp stanford-corenlp-2012-07-09.jar:stanford-corenlp-2012-07-06-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
-
-You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)).
-
-
-# Contributors
-
-This is free and open source software and has benefited from the contribution and feedback of others.  Like Stanford's CoreNLP tools, it is covered under the [GNU General Public License v2 +](http://www.gnu.org/licenses/gpl-2.0.html), which in short means that modifications to this program must maintain the same free and open source distribution policy.
-
-This project has benefited from the contributions of:
-
-  * @jcc Justin Cheng
-  * Abhaya Agarwal
-
 ## Related Projects
 
 These two projects are python wrappers for the [Stanford Parser](http://nlp.stanford.edu/software/lex-parser.shtml), which includes the Stanford Parser, although the Stanford Parser is another project.
 
@@ -25,7 +25,7 @@
 import shutil
 from progressbar import ProgressBar, Fraction
 from unidecode import unidecode
-from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer
+from subprocess import call
 
 VERBOSE = False
 STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
@@ -194,40 +194,49 @@ def extract_words_from_xml(sent_node):
         exted = map(lambda x: x['word'], sent_node['tokens']['token'])
         return exted
 
-    #turning the raw xml into a raw python dictionary:
+    # Turning the raw xml into a raw python dictionary:
     raw_dict = xmltodict.parse(xml)
+    document = raw_dict[u'root'][u'document']
+
+    # Making a raw sentence list of dictionaries:
+    raw_sent_list = document[u'sentences'][u'sentence']
+
+    if document.get(u'coreference') and document[u'coreference'].get(u'coreference'):
+        # Convert coreferences to the format like python
+        coref_flag = True
+
+        # Making a raw coref dictionary:
+        raw_coref_list = document[u'coreference'][u'coreference']
+
+        # To dicrease is for given index different from list index
+        coref_index = [[[int(raw_coref_list[j][u'mention'][i]['sentence'])-1,
+                         int(raw_coref_list[j][u'mention'][i]['head'])-1,
+                         int(raw_coref_list[j][u'mention'][i]['start'])-1,
+                         int(raw_coref_list[j][u'mention'][i]['end'])-1]
+                        for i in xrange(len(raw_coref_list[j][u'mention']))]
+                       for j in xrange(len(raw_coref_list))]
+
+        coref_list = []
+        for j in xrange(len(coref_index)):
+            coref_list.append(coref_index[j])
+            for k, coref in enumerate(coref_index[j]):
+                exted = raw_sent_list[coref[0]]['tokens']['token'][coref[2]:coref[3]]
+                exted_words = map(lambda x: x['word'], exted)
+                coref_list[j][k].insert(0, ' '.join(exted_words))
+
+        coref_list = [[[coref_list[j][i], coref_list[j][0]]
+                        for i in xrange(len(coref_list[j])) if i != 0]
+                      for j in xrange(len(coref_list))]
+    else:
+        coref_flag = False
 
-    #making a raw sentence list of dictionaries:
-    raw_sent_list = raw_dict[u'root'][u'document'][u'sentences'][u'sentence']
-    #making a raw coref dictionary:
-    raw_coref_list = raw_dict[u'root'][u'document'][u'coreference'][u'coreference']
-
-    #cleaning up the list ...the problem is that this doesn't come in pairs, as the command line version:
-
-    # To dicrease is for given index different from list index
-    coref_index = [[[eval(raw_coref_list[j][u'mention'][i]['sentence'])-1,
-                     eval(raw_coref_list[j][u'mention'][i]['head'])-1,
-                     eval(raw_coref_list[j][u'mention'][i]['start'])-1,
-                     eval(raw_coref_list[j][u'mention'][i]['end'])-1]
-                    for i in xrange(len(raw_coref_list[j][u'mention']))]
-                   for j in xrange(len(raw_coref_list))]
-
-    coref_list = []
-    for j in xrange(len(coref_index)):
-        coref_list.append(coref_index[j])
-        for k, coref in enumerate(coref_index[j]):
-            exted = raw_sent_list[coref[0]]['tokens']['token'][coref[2]:coref[3]]
-            exted_words = map(lambda x: x['word'], exted)
-            coref_list[j][k].insert(0, ' '.join(exted_words))
-
-    coref_list = [[[coref_list[j][i], coref_list[j][0]]
-                    for i in xrange(len(coref_list[j])) if i != 0]
-                  for j in xrange(len(coref_list))]
-
+    # Convert sentences to the format like python
+    # TODO: If there is only one sentence in input sentence,
+    # raw_sent_list is dict and cannot decode following code...
     sentences = [{'dependencies': [[dep['dep'][i]['@type'],
                                     dep['dep'][i]['governor']['#text'],
                                     dep['dep'][i]['dependent']['#text']]
-                                   for dep in raw_sent_list[j][u'dependencies']
+                                   for dep in raw_sent_list.values()[j][u'dependencies']
                                    for i in xrange(len(dep['dep']))
                                    if dep['@type']=='basic-dependencies'],
                   'text': extract_words_from_xml(raw_sent_list[j]),
@@ -238,11 +247,15 @@ def extract_words_from_xml(sent_node):
                       ('CharacterOffsetBegin', str(token['CharacterOffsetBegin'])),
                       ('PartOfSpeech', str(token['POS'])),
                       ('Lemma', str(token['lemma']))])]
-                            for token in raw_sent_list[j]['tokens'][u'token']]}
+                            for token in raw_sent_list[j][u'tokens'][u'token']]}
 
-                 for j in xrange(len(raw_sent_list))]
+                 for j in xrange(len(raw_sent_list)) ]
+
+    if coref_flag:
+        results = {'coref':coref_list, 'sentences':sentences}
+    else:
+        results = {'sentences': sentences}
 
-    results = {'coref':coref_list, 'sentences':sentences}
     if file_name:
         results['file_name'] = file_name
 
@@ -261,7 +274,6 @@ def parse_xml_output(input_dir, corenlp_path="stanford-corenlp-full-2013-04-04/"
     #we get a list of the cleaned files that we want to parse:
 
     files = [input_dir+'/'+f for f in os.listdir(input_dir)]
-    file_name = re.sub('.xml$', '', f)
 
     #creating the file list of files to parse
 
@@ -273,19 +285,20 @@ def parse_xml_output(input_dir, corenlp_path="stanford-corenlp-full-2013-04-04/"
 
     #creates the xml file of parser output:
 
-    os.system(command)
+    call(command, shell=True)
 
     #reading in the raw xml file:
+    result = []
     try:
         for output_file in os.listdir(xml_dir):
             with open(xml_dir+'/'+output_file, 'r') as xml:
-                parsed = xml.read()
-            yield parse_parser_xml_results(parsed, file_name)
+                # parsed = xml.read()
+                file_name = re.sub('.xml$', '', os.path.basename(output_file))
+                result.append(parse_parser_xml_results(xml.read(), file_name))
     finally:
         file_list.close()
-        try:
-            shutil.rmtree(xml_dir)
-        except: pass
+        shutil.rmtree(xml_dir)
+    return result
 
 class StanfordCoreNLP:
     """
@@ -366,11 +379,12 @@ def clean_up():
         max_expected_time = max(300.0, len(to_send) / 3.0)
 
         # repeated_input = self.corenlp.except("\n")  # confirm it
-        t = self.corenlp.expect(["\nNLP> ", pexpect.TIMEOUT, pexpect.EOF],
+        t = self.corenlp.expect(["\nNLP> ", pexpect.TIMEOUT, pexpect.EOF,
+                                 "\nWARNING: Parsing of sentence failed, possibly because of out of memory."],
                                 timeout=max_expected_time)
         incoming = self.corenlp.before
         if t == 1:
-            # TIMEOUT, clean up anything when raise pexpect.TIMEOUT error
+            # TIMEOUT, clean up anything left in buffer
             clean_up()
             print >>sys.stderr, {'error': "timed out after %f seconds" % max_expected_time,
                                  'input': to_send,
@@ -383,6 +397,12 @@ def clean_up():
                                  'output': incoming}
             self.corenlp.close()
             raise ProcessError("CoreNLP process terminates abnormally while parsing")
+        elif t == 3:
+            # out of memory
+            print >>sys.stderr, {'error': "WARNING: Parsing of sentence failed, possibly because of out of memory.",
+                                 'input': to_send,
+                                 'output': incoming}
+            return
 
         if VERBOSE: print "%s\n%s" % ('='*40, incoming)
         try:
@@ -429,6 +449,7 @@ def batch_parse(input_folder, corenlp_path="stanford-corenlp-full-2013-04-04/",
     """
     The code below starts an JSONRPC server
     """
+    from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer
     VERBOSE = True
     parser = optparse.OptionParser(usage="%prog [OPTIONS]")
     parser.add_option('-p', '--port', default='8080',