diff --git a/README.md b/README.md index 4cadded..fb7e533 100644 --- a/README.md +++ b/README.md @@ -1,151 +1,32 @@ -# Python interface to Stanford Core NLP tools v1.3.3 +# A Python wrapper for the Java Stanford Core NLP tools -This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml). It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server. +This is a fork of [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), originally written by Dustin Smith and it incorporates the fixes made by Hioroyoshi Komatsu [corenlp-python] (https://bitbucket.org/torotoki/corenlp-python). +## New changes: - * Python interface to Stanford CoreNLP tools: tagging, phrase-structure parsing, dependency parsing, named entity resolution, and coreference resolution. - * Runs an JSON-RPC server that wraps the Java server and outputs JSON. - * Outputs parse trees which can be used by [nltk](http://nltk.googlecode.com/svn/trunk/doc/howto/tree.html). +The original wrapper only worked for extreemly short texts and it did not accept files. This new version cleans the raw text files that are placed in the "raw_text" folder, places the cleaned files into the "clean_text" folder, runs the Stanford tools on those cleaned files and creates a large xml file which is placed in the xml folder. The "parse_xml_output()" calls the Stanford tools and parses the xml file to create a python object that is identical to the object that was created in the old version, except that the text length is now unlimited. All changes were mad to one file: "corenlp.py", which can by found in the "corenlp" folder. For more information, please consult the "README" file (by Hioroyoshi Komatsu) in the "corenlp-python" folder! I am constantly working on this project as these tools help me in my work and I welcome any feedback [jac2130@columbia.edu]. +## New Requirements + * [xmltodict](https://github.com/martinblech/xmltodict) -It requires [pexpect](http://www.noah.org/wiki/pexpect) and (optionally) [unidecode](http://pypi.python.org/pypi/Unidecode) to handle non-ASCII text. This script includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/). +## To use the new feature: -It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 1.3.3** released 2012-07-09. + import sys -## Download and Usage + sys.path.append("[full-path]/corenlp-wrapper/corenlp") -To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the tgz file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. + from collections import OrderedDict -In other words: + from corenlp import StanfordCoreNLP - sudo pip install pexpect unidecode # unidecode is optional - git clone git://github.com/dasmith/stanford-corenlp-python.git - cd stanford-corenlp-python - wget http://nlp.stanford.edu/software/stanford-corenlp-2012-07-09.tgz - tar xvfz stanford-corenlp-2012-07-09.tgz + corenlp_dir = "[full-path]/corenlp-wrapper/stanford-corenlp-full-2013-04-04" -Then, to launch a server: + corenlp = StanfordCoreNLP(corenlp_dir) - python corenlp.py + parse_dict=eval(corenlp.parse()) -Optionally, you can specify a host or port: +If you instead type: - python corenlp.py -H 0.0.0.0 -p 3456 - -That will run a public JSON-RPC server on port 3456. - -Assuming you are running on port 8080, the code in `client.py` shows an example parse: - - import jsonrpc - from simplejson import loads - server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(), - jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080))) - - result = loads(server.parse("Hello world. It is so beautiful")) - print "Result", result - -That returns a dictionary containing the keys `sentences` and (when applicable) `corefs`. The key `sentences` contains a list of dictionaries for each sentence, which contain `parsetree`, `text`, `tuples` containing the dependencies, and `words`, containing information about parts of speech, NER, etc: - - {u'sentences': [{u'parsetree': u'(ROOT (S (VP (NP (INTJ (UH Hello)) (NP (NN world)))) (. !)))', - u'text': u'Hello world!', - u'tuples': [[u'dep', u'world', u'Hello'], - [u'root', u'ROOT', u'world']], - u'words': [[u'Hello', - {u'CharacterOffsetBegin': u'0', - u'CharacterOffsetEnd': u'5', - u'Lemma': u'hello', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'UH'}], - [u'world', - {u'CharacterOffsetBegin': u'6', - u'CharacterOffsetEnd': u'11', - u'Lemma': u'world', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'NN'}], - [u'!', - {u'CharacterOffsetBegin': u'11', - u'CharacterOffsetEnd': u'12', - u'Lemma': u'!', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'.'}]]}, - {u'parsetree': u'(ROOT (S (NP (PRP It)) (VP (VBZ is) (ADJP (RB so) (JJ beautiful))) (. .)))', - u'text': u'It is so beautiful.', - u'tuples': [[u'nsubj', u'beautiful', u'It'], - [u'cop', u'beautiful', u'is'], - [u'advmod', u'beautiful', u'so'], - [u'root', u'ROOT', u'beautiful']], - u'words': [[u'It', - {u'CharacterOffsetBegin': u'14', - u'CharacterOffsetEnd': u'16', - u'Lemma': u'it', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'PRP'}], - [u'is', - {u'CharacterOffsetBegin': u'17', - u'CharacterOffsetEnd': u'19', - u'Lemma': u'be', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'VBZ'}], - [u'so', - {u'CharacterOffsetBegin': u'20', - u'CharacterOffsetEnd': u'22', - u'Lemma': u'so', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'RB'}], - [u'beautiful', - {u'CharacterOffsetBegin': u'23', - u'CharacterOffsetEnd': u'32', - u'Lemma': u'beautiful', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'JJ'}], - [u'.', - {u'CharacterOffsetBegin': u'32', - u'CharacterOffsetEnd': u'33', - u'Lemma': u'.', - u'NamedEntityTag': u'O', - u'PartOfSpeech': u'.'}]]}], - u'coref': [[[[u'It', 1, 0, 0, 1], [u'Hello world', 0, 1, 0, 2]]]]} - -To use it in a regular script or to edit/debug it (because errors via RPC are opaque), load the module instead: - - from corenlp import * - corenlp = StanfordCoreNLP() # wait a few minutes... - corenlp.parse("Parse it") - - - - -## Questions - -**Stanford CoreNLP tools require a large amount of free memory**. Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines. 32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less. -If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process: - - java -cp stanford-corenlp-2012-07-09.jar:stanford-corenlp-2012-07-06-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties - -You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)). - - -# Contributors - -This is free and open source software and has benefited from the contribution and feedback of others. Like Stanford's CoreNLP tools, it is covered under the [GNU General Public License v2 +](http://www.gnu.org/licenses/gpl-2.0.html), which in short means that modifications to this program must maintain the same free and open source distribution policy. - -This project has benefited from the contributions of: - - * @jcc Justin Cheng - * Abhaya Agarwal - -## Related Projects - -These two projects are python wrappers for the [Stanford Parser](http://nlp.stanford.edu/software/lex-parser.shtml), which includes the Stanford Parser, although the Stanford Parser is another project. - - [stanford-parser-python](http://projects.csail.mit.edu/spatial/Stanford_Parser) uses [JPype](http://jpype.sourceforge.net/) (interface to JVM) - - [stanford-parser-jython](http://blog.gnucom.cc/2010/using-the-stanford-parser-with-jython/) uses Python + parse_dict=eval(corenlp.parse(text)) +where "text" is any non-empty string, the behavior is identical to the behavior before the new changes. Thus, if you want to use the new feature, you can just leave the text blank and place your raw files into the "raw_files" folder. The rest is as before. diff --git a/LICENSE b/corenlp-wrapper/LICENSE similarity index 100% rename from LICENSE rename to corenlp-wrapper/LICENSE diff --git a/corenlp-wrapper/MANIFEST b/corenlp-wrapper/MANIFEST new file mode 100644 index 0000000..3e64867 --- /dev/null +++ b/corenlp-wrapper/MANIFEST @@ -0,0 +1,7 @@ +# file GENERATED by distutils, do NOT edit +setup.py +corenlp/__init__.py +corenlp/client.py +corenlp/corenlp.py +corenlp/default.properties +corenlp/progressbar.py diff --git a/corenlp-wrapper/README.md b/corenlp-wrapper/README.md new file mode 100644 index 0000000..590a02c --- /dev/null +++ b/corenlp-wrapper/README.md @@ -0,0 +1,289 @@ +# A Python wrapper of Stanford Core NLP tools + + +This is a fork of [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python). + +## Edited + * Update to Stanford CoreNLP v1.3.5 + * Fix many bugs & improve performance + * Using jsonrpclib for stability and performance + * Can edit the constants as argument such as Stanford Core NLP directory. + * Adjust parameters not to timeout in high load + * Packaging (beta) + +## Requirements + * [jsonrpclib](https://github.com/joshmarshall/jsonrpclib) + * [pexpect](http://www.noah.org/wiki/pexpect) + * [unidecode](http://pypi.python.org/pypi/Unidecode) (optionally) + +## Download and Usage + +To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. + + +In other words: + + sudo pip install jsonrpclib pexpect unidecode # unidecode is optional + git clone https://bitbucket.org/torotoki/corenlp-python.git + cd corenlp-python + wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-04-04.zip + unzip stanford-corenlp-full-2013-04-04.zip + +Then, to launch a server: + + python corenlp/corenlp.py + +Optionally, you can specify a host or port: + + python corenlp/corenlp.py -H 0.0.0.0 -p 3456 + +That will run a public JSON-RPC server on port 3456. +And you can specify Stanford CoreNLP directory: + + python corenlp/corenlp.py -S stanford-corenlp-full-2013-04-04/ + + +Assuming you are running on port 8080 and CoreNLP directory is `stanford-corenlp-full-2013-04-04/` in current directory, the code in `client.py` shows an example parse: + + import jsonrpclib + from simplejson import loads + server = jsonrpclib.Server("http://localhost:8080") + + result = loads(server.parse("Hello world. It is so beautiful")) + print "Result", result + +That returns a dictionary containing the keys `sentences` and (when applicable) `corefs`. The key `sentences` contains a list of dictionaries for each sentence, which contain `parsetree`, `text`, `tuples` containing the dependencies, and `words`, containing information about parts of speech, NER, etc: + + {u'sentences': [{u'parsetree': u'(ROOT (S (VP (NP (INTJ (UH Hello)) (NP (NN world)))) (. !)))', + u'text': u'Hello world!', + u'tuples': [[u'dep', u'world', u'Hello'], + [u'root', u'ROOT', u'world']], + u'words': [[u'Hello', + {u'CharacterOffsetBegin': u'0', + u'CharacterOffsetEnd': u'5', + u'Lemma': u'hello', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'UH'}], + [u'world', + {u'CharacterOffsetBegin': u'6', + u'CharacterOffsetEnd': u'11', + u'Lemma': u'world', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'NN'}], + [u'!', + {u'CharacterOffsetBegin': u'11', + u'CharacterOffsetEnd': u'12', + u'Lemma': u'!', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'.'}]]}, + {u'parsetree': u'(ROOT (S (NP (PRP It)) (VP (VBZ is) (ADJP (RB so) (JJ beautiful))) (. .)))', + u'text': u'It is so beautiful.', + u'tuples': [[u'nsubj', u'beautiful', u'It'], + [u'cop', u'beautiful', u'is'], + [u'advmod', u'beautiful', u'so'], + [u'root', u'ROOT', u'beautiful']], + u'words': [[u'It', + {u'CharacterOffsetBegin': u'14', + u'CharacterOffsetEnd': u'16', + u'Lemma': u'it', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'PRP'}], + [u'is', + {u'CharacterOffsetBegin': u'17', + u'CharacterOffsetEnd': u'19', + u'Lemma': u'be', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'VBZ'}], + [u'so', + {u'CharacterOffsetBegin': u'20', + u'CharacterOffsetEnd': u'22', + u'Lemma': u'so', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'RB'}], + [u'beautiful', + {u'CharacterOffsetBegin': u'23', + u'CharacterOffsetEnd': u'32', + u'Lemma': u'beautiful', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'JJ'}], + [u'.', + {u'CharacterOffsetBegin': u'32', + u'CharacterOffsetEnd': u'33', + u'Lemma': u'.', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'.'}]]}], + u'coref': [[[[u'It', 1, 0, 0, 1], [u'Hello world', 0, 1, 0, 2]]]]} + +To use it in a regular script or to edit/debug it (because errors via RPC are opaque), load the module instead: + + from corenlp import StanfordCoreNLP + corenlp_dir = "stanford-corenlp-full-2013-04-04/" + corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... + corenlp.parse("Parse it") + + + + +Following original README in stanford-corenlp-python. + +------------------------------------- + + Python interface to Stanford Core NLP tools v1.3.3 + +This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml). It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server. + + + * Python interface to Stanford CoreNLP tools: tagging, phrase-structure parsing, dependency parsing, named entity resolution, and coreference resolution. + * Runs an JSON-RPC server that wraps the Java server and outputs JSON. + * Outputs parse trees which can be used by [nltk](http://nltk.googlecode.com/svn/trunk/doc/howto/tree.html). + + +It requires [pexpect](http://www.noah.org/wiki/pexpect) and (optionally) [unidecode](http://pypi.python.org/pypi/Unidecode) to handle non-ASCII text. This script includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/). + +It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 1.3.3** released 2012-07-09. + +## Download and Usage + +To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the tgz file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. + +In other words: + + sudo pip install pexpect unidecode # unidecode is optional + git clone git://github.com/dasmith/stanford-corenlp-python.git + cd stanford-corenlp-python + wget http://nlp.stanford.edu/software/stanford-corenlp-2012-07-09.tgz + tar xvfz stanford-corenlp-2012-07-09.tgz + +Then, to launch a server: + + python corenlp.py + +Optionally, you can specify a host or port: + + python corenlp.py -H 0.0.0.0 -p 3456 + +That will run a public JSON-RPC server on port 3456. + +Assuming you are running on port 8080, the code in `client.py` shows an example parse: + + import jsonrpc + from simplejson import loads + server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(), + jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080))) + + result = loads(server.parse("Hello world. It is so beautiful")) + print "Result", result + +That returns a dictionary containing the keys `sentences` and (when applicable) `corefs`. The key `sentences` contains a list of dictionaries for each sentence, which contain `parsetree`, `text`, `tuples` containing the dependencies, and `words`, containing information about parts of speech, NER, etc: + + {u'sentences': [{u'parsetree': u'(ROOT (S (VP (NP (INTJ (UH Hello)) (NP (NN world)))) (. !)))', + u'text': u'Hello world!', + u'tuples': [[u'dep', u'world', u'Hello'], + [u'root', u'ROOT', u'world']], + u'words': [[u'Hello', + {u'CharacterOffsetBegin': u'0', + u'CharacterOffsetEnd': u'5', + u'Lemma': u'hello', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'UH'}], + [u'world', + {u'CharacterOffsetBegin': u'6', + u'CharacterOffsetEnd': u'11', + u'Lemma': u'world', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'NN'}], + [u'!', + {u'CharacterOffsetBegin': u'11', + u'CharacterOffsetEnd': u'12', + u'Lemma': u'!', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'.'}]]}, + {u'parsetree': u'(ROOT (S (NP (PRP It)) (VP (VBZ is) (ADJP (RB so) (JJ beautiful))) (. .)))', + u'text': u'It is so beautiful.', + u'tuples': [[u'nsubj', u'beautiful', u'It'], + [u'cop', u'beautiful', u'is'], + [u'advmod', u'beautiful', u'so'], + [u'root', u'ROOT', u'beautiful']], + u'words': [[u'It', + {u'CharacterOffsetBegin': u'14', + u'CharacterOffsetEnd': u'16', + u'Lemma': u'it', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'PRP'}], + [u'is', + {u'CharacterOffsetBegin': u'17', + u'CharacterOffsetEnd': u'19', + u'Lemma': u'be', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'VBZ'}], + [u'so', + {u'CharacterOffsetBegin': u'20', + u'CharacterOffsetEnd': u'22', + u'Lemma': u'so', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'RB'}], + [u'beautiful', + {u'CharacterOffsetBegin': u'23', + u'CharacterOffsetEnd': u'32', + u'Lemma': u'beautiful', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'JJ'}], + [u'.', + {u'CharacterOffsetBegin': u'32', + u'CharacterOffsetEnd': u'33', + u'Lemma': u'.', + u'NamedEntityTag': u'O', + u'PartOfSpeech': u'.'}]]}], + u'coref': [[[[u'It', 1, 0, 0, 1], [u'Hello world', 0, 1, 0, 2]]]]} + +To use it in a regular script or to edit/debug it (because errors via RPC are opaque), load the module instead: + + from corenlp import * + corenlp = StanfordCoreNLP() # wait a few minutes... + corenlp.parse("Parse it") + + + + +## Questions + +**Stanford CoreNLP tools require a large amount of free memory**. Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines. 32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less. +If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process: + + java -cp stanford-corenlp-2012-07-09.jar:stanford-corenlp-2012-07-06-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties + +You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)). + + +# Contributors + +This is free and open source software and has benefited from the contribution and feedback of others. Like Stanford's CoreNLP tools, it is covered under the [GNU General Public License v2 +](http://www.gnu.org/licenses/gpl-2.0.html), which in short means that modifications to this program must maintain the same free and open source distribution policy. + +This project has benefited from the contributions of: + + * @jcc Justin Cheng + * Abhaya Agarwal + +## Related Projects + +These two projects are python wrappers for the [Stanford Parser](http://nlp.stanford.edu/software/lex-parser.shtml), which includes the Stanford Parser, although the Stanford Parser is another project. + - [stanford-parser-python](http://projects.csail.mit.edu/spatial/Stanford_Parser) uses [JPype](http://jpype.sourceforge.net/) (interface to JVM) + - [stanford-parser-jython](http://blog.gnucom.cc/2010/using-the-stanford-parser-with-jython/) uses Python diff --git a/corenlp-wrapper/corenlp/__init__.py b/corenlp-wrapper/corenlp/__init__.py new file mode 100644 index 0000000..2ed1ffc --- /dev/null +++ b/corenlp-wrapper/corenlp/__init__.py @@ -0,0 +1,12 @@ +# corenlp +# Copyright 2013- Hiroyoshi Komatsu +# See LICENSE for details. + +""" +Stanford CoreNLP Python wrapper +""" +__version__ = '1.0.3' +__author__ = 'Hiroyoshi Komatsu' +__license__ = 'GNU v2+' + +from corenlp import StanfordCoreNLP, ParserError, TimeoutError, ProcessError diff --git a/client.py b/corenlp-wrapper/corenlp/client.py old mode 100644 new mode 100755 similarity index 60% rename from client.py rename to corenlp-wrapper/corenlp/client.py index 11097ce..c5c4a98 --- a/client.py +++ b/corenlp-wrapper/corenlp/client.py @@ -1,12 +1,13 @@ import json -from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp +# from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp +import jsonrpclib from pprint import pprint + class StanfordNLP: - def __init__(self): - self.server = ServerProxy(JsonRpc20(), - TransportTcpIp(addr=("127.0.0.1", 8080))) - + def __init__(self, port_number=8080): + self.server = jsonrpclib.Server("http://localhost:%d" % port_number) + def parse(self, text): return json.loads(self.server.parse(text)) diff --git a/corenlp-wrapper/corenlp/corenlp.py b/corenlp-wrapper/corenlp/corenlp.py new file mode 100755 index 0000000..c3413ab --- /dev/null +++ b/corenlp-wrapper/corenlp/corenlp.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python +# +# corenlp - Python interface to Stanford Core NLP tools +# Copyright (c) 2012 Dustin Smith +# https://github.com/dasmith/stanford-corenlp-python +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import json, optparse, os, re, sys, time, traceback +import pexpect +from progressbar import ProgressBar, Fraction +from unidecode import unidecode +from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer +import nltk, nltk.data + + +class bc: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + + +VERBOSE = True +STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 +WORD_PATTERN = re.compile('\[([^\]]+)\]') +CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"") + +class ProcessError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class ParserError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class TimeoutError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +def clean_raw_text(files=[]): + #cleans all files contained in the directory "files/raw_text/" and places + #them into the "files/clean_text" directory. + import re + import nltk, nltk.data + + sent_detector=nltk.data.load('tokenizers/punkt/english.pickle') + + raw_files=['files/raw_text/' + f for f in os.listdir('files/raw_text/')] if not files else files + clean_files=['files/clean_text/' + raw.split('/')[-1][:-4] + '_clean.txt' for raw in raw_files] + + for raw, clean in zip(raw_files, clean_files): + raw_text=open(raw, 'r').read() + text=re.sub(r'-+(\n)\s*', '', raw_text) + text=re.sub(r'(\n)+', '', text) + text= ' '.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())]) + open(clean, 'w').write(text) + + +def remove_id(word): + """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """ + return word.count("-") == 0 and word or word[0:word.rindex("-")] + + +def parse_bracketed(s): + '''Parse word features [abc=... def = ...] + Also manages to parse out features that have XML within them + ''' + word = None + attrs = {} + temp = {} + # Substitute XML tags, to replace them later + for i, tag in enumerate(re.findall(r"(<[^<>]+>.*<\/[^<>]+>)", s)): + temp["^^^%d^^^" % i] = tag + s = s.replace(tag, "^^^%d^^^" % i) + # Load key-value pairs, substituting as necessary + for attr, val in re.findall(r"([^=\s]*)=([^=\s]*)", s): + if val in temp: + val = temp[val] + if attr == 'Text': + word = val + else: + attrs[attr] = val + return (word, attrs) + +def parse_xml_output(): + import os + import nltk, nltk.data + import xmltodict + from collections import OrderedDict + """Because interaction with the command-line interface of the CoreNLP + tools is limited to very short text bits, it is necessary to parse xml + output""" + #First, we change to the directory where we place the xml files from the + #parser: + here = os.path.dirname(os.path.abspath( __file__ )) + os.chdir(here) + os.chdir('../../files/xml') + + #we get a list of the cleaned files that we want to parse: + + files=['../clean_text/' + f for f in os.listdir('../clean_text')] + + #creating the file list of files to parse; the stanford tools require a file of file-names, with each file name on its own line: + + with open('../files.txt', 'w') as write_files: + write_files.write('\n'.join(files)) + + sent_detector=nltk.data.load('tokenizers/punkt/english.pickle') + lines=[] + #extracting the sentences from the text: + + [lines.extend(sent_detector.tokenize(open(text, 'r').read().strip())) for text in files] + + command='java -Xmx3g -cp ../../corenlp-wrapper/stanford-corenlp-full-2013-04-04/stanford-corenlp-1.3.5.jar:../../corenlp-wrapper/stanford-corenlp-full-2013-04-04/stanford-corenlp-1.3.5-models.jar:../../corenlp-wrapper/stanford-corenlp-full-2013-04-04/xom.jar:../../corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time.jar:../../corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday.jar edu.stanford.nlp.pipeline.StanfordCoreNLP -props ../../corenlp-wrapper/corenlp/default.properties -filelist ../files.txt' + + #creates the xml file of parser output: + + os.system(command) + + #reading in the raw xml file: + xml=open(os.listdir('.')[0], 'r').read() + + #turning the raw xml into a raw python dictionary: + raw_dict=xmltodict.parse(xml) + + #making a raw sentence list of dictionaries: + raw_sent_list=raw_dict[u'root'][u'document'][u'sentences'][u'sentence'] + #making a raw coref dictionary: + raw_coref_list=raw_dict[u'root'][u'document'][u'coreference'][u'coreference'] + + #cleaning up the list ...the problem is that this doesn't come in pairs, as the command line version: + + coref_list=[[[eval(raw_coref_list[j][u'mention'][i]['sentence'])-1, eval(raw_coref_list[j][u'mention'][i]['head'])-1, eval(raw_coref_list[j][u'mention'][i]['start'])-1, eval(raw_coref_list[j][u'mention'][i]['end'])-1] for i in range(len(raw_coref_list[j][u'mention']))] for j in range(len(raw_coref_list))] + + [[coref.insert(0,' '.join(lines[coref[0]].split()[coref[-2]:coref[-1]])) for coref in coref_list[j]] for j in range(len(coref_list))] + os.chdir('../..') + + coref_list=[[[coref_list[j][i], coref_list[j][0]] for i in range(len(coref_list[j]))] for j in range(len(coref_list))] + + sentences=[{'dependencies': [[dep['dep'][i]['@type'], dep['dep'][i]['governor']['#text'], dep['dep'][i]['dependent']['#text']] for dep in raw_sent_list[j][u'dependencies'] for i in range(len(dep['dep'])) if dep['@type']=='basic-dependencies'], 'text': lines[j], 'parsetree': str(raw_sent_list[j]['parse']), 'words': [[str(token['word']), OrderedDict([('NamedEntityTag', str(token['NER'])), ('CharacterOffsetEnd', str(token['CharacterOffsetEnd'])), ('CharacterOffsetBegin', str(token['CharacterOffsetBegin'])), ('PartOfSpeech', str(token['POS'])), ('Lemma', str(token['lemma']))])] for token in raw_sent_list[j]['tokens'][u'token']]} for j in range(len(lines))] + + results={'coref':coref_list, 'sentences':sentences} + + return results + +def parse_parser_results(text): + """ This is the nasty bit of code to interact with the command-line + interface of the CoreNLP tools. Takes a string of the parser results + and then returns a Python list of dictionaries, one for each parsed + sentence. + """ + results = {"sentences": []} + state = STATE_START + for line in unidecode(text.decode('utf-8')).split("\n"): + line = line.strip() + + if line.startswith("Sentence #"): + sentence = {'words':[], 'parsetree':[], 'dependencies':[]} + results["sentences"].append(sentence) + state = STATE_TEXT + + elif state == STATE_TEXT: + sentence['text'] = line + state = STATE_WORDS + + elif state == STATE_WORDS: + if not line.startswith("[Text="): + raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) + for s in WORD_PATTERN.findall(line): + sentence['words'].append(parse_bracketed(s)) + state = STATE_TREE + + elif state == STATE_TREE: + if len(line) == 0: + state = STATE_DEPENDENCY + sentence['parsetree'] = " ".join(sentence['parsetree']) + else: + sentence['parsetree'].append(line) + + elif state == STATE_DEPENDENCY: + if len(line) == 0: + state = STATE_COREFERENCE + else: + split_entry = re.split("\(|, ", line[:-1]) + if len(split_entry) == 3: + rel, left, right = map(lambda x: remove_id(x), split_entry) + sentence['dependencies'].append(tuple([rel,left,right])) + + elif state == STATE_COREFERENCE: + if "Coreference set" in line: + if 'coref' not in results: + results['coref'] = [] + coref_set = [] + results['coref'].append(coref_set) + else: + for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): + src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1 + sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1 + coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) + + return results + + +class StanfordCoreNLP(object): + """ + Command-line interaction with Stanford's CoreNLP java utilities. + Can be run as a JSON-RPC server or imported as a module. + """ + def __init__(self, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"): + """ + Checks the location of the jar files. + Spawns the server as a process. + """ + + jars = ["stanford-corenlp-1.3.5.jar", + "stanford-corenlp-1.3.5-models.jar", + "xom.jar", + "joda-time.jar", + "jollyday.jar"] + + java_path = "java" + classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" + # include the properties file, so you can change defaults + # but any changes in output format will break parse_parser_results() + property_name = "default.properties" + current_dir_pr = os.path.dirname(os.path.abspath( __file__ )) +"/"+ property_name + if os.path.exists(property_name): + props = "-props %s" % (property_name) + elif os.path.exists(current_dir_pr): + props = "-props %s" % (current_dir_pr) + else: + raise Exception("Error! Cannot locate: default.properties") + + # add and check classpaths + jars = [corenlp_path +"/"+ jar for jar in jars] + for jar in jars: + if not os.path.exists(jar): + raise Exception("Error! Cannot locate: %s" % jar) + + # add memory limit on JVM + if memory: + limit = "-Xmx%s" % memory + else: + limit = "" + + # spawn the server + start_corenlp = "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props) + if VERBOSE: print "===========================================\n", start_corenlp + self.corenlp = pexpect.spawn(start_corenlp) + + # show progress bar while loading the models + if VERBOSE: + widgets = ['Loading Models: ', Fraction()] + pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start() + self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec) + if VERBOSE: pbar.update(1) + self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec) + if VERBOSE: pbar.update(2) + self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec) + if VERBOSE: pbar.update(3) + self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec) + if VERBOSE: pbar.update(4) + self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec) + if VERBOSE: pbar.update(5) + self.corenlp.expect("Entering interactive shell.") + if VERBOSE: pbar.finish() + + # interactive shell + self.corenlp.expect("\nNLP> ", timeout=3) + + def close(self, force=True): + self.corenlp.terminate(force) + + def isalive(self): + return self.corenlp.isalive() + + def __del__(self): + # If our child process is still around, kill it + if self.isalive(): + self.close() + + def _parse(self, text): + """ + This is the core interaction with the parser. + + It returns a Python data-structure, while the parse() + function returns a JSON object + """ + + # CoreNLP interactive shell cannot recognize newline + if '\n' in text or '\r' in text: + to_send = re.sub("[\r\n]", " ", text).strip() + else: + to_send = text + + # clean up anything leftover + def clean_up(): + while True: + try: + self.corenlp.read_nonblocking (8192, 0.1) + except pexpect.TIMEOUT: + break + clean_up() + bytes_written = self.corenlp.sendline(to_send) + print bc.HEADER, "bytes written", bytes_written, bc.ENDC + + # How much time should we give the parser to parse it? + # the idea here is that you increase the timeout as a + # function of the text's length. + # max_expected_time = max(5.0, 3 + len(to_send) / 5.0) + max_expected_time = max(300.0, len(to_send) / 3.0)*9000000000 + + # repeated_input = self.corenlp.except("\n") # confirm it + t = self.corenlp.expect(["\nNLP> ", pexpect.TIMEOUT, pexpect.EOF], + timeout=max_expected_time) + incoming = self.corenlp.before + if t == 1: + # TIMEOUT, clean up anything when raise pexpect.TIMEOUT error + clean_up() + print >>sys.stderr, {'error': "timed out after %f seconds" % max_expected_time, + 'input': to_send, + 'output': incoming} + raise TimeoutError("Timed out after %d seconds" % max_expected_time) + elif t == 2: + # EOF, probably crash CoreNLP process + print >>sys.stderr, {'error': "CoreNLP terminates abnormally while parsing", + 'input': to_send, + 'output': incoming} + self.corenlp.close() + raise ProcessError("CoreNLP process terminates abnormally while parsing") + + if VERBOSE: print "%s\n%s" % ('='*40, incoming) + try: + results = parse_parser_results(incoming) + except Exception, e: + if VERBOSE: print traceback.format_exc() + raise e + + return results + + def raw_parse(self, text): + """ + This function takes a text string, sends it to the Stanford parser, + reads in the result, parses the results and returns a list + with one dictionary entry for each parsed sentence. + """ + return self._parse(text) + + def parse(self, text='', files=[]): + """ + This function takes a text string, sends it to the Stanford parser, + reads in the result, parses the results and returns a list + with one dictionary entry for each parsed sentence, in JSON format. + """ + if text: + return json.dumps(self._parse(text)) + else: + clean_raw_text(files=files) + return str(parse_xml_output()) + + +if __name__ == '__main__': + """ + The code below starts an JSONRPC server + """ + VERBOSE = True + parser = optparse.OptionParser(usage="%prog [OPTIONS]") + parser.add_option('-p', '--port', default='8080', + help='Port to serve on (default 8080)') + parser.add_option('-H', '--host', default='127.0.0.1', + help='Host to serve on (default localhost; 0.0.0.0 to make public)') + parser.add_option('-S', '--corenlp', default="stanford-corenlp-full-2013-04-04", + help='Stanford CoreNLP tool directory (default stanford-corenlp-full-2013-04-04/)') + options, args = parser.parse_args() + # server = jsonrpc.Server(jsonrpc.JsonRpc20(), + # jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) + try: + server = SimpleJSONRPCServer((options.host, int(options.port))) + + nlp = StanfordCoreNLP(options.corenlp) + server.register_function(nlp.parse) + + print 'Serving on http://%s:%s' % (options.host, options.port) + # server.serve() + server.serve_forever() + except KeyboardInterrupt: + print >>sys.stderr, "Bye." + exit() diff --git a/default.properties b/corenlp-wrapper/corenlp/default.properties similarity index 98% rename from default.properties rename to corenlp-wrapper/corenlp/default.properties index e069ea9..ee99077 100644 --- a/default.properties +++ b/corenlp-wrapper/corenlp/default.properties @@ -57,7 +57,7 @@ annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref #clean.xmltags = .* # A set of tags which will force the end of a sentence. HTML example: # you would not want to end on , but you would want to end on

. -# Once again, a regular expression. +# Once again, a regular expression. # (Blank means there are no sentence enders.) #clean.sentenceendingtags = # Whether or not to allow malformed xml diff --git a/progressbar.py b/corenlp-wrapper/corenlp/progressbar.py similarity index 100% rename from progressbar.py rename to corenlp-wrapper/corenlp/progressbar.py diff --git a/corenlp-wrapper/setup.py b/corenlp-wrapper/setup.py new file mode 100644 index 0000000..c5eeb8b --- /dev/null +++ b/corenlp-wrapper/setup.py @@ -0,0 +1,43 @@ +import os +from distutils.core import setup + +PACKAGE = "corenlp" +NAME = "corenlp-python" +DESCRIPTION = "A Stanford Core NLP wrapper" +AUTHOR = "Hiroyoshi Komatsu" +AUTHOR_EMAIL = "hiroyoshi.komat@gmail.com" +URL = "https://bitbucket.org/torotoki/corenlp-python" +VERSION = "1.0.3" + +# Utility function to read the README file. +# Used for the long_description. It's nice, because now 1) we have a top level +# README file and 2) it's easier to type in the README file than to put a raw +# string in below ... +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=read("README.md"), + author=AUTHOR, + author_email=AUTHOR_EMAIL, + url=URL, + packages=['corenlp'], + package_dir = {'corenlp': 'corenlp'}, + package_data = { + "corenlp": ["default.properties"] + }, + # data_files = [ + # ('corenlp', ["default.properties"]), + # ], + # package_data=find_package_data( + # PACKAGE, + # only_in_packages=False + # ) + classifiers=[ + "License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)", + "Programming Language :: Python", + ], +) diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04.zip b/corenlp-wrapper/stanford-corenlp-full-2013-04-04.zip new file mode 100644 index 0000000..26fae5a Binary files /dev/null and b/corenlp-wrapper/stanford-corenlp-full-2013-04-04.zip differ diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/CoreNLP-to-HTML.xsl b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/CoreNLP-to-HTML.xsl new file mode 100755 index 0000000..ccb9f8b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/CoreNLP-to-HTML.xsl @@ -0,0 +1,141 @@ + + + + + + + + + +

Stanford CoreNLP XML Output

+
+

Document

+ + + + + + + + +
Sentences
+ + + +
Coreference resolution graph
+ +
+ + + + + + + + Sentence # + +

+ Tokens
+ +

+ +

+ Parse tree
+ +

+ +

+ Uncollapsed dependencies +

    + + + +
+

+ +

+ Collapsed dependencies +

    + + + +
+

+ +

+ Collapsed dependencies with CC processed +

    + + + +
+

+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
IdWordLemmaChar beginChar endPOSNERNormalized NER
+
+ + + + + + +
  • + + ( + ^- + , + ^- + ) + (extra) +
  • +
    + + +
      + +
    1. +
        + +
      • sentence , + headword + (gov) +
      • +
        +
      +
    2. +
      +
    +
    + + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/LIBRARY-LICENSES b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/LIBRARY-LICENSES new file mode 100644 index 0000000..e1fa147 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/LIBRARY-LICENSES @@ -0,0 +1,20 @@ +xom-1.2.6.jar + +Url: http://www.xom.nu/ + +License: http://www.xom.nu/license.xhtml + +LGPL +Gnu lesser general public license + +----------------------------------------- + +joda-time-2.0.jar + +Url: http://joda-time.sourceforge.net/ + +License: http://joda-time.sourceforge.net/license.html + +Apache License 2.0 + +----------------------------------------- diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/LICENSE.txt b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/LICENSE.txt new file mode 100644 index 0000000..60549be --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/LICENSE.txt @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/Makefile b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/Makefile new file mode 100644 index 0000000..2c23e8e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/Makefile @@ -0,0 +1,25 @@ +# This is a rudimentary Makefile for rebuilding Stanford CoreNLP. +# We actually use ant (q.v.) or a Java IDE. + +JAVAC = javac +JAVAFLAGS = -O -d classes -encoding utf-8 + +# Builds the classes' jar file +corenlp: source + mkdir -p classes + $(JAVAC) $(JAVAFLAGS) src/edu/stanford/nlp/*/*.java \ + src/edu/stanford/nlp/*/*/*.java \ + src/edu/stanford/nlp/*/*/*/*.java \ + src/edu/stanford/nlp/*/*/*/*/*.java \ + src/edu/stanford/nlp/*/*/*/*/*/*.java + cd classes ; jar -cfm ../stanford-corenlp-`date +%Y-%m-%d`.jar ../src/META-INF/MANIFEST.MF edu ; cd .. + +# Before making, unjar the source jar file in the 'src' directory +source: + if [ ! -e src ] ; then \ + mkdir src ; cd src ; jar -xf ../stanford-corenlp-*-sources.jar; \ + fi; + +clean: + rm -rf classes + rm -rf src diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/README.txt b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/README.txt new file mode 100644 index 0000000..fe34173 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/README.txt @@ -0,0 +1,80 @@ +Stanford CoreNLP v1.3.5 - 2013-04-04 +Stanford's Suite of NLP Tools +----------------------------- + +Copyright (c) 2009-2012 The Board of Trustees of +The Leland Stanford Junior University. All Rights Reserved. + +DOCUMENTATION + +Please look at the URL below for documention for Stanford CoreNLP: + + http://nlp.stanford.edu/software/corenlp.shtml + +LICENSE + +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2012 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +--------------------------------- +CHANGES +--------------------------------- + +2013-04-04 1.3.5 Speed improvements, coref improvements, + Chinese version, -nthreads option + +2012-11-12 1.3.4 Improved ner model and dependency code, + now possible to change annotator pool for + later StanfordCoreNLP objects + +2012-07-09 1.3.3 Minor bug fixes + +2012-05-22 1.3.2 Improvements to sutime + +2012-03-09 1.3.1 Now supports caseless models (available as DLC) + +2011-12-16 1.3.0 Threadsafe! + Bugs in time annotation fixed + +2011-09-14 1.2.0 Time expression recognizer added to ner annotator + Output bugfixes + Parser can now substitute for tagger + +2011-06-19 1.1.0 Improved coref release + +2011-05-15 1.0.4 More efficient dcoref data structure + Supports already-tokenized input text + +2011-04-17 1.0.3 Compatible with other releases + Support loading arbitrary annotators + Tagger bug fixes, such as "EOS" token + +2010-11-11 1.0.2 Remove wn.jar + +2010-11-11 1.0.1 Add xml removal + +2010-10-07 1.0 Initial release + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/StanfordCoreNlpDemo.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/StanfordCoreNlpDemo.java new file mode 100644 index 0000000..1756e0b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/StanfordCoreNlpDemo.java @@ -0,0 +1,50 @@ + +import java.io.*; +import java.util.*; + +import edu.stanford.nlp.io.*; +import edu.stanford.nlp.ling.*; +import edu.stanford.nlp.pipeline.*; +import edu.stanford.nlp.trees.*; +import edu.stanford.nlp.util.*; + +public class StanfordCoreNlpDemo { + + public static void main(String[] args) throws IOException { + PrintWriter out; + if (args.length > 1) { + out = new PrintWriter(args[1]); + } else { + out = new PrintWriter(System.out); + } + PrintWriter xmlOut = null; + if (args.length > 2) { + xmlOut = new PrintWriter(args[2]); + } + + StanfordCoreNLP pipeline = new StanfordCoreNLP(); + Annotation annotation; + if (args.length > 0) { + annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0])); + } else { + annotation = new Annotation("Kosgi Santosh sent an email to Stanford University. He didn't get a reply."); + } + + pipeline.annotate(annotation); + pipeline.prettyPrint(annotation, out); + if (xmlOut != null) { + pipeline.xmlPrint(annotation, xmlOut); + } + // An Annotation is a Map and you can get and use the various analyses individually. + // For instance, this gets the parse tree of the first sentence in the text. + List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); + if (sentences != null && sentences.size() > 0) { + CoreMap sentence = sentences.get(0); + Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + out.println(); + out.println("The first sentence parsed is:"); + tree.pennPrint(out); + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/build.xml b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/build.xml new file mode 100644 index 0000000..d5c4694 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/build.xml @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/corenlp.sh b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/corenlp.sh new file mode 100755 index 0000000..862de38 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/corenlp.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# +# Runs Stanford CoreNLP. +# Simple uses for xml and plain text output to files are: +# ./corenlp.sh -file filename +# ./corenlp.sh -file filename -outputFormat text + +scriptdir=`dirname $0` + +echo java -mx3g -cp \"$scriptdir/*\" edu.stanford.nlp.pipeline.StanfordCoreNLP $* +java -mx3g -cp "$scriptdir/*" edu.stanford.nlp.pipeline.StanfordCoreNLP $* diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/input.txt b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/input.txt new file mode 100644 index 0000000..7e2de09 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/input.txt @@ -0,0 +1 @@ +Stanford University is located in California. It is a great university, founded in 1891. diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/input.txt.xml b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/input.txt.xml new file mode 100644 index 0000000..63baf71 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/input.txt.xml @@ -0,0 +1,348 @@ + + + + + + + + + Stanford + Stanford + 0 + 8 + NNP + ORGANIZATION + + + University + University + 9 + 19 + NNP + ORGANIZATION + + + is + be + 20 + 22 + VBZ + O + + + located + located + 23 + 30 + JJ + O + + + in + in + 31 + 33 + IN + O + + + California + California + 34 + 44 + NNP + LOCATION + + + . + . + 44 + 45 + . + O + + + (ROOT (S (NP (NNP Stanford) (NNP University)) (VP (VBZ is) (ADJP (JJ located) (PP (IN in) (NP (NNP California))))) (. .))) + + + ROOT + located + + + University + Stanford + + + located + University + + + located + is + + + located + in + + + in + California + + + + + ROOT + located + + + University + Stanford + + + located + University + + + located + is + + + located + California + + + + + ROOT + located + + + University + Stanford + + + located + University + + + located + is + + + located + California + + + + + + + It + it + 46 + 48 + PRP + O + + + is + be + 49 + 51 + VBZ + O + + + a + a + 52 + 53 + DT + O + + + great + great + 54 + 59 + JJ + O + + + university + university + 60 + 70 + NN + O + + + , + , + 70 + 71 + , + O + + + founded + found + 72 + 79 + VBN + O + + + in + in + 80 + 82 + IN + O + + + 1891 + 1891 + 83 + 87 + CD + DATE + 1891 + 1891 + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (NP (DT a) (JJ great) (NN university)) (, ,) (VP (VBN founded) (PP (IN in) (NP (CD 1891)))))) (. .))) + + + ROOT + university + + + university + It + + + university + is + + + university + a + + + university + great + + + university + founded + + + founded + in + + + in + 1891 + + + + + ROOT + university + + + university + It + + + university + is + + + university + a + + + university + great + + + university + founded + + + founded + 1891 + + + + + ROOT + university + + + university + It + + + university + is + + + university + a + + + university + great + + + university + founded + + + founded + 1891 + + + + + + + + 1 + 1 + 3 + 2 + + + 2 + 1 + 2 + 1 + + + 2 + 3 + 10 + 5 + + + 2 + 3 + 6 + 5 + + + + + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time-2.1-sources.jar b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time-2.1-sources.jar new file mode 100644 index 0000000..44e4ed8 Binary files /dev/null and b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time-2.1-sources.jar differ diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time.jar b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time.jar new file mode 100644 index 0000000..b2aca95 Binary files /dev/null and b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/joda-time.jar differ diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday-0.4.7-sources.jar b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday-0.4.7-sources.jar new file mode 100644 index 0000000..55e0204 Binary files /dev/null and b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday-0.4.7-sources.jar differ diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday.jar b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday.jar new file mode 100644 index 0000000..a6bf8b3 Binary files /dev/null and b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/jollyday.jar differ diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/new_sample.txt.xml b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/new_sample.txt.xml new file mode 100644 index 0000000..2e11218 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/new_sample.txt.xml @@ -0,0 +1,114285 @@ + + + + + + + + + Mr. + Mr. + 0 + 3 + NNP + O + + + MITCHELL + MITCHELL + 4 + 12 + NNP + PERSON + + + . + . + 12 + 13 + . + O + + + (ROOT (NP (NNP Mr.) (NNP MITCHELL) (. .))) + + + ROOT + MITCHELL + + + MITCHELL + Mr. + + + + + ROOT + MITCHELL + + + MITCHELL + Mr. + + + + + ROOT + MITCHELL + + + MITCHELL + Mr. + + + + + + + Mr. + Mr. + 14 + 17 + NNP + O + + + President + President + 18 + 27 + NNP + O + + + , + , + 27 + 28 + , + O + + + fol + fol + 29 + 32 + NN + O + + + - + - + 32 + 33 + : + O + + + lowing + low + 35 + 41 + VBG + O + + + the + the + 42 + 45 + DT + O + + + conclusion + conclusion + 46 + 56 + NN + O + + + of + of + 57 + 59 + IN + O + + + morning + morning + 60 + 67 + NN + TIME + TMO + TMO + + + busi + busus + 68 + 72 + NNS + O + + + - + - + 72 + 73 + : + O + + + ness + ness + 75 + 79 + NN + O + + + at + at + 80 + 82 + IN + O + + + 3:30 + 3:30 + 83 + 87 + CD + TIME + T15:30 + T15:30 + + + p.m. + p.m. + 88 + 92 + RB + TIME + T15:30 + T15:30 + + + , + , + 92 + 93 + , + O + + + I + I + 94 + 95 + PRP + O + + + will + will + 96 + 100 + MD + O + + + seek + seek + 101 + 105 + VB + O + + + unani + unani + 106 + 111 + JJ + O + + + - + - + 111 + 112 + : + O + + + mous + mous + 114 + 118 + JJ + O + + + consent + consent + 119 + 126 + NN + O + + + to + to + 127 + 129 + TO + O + + + proceed + proceed + 130 + 137 + VB + O + + + to + to + 138 + 140 + TO + O + + + the + the + 141 + 144 + DT + O + + + consid + consid + 145 + 151 + NN + O + + + - + - + 151 + 152 + : + O + + + eration + eration + 154 + 161 + NN + O + + + of + of + 162 + 164 + IN + O + + + calendar + calendar + 165 + 173 + NN + O + + + item + item + 174 + 178 + NN + O + + + No. + no. + 179 + 182 + NN + O + + + 427 + 427 + 183 + 186 + CD + NUMBER + 427.0 + + + , + , + 186 + 187 + , + O + + + that + that + 188 + 192 + WDT + O + + + Is + be + 194 + 196 + VBZ + O + + + S. + S. + 197 + 199 + NNP + O + + + 1630 + 1630 + 200 + 204 + CD + DATE + 1630 + 1630 + + + , + , + 204 + 205 + , + O + + + the + the + 206 + 209 + DT + O + + + clean + clean + 210 + 215 + JJ + O + + + air + air + 216 + 219 + NN + O + + + legislation + legislation + 220 + 231 + NN + O + + + . + . + 231 + 232 + . + O + + + (ROOT (NP (NP (NP (NNP Mr.) (NNP President)) (, ,) (NP (NN fol))) (: -) (S (S (VP (VBG lowing) (NP (NP (NP (DT the) (NN conclusion)) (PP (IN of) (NP (NN morning) (NNS busi)))) (: -) (NP (NP (NN ness)) (PP (IN at) (NP (CD 3:30) (RB p.m.))))))) (, ,) (NP (PRP I)) (VP (MD will) (VP (VB seek) (ADJP (ADJP (JJ unani)) (: -) (NP (NP (JJ mous) (NN consent)) (SBAR (S (VP (TO to) (VP (VB proceed) (PP (TO to) (NP (DT the) (NN consid) (: -) (NN eration))))))) (PP (IN of) (NP (NP (NN calendar) (NN item) (NN No.)) (NP-TMP (NP (CD 427)) (, ,) (SBAR (WHNP (WDT that)) (S (VP (VBZ Is) (NP (NP (NNP S.) (CD 1630)) (, ,) (NP (DT the) (JJ clean) (NN air) (NN legislation)))))))))))))) (. .))) + + + ROOT + President + + + President + Mr. + + + President + fol + + + seek + lowing + + + conclusion + the + + + lowing + conclusion + + + conclusion + of + + + busi + morning + + + of + busi + + + conclusion + ness + + + ness + at + + + at + 3:30 + + + 3:30 + p.m. + + + seek + I + + + seek + will + + + President + seek + + + seek + unani + + + consent + mous + + + unani + consent + + + proceed + to + + + consent + proceed + + + proceed + to + + + eration + the + + + eration + consid + + + to + eration + + + consent + of + + + No. + calendar + + + No. + item + + + of + No. + + + No. + 427 + + + S. + that + + + S. + Is + + + 427 + S. + + + S. + 1630 + + + legislation + the + + + legislation + clean + + + legislation + air + + + S. + legislation + + + + + ROOT + President + + + President + Mr. + + + President + fol + + + seek + lowing + + + conclusion + the + + + lowing + conclusion + + + busi + morning + + + conclusion + busi + + + conclusion + ness + + + ness + 3:30 + + + 3:30 + p.m. + + + seek + I + + + seek + will + + + President + seek + + + seek + unani + + + consent + mous + + + unani + consent + + + proceed + to + + + consent + proceed + + + eration + the + + + eration + consid + + + proceed + eration + + + No. + calendar + + + No. + item + + + consent + No. + + + No. + 427 + + + S. + that + + + S. + Is + + + 427 + S. + + + S. + 1630 + + + legislation + the + + + legislation + clean + + + legislation + air + + + S. + legislation + + + + + ROOT + President + + + President + Mr. + + + President + fol + + + seek + lowing + + + conclusion + the + + + lowing + conclusion + + + busi + morning + + + conclusion + busi + + + conclusion + ness + + + ness + 3:30 + + + 3:30 + p.m. + + + seek + I + + + seek + will + + + President + seek + + + seek + unani + + + consent + mous + + + unani + consent + + + proceed + to + + + consent + proceed + + + eration + the + + + eration + consid + + + proceed + eration + + + No. + calendar + + + No. + item + + + consent + No. + + + No. + 427 + + + S. + that + + + S. + Is + + + 427 + S. + + + S. + 1630 + + + legislation + the + + + legislation + clean + + + legislation + air + + + S. + legislation + + + + + + + THE + the + 241 + 244 + DT + O + + + SENATE + SENATE + 245 + 251 + NNP + MISC + + + AGENDA + AGENDA + 252 + 258 + NNP + MISC + + + Mr. + Mr. + 262 + 265 + NNP + O + + + MITCHELL + MITCHELL + 266 + 274 + NNP + PERSON + + + . + . + 274 + 275 + . + O + + + (ROOT (NP (NP (DT THE) (NNP SENATE) (NNP AGENDA)) (NP (NNP Mr.) (NNP MITCHELL)) (. .))) + + + ROOT + AGENDA + + + AGENDA + THE + + + AGENDA + SENATE + + + MITCHELL + Mr. + + + AGENDA + MITCHELL + + + + + ROOT + AGENDA + + + AGENDA + THE + + + AGENDA + SENATE + + + MITCHELL + Mr. + + + AGENDA + MITCHELL + + + + + ROOT + AGENDA + + + AGENDA + THE + + + AGENDA + SENATE + + + MITCHELL + Mr. + + + AGENDA + MITCHELL + + + + + + + Mr. + Mr. + 276 + 279 + NNP + O + + + President + President + 280 + 289 + NNP + O + + + , + , + 289 + 290 + , + O + + + we + we + 291 + 293 + PRP + O + + + begin + begin + 295 + 300 + VBP + O + + + this + this + 301 + 305 + DT + O + + + session + session + 306 + 313 + NN + O + + + with + with + 314 + 318 + IN + O + + + the + the + 319 + 322 + DT + O + + + Clean + Clean + 323 + 328 + NNP + MISC + + + Air + Air + 329 + 332 + NNP + MISC + + + Act + Act + 334 + 337 + NNP + MISC + + + . + . + 337 + 338 + . + O + + + (ROOT (S (NP (NNP Mr.) (NNP President)) (, ,) (NP (PRP we)) (VP (VBP begin) (NP (NP (DT this) (NN session)) (PP (IN with) (NP (DT the) (NNP Clean) (NNP Air) (NNP Act))))) (. .))) + + + ROOT + begin + + + President + Mr. + + + begin + President + + + begin + we + + + session + this + + + begin + session + + + session + with + + + Act + the + + + Act + Clean + + + Act + Air + + + with + Act + + + + + ROOT + begin + + + President + Mr. + + + begin + President + + + begin + we + + + session + this + + + begin + session + + + Act + the + + + Act + Clean + + + Act + Air + + + session + Act + + + + + ROOT + begin + + + President + Mr. + + + begin + President + + + begin + we + + + session + this + + + begin + session + + + Act + the + + + Act + Clean + + + Act + Air + + + session + Act + + + + + + + This + this + 339 + 343 + DT + O + + + is + be + 344 + 346 + VBZ + O + + + critical + critical + 347 + 355 + JJ + O + + + legislation + legislation + 356 + 367 + NN + O + + + . + . + 367 + 368 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (JJ critical) (NN legislation))) (. .))) + + + ROOT + legislation + + + legislation + This + + + legislation + is + + + legislation + critical + + + + + ROOT + legislation + + + legislation + This + + + legislation + is + + + legislation + critical + + + + + ROOT + legislation + + + legislation + This + + + legislation + is + + + legislation + critical + + + + + + + It + it + 369 + 371 + PRP + O + + + has + have + 372 + 375 + VBZ + O + + + been + be + 377 + 381 + VBN + O + + + over + over + 382 + 386 + IN + O + + + 12 + 12 + 387 + 389 + CD + DURATION + 12.0 + P12Y + + + years + year + 390 + 395 + NNS + NUMBER + 0.0 + P12Y + + + since + since + 396 + 401 + IN + O + + + the + the + 402 + 405 + DT + O + + + Clean + Clean + 406 + 411 + NNP + MISC + + + Air + Air + 412 + 415 + NNP + MISC + + + Act + Act + 417 + 420 + NNP + MISC + + + was + be + 421 + 424 + VBD + O + + + last + last + 425 + 429 + JJ + O + + + debated + debate + 430 + 437 + VBN + O + + + in + in + 438 + 440 + IN + O + + + the + the + 441 + 444 + DT + O + + + Senate + Senate + 445 + 451 + NNP + ORGANIZATION + + + . + . + 451 + 452 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ has) (VP (VBN been) (PP (IN over) (NP (CD 12) (NNS years))) (SBAR (IN since) (S (NP (DT the) (NNP Clean) (NNP Air) (NNP Act)) (VP (VBD was) (ADVP (JJ last)) (VP (VBN debated) (PP (IN in) (NP (DT the) (NNP Senate))))))))) (. .))) + + + ROOT + been + + + been + It + + + been + has + + + been + over + + + years + 12 + + + over + years + + + debated + since + + + Act + the + + + Act + Clean + + + Act + Air + + + debated + Act + + + debated + was + + + debated + last + + + been + debated + + + debated + in + + + Senate + the + + + in + Senate + + + + + ROOT + been + + + been + It + + + been + has + + + years + 12 + + + been + years + + + debated + since + + + Act + the + + + Act + Clean + + + Act + Air + + + debated + Act + + + debated + was + + + debated + last + + + been + debated + + + Senate + the + + + debated + Senate + + + + + ROOT + been + + + been + It + + + been + has + + + years + 12 + + + been + years + + + debated + since + + + Act + the + + + Act + Clean + + + Act + Air + + + debated + Act + + + debated + was + + + debated + last + + + been + debated + + + Senate + the + + + debated + Senate + + + + + + + Since + since + 454 + 459 + IN + O + + + then + then + 460 + 464 + RB + O + + + , + , + 464 + 465 + , + O + + + our + we + 466 + 469 + PRP$ + O + + + population + population + 470 + 480 + NN + O + + + has + have + 481 + 484 + VBZ + O + + + grown + grow + 485 + 490 + VBN + O + + + , + , + 490 + 491 + , + O + + + automobile + automobile + 493 + 503 + NN + O + + + use + use + 504 + 507 + NN + O + + + Increased + increase + 508 + 517 + VBN + O + + + , + , + 517 + 518 + , + O + + + and + and + 519 + 522 + CC + O + + + the + the + 523 + 526 + DT + O + + + economy + economy + 528 + 535 + NN + O + + + expanded + expand + 536 + 544 + VBN + O + + + , + , + 544 + 545 + , + O + + + with + with + 546 + 550 + IN + O + + + the + the + 551 + 554 + DT + O + + + accompa + accompa + 555 + 562 + NN + O + + + - + - + 562 + 563 + : + O + + + nying + nying + 565 + 570 + JJ + O + + + increases + increase + 571 + 580 + NNS + O + + + in + in + 581 + 583 + IN + O + + + production + production + 584 + 594 + NN + O + + + facili + facilus + 595 + 601 + NNS + O + + + - + - + 601 + 602 + : + O + + + ties + tie + 604 + 608 + NNS + O + + + , + , + 608 + 609 + , + O + + + energy + energy + 610 + 616 + NN + O + + + use + use + 617 + 620 + NN + O + + + , + , + 620 + 621 + , + O + + + congestion + congestion + 622 + 632 + NN + O + + + , + , + 632 + 633 + , + O + + + and + and + 634 + 637 + CC + O + + + inevi + inevus + 638 + 643 + NN + O + + + - + - + 643 + 644 + : + O + + + tably + tably + 646 + 651 + RB + O + + + , + , + 651 + 652 + , + O + + + pollution + pollution + 653 + 662 + NN + O + + + . + . + 662 + 663 + . + O + + + (ROOT (S (PP (IN Since) (NP (RB then))) (, ,) (NP (PRP$ our) (NN population)) (VP (VBZ has) (VP (VBN grown) (, ,) (NP (NP (NP (NN automobile) (NN use)) (VP (VBN Increased))) (, ,) (CC and) (NP (NP (DT the) (NN economy)) (VP (VBN expanded) (, ,) (PP (IN with) (NP (NP (DT the) (NN accompa)) (: -) (NP (NP (NP (JJ nying) (NNS increases)) (PP (IN in) (NP (NN production) (NNS facili)))) (: -) (NP (NP (NNS ties)) (, ,) (NP (NN energy) (NN use)) (, ,) (NP (NN congestion)) (, ,) (CC and) (NP (NN inevi)))) (: -)) (ADVP (RB tably))) (, ,) (S (NP (NN pollution)))))))) (. .))) + + + ROOT + grown + + + grown + Since + + + Since + then + + + population + our + + + grown + population + + + grown + has + + + use + automobile + + + grown + use + + + use + Increased + + + use + and + + + economy + the + + + use + economy + + + economy + expanded + + + expanded + with + + + accompa + the + + + with + accompa + + + increases + nying + + + accompa + increases + + + increases + in + + + facili + production + + + in + facili + + + increases + ties + + + use + energy + + + ties + use + + + ties + congestion + + + ties + and + + + ties + inevi + + + with + tably + + + expanded + pollution + + + + + ROOT + grown + + + grown + Since + + + Since + then + + + population + our + + + grown + population + + + grown + has + + + use + automobile + + + grown + use + + + use + Increased + + + economy + the + + + use + economy + + + economy + expanded + + + accompa + the + + + expanded + accompa + + + increases + nying + + + accompa + increases + + + facili + production + + + increases + facili + + + increases + ties + + + use + energy + + + ties + use + + + ties + congestion + + + ties + inevi + + + expanded + tably + + + expanded + pollution + + + + + ROOT + grown + + + grown + Since + + + Since + then + + + population + our + + + grown + population + + + grown + has + + + use + automobile + + + grown + use + + + use + Increased + + + economy + the + + + grown + economy + + + use + economy + + + economy + expanded + + + accompa + the + + + expanded + accompa + + + increases + nying + + + accompa + increases + + + facili + production + + + increases + facili + + + increases + ties + + + use + energy + + + increases + use + + + ties + use + + + increases + congestion + + + ties + congestion + + + increases + inevi + + + ties + inevi + + + expanded + tably + + + expanded + pollution + + + + + + + These + these + 667 + 672 + DT + O + + + factors + factor + 673 + 680 + NNS + O + + + have + have + 681 + 685 + VBP + O + + + overwhelmed + overwhelm + 686 + 697 + VBN + O + + + our + we + 698 + 701 + PRP$ + O + + + efforts + effort + 703 + 710 + NNS + O + + + to + to + 711 + 713 + TO + O + + + improve + improve + 714 + 721 + VB + O + + + air + air + 722 + 725 + NN + O + + + quality + quality + 726 + 733 + NN + O + + + In + in + 734 + 736 + IN + O + + + the + the + 737 + 740 + DT + O + + + places + place + 742 + 748 + NNS + O + + + where + where + 749 + 754 + WRB + O + + + the + the + 755 + 758 + DT + O + + + majority + majority + 759 + 767 + NN + O + + + of + of + 768 + 770 + IN + O + + + Ameri + Ameri + 771 + 776 + NNP + LOCATION + + + - + - + 776 + 777 + : + O + + + cans + can + 779 + 783 + NNS + O + + + live + live + 784 + 788 + VBP + O + + + and + and + 789 + 792 + CC + O + + + work + work + 793 + 797 + VBP + O + + + . + . + 797 + 798 + . + O + + + (ROOT (S (NP (DT These) (NNS factors)) (VP (VBP have) (VP (VBN overwhelmed) (NP (PRP$ our) (NNS efforts)) (S (VP (TO to) (VP (VB improve) (NP (NN air) (NN quality)))) (PP (IN In) (NP (NP (DT the) (NNS places)) (SBAR (WHADVP (WRB where)) (S (NP (NP (DT the) (NN majority)) (PP (IN of) (NP (NP (NNP Ameri)) (: -) (NP (NNS cans))))) (VP (VBP live) (CC and) (VBP work))))))))) (. .))) + + + ROOT + overwhelmed + + + factors + These + + + overwhelmed + factors + + + overwhelmed + have + + + efforts + our + + + overwhelmed + efforts + + + improve + to + + + overwhelmed + improve + + + quality + air + + + improve + quality + + + improve + In + + + places + the + + + In + places + + + live + where + + + majority + the + + + live + majority + + + majority + of + + + of + Ameri + + + Ameri + cans + + + places + live + + + live + and + + + live + work + + + + + ROOT + overwhelmed + + + factors + These + + + overwhelmed + factors + + + overwhelmed + have + + + efforts + our + + + overwhelmed + efforts + + + improve + to + + + overwhelmed + improve + + + quality + air + + + improve + quality + + + places + the + + + improve + places + + + live + where + + + majority + the + + + live + majority + + + majority + Ameri + + + Ameri + cans + + + places + live + + + live + work + + + + + ROOT + overwhelmed + + + factors + These + + + overwhelmed + factors + + + overwhelmed + have + + + efforts + our + + + overwhelmed + efforts + + + improve + to + + + overwhelmed + improve + + + quality + air + + + improve + quality + + + places + the + + + improve + places + + + live + where + + + majority + the + + + live + majority + + + work + majority + + + majority + Ameri + + + Ameri + cans + + + places + live + + + places + work + + + live + work + + + + + + + Today + today + 802 + 807 + NN + DATE + THIS P1D + + + + , + , + 807 + 808 + , + O + + + more + more + 809 + 813 + JJR + O + + + than + than + 814 + 818 + IN + O + + + half + half + 819 + 823 + PDT + O + + + the + the + 824 + 827 + DT + O + + + American + american + 828 + 836 + JJ + LOCATION + + + people + people + 838 + 844 + NNS + O + + + are + be + 845 + 848 + VBP + O + + + forced + force + 849 + 855 + VBN + O + + + to + to + 856 + 858 + TO + O + + + breathe + breathe + 859 + 866 + VB + O + + + air + air + 867 + 870 + NN + O + + + that + that + 871 + 875 + WDT + O + + + does + do + 877 + 881 + VBZ + O + + + not + not + 882 + 885 + RB + O + + + meet + meet + 886 + 890 + VB + O + + + national + national + 891 + 899 + JJ + O + + + health + health + 900 + 906 + NN + O + + + stand + stand + 907 + 912 + NN + O + + + - + - + 912 + 913 + : + O + + + ards + ard + 915 + 919 + NNS + O + + + . + . + 919 + 920 + . + O + + + (ROOT (S (NP-TMP (NN Today)) (, ,) (NP (QP (JJR more) (IN than) (PDT half)) (NAC (DT the) (JJ American)) (NNS people)) (VP (VBP are) (VP (VBN forced) (S (VP (TO to) (VP (VB breathe) (NP (NP (NN air)) (SBAR (WHNP (WDT that)) (S (VP (VBZ does) (RB not) (VP (VB meet) (NP (NP (JJ national) (NN health) (NN stand)) (: -) (NP (NNS ards))))))))))))) (. .))) + + + ROOT + forced + + + forced + Today + + + half + more + + + half + than + + + people + half + + + American + the + + + people + American + + + forced + people + + + forced + are + + + breathe + to + + + forced + breathe + + + breathe + air + + + meet + that + + + meet + does + + + meet + not + + + air + meet + + + stand + national + + + stand + health + + + meet + stand + + + stand + ards + + + + + ROOT + forced + + + forced + Today + + + half + more + + + half + than + + + people + half + + + American + the + + + people + American + + + forced + people + + + forced + are + + + breathe + to + + + forced + breathe + + + breathe + air + + + meet + that + + + meet + does + + + meet + not + + + air + meet + + + stand + national + + + stand + health + + + meet + stand + + + stand + ards + + + + + ROOT + forced + + + forced + Today + + + half + more + + + half + than + + + people + half + + + American + the + + + people + American + + + forced + people + + + forced + are + + + breathe + to + + + forced + breathe + + + breathe + air + + + meet + that + + + meet + does + + + meet + not + + + air + meet + + + stand + national + + + stand + health + + + meet + stand + + + stand + ards + + + + + + + This + this + 924 + 928 + DT + O + + + will + will + 929 + 933 + MD + O + + + be + be + 934 + 936 + VB + O + + + a + a + 937 + 938 + DT + O + + + substantive + substantive + 939 + 950 + JJ + O + + + debate + debate + 951 + 957 + NN + O + + + and + and + 959 + 962 + CC + O + + + , + , + 962 + 963 + , + O + + + on + on + 964 + 966 + IN + O + + + some + some + 967 + 971 + DT + O + + + issues + issue + 972 + 978 + NNS + O + + + , + , + 978 + 979 + , + O + + + a + a + 980 + 981 + DT + O + + + controversial + controversial + 982 + 995 + JJ + O + + + one + one + 997 + 1000 + CD + NUMBER + 1.0 + + + . + . + 1000 + 1001 + . + O + + + (ROOT (S (NP (DT This)) (VP (MD will) (VP (VB be) (NP (NP (DT a) (JJ substantive) (NN debate)) (CC and) (PRN (, ,) (PP (IN on) (NP (DT some) (NNS issues))) (, ,)) (NP (DT a) (JJ controversial) (CD one))))) (. .))) + + + ROOT + debate + + + debate + This + + + debate + will + + + debate + be + + + debate + a + + + debate + substantive + + + debate + and + + + debate + on + + + issues + some + + + on + issues + + + one + a + + + one + controversial + + + debate + one + + + + + ROOT + debate + + + debate + This + + + debate + will + + + debate + be + + + debate + a + + + debate + substantive + + + debate + on + + + issues + some + + + on + issues + + + one + a + + + one + controversial + + + debate + one + + + + + ROOT + debate + + + debate + This + + + debate + will + + + debate + be + + + debate + a + + + debate + substantive + + + debate + on + + + issues + some + + + on + issues + + + one + a + + + one + controversial + + + debate + one + + + + + + + Air + Air + 1002 + 1005 + NNP + O + + + quality + quality + 1006 + 1013 + NN + O + + + issues + issue + 1014 + 1020 + NNS + O + + + vary + vary + 1021 + 1025 + VBP + O + + + by + by + 1026 + 1028 + IN + O + + + region + region + 1029 + 1035 + NN + O + + + . + . + 1035 + 1036 + . + O + + + (ROOT (S (NP (NNP Air) (NN quality) (NNS issues)) (VP (VBP vary) (PP (IN by) (NP (NN region)))) (. .))) + + + ROOT + vary + + + issues + Air + + + issues + quality + + + vary + issues + + + vary + by + + + by + region + + + + + ROOT + vary + + + issues + Air + + + issues + quality + + + vary + issues + + + vary + region + + + + + ROOT + vary + + + issues + Air + + + issues + quality + + + vary + issues + + + vary + region + + + + + + + Some + some + 1038 + 1042 + DT + O + + + regions + region + 1045 + 1052 + NNS + O + + + are + be + 1053 + 1056 + VBP + O + + + at + at + 1057 + 1059 + IN + O + + + significantly + significantly + 1060 + 1073 + RB + O + + + greater + greater + 1075 + 1082 + JJR + O + + + risk + risk + 1083 + 1087 + NN + O + + + from + from + 1088 + 1092 + IN + O + + + the + the + 1093 + 1096 + DT + O + + + effects + effect + 1097 + 1104 + NNS + O + + + of + of + 1105 + 1107 + IN + O + + + acid + acid + 1108 + 1112 + JJ + O + + + rain + rain + 1114 + 1118 + NN + O + + + ; + ; + 1118 + 1119 + : + O + + + some + some + 1120 + 1124 + DT + O + + + rural + rural + 1125 + 1130 + JJ + O + + + areas + area + 1131 + 1136 + NNS + O + + + do + do + 1137 + 1139 + VBP + O + + + not + not + 1140 + 1143 + RB + O + + + suffer + suffer + 1144 + 1150 + VB + O + + + as + as + 1151 + 1153 + RB + O + + + much + much + 1155 + 1159 + RB + O + + + from + from + 1160 + 1164 + IN + O + + + ozone + ozone + 1165 + 1170 + NN + O + + + as + as + 1171 + 1173 + IN + O + + + cities + city + 1174 + 1180 + NNS + O + + + ; + ; + 1180 + 1181 + : + O + + + congested + congest + 1182 + 1191 + VBN + O + + + urban + urban + 1193 + 1198 + JJ + O + + + areas + area + 1199 + 1204 + NNS + O + + + are + be + 1205 + 1208 + VBP + O + + + seeing + see + 1209 + 1215 + VBG + O + + + a + a + 1216 + 1217 + DT + O + + + further + further + 1218 + 1225 + JJ + O + + + degra + degra + 1226 + 1231 + NN + O + + + - + - + 1231 + 1232 + : + O + + + dation + dation + 1234 + 1240 + NN + O + + + in + in + 1241 + 1243 + IN + O + + + air + air + 1244 + 1247 + NN + O + + + quality + quality + 1248 + 1255 + NN + O + + + . + . + 1255 + 1256 + . + O + + + (ROOT (S (S (NP (DT Some) (NNS regions)) (VP (VBP are) (PP (IN at) (NP (NP (ADJP (RB significantly) (JJR greater)) (NN risk)) (PP (IN from) (NP (NP (DT the) (NNS effects)) (PP (IN of) (NP (JJ acid) (NN rain))))))))) (: ;) (S (NP (DT some) (JJ rural) (NNS areas)) (VP (VBP do) (RB not) (VP (VB suffer) (ADVP (RB as) (RB much)) (PP (IN from) (NP (NP (NN ozone)) (PP (IN as) (NP (NNS cities)))))))) (: ;) (S (S (VP (VBN congested) (NP (JJ urban) (NNS areas)))) (VP (VBP are) (VP (VBG seeing) (NP (NP (DT a) (JJ further) (NN degra)) (: -) (NP (NP (NN dation)) (PP (IN in) (NP (NN air) (NN quality)))))))) (. .))) + + + ROOT + are + + + regions + Some + + + are + regions + + + are + at + + + greater + significantly + + + risk + greater + + + at + risk + + + risk + from + + + effects + the + + + from + effects + + + effects + of + + + rain + acid + + + of + rain + + + areas + some + + + areas + rural + + + suffer + areas + + + suffer + do + + + suffer + not + + + are + suffer + + + much + as + + + suffer + much + + + suffer + from + + + from + ozone + + + ozone + as + + + as + cities + + + seeing + congested + + + areas + urban + + + congested + areas + + + seeing + are + + + are + seeing + + + degra + a + + + degra + further + + + seeing + degra + + + degra + dation + + + dation + in + + + quality + air + + + in + quality + + + + + ROOT + are + + + regions + Some + + + are + regions + + + greater + significantly + + + risk + greater + + + are + risk + + + effects + the + + + risk + effects + + + rain + acid + + + effects + rain + + + areas + some + + + areas + rural + + + suffer + areas + + + suffer + do + + + suffer + not + + + are + suffer + + + much + as + + + suffer + much + + + suffer + ozone + + + ozone + cities + + + seeing + congested + + + areas + urban + + + congested + areas + + + seeing + are + + + are + seeing + + + degra + a + + + degra + further + + + seeing + degra + + + degra + dation + + + quality + air + + + dation + quality + + + + + ROOT + are + + + regions + Some + + + are + regions + + + greater + significantly + + + risk + greater + + + are + risk + + + effects + the + + + risk + effects + + + rain + acid + + + effects + rain + + + areas + some + + + areas + rural + + + suffer + areas + + + suffer + do + + + suffer + not + + + are + suffer + + + much + as + + + suffer + much + + + suffer + ozone + + + ozone + cities + + + seeing + congested + + + areas + urban + + + congested + areas + + + seeing + are + + + are + seeing + + + degra + a + + + degra + further + + + seeing + degra + + + degra + dation + + + quality + air + + + dation + quality + + + + + + + I + I + 1260 + 1261 + PRP + O + + + welcome + welcome + 1262 + 1269 + VBP + O + + + the + the + 1270 + 1273 + DT + O + + + President + President + 1274 + 1283 + NNP + O + + + 's + 's + 1283 + 1285 + POS + O + + + strong + strong + 1286 + 1292 + JJ + O + + + call + call + 1293 + 1297 + NN + O + + + for + for + 1299 + 1302 + IN + O + + + action + action + 1303 + 1309 + NN + O + + + on + on + 1310 + 1312 + IN + O + + + a + a + 1313 + 1314 + DT + O + + + Clean + Clean + 1315 + 1320 + NNP + MISC + + + Air + Air + 1321 + 1324 + NNP + MISC + + + Act + Act + 1325 + 1328 + NNP + MISC + + + . + . + 1328 + 1329 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP welcome) (NP (NP (NP (DT the) (NNP President) (POS 's)) (JJ strong) (NN call)) (PP (IN for) (NP (NP (NN action)) (PP (IN on) (NP (DT a) (NNP Clean) (NNP Air) (NNP Act))))))) (. .))) + + + ROOT + welcome + + + welcome + I + + + President + the + + + call + President + + + President + 's + + + call + strong + + + welcome + call + + + call + for + + + for + action + + + action + on + + + Act + a + + + Act + Clean + + + Act + Air + + + on + Act + + + + + ROOT + welcome + + + welcome + I + + + President + the + + + call + President + + + call + strong + + + welcome + call + + + call + action + + + Act + a + + + Act + Clean + + + Act + Air + + + action + Act + + + + + ROOT + welcome + + + welcome + I + + + President + the + + + call + President + + + call + strong + + + welcome + call + + + call + action + + + Act + a + + + Act + Clean + + + Act + Air + + + action + Act + + + + + + + I + I + 1330 + 1331 + PRP + O + + + com + com + 1332 + 1335 + NN + O + + + - + - + 1335 + 1336 + : + O + + + mend + mend + 1338 + 1342 + VB + O + + + him + he + 1343 + 1346 + PRP + O + + + for + for + 1347 + 1350 + IN + O + + + it + it + 1351 + 1353 + PRP + O + + + . + . + 1353 + 1354 + . + O + + + (ROOT (NP (NP (PRP I) (NN com)) (: -) (S (VP (VB mend) (NP (PRP him)) (PP (IN for) (NP (PRP it))))) (. .))) + + + ROOT + com + + + com + I + + + com + mend + + + mend + him + + + mend + for + + + for + it + + + + + ROOT + com + + + com + I + + + com + mend + + + mend + him + + + mend + it + + + + + ROOT + com + + + com + I + + + com + mend + + + mend + him + + + mend + it + + + + + + + It + it + 1355 + 1357 + PRP + O + + + is + be + 1358 + 1360 + VBZ + O + + + my + my + 1361 + 1363 + PRP$ + O + + + intention + intention + 1364 + 1373 + NN + O + + + that + that + 1375 + 1379 + IN + O + + + the + the + 1380 + 1383 + DT + O + + + Senate + Senate + 1384 + 1390 + NNP + ORGANIZATION + + + give + give + 1391 + 1395 + VB + O + + + him + he + 1396 + 1399 + PRP + O + + + a + a + 1400 + 1401 + DT + O + + + strong + strong + 1402 + 1408 + JJ + O + + + Clean + Clean + 1410 + 1415 + NNP + MISC + + + Air + Air + 1416 + 1419 + NNP + MISC + + + Act + Act + 1420 + 1423 + NNP + MISC + + + . + . + 1423 + 1424 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (PRP$ my) (NN intention)) (SBAR (IN that) (S (NP (DT the) (NNP Senate)) (VP (VB give) (NP (PRP him)) (NP (DT a) (JJ strong) (NNP Clean) (NNP Air) (NNP Act)))))) (. .))) + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + give + that + + + Senate + the + + + give + Senate + + + intention + give + + + give + him + + + Act + a + + + Act + strong + + + Act + Clean + + + Act + Air + + + give + Act + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + give + that + + + Senate + the + + + give + Senate + + + intention + give + + + give + him + + + Act + a + + + Act + strong + + + Act + Clean + + + Act + Air + + + give + Act + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + give + that + + + Senate + the + + + give + Senate + + + intention + give + + + give + him + + + Act + a + + + Act + strong + + + Act + Clean + + + Act + Air + + + give + Act + + + + + + + There + there + 1428 + 1433 + EX + O + + + are + be + 1434 + 1437 + VBP + O + + + many + many + 1438 + 1442 + JJ + O + + + aspects + aspect + 1443 + 1450 + NNS + O + + + to + to + 1451 + 1453 + TO + O + + + this + this + 1454 + 1458 + DT + O + + + issue + issue + 1459 + 1464 + NN + O + + + . + . + 1464 + 1465 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (JJ many) (NNS aspects)) (PP (TO to) (NP (DT this) (NN issue))))) (. .))) + + + ROOT + are + + + are + There + + + aspects + many + + + are + aspects + + + aspects + to + + + issue + this + + + to + issue + + + + + ROOT + are + + + are + There + + + aspects + many + + + are + aspects + + + issue + this + + + aspects + issue + + + + + ROOT + are + + + are + There + + + aspects + many + + + are + aspects + + + issue + this + + + aspects + issue + + + + + + + One + one + 1467 + 1470 + CD + NUMBER + 1.0 + + + overrides + override + 1471 + 1480 + VBZ + O + + + all + all + 1481 + 1484 + DT + O + + + others + other + 1485 + 1491 + NNS + O + + + . + . + 1491 + 1492 + . + O + + + (ROOT (S (NP (CD One)) (VP (VBZ overrides) (NP (DT all) (NNS others))) (. .))) + + + ROOT + overrides + + + overrides + One + + + others + all + + + overrides + others + + + + + ROOT + overrides + + + overrides + One + + + others + all + + + overrides + others + + + + + ROOT + overrides + + + overrides + One + + + others + all + + + overrides + others + + + + + + + We + we + 1493 + 1495 + PRP + O + + + must + must + 1496 + 1500 + MD + O + + + pro- + pro- + 1501 + 1505 + JJ + O + + + tect + tect + 1507 + 1511 + NN + O + + + the + the + 1512 + 1515 + DT + O + + + health + health + 1516 + 1522 + NN + O + + + of + of + 1523 + 1525 + IN + O + + + Americans + Americans + 1526 + 1535 + NNPS + MISC + + + . + . + 1535 + 1536 + . + O + + + (ROOT (S (NP (PRP We)) (VP (MD must) (VP (NP (NP (JJ pro-) (NN tect)) (NP (NP (DT the) (NN health)) (PP (IN of) (NP (NNPS Americans))))))) (. .))) + + + ROOT + tect + + + tect + We + + + tect + must + + + tect + pro- + + + health + the + + + tect + health + + + health + of + + + of + Americans + + + + + ROOT + tect + + + tect + We + + + tect + must + + + tect + pro- + + + health + the + + + tect + health + + + health + Americans + + + + + ROOT + tect + + + tect + We + + + tect + must + + + tect + pro- + + + health + the + + + tect + health + + + health + Americans + + + + + + + We + we + 1540 + 1542 + PRP + O + + + will + will + 1543 + 1547 + MD + O + + + , + , + 1547 + 1548 + , + O + + + as + as + 1549 + 1551 + IN + O + + + we + we + 1552 + 1554 + PRP + O + + + should + should + 1555 + 1561 + MD + O + + + , + , + 1561 + 1562 + , + O + + + debate + debate + 1563 + 1569 + VB + O + + + the + the + 1570 + 1573 + DT + O + + + costs + cost + 1575 + 1580 + NNS + O + + + of + of + 1581 + 1583 + IN + O + + + this + this + 1584 + 1588 + DT + O + + + bill + bill + 1589 + 1593 + NN + O + + + . + . + 1593 + 1594 + . + O + + + (ROOT (S (NP (PRP We)) (VP (MD will) (, ,) (SBAR (IN as) (S (NP (PRP we)) (VP (MD should)))) (, ,) (VP (VB debate) (NP (NP (DT the) (NNS costs)) (PP (IN of) (NP (DT this) (NN bill)))))) (. .))) + + + ROOT + debate + + + debate + We + + + debate + will + + + should + as + + + should + we + + + debate + should + + + costs + the + + + debate + costs + + + costs + of + + + bill + this + + + of + bill + + + + + ROOT + debate + + + debate + We + + + debate + will + + + should + as + + + should + we + + + debate + should + + + costs + the + + + debate + costs + + + bill + this + + + costs + bill + + + + + ROOT + debate + + + debate + We + + + debate + will + + + should + as + + + should + we + + + debate + should + + + costs + the + + + debate + costs + + + bill + this + + + costs + bill + + + + + + + In + in + 1595 + 1597 + IN + O + + + that + that + 1598 + 1602 + DT + O + + + regard + regard + 1603 + 1609 + NN + O + + + , + , + 1609 + 1610 + , + O + + + I + I + 1611 + 1612 + PRP + O + + + em + em + 1613 + 1615 + SYM + O + + + - + - + 1615 + 1616 + : + O + + + phasize + phasize + 1618 + 1625 + VB + O + + + two + two + 1626 + 1629 + CD + NUMBER + 2.0 + + + points + point + 1630 + 1636 + NNS + O + + + . + . + 1636 + 1637 + . + O + + + (ROOT (S (PP (IN In) (NP (NP (DT that)) (SBAR (S (VP (NP (NP (NP (NN regard)) (, ,) (NP (PRP I))) (X (SYM em)))))))) (: -) (VP (VB phasize) (NP (CD two) (NNS points))) (. .))) + + + ROOT + phasize + + + phasize + In + + + In + that + + + that + regard + + + regard + I + + + regard + em + + + points + two + + + phasize + points + + + + + ROOT + phasize + + + phasize + that + + + that + regard + + + regard + I + + + regard + em + + + points + two + + + phasize + points + + + + + ROOT + phasize + + + phasize + that + + + that + regard + + + regard + I + + + regard + em + + + points + two + + + phasize + points + + + + + + + First + first + 1641 + 1646 + RB + ORDINAL + 1.0 + + + , + , + 1646 + 1647 + , + O + + + if + if + 1648 + 1650 + IN + O + + + measured + measure + 1651 + 1659 + VBN + O + + + solely + solely + 1660 + 1666 + RB + O + + + in + in + 1667 + 1669 + IN + O + + + dollars + dollar + 1670 + 1677 + NNS + O + + + and + and + 1679 + 1682 + CC + O + + + cents + cent + 1683 + 1688 + NNS + O + + + , + , + 1688 + 1689 + , + O + + + this + this + 1690 + 1694 + DT + O + + + bill + bill + 1695 + 1699 + NN + O + + + should + should + 1700 + 1706 + MD + O + + + pass + pass + 1707 + 1711 + VB + O + + + because + because + 1712 + 1719 + IN + O + + + the + the + 1721 + 1724 + DT + O + + + cost + cost + 1725 + 1729 + NN + O + + + of + of + 1730 + 1732 + IN + O + + + inaction + inaction + 1733 + 1741 + NN + O + + + is + be + 1742 + 1744 + VBZ + O + + + higher + higher + 1745 + 1751 + JJR + O + + + than + than + 1752 + 1756 + IN + O + + + the + the + 1757 + 1760 + DT + O + + + cost + cost + 1762 + 1766 + NN + O + + + of + of + 1767 + 1769 + IN + O + + + action + action + 1770 + 1776 + NN + O + + + . + . + 1776 + 1777 + . + O + + + (ROOT (S (ADVP (RB First)) (, ,) (SBAR (IN if) (S (VP (VBN measured) (ADVP (RB solely)) (PP (IN in) (NP (NNS dollars) (CC and) (NNS cents)))))) (, ,) (NP (DT this) (NN bill)) (VP (MD should) (VP (VB pass) (SBAR (IN because) (S (NP (NP (DT the) (NN cost)) (PP (IN of) (NP (NN inaction)))) (VP (VBZ is) (ADJP (ADJP (JJR higher)) (PP (IN than) (NP (NP (DT the) (NN cost)) (PP (IN of) (NP (NN action))))))))))) (. .))) + + + ROOT + pass + + + pass + First + + + measured + if + + + pass + measured + + + measured + solely + + + measured + in + + + in + dollars + + + dollars + and + + + dollars + cents + + + bill + this + + + pass + bill + + + pass + should + + + higher + because + + + cost + the + + + higher + cost + + + cost + of + + + of + inaction + + + higher + is + + + pass + higher + + + higher + than + + + cost + the + + + than + cost + + + cost + of + + + of + action + + + + + ROOT + pass + + + pass + First + + + measured + if + + + pass + measured + + + measured + solely + + + measured + dollars + + + dollars + cents + + + bill + this + + + pass + bill + + + pass + should + + + higher + because + + + cost + the + + + higher + cost + + + cost + inaction + + + higher + is + + + pass + higher + + + cost + the + + + higher + cost + + + cost + action + + + + + ROOT + pass + + + pass + First + + + measured + if + + + pass + measured + + + measured + solely + + + measured + dollars + + + measured + cents + + + dollars + cents + + + bill + this + + + pass + bill + + + pass + should + + + higher + because + + + cost + the + + + higher + cost + + + cost + inaction + + + higher + is + + + pass + higher + + + cost + the + + + higher + cost + + + cost + action + + + + + + + It + it + 1781 + 1783 + PRP + O + + + costs + cost + 1784 + 1789 + VBZ + O + + + the + the + 1790 + 1793 + DT + O + + + United + United + 1794 + 1800 + NNP + LOCATION + + + States + States + 1801 + 1807 + NNPS + LOCATION + + + more + more + 1808 + 1812 + RBR + O + + + in + in + 1813 + 1815 + IN + O + + + health + health + 1817 + 1823 + NN + O + + + care + care + 1824 + 1828 + NN + O + + + and + and + 1829 + 1832 + CC + O + + + lost + lose + 1833 + 1837 + VBD + O + + + productivity + productivity + 1838 + 1850 + NN + O + + + than + than + 1851 + 1855 + IN + O + + + it + it + 1857 + 1859 + PRP + O + + + would + would + 1860 + 1865 + MD + O + + + to + to + 1866 + 1868 + TO + O + + + clean + clean + 1869 + 1874 + VB + O + + + up + up + 1875 + 1877 + RP + O + + + air + air + 1878 + 1881 + NN + O + + + pollution + pollution + 1882 + 1891 + NN + O + + + . + . + 1891 + 1892 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VP (VBZ costs) (NP (DT the) (NNP United) (NNPS States)) (NP (NP (RBR more)) (PP (IN in) (NP (NN health) (NN care))))) (CC and) (VP (VBD lost) (NP (NN productivity)) (SBAR (IN than) (S (NP (PRP it)) (VP (MD would) (S (VP (TO to) (VP (VB clean) (PRT (RP up)) (NP (NN air) (NN pollution)))))))))) (. .))) + + + ROOT + costs + + + costs + It + + + States + the + + + States + United + + + costs + States + + + costs + more + + + more + in + + + care + health + + + in + care + + + costs + and + + + costs + lost + + + lost + productivity + + + would + than + + + would + it + + + lost + would + + + clean + to + + + would + clean + + + clean + up + + + pollution + air + + + clean + pollution + + + + + ROOT + costs + + + costs + It + + + States + the + + + States + United + + + costs + States + + + costs + more + + + care + health + + + more + care + + + costs + lost + + + lost + productivity + + + would + than + + + would + it + + + lost + would + + + clean + to + + + would + clean + + + clean + up + + + pollution + air + + + clean + pollution + + + + + ROOT + costs + + + costs + It + + + lost + It + + + States + the + + + States + United + + + costs + States + + + costs + more + + + care + health + + + more + care + + + costs + lost + + + lost + productivity + + + would + than + + + would + it + + + lost + would + + + clean + to + + + would + clean + + + clean + up + + + pollution + air + + + clean + pollution + + + + + + + This + this + 1893 + 1897 + DT + O + + + 0 + 0 + 1901 + 1902 + CD + NUMBER + 0.0 + + + This + this + 1903 + 1907 + DT + O + + + `` + `` + 1908 + 1909 + `` + O + + + bullet + bullet + 1909 + 1915 + NN + O + + + '' + '' + 1915 + 1916 + '' + O + + + symbol + symbol + 1917 + 1923 + NN + O + + + identifies + identify + 1924 + 1934 + VBZ + O + + + statements + statement + 1935 + 1945 + NNS + O + + + or + or + 1946 + 1948 + CC + O + + + insertions + insertion + 1949 + 1959 + NNS + O + + + which + which + 1960 + 1965 + WDT + O + + + are + be + 1966 + 1969 + VBP + O + + + not + not + 1970 + 1973 + RB + O + + + spoken + speak + 1974 + 1980 + VBN + O + + + by + by + 1981 + 1983 + IN + O + + + a + a + 1984 + 1985 + DT + O + + + Member + Member + 1986 + 1992 + NNP + O + + + of + of + 1993 + 1995 + IN + O + + + the + the + 1996 + 1999 + DT + O + + + Senate + Senate + 2000 + 2006 + NNP + ORGANIZATION + + + on + on + 2007 + 2009 + IN + O + + + the + the + 2010 + 2013 + DT + O + + + floor + floor + 2014 + 2019 + NN + O + + + . + . + 2019 + 2020 + . + O + + + (ROOT (S (NP (NP (DT This) (CD 0)) (NP (DT This) (`` ``) (NN bullet) ('' '') (NN symbol))) (VP (VBZ identifies) (NP (NP (NNS statements) (CC or) (NNS insertions)) (SBAR (WHNP (WDT which)) (S (VP (VBP are) (RB not) (VP (VBN spoken) (PP (IN by) (NP (NP (DT a) (NNP Member)) (PP (IN of) (NP (DT the) (NNP Senate))))) (PP (IN on) (NP (DT the) (NN floor))))))))) (. .))) + + + ROOT + identifies + + + 0 + This + + + identifies + 0 + + + symbol + This + + + symbol + bullet + + + 0 + symbol + + + identifies + statements + + + statements + or + + + statements + insertions + + + spoken + which + + + spoken + are + + + spoken + not + + + statements + spoken + + + spoken + by + + + Member + a + + + by + Member + + + Member + of + + + Senate + the + + + of + Senate + + + spoken + on + + + floor + the + + + on + floor + + + + + ROOT + identifies + + + 0 + This + + + identifies + 0 + + + symbol + This + + + symbol + bullet + + + 0 + symbol + + + identifies + statements + + + statements + insertions + + + spoken + which + + + spoken + are + + + spoken + not + + + statements + spoken + + + Member + a + + + spoken + Member + + + Senate + the + + + Member + Senate + + + floor + the + + + spoken + floor + + + + + ROOT + identifies + + + 0 + This + + + identifies + 0 + + + symbol + This + + + symbol + bullet + + + 0 + symbol + + + identifies + statements + + + identifies + insertions + + + statements + insertions + + + spoken + which + + + spoken + are + + + spoken + not + + + statements + spoken + + + Member + a + + + spoken + Member + + + Senate + the + + + Member + Senate + + + floor + the + + + spoken + floor + + + + + + + CONGRESSIONAL + congressional + 2028 + 2041 + JJ + O + + + RECORD-SENATE + record-senate + 2042 + 2055 + NN + O + + + bill + bill + 2059 + 2063 + NN + O + + + will + will + 2064 + 2068 + MD + O + + + save + save + 2069 + 2073 + VB + O + + + the + the + 2074 + 2077 + DT + O + + + American + american + 2078 + 2086 + JJ + LOCATION + + + people + people + 2087 + 2093 + NNS + O + + + money + money + 2095 + 2100 + NN + O + + + . + . + 2100 + 2101 + . + O + + + (ROOT (S (NP (JJ CONGRESSIONAL) (NN RECORD-SENATE) (NN bill)) (VP (MD will) (VP (VB save) (NP (DT the) (JJ American) (NNS people) (NN money)))) (. .))) + + + ROOT + save + + + bill + CONGRESSIONAL + + + bill + RECORD-SENATE + + + save + bill + + + save + will + + + money + the + + + money + American + + + money + people + + + save + money + + + + + ROOT + save + + + bill + CONGRESSIONAL + + + bill + RECORD-SENATE + + + save + bill + + + save + will + + + money + the + + + money + American + + + money + people + + + save + money + + + + + ROOT + save + + + bill + CONGRESSIONAL + + + bill + RECORD-SENATE + + + save + bill + + + save + will + + + money + the + + + money + American + + + money + people + + + save + money + + + + + + + Second + second + 2105 + 2111 + RB + ORDINAL + 2.0 + + + , + , + 2111 + 2112 + , + O + + + the + the + 2113 + 2116 + DT + O + + + bill + bill + 2117 + 2121 + NN + O + + + ought + ought + 2122 + 2127 + MD + O + + + not + not + 2128 + 2131 + RB + O + + + be + be + 2132 + 2134 + VB + O + + + meas + meas + 2135 + 2139 + SYM + O + + + - + - + 2139 + 2140 + : + O + + + ured + ured + 2142 + 2146 + JJ + O + + + solely + solely + 2147 + 2153 + RB + O + + + in + in + 2154 + 2156 + IN + O + + + dollars + dollar + 2157 + 2164 + NNS + O + + + and + and + 2165 + 2168 + CC + O + + + cents + cent + 2169 + 2174 + NNS + O + + + . + . + 2174 + 2175 + . + O + + + (ROOT (S (ADVP (RB Second)) (, ,) (NP (DT the) (NN bill)) (VP (MD ought) (RB not) (VP (VB be) (X (X (SYM meas)) (: -) (ADJP (JJ ured) (PP (RB solely) (IN in) (NP (NNS dollars) (CC and) (NNS cents)))) (. .)))))) + + + ROOT + ought + + + ought + Second + + + bill + the + + + ought + bill + + + ought + not + + + ought + be + + + ured + meas + + + be + ured + + + in + solely + + + ured + in + + + in + dollars + + + dollars + and + + + dollars + cents + + + + + ROOT + ought + + + ought + Second + + + bill + the + + + ought + bill + + + ought + not + + + ought + be + + + ured + meas + + + be + ured + + + ured + solely + + + ured + dollars + + + dollars + cents + + + + + ROOT + ought + + + ought + Second + + + bill + the + + + ought + bill + + + ought + not + + + ought + be + + + ured + meas + + + be + ured + + + ured + solely + + + ured + dollars + + + ured + cents + + + dollars + cents + + + + + + + That + that + 2176 + 2180 + DT + O + + + would + would + 2182 + 2187 + MD + O + + + exclude + exclude + 2188 + 2195 + VB + O + + + consideration + consideration + 2196 + 2209 + NN + O + + + of + of + 2210 + 2212 + IN + O + + + the + the + 2213 + 2216 + DT + O + + + most + most + 2218 + 2222 + RBS + O + + + important + important + 2223 + 2232 + JJ + O + + + of + of + 2233 + 2235 + IN + O + + + our + we + 2236 + 2239 + PRP$ + O + + + values-human + values-human + 2240 + 2252 + NN + O + + + values + value + 2254 + 2260 + NNS + O + + + . + . + 2260 + 2261 + . + O + + + (ROOT (S (NP (DT That)) (VP (MD would) (VP (VB exclude) (NP (NP (NN consideration)) (PP (IN of) (NP (NP (DT the) (ADJP (RBS most) (JJ important))) (PP (IN of) (NP (PRP$ our) (NN values-human) (NNS values)))))))) (. .))) + + + ROOT + exclude + + + exclude + That + + + exclude + would + + + exclude + consideration + + + consideration + of + + + important + the + + + important + most + + + of + important + + + important + of + + + values + our + + + values + values-human + + + of + values + + + + + ROOT + exclude + + + exclude + That + + + exclude + would + + + exclude + consideration + + + important + the + + + important + most + + + consideration + important + + + values + our + + + values + values-human + + + important + values + + + + + ROOT + exclude + + + exclude + That + + + exclude + would + + + exclude + consideration + + + important + the + + + important + most + + + consideration + important + + + values + our + + + values + values-human + + + important + values + + + + + + + The + the + 2265 + 2268 + DT + O + + + evidence + evidence + 2269 + 2277 + NN + O + + + is + be + 2278 + 2280 + VBZ + O + + + clear + clear + 2281 + 2286 + JJ + O + + + , + , + 2286 + 2287 + , + O + + + compelling + compelling + 2288 + 2298 + JJ + O + + + and + and + 2300 + 2303 + CC + O + + + undisputed + undisputed + 2304 + 2314 + JJ + O + + + that + that + 2315 + 2319 + IN + O + + + air + air + 2320 + 2323 + NN + O + + + pollution + pollution + 2324 + 2333 + NN + O + + + causes + cause + 2335 + 2341 + VBZ + O + + + thousands + thousand + 2342 + 2351 + NNS + O + + + of + of + 2352 + 2354 + IN + O + + + premature + premature + 2355 + 2364 + JJ + O + + + deaths + death + 2365 + 2371 + NNS + O + + + and + and + 2373 + 2376 + CC + O + + + millions + million + 2377 + 2385 + NNS + O + + + of + of + 2386 + 2388 + IN + O + + + illnesses + illness + 2389 + 2398 + NNS + O + + + each + each + 2399 + 2403 + DT + SET + P1Y + + + year + year + 2404 + 2408 + NN + SET + P1Y + + + . + . + 2408 + 2409 + . + O + + + (ROOT (S (NP (DT The) (NN evidence)) (VP (VBZ is) (ADJP (JJ clear) (, ,) (JJ compelling) (CC and) (JJ undisputed)) (SBAR (IN that) (S (NP (NN air) (NN pollution)) (VP (VBZ causes) (NP (NP (NNS thousands)) (PP (IN of) (NP (NP (JJ premature) (NNS deaths) (CC and) (NNS millions)) (PP (IN of) (NP (NP (NNS illnesses)) (NP (DT each) (NN year))))))))))) (. .))) + + + ROOT + clear + + + evidence + The + + + clear + evidence + + + clear + is + + + clear + compelling + + + clear + and + + + clear + undisputed + + + causes + that + + + pollution + air + + + causes + pollution + + + clear + causes + + + causes + thousands + + + thousands + of + + + deaths + premature + + + of + deaths + + + deaths + and + + + deaths + millions + + + deaths + of + + + of + illnesses + + + year + each + + + illnesses + year + + + + + ROOT + clear + + + evidence + The + + + clear + evidence + + + clear + is + + + clear + compelling + + + clear + undisputed + + + causes + that + + + pollution + air + + + causes + pollution + + + clear + causes + + + causes + thousands + + + deaths + premature + + + thousands + deaths + + + deaths + millions + + + deaths + illnesses + + + year + each + + + illnesses + year + + + + + ROOT + clear + + + evidence + The + + + clear + evidence + + + compelling + evidence + + + undisputed + evidence + + + clear + is + + + clear + compelling + + + clear + undisputed + + + causes + that + + + pollution + air + + + causes + pollution + + + clear + causes + + + causes + thousands + + + deaths + premature + + + thousands + deaths + + + thousands + millions + + + deaths + millions + + + deaths + illnesses + + + year + each + + + illnesses + year + + + + + + + Es + es + 2410 + 2412 + SYM + O + + + - + - + 2412 + 2413 + : + O + + + pecially + pecially + 2415 + 2423 + RB + O + + + vulnerable + vulnerable + 2424 + 2434 + JJ + O + + + are + be + 2435 + 2438 + VBP + O + + + children + child + 2439 + 2447 + NNS + O + + + . + . + 2447 + 2448 + . + O + + + (ROOT (FRAG (X (SYM Es)) (: -) (VP (ADVP (ADVP (RB pecially)) (JJ vulnerable)) (VBP are) (NP (NNS children))) (. .))) + + + ROOT + children + + + children + Es + + + children + pecially + + + pecially + vulnerable + + + children + are + + + + + ROOT + children + + + children + Es + + + children + pecially + + + pecially + vulnerable + + + children + are + + + + + ROOT + children + + + children + Es + + + children + pecially + + + pecially + vulnerable + + + children + are + + + + + + + I + I + 2452 + 2453 + PRP + O + + + ask + ask + 2454 + 2457 + VBP + O + + + each + each + 2458 + 2462 + DT + O + + + Senator + Senator + 2463 + 2470 + NNP + O + + + , + , + 2470 + 2471 + , + O + + + what + what + 2472 + 2476 + WP + O + + + Is + be + 2477 + 2479 + VBZ + O + + + the + the + 2480 + 2483 + DT + O + + + dollar + dollar + 2485 + 2491 + NN + O + + + value + value + 2492 + 2497 + NN + O + + + of + of + 2498 + 2500 + IN + O + + + a + a + 2501 + 2502 + DT + O + + + human + human + 2503 + 2508 + JJ + O + + + life + life + 2509 + 2513 + NN + O + + + ? + ? + 2513 + 2514 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP ask) (NP (NP (DT each) (NNP Senator)) (, ,) (SBAR (WHNP (WP what)) (S (VP (VBZ Is) (NP (NP (DT the) (NN dollar) (NN value)) (PP (IN of) (NP (DT a) (JJ human) (NN life))))))))) (. ?))) + + + ROOT + ask + + + ask + I + + + Senator + each + + + ask + Senator + + + value + what + + + value + Is + + + value + the + + + value + dollar + + + Senator + value + + + value + of + + + life + a + + + life + human + + + of + life + + + + + ROOT + ask + + + ask + I + + + Senator + each + + + ask + Senator + + + value + what + + + value + Is + + + value + the + + + value + dollar + + + Senator + value + + + life + a + + + life + human + + + value + life + + + + + ROOT + ask + + + ask + I + + + Senator + each + + + ask + Senator + + + value + what + + + value + Is + + + value + the + + + value + dollar + + + Senator + value + + + life + a + + + life + human + + + value + life + + + + + + + What + what + 2515 + 2519 + WP + O + + + is + be + 2520 + 2522 + VBZ + O + + + the + the + 2524 + 2527 + DT + O + + + dollar + dollar + 2528 + 2534 + NN + O + + + value + value + 2535 + 2540 + NN + O + + + of + of + 2541 + 2543 + IN + O + + + a + a + 2544 + 2545 + DT + O + + + child + child + 2546 + 2551 + NN + O + + + 's + 's + 2551 + 2553 + POS + O + + + health + health + 2554 + 2560 + NN + O + + + ? + ? + 2560 + 2561 + . + O + + + (ROOT (SBARQ (WHNP (WP What)) (SQ (VBZ is) (NP (NP (DT the) (NN dollar) (NN value)) (PP (IN of) (NP (NP (DT a) (NN child) (POS 's)) (NN health))))) (. ?))) + + + ROOT + is + + + is + What + + + value + the + + + value + dollar + + + is + value + + + value + of + + + child + a + + + health + child + + + child + 's + + + of + health + + + + + ROOT + is + + + is + What + + + value + the + + + value + dollar + + + is + value + + + child + a + + + health + child + + + value + health + + + + + ROOT + is + + + is + What + + + value + the + + + value + dollar + + + is + value + + + child + a + + + health + child + + + value + health + + + + + + + Your + you + 2563 + 2567 + PRP$ + O + + + child + child + 2568 + 2573 + NN + O + + + 's + 's + 2573 + 2575 + POS + O + + + health + health + 2576 + 2582 + NN + O + + + ? + ? + 2582 + 2583 + . + O + + + (ROOT (FRAG (NP (NP (PRP$ Your) (NN child) (POS 's)) (NN health)) (. ?))) + + + ROOT + health + + + child + Your + + + health + child + + + child + 's + + + + + ROOT + health + + + child + Your + + + health + child + + + + + ROOT + health + + + child + Your + + + health + child + + + + + + + Obviously + obviously + 2584 + 2593 + RB + O + + + , + , + 2593 + 2594 + , + O + + + these + these + 2595 + 2600 + DT + O + + + are + be + 2602 + 2605 + VBP + O + + + unanswerable + unanswerable + 2606 + 2618 + JJ + O + + + questions + question + 2619 + 2628 + NNS + O + + + . + . + 2628 + 2629 + . + O + + + (ROOT (S (ADVP (RB Obviously)) (, ,) (NP (DT these)) (VP (VBP are) (NP (JJ unanswerable) (NNS questions))) (. .))) + + + ROOT + questions + + + questions + Obviously + + + questions + these + + + questions + are + + + questions + unanswerable + + + + + ROOT + questions + + + questions + Obviously + + + questions + these + + + questions + are + + + questions + unanswerable + + + + + ROOT + questions + + + questions + Obviously + + + questions + these + + + questions + are + + + questions + unanswerable + + + + + + + But + but + 2630 + 2633 + CC + O + + + just + just + 2634 + 2638 + RB + O + + + because + because + 2640 + 2647 + IN + O + + + we + we + 2648 + 2650 + PRP + O + + + ca + can + 2651 + 2653 + MD + O + + + n't + not + 2653 + 2656 + RB + O + + + put + put + 2657 + 2660 + VB + O + + + a + a + 2661 + 2662 + DT + O + + + dollar + dollar + 2663 + 2669 + NN + O + + + value + value + 2670 + 2675 + NN + O + + + on + on + 2676 + 2678 + IN + O + + + a + a + 2680 + 2681 + DT + O + + + child + child + 2682 + 2687 + NN + O + + + 's + 's + 2687 + 2689 + POS + O + + + health + health + 2690 + 2696 + NN + O + + + does + do + 2697 + 2701 + VBZ + O + + + n't + not + 2701 + 2704 + RB + O + + + mean + mean + 2705 + 2709 + VB + O + + + we + we + 2710 + 2712 + PRP + O + + + should + should + 2714 + 2720 + MD + O + + + exclude + exclude + 2721 + 2728 + VB + O + + + the + the + 2729 + 2732 + DT + O + + + health + health + 2733 + 2739 + NN + O + + + of + of + 2740 + 2742 + IN + O + + + Ameri + Ameri + 2743 + 2748 + NNP + LOCATION + + + - + - + 2748 + 2749 + : + O + + + can + can + 2751 + 2754 + MD + O + + + children + child + 2755 + 2763 + NNS + O + + + from + from + 2764 + 2768 + IN + O + + + this + this + 2769 + 2773 + DT + O + + + debate + debate + 2774 + 2780 + NN + O + + + . + . + 2780 + 2781 + . + O + + + (ROOT (S (CC But) (SBAR (RB just) (IN because) (S (NP (PRP we)) (VP (MD ca) (RB n't) (VP (VB put) (NP (DT a) (NN dollar) (NN value)) (PP (IN on) (NP (NP (DT a) (NN child) (POS 's)) (NN health))))))) (VP (VBZ does) (RB n't) (VP (VB mean) (SBAR (S (NP (PRP we)) (VP (MD should) (VP (VB exclude) (S (NP (NP (DT the) (NN health)) (PP (IN of) (NP (NNP Ameri)))) (: -) (VP (MD can) (VP (NP (NNS children)) (PP (IN from) (NP (DT this) (NN debate)))))))))))) (. .))) + + + ROOT + mean + + + mean + But + + + put + just + + + put + because + + + put + we + + + put + ca + + + put + n't + + + mean + put + + + value + a + + + value + dollar + + + put + value + + + put + on + + + child + a + + + health + child + + + child + 's + + + on + health + + + mean + does + + + mean + n't + + + exclude + we + + + exclude + should + + + mean + exclude + + + health + the + + + children + health + + + health + of + + + of + Ameri + + + children + can + + + exclude + children + + + children + from + + + debate + this + + + from + debate + + + + + ROOT + mean + + + mean + But + + + put + just + + + put + because + + + put + we + + + put + ca + + + put + n't + + + mean + put + + + value + a + + + value + dollar + + + put + value + + + child + a + + + health + child + + + put + health + + + mean + does + + + mean + n't + + + exclude + we + + + exclude + should + + + mean + exclude + + + health + the + + + children + health + + + health + Ameri + + + children + can + + + exclude + children + + + debate + this + + + children + debate + + + + + ROOT + mean + + + mean + But + + + put + just + + + put + because + + + put + we + + + put + ca + + + put + n't + + + mean + put + + + value + a + + + value + dollar + + + put + value + + + child + a + + + health + child + + + put + health + + + mean + does + + + mean + n't + + + exclude + we + + + exclude + should + + + mean + exclude + + + health + the + + + children + health + + + health + Ameri + + + children + can + + + exclude + children + + + debate + this + + + children + debate + + + + + + + To + to + 2782 + 2784 + TO + O + + + the + the + 2785 + 2788 + DT + O + + + contrary + contrary + 2790 + 2798 + NN + O + + + , + , + 2798 + 2799 + , + O + + + it + it + 2800 + 2802 + PRP + O + + + is + be + 2803 + 2805 + VBZ + O + + + and + and + 2806 + 2809 + CC + O + + + should + should + 2810 + 2816 + MD + O + + + be + be + 2817 + 2819 + VB + O + + + central + central + 2820 + 2827 + JJ + O + + + to + to + 2828 + 2830 + TO + O + + + this + this + 2832 + 2836 + DT + O + + + debate + debate + 2837 + 2843 + NN + O + + + . + . + 2843 + 2844 + . + O + + + (ROOT (S (PP (TO To) (NP (DT the) (NN contrary))) (, ,) (NP (PRP it)) (VP (VP (VBZ is)) (CC and) (VP (MD should) (VP (VB be) (ADJP (JJ central) (PP (TO to) (NP (DT this) (NN debate))))))) (. .))) + + + ROOT + is + + + is + To + + + contrary + the + + + To + contrary + + + is + it + + + is + and + + + central + should + + + central + be + + + is + central + + + central + to + + + debate + this + + + to + debate + + + + + ROOT + is + + + contrary + the + + + is + contrary + + + is + it + + + central + should + + + central + be + + + is + central + + + debate + this + + + central + debate + + + + + ROOT + is + + + contrary + the + + + is + contrary + + + is + it + + + central + it + + + central + should + + + central + be + + + is + central + + + debate + this + + + central + debate + + + + + + + We + we + 2848 + 2850 + PRP + O + + + should + should + 2851 + 2857 + MD + O + + + get + get + 2858 + 2861 + VB + O + + + in + in + 2862 + 2864 + IN + O + + + perspective + perspective + 2865 + 2876 + NN + O + + + and + and + 2877 + 2880 + CC + O + + + keep + keep + 2882 + 2886 + VB + O + + + in + in + 2887 + 2889 + IN + O + + + perspective + perspective + 2890 + 2901 + NN + O + + + the + the + 2902 + 2905 + DT + O + + + fact + fact + 2906 + 2910 + NN + O + + + that + that + 2911 + 2915 + IN + O + + + we + we + 2916 + 2918 + PRP + O + + + are + be + 2920 + 2923 + VBP + O + + + considering + consider + 2924 + 2935 + VBG + O + + + a + a + 2936 + 2937 + DT + O + + + health + health + 2938 + 2944 + NN + O + + + bill + bill + 2945 + 2949 + NN + O + + + . + . + 2949 + 2950 + . + O + + + (ROOT (S (NP (PRP We)) (VP (MD should) (VP (VP (VB get) (PP (IN in) (NP (NN perspective)))) (CC and) (VP (VB keep) (PP (IN in) (NP (NN perspective))) (NP (NP (DT the) (NN fact)) (SBAR (IN that) (S (NP (PRP we)) (VP (VBP are) (VP (VBG considering) (NP (DT a) (NN health) (NN bill)))))))))) (. .))) + + + ROOT + get + + + get + We + + + get + should + + + get + in + + + in + perspective + + + get + and + + + get + keep + + + keep + in + + + in + perspective + + + fact + the + + + keep + fact + + + considering + that + + + considering + we + + + considering + are + + + fact + considering + + + bill + a + + + bill + health + + + considering + bill + + + + + ROOT + get + + + get + We + + + get + should + + + get + perspective + + + get + keep + + + keep + perspective + + + fact + the + + + keep + fact + + + considering + that + + + considering + we + + + considering + are + + + fact + considering + + + bill + a + + + bill + health + + + considering + bill + + + + + ROOT + get + + + get + We + + + keep + We + + + get + should + + + get + perspective + + + get + keep + + + keep + perspective + + + fact + the + + + keep + fact + + + considering + that + + + considering + we + + + considering + are + + + fact + considering + + + bill + a + + + bill + health + + + considering + bill + + + + + + + Its + its + 2951 + 2954 + PRP$ + O + + + pur + pur + 2955 + 2958 + SYM + O + + + - + - + 2958 + 2959 + : + O + + + pose + pose + 2961 + 2965 + VB + O + + + is + be + 2966 + 2968 + VBZ + O + + + basic + basic + 2969 + 2974 + JJ + O + + + : + : + 2974 + 2975 + : + O + + + To + to + 2976 + 2978 + TO + O + + + make + make + 2979 + 2983 + VB + O + + + the + the + 2984 + 2987 + DT + O + + + air + air + 2988 + 2991 + NN + O + + + we + we + 2992 + 2994 + PRP + O + + + must + must + 2995 + 2999 + MD + O + + + all + all + 3001 + 3004 + RB + O + + + breathe + breathe + 3005 + 3012 + VB + O + + + fit + fit + 3013 + 3016 + NN + O + + + for + for + 3017 + 3020 + IN + O + + + human + human + 3021 + 3026 + JJ + O + + + lungs + lung + 3027 + 3032 + NNS + O + + + . + . + 3032 + 3033 + . + O + + + (ROOT (NP (NP (PRP$ Its) (SYM pur)) (: -) (S (S (VP (VB pose) (SBAR (S (VP (VBZ is) (ADJP (JJ basic))))))) (: :) (S (VP (TO To) (VP (VB make) (NP (NP (DT the) (NN air)) (SBAR (S (NP (PRP we)) (VP (MD must) (RB all) (VP (VB breathe) (NP (NN fit)) (PP (IN for) (NP (JJ human) (NNS lungs)))))))))))) (. .))) + + + ROOT + Its + + + Its + pur + + + Its + pose + + + basic + is + + + pose + basic + + + make + To + + + pose + make + + + air + the + + + make + air + + + breathe + we + + + breathe + must + + + breathe + all + + + air + breathe + + + breathe + fit + + + breathe + for + + + lungs + human + + + for + lungs + + + + + ROOT + Its + + + Its + pur + + + Its + pose + + + basic + is + + + pose + basic + + + make + To + + + pose + make + + + air + the + + + make + air + + + breathe + we + + + breathe + must + + + breathe + all + + + air + breathe + + + breathe + fit + + + lungs + human + + + breathe + lungs + + + + + ROOT + Its + + + Its + pur + + + Its + pose + + + basic + is + + + pose + basic + + + make + To + + + pose + make + + + air + the + + + make + air + + + breathe + we + + + breathe + must + + + breathe + all + + + air + breathe + + + breathe + fit + + + lungs + human + + + breathe + lungs + + + + + + + Beyond + beyond + 3037 + 3043 + IN + O + + + the + the + 3044 + 3047 + DT + O + + + Clean + Clean + 3048 + 3053 + NNP + MISC + + + Air + Air + 3054 + 3057 + NNP + MISC + + + Act + Act + 3058 + 3061 + NNP + MISC + + + , + , + 3061 + 3062 + , + O + + + this + this + 3063 + 3067 + DT + O + + + 2d + 2d + 3068 + 3070 + JJ + O + + + session + session + 3072 + 3079 + NN + O + + + of + of + 3080 + 3082 + IN + O + + + the + the + 3083 + 3086 + DT + O + + + 101st + 101st + 3087 + 3092 + CD + NUMBER + 101.0 + + + Congress + Congress + 3093 + 3101 + NNP + ORGANIZATION + + + will + will + 3102 + 3106 + MD + O + + + be + be + 3107 + 3109 + VB + O + + + busy + busy + 3111 + 3115 + JJ + O + + + . + . + 3115 + 3116 + . + O + + + (ROOT (S (PP (IN Beyond) (NP (DT the) (NNP Clean) (NNP Air) (NNP Act))) (, ,) (NP (NP (DT this) (JJ 2d) (NN session)) (PP (IN of) (NP (DT the) (CD 101st)))) (NP (NNP Congress)) (VP (MD will) (VP (VB be) (ADJP (JJ busy)))) (. .))) + + + ROOT + busy + + + busy + Beyond + + + Act + the + + + Act + Clean + + + Act + Air + + + Beyond + Act + + + session + this + + + session + 2d + + + busy + session + + + session + of + + + 101st + the + + + of + 101st + + + busy + Congress + + + busy + will + + + busy + be + + + + + ROOT + busy + + + Act + the + + + Act + Clean + + + Act + Air + + + busy + Act + + + session + this + + + session + 2d + + + busy + session + + + 101st + the + + + session + 101st + + + busy + Congress + + + busy + will + + + busy + be + + + + + ROOT + busy + + + Act + the + + + Act + Clean + + + Act + Air + + + busy + Act + + + session + this + + + session + 2d + + + busy + session + + + 101st + the + + + session + 101st + + + busy + Congress + + + busy + will + + + busy + be + + + + + + + We + we + 3117 + 3119 + PRP + O + + + have + have + 3120 + 3124 + VBP + O + + + unfinished + unfinished + 3125 + 3135 + JJ + O + + + business + business + 3136 + 3144 + NN + O + + + from + from + 3146 + 3150 + IN + O + + + the + the + 3151 + 3154 + DT + O + + + first + first + 3155 + 3160 + JJ + ORDINAL + 1.0 + + + session + session + 3161 + 3168 + NN + O + + + to + to + 3169 + 3171 + TO + O + + + complete + complete + 3172 + 3180 + VB + O + + + , + , + 3180 + 3181 + , + O + + + im + im + 3182 + 3184 + SYM + O + + + - + - + 3184 + 3185 + : + O + + + portant + portant + 3187 + 3194 + JJ + O + + + reauthorizations + reauthorization + 3195 + 3211 + NNS + O + + + to + to + 3212 + 3214 + TO + O + + + write + write + 3215 + 3220 + VB + O + + + , + , + 3220 + 3221 + , + O + + + a + a + 3222 + 3223 + DT + O + + + dramatically + dramatically + 3225 + 3237 + RB + O + + + different + different + 3238 + 3247 + JJ + O + + + world + world + 3248 + 3253 + NN + O + + + against + against + 3254 + 3261 + IN + O + + + which + which + 3263 + 3268 + WDT + O + + + to + to + 3269 + 3271 + TO + O + + + weigh + weigh + 3272 + 3277 + VB + O + + + our + we + 3278 + 3281 + PRP$ + O + + + Nation + Nation + 3282 + 3288 + NNP + O + + + 's + 's + 3288 + 3290 + POS + O + + + security + security + 3291 + 3299 + NN + O + + + needs + need + 3301 + 3306 + NNS + O + + + and + and + 3307 + 3310 + CC + O + + + priorities + priority + 3311 + 3321 + NNS + O + + + , + , + 3321 + 3322 + , + O + + + as + as + 3323 + 3325 + RB + O + + + well + well + 3326 + 3330 + RB + O + + + as + as + 3331 + 3333 + IN + O + + + the + the + 3334 + 3337 + DT + O + + + re + re + 3338 + 3340 + NN + O + + + - + - + 3340 + 3341 + : + O + + + quired + quire + 3343 + 3349 + VBN + O + + + budget + budget + 3351 + 3357 + NN + O + + + and + and + 3360 + 3363 + CC + O + + + appropriations + appropriation + 3366 + 3380 + NNS + O + + + measures + measure + 3382 + 3390 + NNS + O + + + for + for + 3391 + 3394 + IN + O + + + 1991 + 1991 + 3395 + 3399 + CD + DATE + 1991 + 1991 + + + . + . + 3399 + 3400 + . + O + + + (ROOT (S (NP (PRP We)) (VP (VBP have) (NP (NP (JJ unfinished) (NN business)) (PP (IN from) (NP (DT the) (JJ first) (NN session) (S (VP (TO to) (VP (VB complete) (, ,) (FRAG (X (SYM im)) (: -) (NP (JJ portant) (NNS reauthorizations))))))))) (S (VP (TO to) (VP (VP (VB write)) (, ,) (NP (NP (DT a) (ADJP (RB dramatically) (JJ different)) (NN world)) (PP (IN against) (SBAR (WHNP (WDT which)) (S (VP (TO to) (VP (VB weigh) (S (NP (NP (PRP$ our) (NP (NP (NNP Nation) (POS 's)) (NN security) (NNS needs) (CC and) (NNS priorities))) (, ,) (CONJP (RB as) (RB well) (IN as)) (NP (DT the) (NN re)) (: -)) (VP (VBN quired) (NP (NP (NN budget) (CC and) (NNS appropriations)) (NNS measures)) (PP (IN for) (NP (CD 1991))))))))))))))) (. .))) + + + ROOT + have + + + have + We + + + business + unfinished + + + have + business + + + business + from + + + session + the + + + session + first + + + from + session + + + complete + to + + + session + complete + + + reauthorizations + im + + + reauthorizations + portant + + + complete + reauthorizations + + + write + to + + + have + write + + + world + a + + + different + dramatically + + + world + different + + + write + world + + + world + against + + + weigh + which + + + weigh + to + + + against + weigh + + + needs + our + + + needs + Nation + + + Nation + 's + + + needs + security + + + quired + needs + + + needs + and + + + needs + priorities + + + well + as + + + needs + well + + + well + as + + + re + the + + + needs + re + + + weigh + quired + + + measures + budget + + + budget + and + + + budget + appropriations + + + quired + measures + + + quired + for + + + for + 1991 + + + + + ROOT + have + + + have + We + + + business + unfinished + + + have + business + + + session + the + + + session + first + + + business + session + + + complete + to + + + session + complete + + + reauthorizations + im + + + reauthorizations + portant + + + complete + reauthorizations + + + write + to + + + have + write + + + world + a + + + different + dramatically + + + world + different + + + write + world + + + weigh + which + + + weigh + to + + + world + weigh + + + needs + our + + + needs + Nation + + + needs + security + + + quired + needs + + + needs + priorities + + + re + the + + + needs + re + + + weigh + quired + + + measures + budget + + + budget + appropriations + + + quired + measures + + + quired + 1991 + + + + + ROOT + have + + + have + We + + + business + unfinished + + + have + business + + + session + the + + + session + first + + + business + session + + + complete + to + + + session + complete + + + reauthorizations + im + + + reauthorizations + portant + + + complete + reauthorizations + + + write + to + + + have + write + + + world + a + + + different + dramatically + + + world + different + + + write + world + + + weigh + which + + + weigh + to + + + world + weigh + + + needs + our + + + needs + Nation + + + needs + security + + + quired + needs + + + needs + priorities + + + quired + priorities + + + re + the + + + needs + re + + + quired + re + + + weigh + quired + + + measures + budget + + + budget + appropriations + + + measures + appropriations + + + quired + measures + + + quired + 1991 + + + + + + + I + I + 3404 + 3405 + PRP + O + + + hope + hope + 3406 + 3410 + VBP + O + + + conferees + conferee + 3411 + 3420 + NNS + O + + + on + on + 3421 + 3423 + IN + O + + + the + the + 3424 + 3427 + DT + O + + + unfinished + unfinished + 3428 + 3438 + JJ + O + + + business + business + 3440 + 3448 + NN + O + + + of + of + 3449 + 3451 + IN + O + + + the + the + 3452 + 3455 + DT + O + + + first + first + 3456 + 3461 + JJ + ORDINAL + 1.0 + + + session + session + 3462 + 3469 + NN + O + + + , + , + 3469 + 3470 + , + O + + + drug + drug + 3471 + 3475 + NN + O + + + treatment + treatment + 3477 + 3486 + NN + O + + + legislation + legislation + 3487 + 3498 + NN + O + + + and + and + 3499 + 3502 + CC + O + + + oilspill + oilspill + 3503 + 3511 + NN + O + + + liabil + liabil + 3512 + 3518 + NN + O + + + - + - + 3518 + 3519 + : + O + + + ity + ity + 3521 + 3524 + NN + O + + + , + , + 3524 + 3525 + , + O + + + will + will + 3526 + 3530 + MD + O + + + act + act + 3531 + 3534 + VB + O + + + promptly + promptly + 3535 + 3543 + RB + O + + + . + . + 3543 + 3544 + . + O + + + (ROOT (S (S (NP (PRP I)) (VP (VBP hope) (NP (NNS conferees)) (PP (IN on) (NP (NP (DT the) (JJ unfinished) (NN business)) (PP (IN of) (NP (DT the) (JJ first) (NN session))))))) (, ,) (NP (NP (NP (NN drug) (NN treatment) (NN legislation)) (CC and) (NP (NN oilspill) (NN liabil))) (: -) (NP (NN ity)) (, ,)) (VP (MD will) (VP (VB act) (ADVP (RB promptly)))) (. .))) + + + ROOT + act + + + hope + I + + + act + hope + + + hope + conferees + + + hope + on + + + business + the + + + business + unfinished + + + on + business + + + business + of + + + session + the + + + session + first + + + of + session + + + legislation + drug + + + legislation + treatment + + + act + legislation + + + legislation + and + + + liabil + oilspill + + + legislation + liabil + + + legislation + ity + + + act + will + + + act + promptly + + + + + ROOT + act + + + hope + I + + + act + hope + + + hope + conferees + + + business + the + + + business + unfinished + + + hope + business + + + session + the + + + session + first + + + business + session + + + legislation + drug + + + legislation + treatment + + + act + legislation + + + liabil + oilspill + + + legislation + liabil + + + legislation + ity + + + act + will + + + act + promptly + + + + + ROOT + act + + + hope + I + + + act + hope + + + hope + conferees + + + business + the + + + business + unfinished + + + hope + business + + + session + the + + + session + first + + + business + session + + + legislation + drug + + + legislation + treatment + + + act + legislation + + + liabil + oilspill + + + legislation + liabil + + + act + liabil + + + legislation + ity + + + act + will + + + act + promptly + + + + + + + These + these + 3545 + 3550 + DT + O + + + are + be + 3551 + 3554 + VBP + O + + + im + im + 3555 + 3557 + SYM + O + + + - + - + 3557 + 3558 + : + O + + + portant + portant + 3560 + 3567 + JJ + O + + + matters + matter + 3568 + 3575 + NNS + O + + + we + we + 3576 + 3578 + PRP + O + + + should + should + 3579 + 3585 + MD + O + + + be + be + 3586 + 3588 + VB + O + + + able + able + 3589 + 3593 + JJ + O + + + to + to + 3594 + 3596 + TO + O + + + finish + finish + 3598 + 3604 + VB + O + + + swiftly + swiftly + 3605 + 3612 + RB + O + + + . + . + 3612 + 3613 + . + O + + + (ROOT (S (NP (DT These)) (VP (VBP are) (FRAG (X (SYM im)) (: -) (NP (NP (JJ portant) (NNS matters)) (SBAR (S (NP (PRP we)) (VP (MD should) (VP (VB be) (ADJP (JJ able) (S (VP (TO to) (VP (VB finish) (ADVP (RB swiftly))))))))))) (. .))))) + + + ROOT + are + + + are + These + + + matters + im + + + matters + portant + + + are + matters + + + able + we + + + able + should + + + able + be + + + matters + able + + + finish + to + + + able + finish + + + finish + swiftly + + + + + ROOT + are + + + are + These + + + matters + im + + + matters + portant + + + are + matters + + + able + we + + + able + should + + + able + be + + + matters + able + + + finish + to + + + able + finish + + + finish + swiftly + + + + + ROOT + are + + + are + These + + + matters + im + + + matters + portant + + + are + matters + + + able + we + + + able + should + + + able + be + + + matters + able + + + finish + to + + + able + finish + + + finish + swiftly + + + + + + + Tomorrow + tomorrow + 3617 + 3625 + NN + DATE + OFFSET P1D + + + + , + , + 3625 + 3626 + , + O + + + the + the + 3627 + 3630 + DT + O + + + House + House + 3631 + 3636 + NNP + ORGANIZATION + + + will + will + 3637 + 3641 + MD + O + + + override + override + 3642 + 3650 + VB + O + + + the + the + 3652 + 3655 + DT + O + + + President + President + 3656 + 3665 + NNP + O + + + 's + 's + 3665 + 3667 + POS + O + + + veto + veto + 3668 + 3672 + NN + O + + + of + of + 3673 + 3675 + IN + O + + + legislation + legislation + 3676 + 3687 + NN + O + + + ex + ex + 3688 + 3690 + FW + O + + + - + - + 3690 + 3691 + : + O + + + tending + tend + 3693 + 3700 + VBG + O + + + the + the + 3701 + 3704 + DT + O + + + visa + visa + 3705 + 3709 + NN + O + + + protections + protection + 3710 + 3721 + NNS + O + + + of + of + 3722 + 3724 + IN + O + + + Chi + Chi + 3725 + 3728 + NNP + O + + + - + - + 3728 + 3729 + : + O + + + nese + nese + 3731 + 3735 + JJ + O + + + students + student + 3736 + 3744 + NNS + O + + + and + and + 3745 + 3748 + CC + O + + + exchange + exchange + 3749 + 3757 + NN + O + + + scholars + scholar + 3758 + 3766 + NNS + O + + + . + . + 3766 + 3767 + . + O + + + (ROOT (S (NP-TMP (NN Tomorrow)) (, ,) (NP (DT the) (NNP House)) (VP (MD will) (VP (VB override) (NP (NP (NP (NP (NP (DT the) (NNP President) (POS 's)) (NN veto)) (PP (IN of) (NP (NN legislation)))) (NP (FW ex))) (PRN (: -) (VP (VBG tending) (NP (NP (DT the) (NN visa) (NNS protections)) (PP (IN of) (NP (NNP Chi))))) (: -)) (NP (NP (JJ nese) (NNS students)) (CC and) (NP (NN exchange) (NNS scholars)))))) (. .))) + + + ROOT + override + + + override + Tomorrow + + + House + the + + + override + House + + + override + will + + + President + the + + + veto + President + + + President + 's + + + override + veto + + + veto + of + + + of + legislation + + + veto + ex + + + veto + tending + + + protections + the + + + protections + visa + + + tending + protections + + + protections + of + + + of + Chi + + + students + nese + + + veto + students + + + students + and + + + scholars + exchange + + + students + scholars + + + + + ROOT + override + + + override + Tomorrow + + + House + the + + + override + House + + + override + will + + + President + the + + + veto + President + + + override + veto + + + veto + legislation + + + veto + ex + + + veto + tending + + + protections + the + + + protections + visa + + + tending + protections + + + protections + Chi + + + students + nese + + + veto + students + + + scholars + exchange + + + students + scholars + + + + + ROOT + override + + + override + Tomorrow + + + House + the + + + override + House + + + override + will + + + President + the + + + veto + President + + + override + veto + + + veto + legislation + + + veto + ex + + + veto + tending + + + protections + the + + + protections + visa + + + tending + protections + + + protections + Chi + + + students + nese + + + veto + students + + + scholars + exchange + + + veto + scholars + + + students + scholars + + + + + + + It + it + 3769 + 3771 + PRP + O + + + is + be + 3772 + 3774 + VBZ + O + + + my + my + 3775 + 3777 + PRP$ + O + + + intention + intention + 3778 + 3787 + NN + O + + + to + to + 3788 + 3790 + TO + O + + + ask + ask + 3791 + 3794 + VB + O + + + the + the + 3795 + 3798 + DT + O + + + Senate + Senate + 3799 + 3805 + NNP + ORGANIZATION + + + to + to + 3806 + 3808 + TO + O + + + move + move + 3810 + 3814 + VB + O + + + promptly + promptly + 3815 + 3823 + RB + O + + + to + to + 3824 + 3826 + TO + O + + + that + that + 3827 + 3831 + DT + O + + + proposal + proposal + 3832 + 3840 + NN + O + + + . + . + 3840 + 3841 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (PRP$ my) (NN intention) (S (VP (TO to) (VP (VB ask) (S (NP (DT the) (NNP Senate)) (VP (TO to) (VP (VB move) (ADVP (RB promptly)) (PP (TO to) (NP (DT that) (NN proposal))))))))))) (. .))) + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + ask + to + + + intention + ask + + + Senate + the + + + move + Senate + + + move + to + + + ask + move + + + move + promptly + + + move + to + + + proposal + that + + + to + proposal + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + ask + to + + + intention + ask + + + Senate + the + + + move + Senate + + + move + to + + + ask + move + + + move + promptly + + + proposal + that + + + move + proposal + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + ask + to + + + intention + ask + + + Senate + the + + + move + Senate + + + move + to + + + ask + move + + + move + promptly + + + proposal + that + + + move + proposal + + + + + + + I + I + 3845 + 3846 + PRP + O + + + regret + regret + 3847 + 3853 + VBP + O + + + the + the + 3854 + 3857 + DT + O + + + President + President + 3858 + 3867 + NNP + O + + + 's + 's + 3867 + 3869 + POS + O + + + veto + veto + 3870 + 3874 + NN + O + + + of + of + 3875 + 3877 + IN + O + + + this + this + 3878 + 3882 + DT + O + + + bill + bill + 3884 + 3888 + NN + O + + + . + . + 3888 + 3889 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP regret) (NP (NP (NP (DT the) (NNP President) (POS 's)) (NN veto)) (PP (IN of) (NP (DT this) (NN bill))))) (. .))) + + + ROOT + regret + + + regret + I + + + President + the + + + veto + President + + + President + 's + + + regret + veto + + + veto + of + + + bill + this + + + of + bill + + + + + ROOT + regret + + + regret + I + + + President + the + + + veto + President + + + regret + veto + + + bill + this + + + veto + bill + + + + + ROOT + regret + + + regret + I + + + President + the + + + veto + President + + + regret + veto + + + bill + this + + + veto + bill + + + + + + + His + he + 3890 + 3893 + PRP$ + O + + + claim + claim + 3894 + 3899 + NN + O + + + that + that + 3900 + 3904 + IN + O + + + he + he + 3905 + 3907 + PRP + O + + + is + be + 3908 + 3910 + VBZ + O + + + doing + do + 3911 + 3916 + VBG + O + + + as + as + 3917 + 3919 + RB + O + + + much + much + 3920 + 3924 + RB + O + + + through + through + 3926 + 3933 + IN + O + + + a + a + 3934 + 3935 + DT + O + + + Presidential + presidential + 3936 + 3948 + JJ + O + + + memorandum + memorandum + 3949 + 3959 + NN + O + + + of + of + 3961 + 3963 + IN + O + + + disapproval + disapproval + 3964 + 3975 + NN + O + + + as + as + 3976 + 3978 + IN + O + + + the + the + 3979 + 3982 + DT + O + + + bill + bill + 3983 + 3987 + NN + O + + + would + would + 3988 + 3993 + MD + O + + + do + do + 3994 + 3996 + VB + O + + + through + through + 3998 + 4005 + IN + O + + + the + the + 4006 + 4009 + DT + O + + + law + law + 4010 + 4013 + NN + O + + + is + be + 4014 + 4016 + VBZ + O + + + unpersuasive + unpersuasive + 4017 + 4029 + JJ + O + + + . + . + 4029 + 4030 + . + O + + + (ROOT (S (NP (NP (PRP$ His) (NN claim)) (SBAR (IN that) (S (NP (PRP he)) (VP (VBZ is) (VP (VBG doing) (ADVP (RB as) (RB much)) (PP (IN through) (NP (NP (DT a) (JJ Presidential) (NN memorandum)) (PP (IN of) (NP (NN disapproval))))) (SBAR (IN as) (S (NP (DT the) (NN bill)) (VP (MD would) (VP (VB do) (PP (IN through) (NP (DT the) (NN law)))))))))))) (VP (VBZ is) (ADJP (JJ unpersuasive))) (. .))) + + + ROOT + unpersuasive + + + claim + His + + + unpersuasive + claim + + + doing + that + + + doing + he + + + doing + is + + + claim + doing + + + much + as + + + doing + much + + + doing + through + + + memorandum + a + + + memorandum + Presidential + + + through + memorandum + + + memorandum + of + + + of + disapproval + + + do + as + + + bill + the + + + do + bill + + + do + would + + + doing + do + + + do + through + + + law + the + + + through + law + + + unpersuasive + is + + + + + ROOT + unpersuasive + + + claim + His + + + unpersuasive + claim + + + doing + that + + + doing + he + + + doing + is + + + claim + doing + + + much + as + + + doing + much + + + memorandum + a + + + memorandum + Presidential + + + doing + memorandum + + + memorandum + disapproval + + + do + as + + + bill + the + + + do + bill + + + do + would + + + doing + do + + + law + the + + + do + law + + + unpersuasive + is + + + + + ROOT + unpersuasive + + + claim + His + + + unpersuasive + claim + + + doing + that + + + doing + he + + + doing + is + + + claim + doing + + + much + as + + + doing + much + + + memorandum + a + + + memorandum + Presidential + + + doing + memorandum + + + memorandum + disapproval + + + do + as + + + bill + the + + + do + bill + + + do + would + + + doing + do + + + law + the + + + do + law + + + unpersuasive + is + + + + + + + The + the + 4034 + 4037 + DT + O + + + President + President + 4038 + 4047 + NNP + O + + + 's + 's + 4047 + 4049 + POS + O + + + memorandum + memorandum + 4050 + 4060 + NN + O + + + of + of + 4061 + 4063 + IN + O + + + dis + di + 4064 + 4067 + NN + O + + + - + - + 4067 + 4068 + : + O + + + approval + approval + 4070 + 4078 + NN + O + + + is + be + 4079 + 4081 + VBZ + O + + + only + only + 4082 + 4086 + RB + O + + + an + a + 4087 + 4089 + DT + O + + + administrative + administrative + 4090 + 4104 + JJ + O + + + action + action + 4106 + 4112 + NN + O + + + . + . + 4112 + 4113 + . + O + + + (ROOT (NP (NP (NP (NP (DT The) (NNP President) (POS 's)) (NN memorandum)) (PP (IN of) (NP (NN dis)))) (: -) (NP (NP (NN approval)) (SBAR (S (VP (VBZ is) (ADVP (RB only)) (NP (DT an) (JJ administrative) (NN action)))))) (. .))) + + + ROOT + memorandum + + + President + The + + + memorandum + President + + + President + 's + + + memorandum + of + + + of + dis + + + memorandum + approval + + + action + is + + + action + only + + + action + an + + + action + administrative + + + approval + action + + + + + ROOT + memorandum + + + President + The + + + memorandum + President + + + memorandum + dis + + + memorandum + approval + + + action + is + + + action + only + + + action + an + + + action + administrative + + + approval + action + + + + + ROOT + memorandum + + + President + The + + + memorandum + President + + + memorandum + dis + + + memorandum + approval + + + action + is + + + action + only + + + action + an + + + action + administrative + + + approval + action + + + + + + + It + it + 4114 + 4116 + PRP + O + + + provides + provide + 4117 + 4125 + VBZ + O + + + no + no + 4126 + 4128 + DT + O + + + statutory + statutory + 4129 + 4138 + JJ + O + + + legal + legal + 4139 + 4144 + JJ + O + + + protection + protection + 4146 + 4156 + NN + O + + + for + for + 4157 + 4160 + IN + O + + + the + the + 4161 + 4164 + DT + O + + + Chinese + chinese + 4165 + 4172 + JJ + MISC + + + students + student + 4173 + 4181 + NNS + O + + + . + . + 4181 + 4182 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ provides) (NP (NP (DT no) (JJ statutory) (JJ legal) (NN protection)) (PP (IN for) (NP (DT the) (JJ Chinese) (NNS students))))) (. .))) + + + ROOT + provides + + + provides + It + + + protection + no + + + protection + statutory + + + protection + legal + + + provides + protection + + + protection + for + + + students + the + + + students + Chinese + + + for + students + + + + + ROOT + provides + + + provides + It + + + protection + no + + + protection + statutory + + + protection + legal + + + provides + protection + + + students + the + + + students + Chinese + + + protection + students + + + + + ROOT + provides + + + provides + It + + + protection + no + + + protection + statutory + + + protection + legal + + + provides + protection + + + students + the + + + students + Chinese + + + protection + students + + + + + + + It + it + 4183 + 4185 + PRP + O + + + can + can + 4187 + 4190 + MD + O + + + be + be + 4191 + 4193 + VB + O + + + revoked + revoke + 4194 + 4201 + VBN + O + + + by + by + 4202 + 4204 + IN + O + + + the + the + 4205 + 4208 + DT + O + + + President + President + 4209 + 4218 + NNP + O + + + or + or + 4219 + 4221 + CC + O + + + the + the + 4222 + 4225 + DT + O + + + Attorney + Attorney + 4227 + 4235 + NNP + O + + + General + General + 4236 + 4243 + NNP + O + + + at + at + 4244 + 4246 + IN + O + + + their + they + 4247 + 4252 + PRP$ + O + + + discretion + discretion + 4253 + 4263 + NN + O + + + . + . + 4263 + 4264 + . + O + + + (ROOT (S (NP (PRP It)) (VP (MD can) (VP (VB be) (VP (VBN revoked) (PP (IN by) (NP (NP (DT the) (NNP President)) (CC or) (NP (DT the) (NNP Attorney) (NNP General)))) (PP (IN at) (NP (PRP$ their) (NN discretion)))))) (. .))) + + + ROOT + revoked + + + revoked + It + + + revoked + can + + + revoked + be + + + revoked + by + + + President + the + + + by + President + + + President + or + + + General + the + + + General + Attorney + + + President + General + + + revoked + at + + + discretion + their + + + at + discretion + + + + + ROOT + revoked + + + revoked + It + + + revoked + can + + + revoked + be + + + President + the + + + revoked + President + + + General + the + + + General + Attorney + + + President + General + + + discretion + their + + + revoked + discretion + + + + + ROOT + revoked + + + revoked + It + + + revoked + can + + + revoked + be + + + President + the + + + revoked + President + + + General + the + + + General + Attorney + + + revoked + General + + + President + General + + + discretion + their + + + revoked + discretion + + + + + + + This + this + 4268 + 4272 + DT + O + + + administrative + administrative + 4273 + 4287 + JJ + O + + + action + action + 4288 + 4294 + NN + O + + + could + could + 4295 + 4300 + MD + O + + + also + also + 4301 + 4305 + RB + O + + + be + be + 4307 + 4309 + VB + O + + + challenged + challenge + 4310 + 4320 + VBN + O + + + because + because + 4321 + 4328 + IN + O + + + immigration + immigration + 4329 + 4340 + NN + O + + + law + law + 4342 + 4345 + NN + O + + + does + do + 4346 + 4350 + VBZ + O + + + not + not + 4351 + 4354 + RB + O + + + , + , + 4354 + 4355 + , + O + + + in + in + 4356 + 4358 + IN + O + + + general + general + 4359 + 4366 + JJ + O + + + , + , + 4366 + 4367 + , + O + + + permit + permit + 4368 + 4374 + VB + O + + + aliens + alien + 4375 + 4381 + NNS + O + + + to + to + 4383 + 4385 + TO + O + + + adjust + adjust + 4386 + 4392 + VB + O + + + their + they + 4393 + 4398 + PRP$ + O + + + status + status + 4399 + 4405 + NN + O + + + if + if + 4406 + 4408 + IN + O + + + they + they + 4409 + 4413 + PRP + O + + + apply + apply + 4414 + 4419 + VBP + O + + + to + to + 4420 + 4422 + TO + O + + + do + do + 4424 + 4426 + VB + O + + + so + so + 4427 + 4429 + RB + O + + + while + while + 4430 + 4435 + IN + O + + + they + they + 4436 + 4440 + PRP + O + + + are + be + 4441 + 4444 + VBP + O + + + technically + technically + 4445 + 4456 + RB + O + + + in + in + 4457 + 4459 + IN + O + + + ille + ille + 4460 + 4464 + NN + O + + + - + - + 4464 + 4465 + : + O + + + gal + gal + 4467 + 4470 + NN + O + + + status + status + 4471 + 4477 + NN + O + + + . + . + 4477 + 4478 + . + O + + + (ROOT (S (NP (DT This) (JJ administrative) (NN action)) (VP (MD could) (ADVP (RB also)) (VP (VB be) (VP (VBN challenged) (SBAR (IN because) (S (NP (NN immigration) (NN law)) (VP (VBZ does) (ADVP (RB not) (PRN (, ,) (PP (IN in) (ADJP (JJ general))) (, ,)) (S (VP (VB permit) (S (NP (NNS aliens)) (VP (TO to) (VP (VB adjust) (NP (PRP$ their) (NN status)))))))) (SBAR (IN if) (S (NP (PRP they)) (VP (VBP apply) (S (VP (TO to) (VP (VB do) (SBAR (RB so) (IN while) (S (NP (PRP they)) (VP (VBP are) (ADVP (RB technically)) (PP (IN in) (NP (NN ille))))))))) (: -) (NP (NN gal) (NN status))))))))))) (. .))) + + + ROOT + challenged + + + action + This + + + action + administrative + + + challenged + action + + + challenged + could + + + challenged + also + + + challenged + be + + + does + because + + + law + immigration + + + does + law + + + challenged + does + + + does + not + + + not + in + + + in + general + + + not + permit + + + adjust + aliens + + + adjust + to + + + permit + adjust + + + status + their + + + adjust + status + + + apply + if + + + apply + they + + + does + apply + + + do + to + + + apply + do + + + are + so + + + are + while + + + are + they + + + do + are + + + are + technically + + + are + in + + + in + ille + + + status + gal + + + apply + status + + + + + ROOT + challenged + + + action + This + + + action + administrative + + + challenged + action + + + challenged + could + + + challenged + also + + + challenged + be + + + does + because + + + law + immigration + + + does + law + + + challenged + does + + + does + not + + + not + in + + + in + general + + + not + permit + + + adjust + aliens + + + adjust + to + + + permit + adjust + + + status + their + + + adjust + status + + + apply + if + + + apply + they + + + does + apply + + + do + to + + + apply + do + + + are + so + + + are + while + + + are + they + + + do + are + + + are + technically + + + are + ille + + + status + gal + + + apply + status + + + + + ROOT + challenged + + + action + This + + + action + administrative + + + challenged + action + + + challenged + could + + + challenged + also + + + challenged + be + + + does + because + + + law + immigration + + + does + law + + + challenged + does + + + does + not + + + not + in + + + in + general + + + not + permit + + + adjust + aliens + + + adjust + to + + + permit + adjust + + + status + their + + + adjust + status + + + apply + if + + + apply + they + + + does + apply + + + do + to + + + apply + do + + + are + so + + + are + while + + + are + they + + + do + are + + + are + technically + + + are + ille + + + status + gal + + + apply + status + + + + + + + It + it + 4479 + 4481 + PRP + O + + + is + be + 4482 + 4484 + VBZ + O + + + an + a + 4485 + 4487 + DT + O + + + open + open + 4488 + 4492 + JJ + O + + + question + question + 4493 + 4501 + NN + O + + + whether + whether + 4503 + 4510 + IN + O + + + the + the + 4511 + 4514 + DT + O + + + administration + administration + 4515 + 4529 + NN + O + + + has + have + 4530 + 4533 + VBZ + O + + + the + the + 4534 + 4537 + DT + O + + + authority + authority + 4539 + 4548 + NN + O + + + to + to + 4549 + 4551 + TO + O + + + grant + grant + 4552 + 4557 + VB + O + + + such + such + 4558 + 4562 + PDT + O + + + a + a + 4563 + 4564 + DT + O + + + generalized + generalize + 4565 + 4576 + VBN + O + + + waiver + waiver + 4578 + 4584 + NN + O + + + of + of + 4585 + 4587 + IN + O + + + a + a + 4588 + 4589 + DT + O + + + congressionally + congressionally + 4590 + 4605 + RB + O + + + mandated + mandate + 4606 + 4614 + VBN + O + + + stipulation + stipulation + 4616 + 4627 + NN + O + + + . + . + 4627 + 4628 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (DT an) (JJ open) (NN question)) (SBAR (IN whether) (S (NP (DT the) (NN administration)) (VP (VBZ has) (NP (DT the) (NN authority) (S (VP (TO to) (VP (VB grant) (NP (NP (PDT such) (DT a)) (VP (VBN generalized) (NP (NP (NN waiver)) (PP (IN of) (NP (DT a) (ADJP (RB congressionally) (VBN mandated)) (NN stipulation)))))))))))))) (. .))) + + + ROOT + question + + + question + It + + + question + is + + + question + an + + + question + open + + + has + whether + + + administration + the + + + has + administration + + + question + has + + + authority + the + + + has + authority + + + grant + to + + + authority + grant + + + a + such + + + grant + a + + + a + generalized + + + generalized + waiver + + + waiver + of + + + stipulation + a + + + mandated + congressionally + + + stipulation + mandated + + + of + stipulation + + + + + ROOT + question + + + question + It + + + question + is + + + question + an + + + question + open + + + has + whether + + + administration + the + + + has + administration + + + question + has + + + authority + the + + + has + authority + + + grant + to + + + authority + grant + + + a + such + + + grant + a + + + a + generalized + + + generalized + waiver + + + stipulation + a + + + mandated + congressionally + + + stipulation + mandated + + + waiver + stipulation + + + + + ROOT + question + + + question + It + + + question + is + + + question + an + + + question + open + + + has + whether + + + administration + the + + + has + administration + + + question + has + + + authority + the + + + has + authority + + + grant + to + + + authority + grant + + + a + such + + + grant + a + + + a + generalized + + + generalized + waiver + + + stipulation + a + + + mandated + congressionally + + + stipulation + mandated + + + waiver + stipulation + + + + + + + The + the + 4629 + 4632 + DT + O + + + best + best + 4633 + 4637 + JJS + O + + + way + way + 4638 + 4641 + NN + O + + + to + to + 4642 + 4644 + TO + O + + + answer + answer + 4645 + 4651 + VB + O + + + that + that + 4653 + 4657 + DT + O + + + question + question + 4658 + 4666 + NN + O + + + and + and + 4667 + 4670 + CC + O + + + to + to + 4671 + 4673 + TO + O + + + resolve + resolve + 4674 + 4681 + VB + O + + + all + all + 4682 + 4685 + DT + O + + + doubt + doubt + 4686 + 4691 + NN + O + + + is + be + 4693 + 4695 + VBZ + O + + + to + to + 4696 + 4698 + TO + O + + + do + do + 4699 + 4701 + VB + O + + + what + what + 4702 + 4706 + WP + O + + + Congress + Congress + 4707 + 4715 + NNP + ORGANIZATION + + + did + do + 4716 + 4719 + VBD + O + + + last + last + 4720 + 4724 + JJ + DATE + THIS P1Y OFFSET P-1Y + + + + year + year + 4725 + 4729 + NN + DATE + THIS P1Y OFFSET P-1Y + + + + : + : + 4729 + 4730 + : + O + + + Change + Change + 4732 + 4738 + NNP + O + + + the + the + 4739 + 4742 + DT + O + + + law + law + 4743 + 4746 + NN + O + + + . + . + 4746 + 4747 + . + O + + + (ROOT (S (NP (NP (DT The) (JJS best) (NN way)) (SBAR (S (VP (VP (TO to) (VP (VB answer) (NP (DT that) (NN question)))) (CC and) (VP (TO to) (VP (VB resolve) (ADVP (DT all) (NN doubt)))))))) (VP (VBZ is) (S (VP (TO to) (VP (VB do) (NP (SBAR (WHNP (WP what)) (S (NP (NNP Congress)) (VP (VBD did) (NP-TMP (JJ last) (NN year))))) (: :) (NP (NP (NNP Change)) (NP (DT the) (NN law)))))))) (. .))) + + + ROOT + is + + + way + The + + + way + best + + + is + way + + + answer + to + + + way + answer + + + question + that + + + answer + question + + + answer + and + + + resolve + to + + + answer + resolve + + + doubt + all + + + resolve + doubt + + + do + to + + + is + do + + + did + what + + + did + Congress + + + Change + did + + + year + last + + + did + year + + + do + Change + + + law + the + + + Change + law + + + + + ROOT + is + + + way + The + + + way + best + + + is + way + + + answer + to + + + way + answer + + + question + that + + + answer + question + + + resolve + to + + + answer + resolve + + + doubt + all + + + resolve + doubt + + + do + to + + + is + do + + + did + what + + + did + Congress + + + Change + did + + + year + last + + + did + year + + + do + Change + + + law + the + + + Change + law + + + + + ROOT + is + + + way + The + + + way + best + + + is + way + + + answer + to + + + way + answer + + + question + that + + + answer + question + + + resolve + to + + + way + resolve + + + answer + resolve + + + doubt + all + + + resolve + doubt + + + do + to + + + is + do + + + did + what + + + did + Congress + + + Change + did + + + year + last + + + did + year + + + do + Change + + + law + the + + + Change + law + + + + + + + That + that + 4753 + 4757 + DT + O + + + is + be + 4758 + 4760 + VBZ + O + + + why + why + 4761 + 4764 + WRB + O + + + we + we + 4765 + 4767 + PRP + O + + + must + must + 4768 + 4772 + MD + O + + + now + now + 4773 + 4776 + RB + DATE + PRESENT_REF + PRESENT_REF + + + override + override + 4777 + 4785 + VB + O + + + the + the + 4787 + 4790 + DT + O + + + veto + veto + 4791 + 4795 + NN + O + + + . + . + 4795 + 4796 + . + O + + + (ROOT (S (NP (DT That)) (VP (VBZ is) (SBAR (WHADVP (WRB why)) (S (NP (PRP we)) (VP (MD must) (ADVP (RB now)) (VP (VB override) (NP (DT the) (NN veto))))))) (. .))) + + + ROOT + is + + + is + That + + + override + why + + + override + we + + + override + must + + + override + now + + + is + override + + + veto + the + + + override + veto + + + + + ROOT + is + + + is + That + + + override + why + + + override + we + + + override + must + + + override + now + + + is + override + + + veto + the + + + override + veto + + + + + ROOT + is + + + is + That + + + override + why + + + override + we + + + override + must + + + override + now + + + is + override + + + veto + the + + + override + veto + + + + + + + Equally + equally + 4800 + 4807 + RB + O + + + important + important + 4808 + 4817 + JJ + O + + + , + , + 4817 + 4818 + , + O + + + the + the + 4819 + 4822 + DT + O + + + veto + veto + 4823 + 4827 + NN + O + + + sends + send + 4828 + 4833 + VBZ + O + + + exactly + exactly + 4835 + 4842 + RB + O + + + the + the + 4843 + 4846 + DT + O + + + wrong + wrong + 4847 + 4852 + JJ + O + + + signal + signal + 4853 + 4859 + NN + O + + + . + . + 4859 + 4860 + . + O + + + (ROOT (S (S (ADJP (RB Equally) (JJ important))) (, ,) (NP (DT the) (NN veto)) (VP (VBZ sends) (NP (RB exactly) (DT the) (JJ wrong) (NN signal))) (. .))) + + + ROOT + sends + + + important + Equally + + + sends + important + + + veto + the + + + sends + veto + + + signal + exactly + + + signal + the + + + signal + wrong + + + sends + signal + + + + + ROOT + sends + + + important + Equally + + + sends + important + + + veto + the + + + sends + veto + + + signal + exactly + + + signal + the + + + signal + wrong + + + sends + signal + + + + + ROOT + sends + + + important + Equally + + + sends + important + + + veto + the + + + sends + veto + + + signal + exactly + + + signal + the + + + signal + wrong + + + sends + signal + + + + + + + The + the + 4864 + 4867 + DT + O + + + President + President + 4868 + 4877 + NNP + O + + + says + say + 4878 + 4882 + VBZ + O + + + he + he + 4883 + 4885 + PRP + O + + + does + do + 4886 + 4890 + VBZ + O + + + not + not + 4891 + 4894 + RB + O + + + want + want + 4895 + 4899 + VB + O + + + to + to + 4901 + 4903 + TO + O + + + isolate + isolate + 4904 + 4911 + VB + O + + + the + the + 4912 + 4915 + DT + O + + + Government + government + 4916 + 4926 + NN + O + + + of + of + 4927 + 4929 + IN + O + + + China + China + 4930 + 4935 + NNP + LOCATION + + + . + . + 4935 + 4936 + . + O + + + (ROOT (S (NP (DT The) (NNP President)) (VP (VBZ says) (SBAR (S (NP (PRP he)) (VP (VBZ does) (RB not) (VP (VB want) (S (VP (TO to) (VP (VB isolate) (NP (NP (DT the) (NN Government)) (PP (IN of) (NP (NNP China)))))))))))) (. .))) + + + ROOT + says + + + President + The + + + says + President + + + want + he + + + want + does + + + want + not + + + says + want + + + isolate + to + + + want + isolate + + + Government + the + + + isolate + Government + + + Government + of + + + of + China + + + + + ROOT + says + + + President + The + + + says + President + + + want + he + + + want + does + + + want + not + + + says + want + + + isolate + to + + + want + isolate + + + Government + the + + + isolate + Government + + + Government + China + + + + + ROOT + says + + + President + The + + + says + President + + + want + he + + + want + does + + + want + not + + + says + want + + + isolate + to + + + want + isolate + + + Government + the + + + isolate + Government + + + Government + China + + + + + + + Neither + neither + 4938 + 4945 + DT + O + + + do + do + 4946 + 4948 + VBP + O + + + I. + i. + 4949 + 4951 + NN + O + + + But + but + 4955 + 4958 + CC + O + + + to + to + 4959 + 4961 + TO + O + + + the + the + 4962 + 4965 + DT + O + + + extent + extent + 4966 + 4972 + NN + O + + + that + that + 4973 + 4977 + IN + O + + + it + it + 4978 + 4980 + PRP + O + + + is + be + 4981 + 4983 + VBZ + O + + + isolated + isolate + 4984 + 4992 + VBN + O + + + , + , + 4992 + 4993 + , + O + + + the + the + 4995 + 4998 + DT + O + + + Government + government + 4999 + 5009 + NN + O + + + of + of + 5010 + 5012 + IN + O + + + China + China + 5013 + 5018 + NNP + LOCATION + + + isolated + isolate + 5019 + 5027 + VBD + O + + + itself + itself + 5029 + 5035 + PRP + O + + + . + . + 5035 + 5036 + . + O + + + (ROOT (S (S (NP (DT Neither)) (VP (VBP do) (NP (NN I.)))) (CC But) (S (PP (TO to) (NP (NP (DT the) (NN extent)) (SBAR (IN that) (S (NP (PRP it)) (VP (VBZ is) (ADJP (VBN isolated))))))) (, ,) (NP (NP (DT the) (NN Government)) (PP (IN of) (NP (NNP China)))) (VP (VBD isolated) (NP (PRP itself)))) (. .))) + + + ROOT + do + + + do + Neither + + + do + I. + + + do + But + + + isolated + to + + + extent + the + + + to + extent + + + isolated + that + + + isolated + it + + + isolated + is + + + extent + isolated + + + Government + the + + + isolated + Government + + + Government + of + + + of + China + + + do + isolated + + + isolated + itself + + + + + ROOT + do + + + do + Neither + + + do + I. + + + extent + the + + + isolated + extent + + + isolated + that + + + isolated + it + + + isolated + is + + + extent + isolated + + + Government + the + + + isolated + Government + + + Government + China + + + do + isolated + + + isolated + itself + + + + + ROOT + do + + + do + Neither + + + do + I. + + + extent + the + + + isolated + extent + + + isolated + that + + + isolated + it + + + isolated + is + + + extent + isolated + + + Government + the + + + isolated + Government + + + Government + China + + + do + isolated + + + isolated + itself + + + + + + + It + it + 5037 + 5039 + PRP + O + + + isolated + isolate + 5040 + 5048 + VBD + O + + + itself + itself + 5049 + 5055 + PRP + O + + + from + from + 5056 + 5060 + IN + O + + + its + its + 5061 + 5064 + PRP$ + O + + + own + own + 5065 + 5068 + JJ + O + + + people + people + 5070 + 5076 + NNS + O + + + and + and + 5077 + 5080 + CC + O + + + from + from + 5081 + 5085 + IN + O + + + the + the + 5086 + 5089 + DT + O + + + community + community + 5090 + 5099 + NN + O + + + of + of + 5100 + 5102 + IN + O + + + na + na + 5103 + 5105 + TO + O + + + - + - + 5105 + 5106 + : + O + + + tions + tion + 5108 + 5113 + NNS + O + + + by + by + 5114 + 5116 + IN + O + + + murdering + murder + 5117 + 5126 + VBG + O + + + its + its + 5127 + 5130 + PRP$ + O + + + own + own + 5131 + 5134 + JJ + O + + + citizens + citizen + 5135 + 5143 + NNS + O + + + , + , + 5143 + 5144 + , + O + + + by + by + 5145 + 5147 + IN + O + + + denying + deny + 5149 + 5156 + VBG + O + + + to + to + 5157 + 5159 + TO + O + + + those + those + 5160 + 5165 + DT + O + + + citizens + citizen + 5166 + 5174 + NNS + O + + + even + even + 5175 + 5179 + RB + O + + + the + the + 5180 + 5183 + DT + O + + + most + most + 5185 + 5189 + JJS + O + + + basic + basic + 5190 + 5195 + JJ + O + + + of + of + 5196 + 5198 + IN + O + + + human + human + 5199 + 5204 + JJ + O + + + rights + rights + 5205 + 5211 + NNS + O + + + . + . + 5211 + 5212 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBD isolated) (NP (PRP itself)) (PP (IN from) (NP (PRP$ its) (NP (NP (JJ own) (NNS people)) (CC and) (PP (IN from) (NP (NP (DT the) (NN community)) (SBAR (IN of) (S (VP (TO na))))) (: -))) (NNS tions))) (PP (IN by) (S (VP (VBG murdering) (NP (PRP$ its) (JJ own) (NNS citizens))))) (, ,) (PP (IN by) (S (VP (VBG denying) (PP (TO to) (NP (DT those) (NNS citizens) (RB even))) (NP (NP (DT the) (JJS most) (JJ basic)) (PP (IN of) (NP (JJ human) (NNS rights)))))))) (. .))) + + + ROOT + isolated + + + isolated + It + + + isolated + itself + + + isolated + from + + + tions + its + + + people + own + + + tions + people + + + people + and + + + people + from + + + community + the + + + from + community + + + na + of + + + community + na + + + from + tions + + + isolated + by + + + by + murdering + + + citizens + its + + + citizens + own + + + murdering + citizens + + + isolated + by + + + by + denying + + + denying + to + + + citizens + those + + + to + citizens + + + citizens + even + + + basic + the + + + basic + most + + + denying + basic + + + basic + of + + + rights + human + + + of + rights + + + + + ROOT + isolated + + + isolated + It + + + isolated + itself + + + tions + its + + + people + own + + + tions + people + + + people + from + + + community + the + + + from + community + + + na + of + + + community + na + + + isolated + tions + + + isolated + murdering + + + citizens + its + + + citizens + own + + + murdering + citizens + + + isolated + denying + + + citizens + those + + + denying + citizens + + + citizens + even + + + basic + the + + + basic + most + + + denying + basic + + + rights + human + + + basic + rights + + + + + ROOT + isolated + + + isolated + It + + + isolated + itself + + + tions + its + + + people + own + + + tions + people + + + people + from + + + tions + from + + + community + the + + + from + community + + + na + of + + + community + na + + + isolated + tions + + + isolated + murdering + + + citizens + its + + + citizens + own + + + murdering + citizens + + + isolated + denying + + + citizens + those + + + denying + citizens + + + citizens + even + + + basic + the + + + basic + most + + + denying + basic + + + rights + human + + + basic + rights + + + + + + + Our + we + 5213 + 5216 + PRP$ + O + + + re + re + 5217 + 5219 + SYM + O + + + - + - + 5219 + 5220 + : + O + + + sponse + sponse + 5222 + 5228 + NN + O + + + to + to + 5229 + 5231 + TO + O + + + the + the + 5232 + 5235 + DT + O + + + urgent + urgent + 5236 + 5242 + JJ + O + + + and + and + 5243 + 5246 + CC + O + + + well-founded + well-founded + 5247 + 5259 + JJ + O + + + fears + fear + 5261 + 5266 + NNS + O + + + of + of + 5267 + 5269 + IN + O + + + the + the + 5270 + 5273 + DT + O + + + Chinese + chinese + 5274 + 5281 + JJ + MISC + + + students + student + 5282 + 5290 + NNS + O + + + in + in + 5291 + 5293 + IN + O + + + our + we + 5294 + 5297 + PRP$ + O + + + country + country + 5299 + 5306 + NN + O + + + was + be + 5307 + 5310 + VBD + O + + + not + not + 5311 + 5314 + RB + O + + + taken + take + 5315 + 5320 + VBN + O + + + to + to + 5321 + 5323 + TO + O + + + isolate + isolate + 5324 + 5331 + VB + O + + + anyone + anyone + 5333 + 5339 + NN + O + + + : + : + 5339 + 5340 + : + O + + + It + it + 5341 + 5343 + PRP + O + + + was + be + 5344 + 5347 + VBD + O + + + an + a + 5348 + 5350 + DT + O + + + appropriate + appropriate + 5351 + 5362 + JJ + O + + + Ameri + Ameri + 5363 + 5368 + NNP + LOCATION + + + - + - + 5368 + 5369 + : + O + + + can + can + 5371 + 5374 + MD + O + + + response + response + 5375 + 5383 + VB + O + + + to + to + 5384 + 5386 + TO + O + + + the + the + 5387 + 5390 + DT + O + + + victims + victim + 5391 + 5398 + NNS + O + + + of + of + 5399 + 5401 + IN + O + + + murder + murder + 5402 + 5408 + NN + O + + + by + by + 5410 + 5412 + IN + O + + + government + government + 5413 + 5423 + NN + O + + + . + . + 5423 + 5424 + . + O + + + (ROOT (FRAG (NP (NP (PRP$ Our)) (SBAR (S (SBAR (X (SYM re)) (S (NP (NP (: -) (NP (NP (NN sponse)) (PP (TO to) (NP (DT the) (JJ urgent)))) (CC and) (NP (NP (JJ well-founded) (NNS fears)) (PP (IN of) (NP (DT the) (JJ Chinese) (NNS students))))) (PP (IN in) (NP (PRP$ our) (NN country)))) (VP (VBD was) (RB not) (VP (VBN taken) (S (VP (TO to) (VP (VB isolate) (NP (NN anyone))))))))) (: :) (NP (PRP It)) (VP (VBD was) (NP (DT an) (JJ appropriate) (NNP Ameri)))))) (: -) (VP (MD can) (VP (VB response) (PP (TO to) (NP (NP (DT the) (NNS victims)) (PP (IN of) (NP (NN murder))))) (PP (IN by) (NP (NN government))))) (. .))) + + + ROOT + Our + + + taken + re + + + taken + sponse + + + sponse + to + + + urgent + the + + + to + urgent + + + sponse + and + + + fears + well-founded + + + sponse + fears + + + fears + of + + + students + the + + + students + Chinese + + + of + students + + + sponse + in + + + country + our + + + in + country + + + taken + was + + + taken + not + + + Ameri + taken + + + isolate + to + + + taken + isolate + + + isolate + anyone + + + Ameri + It + + + Ameri + was + + + Ameri + an + + + Ameri + appropriate + + + Our + Ameri + + + response + can + + + Our + response + + + response + to + + + victims + the + + + to + victims + + + victims + of + + + of + murder + + + response + by + + + by + government + + + + + ROOT + Our + + + taken + re + + + taken + sponse + + + urgent + the + + + sponse + urgent + + + fears + well-founded + + + sponse + fears + + + students + the + + + students + Chinese + + + fears + students + + + country + our + + + sponse + country + + + taken + was + + + taken + not + + + Ameri + taken + + + isolate + to + + + taken + isolate + + + isolate + anyone + + + Ameri + It + + + Ameri + was + + + Ameri + an + + + Ameri + appropriate + + + Our + Ameri + + + response + can + + + Our + response + + + victims + the + + + response + victims + + + victims + murder + + + response + government + + + + + ROOT + Our + + + taken + re + + + taken + sponse + + + urgent + the + + + sponse + urgent + + + fears + well-founded + + + sponse + fears + + + taken + fears + + + students + the + + + students + Chinese + + + fears + students + + + country + our + + + sponse + country + + + taken + was + + + taken + not + + + Ameri + taken + + + isolate + to + + + taken + isolate + + + isolate + anyone + + + Ameri + It + + + Ameri + was + + + Ameri + an + + + Ameri + appropriate + + + Our + Ameri + + + response + can + + + Our + response + + + victims + the + + + response + victims + + + victims + murder + + + response + government + + + + + + + I + I + 5428 + 5429 + PRP + O + + + hope + hope + 5430 + 5434 + VBP + O + + + my + my + 5435 + 5437 + PRP$ + O + + + colleagues + colleague + 5438 + 5448 + NNS + O + + + will + will + 5449 + 5453 + MD + O + + + repeat + repeat + 5454 + 5460 + VB + O + + + their + they + 5462 + 5467 + PRP$ + O + + + unanimous + unanimous + 5468 + 5477 + JJ + O + + + approval + approval + 5478 + 5486 + NN + O + + + of + of + 5487 + 5489 + IN + O + + + the + the + 5490 + 5493 + DT + O + + + bill + bill + 5494 + 5498 + NN + O + + + last + last + 5500 + 5504 + JJ + DATE + THIS P1Y OFFSET P-1Y + + + + year + year + 5505 + 5509 + NN + DATE + THIS P1Y OFFSET P-1Y + + + + with + with + 5510 + 5514 + IN + O + + + an + a + 5515 + 5517 + DT + O + + + equally + equally + 5518 + 5525 + RB + O + + + strong + strong + 5526 + 5532 + JJ + O + + + vote + vote + 5533 + 5537 + NN + O + + + to + to + 5539 + 5541 + TO + O + + + override + override + 5542 + 5550 + VB + O + + + the + the + 5551 + 5554 + DT + O + + + veto + veto + 5555 + 5559 + NN + O + + + . + . + 5559 + 5560 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP hope) (SBAR (S (NP (PRP$ my) (NNS colleagues)) (VP (MD will) (VP (VB repeat) (NP (NP (PRP$ their) (JJ unanimous) (NN approval)) (PP (IN of) (NP (DT the) (NN bill)))) (NP-TMP (JJ last) (NN year)) (PP (IN with) (NP (DT an) (ADJP (RB equally) (JJ strong)) (NN vote) (S (VP (TO to) (VP (VB override) (NP (DT the) (NN veto)))))))))))) (. .))) + + + ROOT + hope + + + hope + I + + + colleagues + my + + + repeat + colleagues + + + repeat + will + + + hope + repeat + + + approval + their + + + approval + unanimous + + + repeat + approval + + + approval + of + + + bill + the + + + of + bill + + + year + last + + + repeat + year + + + repeat + with + + + vote + an + + + strong + equally + + + vote + strong + + + with + vote + + + override + to + + + vote + override + + + veto + the + + + override + veto + + + + + ROOT + hope + + + hope + I + + + colleagues + my + + + repeat + colleagues + + + repeat + will + + + hope + repeat + + + approval + their + + + approval + unanimous + + + repeat + approval + + + bill + the + + + approval + bill + + + year + last + + + repeat + year + + + vote + an + + + strong + equally + + + vote + strong + + + repeat + vote + + + override + to + + + vote + override + + + veto + the + + + override + veto + + + + + ROOT + hope + + + hope + I + + + colleagues + my + + + repeat + colleagues + + + repeat + will + + + hope + repeat + + + approval + their + + + approval + unanimous + + + repeat + approval + + + bill + the + + + approval + bill + + + year + last + + + repeat + year + + + vote + an + + + strong + equally + + + vote + strong + + + repeat + vote + + + override + to + + + vote + override + + + veto + the + + + override + veto + + + + + + + It + it + 5561 + 5563 + PRP + O + + + is + be + 5564 + 5566 + VBZ + O + + + the + the + 5567 + 5570 + DT + O + + + right + right + 5571 + 5576 + JJ + O + + + thing + thing + 5578 + 5583 + NN + O + + + to + to + 5584 + 5586 + TO + O + + + do + do + 5587 + 5589 + VB + O + + + . + . + 5589 + 5590 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (DT the) (JJ right) (NN thing) (S (VP (TO to) (VP (VB do)))))) (. .))) + + + ROOT + thing + + + thing + It + + + thing + is + + + thing + the + + + thing + right + + + do + to + + + thing + do + + + + + ROOT + thing + + + thing + It + + + thing + is + + + thing + the + + + thing + right + + + do + to + + + thing + do + + + + + ROOT + thing + + + thing + It + + + thing + is + + + thing + the + + + thing + right + + + do + to + + + thing + do + + + + + + + It + it + 5594 + 5596 + PRP + O + + + is + be + 5597 + 5599 + VBZ + O + + + my + my + 5600 + 5602 + PRP$ + O + + + intention + intention + 5603 + 5612 + NN + O + + + to + to + 5613 + 5615 + TO + O + + + proceed + proceed + 5616 + 5623 + VB + O + + + to + to + 5624 + 5626 + TO + O + + + the + the + 5627 + 5630 + DT + O + + + crime + crime + 5632 + 5637 + NN + O + + + legislation + legislation + 5638 + 5649 + NN + O + + + on + on + 5650 + 5652 + IN + O + + + or + or + 5653 + 5655 + CC + O + + + about + about + 5656 + 5661 + IN + DATE + XXXX-02-07 + XXXX-02-07 + + + February + February + 5662 + 5670 + NNP + DATE + XXXX-02-07 + XXXX-02-07 + + + 7 + 7 + 5672 + 5673 + CD + DATE + XXXX-02-07 + XXXX-02-07 + + + , + , + 5673 + 5674 + , + O + + + as + as + 5675 + 5677 + IN + O + + + provided + provide + 5678 + 5686 + VBN + O + + + in + in + 5687 + 5689 + IN + O + + + the + the + 5690 + 5693 + DT + O + + + agreement + agreement + 5694 + 5703 + NN + O + + + we + we + 5704 + 5706 + PRP + O + + + reached + reach + 5708 + 5715 + VBD + O + + + last + last + 5716 + 5720 + JJ + DATE + THIS P1Y OFFSET P-1Y + + + + year + year + 5721 + 5725 + NN + DATE + THIS P1Y OFFSET P-1Y + + + + . + . + 5725 + 5726 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (PRP$ my) (NN intention) (S (VP (TO to) (VP (VB proceed) (PP (TO to) (NP (DT the) (NN crime) (NN legislation))) (PP (IN on) (CC or) (IN about) (NP (NNP February) (CD 7))) (, ,) (SBAR (IN as) (S (PP (VBN provided) (PP (IN in) (NP (DT the) (NN agreement)))) (NP (PRP we)) (VP (VBD reached) (NP-TMP (JJ last) (NN year)))))))))) (. .))) + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + proceed + to + + + intention + proceed + + + proceed + to + + + legislation + the + + + legislation + crime + + + to + legislation + + + proceed + on + + + on + or + + + on + about + + + on + February + + + February + 7 + + + reached + as + + + reached + provided + + + provided + in + + + agreement + the + + + in + agreement + + + reached + we + + + proceed + reached + + + year + last + + + reached + year + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + proceed + to + + + intention + proceed + + + proceed + proceed + + + legislation + the + + + legislation + crime + + + proceed + legislation + + + proceed + February + + + proceed + February + + + February + 7 + + + reached + as + + + reached + provided + + + provided + in + + + agreement + the + + + in + agreement + + + reached + we + + + proceed + reached + + + year + last + + + reached + year + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + proceed + to + + + intention + proceed + + + intention + proceed + + + proceed + proceed + + + legislation + the + + + legislation + crime + + + proceed + legislation + + + proceed + February + + + proceed + February + + + February + 7 + + + reached + as + + + reached + provided + + + provided + in + + + agreement + the + + + in + agreement + + + reached + we + + + proceed + reached + + + year + last + + + reached + year + + + + + + + Senator + Senator + 5730 + 5737 + NNP + O + + + BIDEN + BIDEN + 5738 + 5743 + NNP + PERSON + + + , + , + 5743 + 5744 + , + O + + + the + the + 5745 + 5748 + DT + O + + + chairman + chairman + 5749 + 5757 + NN + O + + + of + of + 5758 + 5760 + IN + O + + + the + the + 5761 + 5764 + DT + O + + + Judiciary + Judiciary + 5766 + 5775 + NNP + ORGANIZATION + + + Committee + Committee + 5776 + 5785 + NNP + ORGANIZATION + + + , + , + 5785 + 5786 + , + O + + + has + have + 5787 + 5790 + VBZ + O + + + proposed + propose + 5791 + 5799 + VBN + O + + + a + a + 5800 + 5801 + DT + O + + + vehicle + vehicle + 5803 + 5810 + NN + O + + + which + which + 5811 + 5816 + WDT + O + + + incorporates + incorporate + 5817 + 5829 + VBZ + O + + + the + the + 5830 + 5833 + DT + O + + + three + three + 5834 + 5839 + CD + NUMBER + 3.0 + + + uncompleted + uncompleted + 5841 + 5852 + JJ + O + + + items + item + 5856 + 5861 + NNS + O + + + of + of + 5863 + 5865 + IN + O + + + the + the + 5867 + 5870 + DT + O + + + Bush + Bush + 5872 + 5876 + NNP + PERSON + + + agenda-another + agenda-another + 5878 + 5892 + JJ + O + + + Federal + Federal + 5893 + 5900 + NNP + O + + + death + death + 5901 + 5906 + NN + O + + + penal + penal + 5907 + 5912 + NN + O + + + - + - + 5912 + 5913 + : + O + + + ty + ty + 5915 + 5917 + NN + O + + + , + , + 5917 + 5918 + , + O + + + habeus + habeus + 5919 + 5925 + NN + O + + + corpus + corpus + 5926 + 5932 + NN + O + + + reform + reform + 5933 + 5939 + NN + O + + + , + , + 5939 + 5940 + , + O + + + and + and + 5941 + 5944 + CC + O + + + exclu + exclu + 5945 + 5950 + NN + O + + + - + - + 5950 + 5951 + : + O + + + sionary + sionary + 5953 + 5960 + JJ + O + + + rule + rule + 5961 + 5965 + NN + O + + + changes-along + changes-along + 5966 + 5979 + NN + O + + + with + with + 5980 + 5984 + IN + O + + + some + some + 5985 + 5989 + DT + O + + + important + important + 5991 + 6000 + JJ + O + + + additional + additional + 6001 + 6011 + JJ + O + + + elements + element + 6012 + 6020 + NNS + O + + + to + to + 6021 + 6023 + TO + O + + + curb + curb + 6024 + 6028 + VB + O + + + drug + drug + 6030 + 6034 + NN + O + + + money-laundering + money-laundering + 6035 + 6051 + NN + O + + + , + , + 6051 + 6052 + , + O + + + the + the + 6053 + 6056 + DT + O + + + DeConcini + DeConcini + 6057 + 6066 + NNP + PERSON + + + assault + assault + 6068 + 6075 + NN + O + + + weapons + weapon + 6076 + 6083 + NNS + O + + + bill + bill + 6084 + 6088 + NN + O + + + , + , + 6088 + 6089 + , + O + + + language + language + 6090 + 6098 + NN + O + + + to + to + 6099 + 6101 + TO + O + + + curb + curb + 6102 + 6106 + VB + O + + + the + the + 6108 + 6111 + DT + O + + + export + export + 6112 + 6118 + NN + O + + + of + of + 6119 + 6121 + IN + O + + + assault + assault + 6122 + 6129 + NN + O + + + weapons + weapon + 6130 + 6137 + NNS + O + + + to + to + 6138 + 6140 + TO + O + + + drug + drug + 6141 + 6145 + NN + O + + + dealers + dealer + 6147 + 6154 + NNS + O + + + in + in + 6155 + 6157 + IN + O + + + Latin + Latin + 6158 + 6163 + NNP + LOCATION + + + America + America + 6164 + 6171 + NNP + LOCATION + + + , + , + 6171 + 6172 + , + O + + + as + as + 6173 + 6175 + RB + O + + + well + well + 6176 + 6180 + RB + O + + + as + as + 6181 + 6183 + IN + O + + + ad + ad + 6184 + 6186 + NN + O + + + - + - + 6186 + 6187 + : + O + + + ditional + ditional + 6189 + 6197 + JJ + O + + + funding + funding + 6198 + 6205 + NN + O + + + for + for + 6206 + 6209 + IN + O + + + law + law + 6210 + 6213 + NN + O + + + enforcement + enforcement + 6214 + 6225 + NN + O + + + personnel + personnel + 6227 + 6236 + NNS + O + + + and + and + 6237 + 6240 + CC + O + + + other + other + 6241 + 6246 + JJ + O + + + matters + matter + 6247 + 6254 + NNS + O + + + . + . + 6254 + 6255 + . + O + + + (ROOT (S (NP (NP (NNP Senator) (NNP BIDEN)) (, ,) (NP (NP (DT the) (NN chairman)) (PP (IN of) (NP (DT the) (NNP Judiciary) (NNP Committee)))) (, ,)) (VP (VBZ has) (VP (VBN proposed) (NP (NP (NP (DT a) (NN vehicle)) (SBAR (WHNP (WDT which)) (S (VP (VBZ incorporates) (S (NP (NP (DT the) (CD three) (JJ uncompleted) (NNS items)) (PP (IN of) (NP (NP (DT the) (NNP Bush)) (NP (NP (NP (JJ agenda-another) (NNP Federal)) (NP (NN death) (NN penal))) (: -) (NP (NP (NN ty)) (, ,) (NP (NN habeus) (NN corpus) (NN reform)) (, ,) (CC and) (NP (NN exclu))) (: -) (NP (NP (JJ sionary) (NN rule) (NN changes-along)) (PP (IN with) (NP (DT some) (JJ important) (JJ additional) (NNS elements)))))))) (VP (TO to) (VP (VB curb) (NP (NP (NP (NN drug) (NN money-laundering)) (, ,) (NP (DT the) (NNP DeConcini) (NN assault) (NNS weapons) (NN bill)) (, ,) (NP (NN language) (S (VP (TO to) (VP (VB curb) (NP (NP (DT the) (NN export)) (PP (IN of) (NP (NN assault) (NNS weapons)))) (PP (TO to) (NP (NP (NN drug) (NNS dealers)) (PP (IN in) (NP (NNP Latin) (NNP America)))))))))) (, ,) (CONJP (RB as) (RB well) (IN as)) (NP (NN ad)))))))))) (: -) (NP (NP (JJ ditional) (NN funding)) (PP (IN for) (NP (NN law) (NN enforcement) (NNS personnel)))) (CC and) (NP (JJ other) (NNS matters))))) (. .))) + + + ROOT + proposed + + + BIDEN + Senator + + + proposed + BIDEN + + + chairman + the + + + BIDEN + chairman + + + chairman + of + + + Committee + the + + + Committee + Judiciary + + + of + Committee + + + proposed + has + + + vehicle + a + + + proposed + vehicle + + + incorporates + which + + + vehicle + incorporates + + + items + the + + + items + three + + + items + uncompleted + + + curb + items + + + items + of + + + Bush + the + + + of + Bush + + + Federal + agenda-another + + + Bush + Federal + + + penal + death + + + Federal + penal + + + Federal + ty + + + reform + habeus + + + reform + corpus + + + ty + reform + + + ty + and + + + ty + exclu + + + changes-along + sionary + + + changes-along + rule + + + Federal + changes-along + + + changes-along + with + + + elements + some + + + elements + important + + + elements + additional + + + with + elements + + + curb + to + + + incorporates + curb + + + money-laundering + drug + + + curb + money-laundering + + + bill + the + + + bill + DeConcini + + + bill + assault + + + bill + weapons + + + money-laundering + bill + + + money-laundering + language + + + curb + to + + + language + curb + + + export + the + + + curb + export + + + export + of + + + weapons + assault + + + of + weapons + + + curb + to + + + dealers + drug + + + to + dealers + + + dealers + in + + + America + Latin + + + in + America + + + well + as + + + money-laundering + well + + + well + as + + + money-laundering + ad + + + funding + ditional + + + vehicle + funding + + + funding + for + + + personnel + law + + + personnel + enforcement + + + for + personnel + + + vehicle + and + + + matters + other + + + vehicle + matters + + + + + ROOT + proposed + + + BIDEN + Senator + + + proposed + BIDEN + + + chairman + the + + + BIDEN + chairman + + + Committee + the + + + Committee + Judiciary + + + chairman + Committee + + + proposed + has + + + vehicle + a + + + proposed + vehicle + + + incorporates + which + + + vehicle + incorporates + + + items + the + + + items + three + + + items + uncompleted + + + curb + items + + + Bush + the + + + items + Bush + + + Federal + agenda-another + + + Bush + Federal + + + penal + death + + + Federal + penal + + + Federal + ty + + + reform + habeus + + + reform + corpus + + + ty + reform + + + ty + exclu + + + changes-along + sionary + + + changes-along + rule + + + Federal + changes-along + + + elements + some + + + elements + important + + + elements + additional + + + changes-along + elements + + + curb + to + + + incorporates + curb + + + money-laundering + drug + + + curb + money-laundering + + + bill + the + + + bill + DeConcini + + + bill + assault + + + bill + weapons + + + money-laundering + bill + + + money-laundering + language + + + curb + to + + + language + curb + + + export + the + + + curb + export + + + weapons + assault + + + export + weapons + + + dealers + drug + + + curb + dealers + + + America + Latin + + + dealers + America + + + money-laundering + ad + + + funding + ditional + + + vehicle + funding + + + personnel + law + + + personnel + enforcement + + + funding + personnel + + + matters + other + + + vehicle + matters + + + + + ROOT + proposed + + + BIDEN + Senator + + + proposed + BIDEN + + + chairman + the + + + BIDEN + chairman + + + Committee + the + + + Committee + Judiciary + + + chairman + Committee + + + proposed + has + + + vehicle + a + + + proposed + vehicle + + + incorporates + which + + + vehicle + incorporates + + + items + the + + + items + three + + + items + uncompleted + + + curb + items + + + Bush + the + + + items + Bush + + + Federal + agenda-another + + + Bush + Federal + + + penal + death + + + Federal + penal + + + Federal + ty + + + reform + habeus + + + reform + corpus + + + Federal + reform + + + ty + reform + + + Federal + exclu + + + ty + exclu + + + changes-along + sionary + + + changes-along + rule + + + Federal + changes-along + + + elements + some + + + elements + important + + + elements + additional + + + changes-along + elements + + + curb + to + + + incorporates + curb + + + money-laundering + drug + + + curb + money-laundering + + + bill + the + + + bill + DeConcini + + + bill + assault + + + bill + weapons + + + money-laundering + bill + + + money-laundering + language + + + curb + to + + + language + curb + + + export + the + + + curb + export + + + weapons + assault + + + export + weapons + + + dealers + drug + + + curb + dealers + + + America + Latin + + + dealers + America + + + curb + ad + + + money-laundering + ad + + + funding + ditional + + + proposed + funding + + + vehicle + funding + + + personnel + law + + + personnel + enforcement + + + funding + personnel + + + matters + other + + + proposed + matters + + + vehicle + matters + + + + + + + I + I + 6259 + 6260 + PRP + O + + + know + know + 6261 + 6265 + VBP + O + + + other + other + 6266 + 6271 + JJ + O + + + Senators + senator + 6272 + 6280 + NNS + O + + + have + have + 6281 + 6285 + VBP + O + + + propos + propo + 6286 + 6292 + NNS + O + + + - + - + 6292 + 6293 + : + O + + + als + al + 6295 + 6298 + NNS + O + + + in + in + 6299 + 6301 + IN + O + + + this + this + 6302 + 6306 + DT + O + + + field + field + 6307 + 6312 + NN + O + + + as + as + 6313 + 6315 + RB + O + + + well + well + 6316 + 6320 + RB + O + + + . + . + 6320 + 6321 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP know) (SBAR (S (NP (JJ other) (NNS Senators)) (VP (VBP have) (NP (NP (NNS propos)) (: -) (NP (NP (NNS als)) (PP (IN in) (NP (NP (DT this) (NN field)) (ADVP (RB as) (RB well)))))))))) (. .))) + + + ROOT + know + + + know + I + + + Senators + other + + + have + Senators + + + know + have + + + have + propos + + + propos + als + + + als + in + + + field + this + + + in + field + + + well + as + + + field + well + + + + + ROOT + know + + + know + I + + + Senators + other + + + have + Senators + + + know + have + + + have + propos + + + propos + als + + + field + this + + + als + field + + + well + as + + + field + well + + + + + ROOT + know + + + know + I + + + Senators + other + + + have + Senators + + + know + have + + + have + propos + + + propos + als + + + field + this + + + als + field + + + well + as + + + field + well + + + + + + + The + the + 6325 + 6328 + DT + O + + + most + most + 6329 + 6333 + RBS + O + + + effective + effective + 6334 + 6343 + JJ + O + + + direct + direct + 6344 + 6350 + JJ + O + + + assistance + assistance + 6351 + 6361 + NN + O + + + the + the + 6363 + 6366 + DT + O + + + Federal + Federal + 6367 + 6374 + NNP + O + + + Government + Government + 6375 + 6385 + NNP + O + + + can + can + 6386 + 6389 + MD + O + + + provide + provide + 6390 + 6397 + VB + O + + + to + to + 6399 + 6401 + TO + O + + + States + States + 6402 + 6408 + NNPS + LOCATION + + + for + for + 6409 + 6412 + IN + O + + + the + the + 6413 + 6416 + DT + O + + + purpose + purpose + 6417 + 6424 + NN + O + + + of + of + 6425 + 6427 + IN + O + + + curbing + curb + 6428 + 6435 + VBG + O + + + violent + violent + 6437 + 6444 + JJ + O + + + crime + crime + 6445 + 6450 + NN + O + + + is + be + 6451 + 6453 + VBZ + O + + + additional + additional + 6454 + 6464 + JJ + O + + + resources + resource + 6465 + 6474 + NNS + O + + + for + for + 6476 + 6479 + IN + O + + + law + law + 6480 + 6483 + NN + O + + + enforcement + enforcement + 6484 + 6495 + NN + O + + + , + , + 6495 + 6496 + , + O + + + prosecution + prosecution + 6497 + 6508 + NN + O + + + , + , + 6508 + 6509 + , + O + + + and + and + 6510 + 6513 + CC + O + + + detention + detention + 6515 + 6524 + NN + O + + + . + . + 6524 + 6525 + . + O + + + (ROOT (S (NP (NP (DT The) (ADJP (RBS most) (JJ effective)) (JJ direct) (NN assistance)) (SBAR (S (NP (DT the) (NNP Federal) (NNP Government)) (VP (MD can) (VP (VB provide) (PP (TO to) (NP (NNPS States))) (PP (IN for) (NP (NP (DT the) (NN purpose)) (PP (IN of) (S (VP (VBG curbing) (NP (JJ violent) (NN crime)))))))))))) (VP (VBZ is) (NP (NP (JJ additional) (NNS resources)) (PP (IN for) (NP (NP (NN law) (NN enforcement)) (, ,) (NP (NN prosecution)) (, ,) (CC and) (NP (NN detention)))))) (. .))) + + + ROOT + resources + + + assistance + The + + + effective + most + + + assistance + effective + + + assistance + direct + + + resources + assistance + + + Government + the + + + Government + Federal + + + provide + Government + + + provide + can + + + assistance + provide + + + provide + to + + + to + States + + + provide + for + + + purpose + the + + + for + purpose + + + purpose + of + + + of + curbing + + + crime + violent + + + curbing + crime + + + resources + is + + + resources + additional + + + resources + for + + + enforcement + law + + + for + enforcement + + + enforcement + prosecution + + + enforcement + and + + + enforcement + detention + + + + + ROOT + resources + + + assistance + The + + + effective + most + + + assistance + effective + + + assistance + direct + + + resources + assistance + + + Government + the + + + Government + Federal + + + provide + Government + + + provide + can + + + assistance + provide + + + provide + States + + + purpose + the + + + provide + purpose + + + purpose + curbing + + + crime + violent + + + curbing + crime + + + resources + is + + + resources + additional + + + enforcement + law + + + resources + enforcement + + + enforcement + prosecution + + + enforcement + detention + + + + + ROOT + resources + + + assistance + The + + + effective + most + + + assistance + effective + + + assistance + direct + + + resources + assistance + + + Government + the + + + Government + Federal + + + provide + Government + + + provide + can + + + assistance + provide + + + provide + States + + + purpose + the + + + provide + purpose + + + purpose + curbing + + + crime + violent + + + curbing + crime + + + resources + is + + + resources + additional + + + enforcement + law + + + resources + enforcement + + + resources + prosecution + + + enforcement + prosecution + + + resources + detention + + + enforcement + detention + + + + + + + We + we + 6526 + 6528 + PRP + O + + + made + make + 6529 + 6533 + VBD + O + + + a + a + 6534 + 6535 + DT + O + + + good + good + 6536 + 6540 + JJ + O + + + start + start + 6541 + 6546 + NN + O + + + on + on + 6547 + 6549 + IN + O + + + fi + fi + 6550 + 6552 + SYM + O + + + - + - + 6552 + 6553 + : + O + + + nancing + nancing + 6555 + 6562 + NN + O + + + that + that + 6563 + 6567 + WDT + O + + + assistance + assistance + 6568 + 6578 + NN + O + + + last + last + 6579 + 6583 + JJ + DATE + THIS P1Y OFFSET P-1Y + + + + year + year + 6584 + 6588 + NN + DATE + THIS P1Y OFFSET P-1Y + + + + . + . + 6588 + 6589 + . + O + + + (ROOT (S (NP (PRP We)) (VP (VBD made) (NP (DT a) (JJ good) (NN start)) (PP (PP (IN on) (FRAG (X (SYM fi)) (: -) (NP (NN nancing)))) (NP (WDT that) (NN assistance))) (NP-TMP (JJ last) (NN year))) (. .))) + + + ROOT + made + + + made + We + + + start + a + + + start + good + + + made + start + + + made + on + + + nancing + fi + + + on + nancing + + + assistance + that + + + on + assistance + + + year + last + + + made + year + + + + + ROOT + made + + + made + We + + + start + a + + + start + good + + + made + start + + + made + on + + + nancing + fi + + + on + nancing + + + assistance + that + + + on + assistance + + + year + last + + + made + year + + + + + ROOT + made + + + made + We + + + start + a + + + start + good + + + made + start + + + made + on + + + nancing + fi + + + on + nancing + + + assistance + that + + + on + assistance + + + year + last + + + made + year + + + + + + + I + I + 6590 + 6591 + PRP + O + + + hope + hope + 6593 + 6597 + VBP + O + + + the + the + 6598 + 6601 + DT + O + + + President + President + 6602 + 6611 + NNP + O + + + 's + 's + 6611 + 6613 + POS + O + + + budget + budget + 6614 + 6620 + NN + O + + + for + for + 6621 + 6624 + IN + O + + + 1991 + 1991 + 6625 + 6629 + CD + DATE + 1991 + + + builds + build + 6631 + 6637 + NNS + O + + + on + on + 6638 + 6640 + IN + O + + + that + that + 6641 + 6645 + DT + O + + + beginning + beginning + 6646 + 6655 + NN + O + + + . + . + 6655 + 6656 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP hope) (NP (NP (NP (DT the) (NNP President) (POS 's)) (NN budget)) (PP (IN for) (NP (NP (CD 1991) (NNS builds)) (PP (IN on) (NP (DT that) (NN beginning))))))) (. .))) + + + ROOT + hope + + + hope + I + + + President + the + + + budget + President + + + President + 's + + + hope + budget + + + budget + for + + + builds + 1991 + + + for + builds + + + builds + on + + + beginning + that + + + on + beginning + + + + + ROOT + hope + + + hope + I + + + President + the + + + budget + President + + + hope + budget + + + builds + 1991 + + + budget + builds + + + beginning + that + + + builds + beginning + + + + + ROOT + hope + + + hope + I + + + President + the + + + budget + President + + + hope + budget + + + builds + 1991 + + + budget + builds + + + beginning + that + + + builds + beginning + + + + + + + Following + follow + 6660 + 6669 + VBG + O + + + passage + passage + 6670 + 6677 + NN + O + + + of + of + 6678 + 6680 + IN + O + + + clean + clean + 6681 + 6686 + JJ + O + + + air + air + 6687 + 6690 + NN + O + + + legis + legi + 6691 + 6696 + NN + O + + + - + - + 6696 + 6697 + : + O + + + lation + lation + 6699 + 6705 + NN + O + + + , + , + 6705 + 6706 + , + O + + + we + we + 6707 + 6709 + PRP + O + + + will + will + 6710 + 6714 + MD + O + + + consider + consider + 6715 + 6723 + VB + O + + + national + national + 6724 + 6732 + JJ + O + + + serv + serv + 6733 + 6737 + NN + O + + + - + - + 6737 + 6738 + : + O + + + ice + ice + 6740 + 6743 + NN + O + + + legislation + legislation + 6744 + 6755 + NN + O + + + . + . + 6755 + 6756 + . + O + + + (ROOT (S (PP (VBG Following) (NP (NP (NP (NN passage)) (PP (IN of) (NP (JJ clean) (NN air) (NN legis)))) (: -) (NP (NN lation)))) (, ,) (NP (PRP we)) (VP (MD will) (VP (VB consider) (NP (NP (JJ national) (NN serv)) (: -) (NP (NN ice) (NN legislation))))) (. .))) + + + ROOT + consider + + + consider + Following + + + Following + passage + + + passage + of + + + legis + clean + + + legis + air + + + of + legis + + + passage + lation + + + consider + we + + + consider + will + + + serv + national + + + consider + serv + + + legislation + ice + + + serv + legislation + + + + + ROOT + consider + + + consider + passage + + + legis + clean + + + legis + air + + + passage + legis + + + passage + lation + + + consider + we + + + consider + will + + + serv + national + + + consider + serv + + + legislation + ice + + + serv + legislation + + + + + ROOT + consider + + + consider + passage + + + legis + clean + + + legis + air + + + passage + legis + + + passage + lation + + + consider + we + + + consider + will + + + serv + national + + + consider + serv + + + legislation + ice + + + serv + legislation + + + + + + + The + the + 6760 + 6763 + DT + O + + + national + national + 6764 + 6772 + JJ + O + + + 3ervice + 3ervice + 6773 + 6780 + NN + O + + + concept + concept + 6781 + 6788 + NN + O + + + seeks + seek + 6789 + 6794 + VBZ + O + + + to + to + 6796 + 6798 + TO + O + + + reinstate + reinstate + 6799 + 6808 + VB + O + + + at + at + 6809 + 6811 + IN + O + + + a + a + 6812 + 6813 + DT + O + + + national + national + 6814 + 6822 + JJ + O + + + level + level + 6823 + 6828 + NN + O + + + the + the + 6829 + 6832 + DT + O + + + sense + sense + 6834 + 6839 + NN + O + + + of + of + 6840 + 6842 + IN + O + + + community + community + 6843 + 6852 + NN + O + + + , + , + 6852 + 6853 + , + O + + + participation + participation + 6854 + 6867 + NN + O + + + , + , + 6867 + 6868 + , + O + + + and + and + 6869 + 6872 + CC + O + + + self-help + self-help + 6874 + 6883 + NN + O + + + that + that + 6884 + 6888 + WDT + O + + + are + be + 6889 + 6892 + VBP + O + + + all + all + 6893 + 6896 + DT + O + + + part + part + 6897 + 6901 + NN + O + + + of + of + 6902 + 6904 + IN + O + + + the + the + 6905 + 6908 + DT + O + + + Amer + Amer + 6909 + 6913 + NNP + O + + + - + - + 6913 + 6914 + : + O + + + ican + ican + 6916 + 6920 + JJ + O + + + tradition + tradition + 6921 + 6930 + NN + O + + + . + . + 6930 + 6931 + . + O + + + (ROOT (S (NP (DT The) (JJ national) (NN 3ervice) (NN concept)) (VP (VBZ seeks) (S (VP (TO to) (VP (VB reinstate) (PP (IN at) (NP (DT a) (JJ national) (NN level))) (NP (NP (DT the) (NN sense)) (PP (IN of) (NP (NP (NN community) (, ,) (NN participation) (, ,) (CC and) (NN self-help)) (SBAR (WHNP (WDT that)) (S (VP (VBP are) (NP (NP (NP (DT all) (NN part)) (PP (IN of) (NP (DT the) (NNP Amer)))) (: -) (NP (JJ ican) (NN tradition))))))))))))) (. .))) + + + ROOT + seeks + + + concept + The + + + concept + national + + + concept + 3ervice + + + seeks + concept + + + reinstate + to + + + seeks + reinstate + + + reinstate + at + + + level + a + + + level + national + + + at + level + + + sense + the + + + reinstate + sense + + + sense + of + + + of + community + + + community + participation + + + community + and + + + community + self-help + + + part + that + + + part + are + + + part + all + + + community + part + + + part + of + + + Amer + the + + + of + Amer + + + tradition + ican + + + part + tradition + + + + + ROOT + seeks + + + concept + The + + + concept + national + + + concept + 3ervice + + + seeks + concept + + + reinstate + to + + + seeks + reinstate + + + level + a + + + level + national + + + reinstate + level + + + sense + the + + + reinstate + sense + + + sense + community + + + community + participation + + + community + self-help + + + part + that + + + part + are + + + part + all + + + community + part + + + Amer + the + + + part + Amer + + + tradition + ican + + + part + tradition + + + + + ROOT + seeks + + + concept + The + + + concept + national + + + concept + 3ervice + + + seeks + concept + + + reinstate + to + + + seeks + reinstate + + + level + a + + + level + national + + + reinstate + level + + + sense + the + + + reinstate + sense + + + sense + community + + + sense + participation + + + community + participation + + + sense + self-help + + + community + self-help + + + part + that + + + part + are + + + part + all + + + community + part + + + Amer + the + + + part + Amer + + + tradition + ican + + + part + tradition + + + + + + + National + National + 6935 + 6943 + NNP + O + + + service + service + 6944 + 6951 + NN + O + + + will + will + 6952 + 6956 + MD + O + + + give + give + 6957 + 6961 + VB + O + + + our + we + 6962 + 6965 + PRP$ + O + + + young + young + 6966 + 6971 + JJ + O + + + people + people + 6973 + 6979 + NNS + O + + + an + a + 6980 + 6982 + DT + O + + + opportunity + opportunity + 6983 + 6994 + NN + O + + + to + to + 6995 + 6997 + TO + O + + + use + use + 6998 + 7001 + VB + O + + + their + they + 7002 + 7007 + PRP$ + O + + + energy + energy + 7009 + 7015 + NN + O + + + and + and + 7016 + 7019 + CC + O + + + ideals + ideal + 7020 + 7026 + NNS + O + + + to + to + 7027 + 7029 + TO + O + + + help + help + 7030 + 7034 + VB + O + + + the + the + 7035 + 7038 + DT + O + + + larger + larger + 7039 + 7045 + JJR + O + + + so + so + 7046 + 7048 + IN + O + + + - + - + 7048 + 7049 + : + O + + + ciety + ciety + 7051 + 7056 + NN + O + + + . + . + 7056 + 7057 + . + O + + + (ROOT (S (NP (NNP National) (NN service)) (VP (MD will) (VP (VB give) (NP (PRP$ our) (JJ young) (NNS people)) (NP (DT an) (NN opportunity) (S (VP (TO to) (VP (VB use) (S (NP (PRP$ their) (NN energy) (CC and) (NNS ideals)) (VP (TO to) (VP (VB help) (NP (DT the) (JJR larger)) (ADVP (IN so)) (: -) (FRAG (NP (NN ciety)))))))))))) (. .))) + + + ROOT + give + + + service + National + + + give + service + + + give + will + + + people + our + + + people + young + + + give + people + + + opportunity + an + + + give + opportunity + + + use + to + + + opportunity + use + + + energy + their + + + help + energy + + + energy + and + + + energy + ideals + + + help + to + + + use + help + + + larger + the + + + help + larger + + + help + so + + + help + ciety + + + + + ROOT + give + + + service + National + + + give + service + + + give + will + + + people + our + + + people + young + + + give + people + + + opportunity + an + + + give + opportunity + + + use + to + + + opportunity + use + + + energy + their + + + help + energy + + + energy + ideals + + + help + to + + + use + help + + + larger + the + + + help + larger + + + help + so + + + help + ciety + + + + + ROOT + give + + + service + National + + + give + service + + + give + will + + + people + our + + + people + young + + + give + people + + + opportunity + an + + + give + opportunity + + + use + to + + + opportunity + use + + + energy + their + + + help + energy + + + energy + ideals + + + help + ideals + + + help + to + + + use + help + + + larger + the + + + help + larger + + + help + so + + + help + ciety + + + + + + + It + it + 7058 + 7060 + PRP + O + + + can + can + 7061 + 7064 + MD + O + + + give + give + 7065 + 7069 + VB + O + + + an + a + 7070 + 7072 + DT + O + + + alternative + alternative + 7073 + 7084 + JJ + O + + + to + to + 7085 + 7087 + TO + O + + + that + that + 7088 + 7092 + DT + O + + + half + half + 7094 + 7098 + NN + O + + + of + of + 7099 + 7101 + IN + O + + + our + we + 7102 + 7105 + PRP$ + O + + + young + young + 7106 + 7111 + JJ + O + + + people + people + 7112 + 7118 + NNS + O + + + who + who + 7119 + 7122 + WP + O + + + do + do + 7123 + 7125 + VBP + O + + + not + not + 7126 + 7129 + RB + O + + + go + go + 7131 + 7133 + VB + O + + + to + to + 7134 + 7136 + TO + O + + + college + college + 7137 + 7144 + NN + O + + + . + . + 7144 + 7145 + . + O + + + (ROOT (S (NP (PRP It)) (VP (MD can) (VP (VB give) (NP (DT an) (JJ alternative)) (PP (TO to) (NP (NP (DT that) (NN half)) (PP (IN of) (NP (PRP$ our) (JJ young) (NNS people))) (SBAR (WHNP (WP who)) (S (VP (VBP do) (RB not) (VP (VB go) (PP (TO to) (NP (NN college))))))))))) (. .))) + + + ROOT + give + + + give + It + + + give + can + + + alternative + an + + + give + alternative + + + give + to + + + half + that + + + to + half + + + half + of + + + people + our + + + people + young + + + of + people + + + go + who + + + go + do + + + go + not + + + half + go + + + go + to + + + to + college + + + + + ROOT + give + + + give + It + + + give + can + + + alternative + an + + + give + alternative + + + half + that + + + give + half + + + people + our + + + people + young + + + half + people + + + go + who + + + go + do + + + go + not + + + half + go + + + go + college + + + + + ROOT + give + + + give + It + + + give + can + + + alternative + an + + + give + alternative + + + half + that + + + give + half + + + people + our + + + people + young + + + half + people + + + go + who + + + go + do + + + go + not + + + half + go + + + go + college + + + + + + + It + it + 7146 + 7148 + PRP + O + + + will + will + 7149 + 7153 + MD + O + + + give + give + 7154 + 7158 + VB + O + + + them + they + 7159 + 7163 + PRP + O + + + a + a + 7164 + 7165 + DT + O + + + way + way + 7166 + 7169 + NN + O + + + to + to + 7171 + 7173 + TO + O + + + make + make + 7174 + 7178 + VB + O + + + a + a + 7179 + 7180 + DT + O + + + contribution + contribution + 7181 + 7193 + NN + O + + + and + and + 7194 + 7197 + CC + O + + + , + , + 7197 + 7198 + , + O + + + at + at + 7199 + 7201 + IN + O + + + the + the + 7202 + 7205 + DT + O + + + same + same + 7207 + 7211 + JJ + O + + + time + time + 7212 + 7216 + NN + O + + + , + , + 7216 + 7217 + , + O + + + earn + earn + 7218 + 7222 + VBP + O + + + a + a + 7223 + 7224 + DT + O + + + stake + stake + 7225 + 7230 + NN + O + + + in + in + 7231 + 7233 + IN + O + + + their + they + 7234 + 7239 + PRP$ + O + + + own + own + 7240 + 7243 + JJ + O + + + education + education + 7245 + 7254 + NN + O + + + or + or + 7255 + 7257 + CC + O + + + their + they + 7258 + 7263 + PRP$ + O + + + first + first + 7264 + 7269 + JJ + ORDINAL + 1.0 + + + home + home + 7270 + 7274 + NN + O + + + . + . + 7274 + 7275 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VP (MD will) (VP (VB give) (NP (PRP them)) (NP (DT a) (NN way) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN contribution)))))))) (CC and) (PRN (, ,) (PP (IN at) (NP (DT the) (JJ same) (NN time))) (, ,)) (VP (VBP earn) (NP (DT a) (NN stake)) (PP (IN in) (NP (NP (PRP$ their) (JJ own) (NN education)) (CC or) (NP (PRP$ their) (JJ first) (NN home)))))) (. .))) + + + ROOT + give + + + give + It + + + give + will + + + give + them + + + way + a + + + give + way + + + make + to + + + way + make + + + contribution + a + + + make + contribution + + + give + and + + + give + at + + + time + the + + + time + same + + + at + time + + + give + earn + + + stake + a + + + earn + stake + + + earn + in + + + education + their + + + education + own + + + in + education + + + education + or + + + home + their + + + home + first + + + education + home + + + + + ROOT + give + + + give + It + + + give + will + + + give + them + + + way + a + + + give + way + + + make + to + + + way + make + + + contribution + a + + + make + contribution + + + give + at + + + time + the + + + time + same + + + at + time + + + give + earn + + + stake + a + + + earn + stake + + + education + their + + + education + own + + + earn + education + + + home + their + + + home + first + + + education + home + + + + + ROOT + give + + + give + It + + + earn + It + + + give + will + + + give + them + + + way + a + + + give + way + + + make + to + + + way + make + + + contribution + a + + + make + contribution + + + give + at + + + time + the + + + time + same + + + at + time + + + give + earn + + + stake + a + + + earn + stake + + + education + their + + + education + own + + + earn + education + + + home + their + + + home + first + + + earn + home + + + education + home + + + + + + + For + for + 7279 + 7282 + IN + O + + + the + the + 7283 + 7286 + DT + O + + + many + many + 7287 + 7291 + JJ + O + + + young + young + 7292 + 7297 + JJ + O + + + people + people + 7298 + 7304 + NNS + O + + + who + who + 7305 + 7308 + WP + O + + + know + know + 7310 + 7314 + VBP + O + + + that + that + 7315 + 7319 + IN + O + + + their + they + 7320 + 7325 + PRP$ + O + + + desire + desire + 7326 + 7332 + NN + O + + + to + to + 7333 + 7335 + TO + O + + + attend + attend + 7336 + 7342 + VB + O + + + col + col + 7343 + 7346 + NN + O + + + - + - + 7346 + 7347 + : + O + + + lege + lege + 7349 + 7353 + NN + O + + + poses + pose + 7354 + 7359 + VBZ + O + + + an + a + 7360 + 7362 + DT + O + + + enormous + enormous + 7363 + 7371 + JJ + O + + + financial + financial + 7372 + 7381 + JJ + O + + + sacri + sacrus + 7382 + 7387 + NN + O + + + - + - + 7387 + 7388 + : + O + + + fice + fice + 7390 + 7394 + NN + O + + + to + to + 7395 + 7397 + TO + O + + + their + they + 7398 + 7403 + PRP$ + O + + + parents + parent + 7404 + 7411 + NNS + O + + + , + , + 7411 + 7412 + , + O + + + national + national + 7413 + 7421 + JJ + O + + + service + service + 7422 + 7429 + NN + O + + + can + can + 7433 + 7436 + MD + O + + + be + be + 7437 + 7439 + VB + O + + + a + a + 7440 + 7441 + DT + O + + + way + way + 7442 + 7445 + NN + O + + + to + to + 7446 + 7448 + TO + O + + + help + help + 7449 + 7453 + VB + O + + + themselves + themselves + 7454 + 7464 + PRP + O + + + , + , + 7464 + 7465 + , + O + + + by + by + 7466 + 7468 + IN + O + + + earning + earn + 7470 + 7477 + VBG + O + + + their + they + 7478 + 7483 + PRP$ + O + + + tuition + tuition + 7484 + 7491 + NN + O + + + costs + cost + 7492 + 7497 + NNS + O + + + in + in + 7498 + 7500 + IN + O + + + advance + advance + 7501 + 7508 + NN + O + + + of + of + 7510 + 7512 + IN + O + + + school + school + 7513 + 7519 + NN + O + + + , + , + 7519 + 7520 + , + O + + + rather + rather + 7521 + 7527 + RB + O + + + than + than + 7528 + 7532 + IN + O + + + graduating + graduate + 7533 + 7543 + VBG + O + + + with + with + 7544 + 7548 + IN + O + + + an + a + 7550 + 7552 + DT + O + + + enormous + enormous + 7553 + 7561 + JJ + O + + + debt + debt + 7562 + 7566 + NN + O + + + load + load + 7567 + 7571 + NN + O + + + . + . + 7571 + 7572 + . + O + + + (ROOT (S (PP (IN For) (NP (NP (NP (DT the) (JJ many) (JJ young) (NNS people)) (SBAR (WHNP (WP who)) (S (VP (VBP know) (SBAR (IN that) (S (NP (PRP$ their) (NN desire)) (VP (TO to) (VP (VB attend) (NP (NN col)))))))))) (: -) (NP (NP (NN lege)) (VP (VBZ poses) (NP (DT an) (JJ enormous) (JJ financial) (NN sacri)))) (: -) (NP (NP (NN fice)) (PP (TO to) (NP (PRP$ their) (NNS parents)))))) (, ,) (NP (JJ national) (NN service)) (VP (MD can) (VP (VB be) (NP (DT a) (NN way) (S (VP (TO to) (VP (VB help) (NP (PRP themselves)))))) (, ,) (PP (IN by) (S (VP (VP (VBG earning) (NP (PRP$ their) (NN tuition) (NNS costs)) (PP (IN in) (NP (NP (NN advance)) (PP (IN of) (NP (NN school)))))) (, ,) (CONJP (RB rather) (IN than)) (VP (VBG graduating) (PP (IN with) (NP (DT an) (JJ enormous) (NN debt) (NN load))))))))) (. .))) + + + ROOT + way + + + way + For + + + people + the + + + people + many + + + people + young + + + For + people + + + know + who + + + people + know + + + attend + that + + + desire + their + + + attend + desire + + + attend + to + + + know + attend + + + attend + col + + + people + lege + + + lege + poses + + + sacri + an + + + sacri + enormous + + + sacri + financial + + + poses + sacri + + + people + fice + + + fice + to + + + parents + their + + + to + parents + + + service + national + + + way + service + + + way + can + + + way + be + + + way + a + + + help + to + + + way + help + + + help + themselves + + + way + by + + + by + earning + + + costs + their + + + costs + tuition + + + earning + costs + + + earning + in + + + in + advance + + + advance + of + + + of + school + + + earning + rather + + + rather + than + + + earning + graduating + + + graduating + with + + + load + an + + + load + enormous + + + load + debt + + + with + load + + + + + ROOT + way + + + people + the + + + people + many + + + people + young + + + way + people + + + know + who + + + people + know + + + attend + that + + + desire + their + + + attend + desire + + + attend + to + + + know + attend + + + attend + col + + + people + lege + + + lege + poses + + + sacri + an + + + sacri + enormous + + + sacri + financial + + + poses + sacri + + + people + fice + + + parents + their + + + fice + parents + + + service + national + + + way + service + + + way + can + + + way + be + + + way + a + + + help + to + + + way + help + + + help + themselves + + + way + earning + + + costs + their + + + costs + tuition + + + earning + costs + + + earning + advance + + + advance + school + + + earning + graduating + + + load + an + + + load + enormous + + + load + debt + + + graduating + load + + + + + ROOT + way + + + people + the + + + people + many + + + people + young + + + way + people + + + know + who + + + people + know + + + attend + that + + + desire + their + + + attend + desire + + + attend + to + + + know + attend + + + attend + col + + + people + lege + + + lege + poses + + + sacri + an + + + sacri + enormous + + + sacri + financial + + + poses + sacri + + + people + fice + + + parents + their + + + fice + parents + + + service + national + + + way + service + + + way + can + + + way + be + + + way + a + + + help + to + + + way + help + + + help + themselves + + + way + earning + + + costs + their + + + costs + tuition + + + earning + costs + + + earning + advance + + + advance + school + + + way + graduating + + + earning + graduating + + + load + an + + + load + enormous + + + load + debt + + + graduating + load + + + + + + + Most + most + 7576 + 7580 + RBS + O + + + important + important + 7581 + 7590 + JJ + O + + + , + , + 7590 + 7591 + , + O + + + national + national + 7592 + 7600 + JJ + O + + + service + service + 7601 + 7608 + NN + O + + + will + will + 7609 + 7613 + MD + O + + + show + show + 7615 + 7619 + VB + O + + + young + young + 7620 + 7625 + JJ + O + + + people + people + 7626 + 7632 + NNS + O + + + in + in + 7633 + 7635 + IN + O + + + very + very + 7636 + 7640 + RB + O + + + direct + direct + 7641 + 7647 + JJ + O + + + and + and + 7648 + 7651 + CC + O + + + practical + practical + 7653 + 7662 + JJ + O + + + terms + term + 7663 + 7668 + NNS + O + + + that + that + 7669 + 7673 + IN + O + + + their + they + 7674 + 7679 + PRP$ + O + + + efforts + effort + 7680 + 7687 + NNS + O + + + , + , + 7687 + 7688 + , + O + + + their + they + 7689 + 7694 + PRP$ + O + + + talents + talent + 7696 + 7703 + NNS + O + + + and + and + 7704 + 7707 + CC + O + + + their + they + 7708 + 7713 + PRP$ + O + + + ideals + ideal + 7714 + 7720 + NNS + O + + + are + be + 7721 + 7724 + VBP + O + + + valued + value + 7725 + 7731 + VBN + O + + + by + by + 7732 + 7734 + IN + O + + + their + they + 7736 + 7741 + PRP$ + O + + + society + society + 7742 + 7749 + NN + O + + + and + and + 7750 + 7753 + CC + O + + + needed + need + 7754 + 7760 + VBN + O + + + by + by + 7761 + 7763 + IN + O + + + many + many + 7764 + 7768 + JJ + O + + + mil + mil + 7769 + 7772 + NN + O + + + - + - + 7772 + 7773 + : + O + + + lions + lion + 7775 + 7780 + NNS + O + + + of + of + 7781 + 7783 + IN + O + + + their + they + 7784 + 7789 + PRP$ + O + + + fellow + fellow + 7790 + 7796 + JJ + O + + + citizens + citizen + 7797 + 7805 + NNS + O + + + . + . + 7805 + 7806 + . + O + + + (ROOT (S (ADVP (RBS Most) (JJ important)) (, ,) (NP (JJ national) (NN service)) (VP (MD will) (VP (VB show) (NP (JJ young) (NNS people)) (PP (IN in) (NP (ADJP (RB very) (JJ direct) (CC and) (JJ practical)) (NNS terms))) (SBAR (IN that) (S (NP (NP (PRP$ their) (NNS efforts)) (, ,) (NP (PRP$ their) (NNS talents)) (CC and) (NP (PRP$ their) (NNS ideals))) (VP (VBP are) (VP (VP (VBN valued) (PP (IN by) (NP (PRP$ their) (NN society)))) (CC and) (VP (VBN needed) (PP (IN by) (NP (NP (JJ many) (NN mil)) (: -) (NP (NP (NNS lions)) (PP (IN of) (NP (PRP$ their) (JJ fellow) (NNS citizens))))))))))))) (. .))) + + + ROOT + show + + + important + Most + + + show + important + + + service + national + + + show + service + + + show + will + + + people + young + + + show + people + + + show + in + + + direct + very + + + terms + direct + + + direct + and + + + direct + practical + + + in + terms + + + valued + that + + + efforts + their + + + valued + efforts + + + talents + their + + + efforts + talents + + + efforts + and + + + ideals + their + + + efforts + ideals + + + valued + are + + + show + valued + + + valued + by + + + society + their + + + by + society + + + valued + and + + + valued + needed + + + needed + by + + + mil + many + + + by + mil + + + mil + lions + + + lions + of + + + citizens + their + + + citizens + fellow + + + of + citizens + + + + + ROOT + show + + + important + Most + + + show + important + + + service + national + + + show + service + + + show + will + + + people + young + + + show + people + + + direct + very + + + terms + direct + + + direct + practical + + + show + terms + + + valued + that + + + efforts + their + + + valued + efforts + + + talents + their + + + efforts + talents + + + ideals + their + + + efforts + ideals + + + valued + are + + + show + valued + + + society + their + + + valued + society + + + valued + needed + + + mil + many + + + needed + mil + + + mil + lions + + + citizens + their + + + citizens + fellow + + + lions + citizens + + + + + ROOT + show + + + important + Most + + + show + important + + + service + national + + + show + service + + + show + will + + + people + young + + + show + people + + + direct + very + + + terms + direct + + + direct + practical + + + terms + practical + + + show + terms + + + valued + that + + + efforts + their + + + valued + efforts + + + needed + efforts + + + talents + their + + + efforts + talents + + + valued + talents + + + ideals + their + + + efforts + ideals + + + valued + ideals + + + valued + are + + + show + valued + + + society + their + + + valued + society + + + show + needed + + + valued + needed + + + mil + many + + + needed + mil + + + mil + lions + + + citizens + their + + + citizens + fellow + + + lions + citizens + + + + + + + The + the + 7810 + 7813 + DT + O + + + bill + bill + 7814 + 7818 + NN + O + + + we + we + 7819 + 7821 + PRP + O + + + will + will + 7822 + 7826 + MD + O + + + debate + debate + 7827 + 7833 + VB + O + + + includes + include + 7834 + 7842 + VBZ + O + + + a + a + 7843 + 7844 + DT + O + + + voluntary + voluntary + 7846 + 7855 + JJ + O + + + service + service + 7856 + 7863 + NN + O + + + component + component + 7864 + 7873 + NN + O + + + , + , + 7873 + 7874 + , + O + + + a + a + 7875 + 7876 + DT + O + + + conser + conser + 7877 + 7883 + NN + O + + + - + - + 7883 + 7884 + : + O + + + vation + vation + 7886 + 7892 + NN + O + + + component + component + 7893 + 7902 + NN + O + + + , + , + 7902 + 7903 + , + O + + + and + and + 7904 + 7907 + CC + O + + + a + a + 7908 + 7909 + DT + O + + + pilot + pilot + 7910 + 7915 + NN + O + + + program + program + 7916 + 7923 + NN + O + + + for + for + 7925 + 7928 + IN + O + + + the + the + 7929 + 7932 + DT + O + + + core + core + 7933 + 7937 + NN + O + + + idea + idea + 7938 + 7942 + NN + O + + + of + of + 7943 + 7945 + IN + O + + + national + national + 7946 + 7954 + JJ + O + + + service + service + 7955 + 7962 + NN + O + + + in + in + 7963 + 7965 + IN + O + + + exchange + exchange + 7967 + 7975 + NN + O + + + for + for + 7976 + 7979 + IN + O + + + education + education + 7980 + 7989 + NN + O + + + or + or + 7990 + 7992 + CC + O + + + home + home + 7993 + 7997 + NN + O + + + own + own + 7998 + 8001 + JJ + O + + + - + - + 8001 + 8002 + : + O + + + ership + ership + 8004 + 8010 + NN + O + + + credits + credit + 8011 + 8018 + NNS + O + + + . + . + 8018 + 8019 + . + O + + + (ROOT (S (NP (NP (DT The) (NN bill)) (SBAR (S (NP (PRP we)) (VP (MD will) (VP (VB debate)))))) (VP (VBZ includes) (NP (NP (DT a) (JJ voluntary) (NN service) (NN component)) (, ,) (NP (NP (DT a) (NN conser)) (: -) (NP (NN vation) (NN component))) (, ,) (CC and) (NP (NP (DT a) (NN pilot) (NN program)) (PP (IN for) (NP (NP (DT the) (NN core) (NN idea)) (PP (IN of) (NP (JJ national) (NN service))))))) (PP (IN in) (NP (NP (NP (NP (NN exchange)) (PP (IN for) (NP (NN education) (CC or) (NN home)))) (JJ own)) (: -) (NP (NN ership) (NNS credits))))) (. .))) + + + ROOT + includes + + + bill + The + + + includes + bill + + + debate + we + + + debate + will + + + bill + debate + + + component + a + + + component + voluntary + + + component + service + + + includes + component + + + conser + a + + + component + conser + + + component + vation + + + conser + component + + + component + and + + + program + a + + + program + pilot + + + component + program + + + program + for + + + idea + the + + + idea + core + + + for + idea + + + idea + of + + + service + national + + + of + service + + + includes + in + + + in + exchange + + + exchange + for + + + for + education + + + education + or + + + education + home + + + exchange + own + + + credits + ership + + + exchange + credits + + + + + ROOT + includes + + + bill + The + + + includes + bill + + + debate + we + + + debate + will + + + bill + debate + + + component + a + + + component + voluntary + + + component + service + + + includes + component + + + conser + a + + + component + conser + + + component + vation + + + conser + component + + + program + a + + + program + pilot + + + component + program + + + idea + the + + + idea + core + + + program + idea + + + service + national + + + idea + service + + + includes + exchange + + + exchange + education + + + education + home + + + exchange + own + + + credits + ership + + + exchange + credits + + + + + ROOT + includes + + + bill + The + + + includes + bill + + + debate + we + + + debate + will + + + bill + debate + + + component + a + + + component + voluntary + + + component + service + + + includes + component + + + conser + a + + + includes + conser + + + component + conser + + + component + vation + + + conser + component + + + program + a + + + program + pilot + + + includes + program + + + component + program + + + idea + the + + + idea + core + + + program + idea + + + service + national + + + idea + service + + + includes + exchange + + + exchange + education + + + exchange + home + + + education + home + + + exchange + own + + + credits + ership + + + exchange + credits + + + + + + + It + it + 8023 + 8025 + PRP + O + + + is + be + 8026 + 8028 + VBZ + O + + + my + my + 8029 + 8031 + PRP$ + O + + + intention + intention + 8032 + 8041 + NN + O + + + also + also + 8042 + 8046 + RB + O + + + to + to + 8047 + 8049 + TO + O + + + move + move + 8050 + 8054 + VB + O + + + promptly + promptly + 8056 + 8064 + RB + O + + + to + to + 8065 + 8067 + TO + O + + + address + address + 8068 + 8075 + VB + O + + + the + the + 8076 + 8079 + DT + O + + + Nation + Nation + 8080 + 8086 + NNP + O + + + 's + 's + 8086 + 8088 + POS + O + + + key + key + 8089 + 8092 + JJ + O + + + education + education + 8094 + 8103 + NN + O + + + needs + need + 8104 + 8109 + NNS + O + + + . + . + 8109 + 8110 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (PRP$ my) (NN intention)) (ADVP (RB also)) (S (VP (TO to) (VP (VB move) (ADVP (RB promptly)) (S (VP (TO to) (VP (VB address) (NP (NP (DT the) (NNP Nation) (POS 's)) (JJ key) (NN education) (NNS needs))))))))) (. .))) + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + intention + also + + + move + to + + + intention + move + + + move + promptly + + + address + to + + + move + address + + + Nation + the + + + needs + Nation + + + Nation + 's + + + needs + key + + + needs + education + + + address + needs + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + intention + also + + + move + to + + + intention + move + + + move + promptly + + + address + to + + + move + address + + + Nation + the + + + needs + Nation + + + needs + key + + + needs + education + + + address + needs + + + + + ROOT + intention + + + intention + It + + + intention + is + + + intention + my + + + intention + also + + + move + to + + + intention + move + + + move + promptly + + + address + to + + + move + address + + + Nation + the + + + needs + Nation + + + needs + key + + + needs + education + + + address + needs + + + + + + + Our + we + 8114 + 8117 + PRP$ + O + + + higher + higher + 8118 + 8124 + JJR + O + + + education + education + 8125 + 8134 + NN + O + + + system + system + 8135 + 8141 + NN + O + + + is + be + 8142 + 8144 + VBZ + O + + + among + among + 8146 + 8151 + IN + O + + + the + the + 8152 + 8155 + DT + O + + + finest + finest + 8156 + 8162 + JJS + O + + + in + in + 8163 + 8165 + IN + O + + + the + the + 8166 + 8169 + DT + O + + + world + world + 8170 + 8175 + NN + O + + + . + . + 8175 + 8176 + . + O + + + (ROOT (S (NP (NP (PRP$ Our) (JJR higher) (NN education)) (NP (NN system))) (VP (VBZ is) (PP (IN among) (NP (NP (DT the) (JJS finest)) (PP (IN in) (NP (DT the) (NN world)))))) (. .))) + + + ROOT + is + + + education + Our + + + education + higher + + + is + education + + + education + system + + + is + among + + + finest + the + + + among + finest + + + finest + in + + + world + the + + + in + world + + + + + ROOT + is + + + education + Our + + + education + higher + + + is + education + + + education + system + + + finest + the + + + is + finest + + + world + the + + + finest + world + + + + + ROOT + is + + + education + Our + + + education + higher + + + is + education + + + education + system + + + finest + the + + + is + finest + + + world + the + + + finest + world + + + + + + + But + but + 8177 + 8180 + CC + O + + + half + half + 8182 + 8186 + PDT + O + + + our + we + 8187 + 8190 + PRP$ + O + + + students + student + 8191 + 8199 + NNS + O + + + do + do + 8200 + 8202 + VBP + O + + + not + not + 8203 + 8206 + RB + O + + + go + go + 8207 + 8209 + VB + O + + + on + on + 8210 + 8212 + IN + O + + + to + to + 8213 + 8215 + TO + O + + + higher + higher + 8217 + 8223 + JJR + O + + + education + education + 8224 + 8233 + NN + O + + + . + . + 8233 + 8234 + . + O + + + (ROOT (S (CC But) (NP (PDT half) (PRP$ our) (NNS students)) (VP (VBP do) (RB not) (VP (VB go) (PP (IN on) (PP (TO to) (NP (JJR higher) (NN education)))))) (. .))) + + + ROOT + go + + + go + But + + + students + half + + + students + our + + + go + students + + + go + do + + + go + not + + + go + on + + + on + to + + + education + higher + + + to + education + + + + + ROOT + go + + + go + But + + + students + half + + + students + our + + + go + students + + + go + do + + + go + not + + + go + on + + + on + to + + + education + higher + + + to + education + + + + + ROOT + go + + + go + But + + + students + half + + + students + our + + + go + students + + + go + do + + + go + not + + + go + on + + + on + to + + + education + higher + + + to + education + + + + + + + The + the + 8235 + 8238 + DT + O + + + education + education + 8239 + 8248 + NN + O + + + crisis + crisis + 8249 + 8255 + NN + O + + + is + be + 8257 + 8259 + VBZ + O + + + not + not + 8260 + 8263 + RB + O + + + at + at + 8264 + 8266 + IN + O + + + the + the + 8267 + 8270 + DT + O + + + college + college + 8271 + 8278 + NN + O + + + level + level + 8279 + 8284 + NN + O + + + ; + ; + 8284 + 8285 + : + O + + + it + it + 8286 + 8288 + PRP + O + + + is + be + 8289 + 8291 + VBZ + O + + + at + at + 8292 + 8294 + IN + O + + + the + the + 8295 + 8298 + DT + O + + + elementary + elementary + 8300 + 8310 + JJ + O + + + and + and + 8313 + 8316 + CC + O + + + secondary + secondary + 8319 + 8328 + JJ + O + + + levels + level + 8331 + 8337 + NNS + O + + + , + , + 8337 + 8338 + , + O + + + where + where + 8340 + 8345 + WRB + O + + + the + the + 8346 + 8349 + DT + O + + + basic + basic + 8350 + 8355 + JJ + O + + + foundations + foundation + 8356 + 8367 + NNS + O + + + of + of + 8368 + 8370 + IN + O + + + liter + liter + 8371 + 8376 + NN + O + + + - + - + 8376 + 8377 + : + O + + + acy + acy + 8379 + 8382 + NN + O + + + , + , + 8382 + 8383 + , + O + + + mathematical + mathematical + 8384 + 8396 + JJ + O + + + skills + skill + 8397 + 8403 + NNS + O + + + , + , + 8403 + 8404 + , + O + + + and + and + 8405 + 8408 + CC + O + + + learning + learn + 8409 + 8417 + VBG + O + + + skills + skill + 8419 + 8425 + NNS + O + + + are + be + 8426 + 8429 + VBP + O + + + established + establish + 8430 + 8441 + VBN + O + + + . + . + 8441 + 8442 + . + O + + + (ROOT (S (S (NP (DT The) (NN education) (NN crisis)) (VP (VBZ is) (RB not) (PP (IN at) (NP (DT the) (NN college) (NN level))))) (: ;) (S (NP (PRP it)) (VP (VBZ is) (PP (IN at) (NP (DT the) (ADJP (JJ elementary) (CC and) (JJ secondary)) (NNS levels))) (, ,) (SBAR (WHADVP (WRB where)) (S (NP (NP (DT the) (JJ basic) (NNS foundations)) (PP (IN of) (NP (NP (NN liter)) (: -) (NP (NP (NN acy)) (, ,) (NP (NP (JJ mathematical) (NNS skills)) (, ,) (CC and) (VP (VBG learning) (NP (NNS skills)))))))) (VP (VBP are) (VP (VBN established))))))) (. .))) + + + ROOT + is + + + crisis + The + + + crisis + education + + + is + crisis + + + is + not + + + is + at + + + level + the + + + level + college + + + at + level + + + is + it + + + is + is + + + is + at + + + levels + the + + + levels + elementary + + + elementary + and + + + elementary + secondary + + + at + levels + + + established + where + + + foundations + the + + + foundations + basic + + + established + foundations + + + foundations + of + + + of + liter + + + liter + acy + + + skills + mathematical + + + acy + skills + + + skills + and + + + skills + learning + + + learning + skills + + + established + are + + + is + established + + + + + ROOT + is + + + crisis + The + + + crisis + education + + + is + crisis + + + is + not + + + level + the + + + level + college + + + is + level + + + is + it + + + is + is + + + levels + the + + + levels + elementary + + + elementary + secondary + + + is + levels + + + established + where + + + foundations + the + + + foundations + basic + + + established + foundations + + + foundations + liter + + + liter + acy + + + skills + mathematical + + + acy + skills + + + skills + learning + + + learning + skills + + + established + are + + + is + established + + + + + ROOT + is + + + crisis + The + + + crisis + education + + + is + crisis + + + is + not + + + level + the + + + level + college + + + is + level + + + is + it + + + is + is + + + levels + the + + + levels + elementary + + + elementary + secondary + + + levels + secondary + + + is + levels + + + established + where + + + foundations + the + + + foundations + basic + + + established + foundations + + + foundations + liter + + + liter + acy + + + skills + mathematical + + + acy + skills + + + acy + learning + + + skills + learning + + + learning + skills + + + established + are + + + is + established + + + + + + + We + we + 8446 + 8448 + PRP + O + + + will + will + 8449 + 8453 + MD + O + + + debate + debate + 8454 + 8460 + VB + O + + + the + the + 8461 + 8464 + DT + O + + + Educational + Educational + 8465 + 8476 + NNP + O + + + Ex + ex + 8477 + 8479 + FW + O + + + - + - + 8479 + 8480 + : + O + + + cellence + cellence + 8482 + 8490 + NN + O + + + Act + act + 8491 + 8494 + NN + O + + + , + , + 8494 + 8495 + , + O + + + which + which + 8496 + 8501 + WDT + O + + + contains + contain + 8502 + 8510 + VBZ + O + + + the + the + 8511 + 8514 + DT + O + + + Presi + Presi + 8515 + 8520 + NNP + ORGANIZATION + + + - + - + 8520 + 8521 + : + O + + + dent + dent + 8523 + 8527 + NN + O + + + 's + 's + 8527 + 8529 + POS + O + + + proposals + proposal + 8530 + 8539 + NNS + O + + + to + to + 8540 + 8542 + TO + O + + + give + give + 8543 + 8547 + VB + O + + + awards + award + 8548 + 8554 + NNS + O + + + to + to + 8555 + 8557 + TO + O + + + schools + school + 8559 + 8566 + NNS + O + + + and + and + 8567 + 8570 + CC + O + + + teachers + teacher + 8571 + 8579 + NNS + O + + + for + for + 8580 + 8583 + IN + O + + + excellence + excellence + 8584 + 8594 + NN + O + + + , + , + 8594 + 8595 + , + O + + + encourage + encourage + 8597 + 8606 + VBP + O + + + Innovative + innovative + 8607 + 8617 + JJ + MISC + + + teaching + teaching + 8618 + 8626 + NN + O + + + meth + meth + 8627 + 8631 + NN + O + + + - + - + 8631 + 8632 + : + O + + + ods + od + 8634 + 8637 + NNS + O + + + , + , + 8637 + 8638 + , + O + + + and + and + 8639 + 8642 + CC + O + + + reduce + reduce + 8643 + 8649 + VB + O + + + student + student + 8650 + 8657 + NN + O + + + loan + loan + 8658 + 8662 + NN + O + + + defaults + default + 8663 + 8671 + NNS + O + + + . + . + 8671 + 8672 + . + O + + + (ROOT (S (NP (PRP We)) (VP (MD will) (VP (VP (VB debate) (NP (NP (NP (DT the) (NNP Educational)) (NP (FW Ex))) (: -) (S (NP (NP (NN cellence) (NN Act)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBZ contains) (NP (DT the) (NNP Presi)) (: -) (S (NP (NP (NN dent) (POS 's)) (NNS proposals)) (VP (TO to) (VP (VB give) (NP (NNS awards)) (PP (TO to) (NP (NP (NNS schools) (CC and) (NNS teachers)) (PP (IN for) (NP (NN excellence))))))))))) (, ,)) (VP (VBP encourage) (NP (JJ Innovative) (NN teaching) (NN meth)) (: -) (NP (NNS ods)))))) (, ,) (CC and) (VP (VB reduce) (NP (NN student) (NN loan) (NNS defaults))))) (. .))) + + + ROOT + debate + + + debate + We + + + debate + will + + + Educational + the + + + debate + Educational + + + Educational + Ex + + + Act + cellence + + + encourage + Act + + + contains + which + + + Act + contains + + + Presi + the + + + contains + Presi + + + proposals + dent + + + dent + 's + + + give + proposals + + + give + to + + + contains + give + + + give + awards + + + give + to + + + to + schools + + + schools + and + + + schools + teachers + + + schools + for + + + for + excellence + + + Educational + encourage + + + meth + Innovative + + + meth + teaching + + + encourage + meth + + + encourage + ods + + + debate + and + + + debate + reduce + + + defaults + student + + + defaults + loan + + + reduce + defaults + + + + + ROOT + debate + + + debate + We + + + debate + will + + + Educational + the + + + debate + Educational + + + Educational + Ex + + + Act + cellence + + + encourage + Act + + + contains + which + + + Act + contains + + + Presi + the + + + contains + Presi + + + proposals + dent + + + give + proposals + + + give + to + + + contains + give + + + give + awards + + + give + schools + + + schools + teachers + + + schools + excellence + + + Educational + encourage + + + meth + Innovative + + + meth + teaching + + + encourage + meth + + + encourage + ods + + + debate + reduce + + + defaults + student + + + defaults + loan + + + reduce + defaults + + + + + ROOT + debate + + + debate + We + + + reduce + We + + + debate + will + + + Educational + the + + + debate + Educational + + + Educational + Ex + + + Act + cellence + + + encourage + Act + + + contains + which + + + Act + contains + + + Presi + the + + + contains + Presi + + + proposals + dent + + + give + proposals + + + give + to + + + contains + give + + + give + awards + + + give + schools + + + give + teachers + + + schools + teachers + + + schools + excellence + + + Educational + encourage + + + meth + Innovative + + + meth + teaching + + + encourage + meth + + + encourage + ods + + + debate + reduce + + + defaults + student + + + defaults + loan + + + reduce + defaults + + + + + + + I + I + 8676 + 8677 + PRP + O + + + also + also + 8678 + 8682 + RB + O + + + want + want + 8683 + 8687 + VBP + O + + + to + to + 8688 + 8690 + TO + O + + + consider + consider + 8691 + 8699 + VB + O + + + the + the + 8700 + 8703 + DT + O + + + National + National + 8704 + 8712 + NNP + O + + + Literacy + literacy + 8714 + 8722 + NN + O + + + Act + Act + 8723 + 8726 + NNP + O + + + , + , + 8726 + 8727 + , + O + + + which + which + 8728 + 8733 + WDT + O + + + is + be + 8734 + 8736 + VBZ + O + + + designed + design + 8737 + 8745 + VBN + O + + + to + to + 8746 + 8748 + TO + O + + + eliminate + eliminate + 8750 + 8759 + VB + O + + + illiteracy + illiteracy + 8760 + 8770 + NN + O + + + in + in + 8771 + 8773 + IN + O + + + this + this + 8774 + 8778 + DT + O + + + Nation + nation + 8779 + 8785 + NN + O + + + by + by + 8786 + 8788 + IN + O + + + the + the + 8790 + 8793 + DT + DATE + 2000 + 2000 + + + year + year + 8794 + 8798 + NN + DATE + 2000 + 2000 + + + 2000 + 2000 + 8799 + 8803 + CD + DATE + 2000 + 2000 + + + . + . + 8803 + 8804 + . + O + + + (ROOT (S (NP (PRP I)) (ADVP (RB also)) (VP (VBP want) (S (VP (TO to) (VP (VB consider) (NP (DT the) (NNP National) (NN Literacy))))) (NP-TMP (NP (NNP Act)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN designed) (S (VP (TO to) (VP (VB eliminate) (NP (NP (NN illiteracy)) (PP (IN in) (NP (DT this) (NN Nation)))) (PP (IN by) (NP (DT the) (NN year) (CD 2000)))))))))))) (. .))) + + + ROOT + want + + + want + I + + + want + also + + + consider + to + + + want + consider + + + Literacy + the + + + Literacy + National + + + consider + Literacy + + + want + Act + + + designed + which + + + designed + is + + + Act + designed + + + eliminate + to + + + designed + eliminate + + + eliminate + illiteracy + + + illiteracy + in + + + Nation + this + + + in + Nation + + + eliminate + by + + + year + the + + + by + year + + + year + 2000 + + + + + ROOT + want + + + want + I + + + want + also + + + consider + to + + + want + consider + + + Literacy + the + + + Literacy + National + + + consider + Literacy + + + want + Act + + + designed + which + + + designed + is + + + Act + designed + + + eliminate + to + + + designed + eliminate + + + eliminate + illiteracy + + + Nation + this + + + illiteracy + Nation + + + year + the + + + eliminate + year + + + year + 2000 + + + + + ROOT + want + + + want + I + + + want + also + + + consider + to + + + want + consider + + + Literacy + the + + + Literacy + National + + + consider + Literacy + + + want + Act + + + designed + which + + + designed + is + + + Act + designed + + + eliminate + to + + + designed + eliminate + + + eliminate + illiteracy + + + Nation + this + + + illiteracy + Nation + + + year + the + + + eliminate + year + + + year + 2000 + + + + + + + No + no + 8805 + 8807 + DT + O + + + single + single + 8808 + 8814 + JJ + O + + + action + action + 8815 + 8821 + NN + O + + + is + be + 8822 + 8824 + VBZ + O + + + more + more + 8825 + 8829 + RBR + O + + + critical + critical + 8831 + 8839 + JJ + O + + + to + to + 8840 + 8842 + TO + O + + + our + we + 8843 + 8846 + PRP$ + O + + + future + future + 8847 + 8853 + JJ + DATE + FUTURE_REF + FUTURE_REF + + + economic + economic + 8854 + 8862 + JJ + O + + + securi + securus + 8863 + 8869 + NN + O + + + - + - + 8869 + 8870 + : + O + + + ty + ty + 8872 + 8874 + NN + O + + + . + . + 8874 + 8875 + . + O + + + (ROOT (S (NP (DT No) (JJ single) (NN action)) (VP (VBZ is) (ADJP (RBR more) (JJ critical) (PP (TO to) (NP (PRP$ our) (JJ future) (JJ economic) (NN securi) (: -) (NN ty))))) (. .))) + + + ROOT + critical + + + action + No + + + action + single + + + critical + action + + + critical + is + + + critical + more + + + critical + to + + + ty + our + + + ty + future + + + ty + economic + + + ty + securi + + + to + ty + + + + + ROOT + critical + + + action + No + + + action + single + + + critical + action + + + critical + is + + + critical + more + + + ty + our + + + ty + future + + + ty + economic + + + ty + securi + + + critical + ty + + + + + ROOT + critical + + + action + No + + + action + single + + + critical + action + + + critical + is + + + critical + more + + + ty + our + + + ty + future + + + ty + economic + + + ty + securi + + + critical + ty + + + + + + + By + by + 8876 + 8878 + IN + O + + + the + the + 8879 + 8882 + DT + DATE + THIS P100Y + + + + end + end + 8883 + 8886 + NN + DATE + THIS P100Y + + + + of + of + 8887 + 8889 + IN + DATE + THIS P100Y + + + + the + the + 8890 + 8893 + DT + DATE + THIS P100Y + + + + century + century + 8894 + 8901 + NN + DATE + THIS P100Y + + + + , + , + 8901 + 8902 + , + O + + + ade + ade + 8903 + 8906 + SYM + O + + + - + - + 8906 + 8907 + : + O + + + quate + quate + 8909 + 8914 + NN + O + + + literacy + literacy + 8915 + 8923 + NN + O + + + will + will + 8924 + 8928 + MD + O + + + be + be + 8929 + 8931 + VB + O + + + an + a + 8932 + 8934 + DT + O + + + essential + essential + 8935 + 8944 + JJ + O + + + pre + pre + 8945 + 8948 + JJ + O + + + - + - + 8948 + 8949 + : + O + + + condition + condition + 8951 + 8960 + NN + O + + + to + to + 8961 + 8963 + TO + O + + + living + live + 8964 + 8970 + VBG + O + + + in + in + 8971 + 8973 + IN + O + + + our + we + 8974 + 8977 + PRP$ + O + + + society + society + 8978 + 8985 + NN + O + + + . + . + 8985 + 8986 + . + O + + + (ROOT (S (PP (IN By) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (NN century))) (, ,) (X (SYM ade)))) (: -) (NP (NN quate) (NN literacy)) (VP (MD will) (VP (VB be) (NP (NP (DT an) (ADJP (JJ essential) (JJ pre))) (: -) (NP (NP (NN condition)) (PP (TO to) (S (VP (VBG living) (PP (IN in) (NP (PRP$ our) (NN society)))))))))) (. .))) + + + ROOT + pre + + + pre + By + + + end + the + + + By + end + + + end + of + + + century + the + + + of + century + + + end + ade + + + literacy + quate + + + pre + literacy + + + pre + will + + + pre + be + + + pre + an + + + pre + essential + + + pre + condition + + + condition + to + + + to + living + + + living + in + + + society + our + + + in + society + + + + + ROOT + pre + + + end + the + + + pre + end + + + century + the + + + end + century + + + end + ade + + + literacy + quate + + + pre + literacy + + + pre + will + + + pre + be + + + pre + an + + + pre + essential + + + pre + condition + + + condition + living + + + society + our + + + living + society + + + + + ROOT + pre + + + end + the + + + pre + end + + + century + the + + + end + century + + + end + ade + + + literacy + quate + + + pre + literacy + + + pre + will + + + pre + be + + + pre + an + + + pre + essential + + + pre + condition + + + condition + living + + + society + our + + + living + society + + + + + + + Only + only + 8990 + 8994 + RB + O + + + 14 + 14 + 8995 + 8997 + CD + PERCENT + %14.0 + + + percent + percent + 8998 + 9005 + NN + PERCENT + %14.0 + + + of + of + 9006 + 9008 + IN + O + + + the + the + 9009 + 9012 + DT + O + + + jobs + job + 9013 + 9017 + NNS + O + + + available + available + 9018 + 9027 + JJ + O + + + then + then + 9029 + 9033 + RB + O + + + will + will + 9034 + 9038 + MD + O + + + be + be + 9039 + 9041 + VB + O + + + adequately + adequately + 9042 + 9052 + RB + O + + + performed + perform + 9053 + 9062 + VBN + O + + + by + by + 9063 + 9065 + IN + O + + + high + high + 9067 + 9071 + JJ + O + + + school + school + 9072 + 9078 + NN + O + + + graduates + graduate + 9079 + 9088 + NNS + O + + + . + . + 9088 + 9089 + . + O + + + (ROOT (S (NP (NP (RB Only) (CD 14) (NN percent)) (PP (IN of) (NP (NP (DT the) (NNS jobs)) (ADJP (JJ available))))) (ADVP (RB then)) (VP (MD will) (VP (VB be) (VP (ADVP (RB adequately)) (VBN performed) (PP (IN by) (NP (JJ high) (NN school) (NNS graduates)))))) (. .))) + + + ROOT + performed + + + percent + Only + + + percent + 14 + + + performed + percent + + + percent + of + + + jobs + the + + + of + jobs + + + jobs + available + + + performed + then + + + performed + will + + + performed + be + + + performed + adequately + + + performed + by + + + graduates + high + + + graduates + school + + + by + graduates + + + + + ROOT + performed + + + percent + Only + + + percent + 14 + + + performed + percent + + + jobs + the + + + percent + jobs + + + jobs + available + + + performed + then + + + performed + will + + + performed + be + + + performed + adequately + + + graduates + high + + + graduates + school + + + performed + graduates + + + + + ROOT + performed + + + percent + Only + + + percent + 14 + + + performed + percent + + + jobs + the + + + percent + jobs + + + jobs + available + + + performed + then + + + performed + will + + + performed + be + + + performed + adequately + + + graduates + high + + + graduates + school + + + performed + graduates + + + + + + + Most + most + 9090 + 9094 + JJS + O + + + Jobs + Jobs + 9095 + 9099 + NNP + O + + + will + will + 9100 + 9104 + MD + O + + + need + need + 9106 + 9110 + VB + O + + + higher + higher + 9111 + 9117 + JJR + O + + + skills + skill + 9118 + 9124 + NNS + O + + + . + . + 9124 + 9125 + . + O + + + (ROOT (S (NP (NP (JJS Most)) (NP (NNP Jobs))) (VP (MD will) (VP (VB need) (NP (JJR higher) (NNS skills)))) (. .))) + + + ROOT + need + + + need + Most + + + Most + Jobs + + + need + will + + + skills + higher + + + need + skills + + + + + ROOT + need + + + need + Most + + + Most + Jobs + + + need + will + + + skills + higher + + + need + skills + + + + + ROOT + need + + + need + Most + + + Most + Jobs + + + need + will + + + skills + higher + + + need + skills + + + + + + + Brt + Brt + 9126 + 9129 + NNP + O + + + 80 + 80 + 9130 + 9132 + CD + PERCENT + %80.0 + + + percent + percent + 9133 + 9140 + NN + PERCENT + %80.0 + + + of + of + 9141 + 9143 + IN + O + + + new + new + 9145 + 9148 + JJ + O + + + job + job + 9149 + 9152 + NN + O + + + seekers + seeker + 9153 + 9160 + NNS + O + + + at + at + 9161 + 9163 + IN + O + + + that + that + 9164 + 9168 + DT + O + + + time + time + 9169 + 9173 + NN + O + + + will + will + 9174 + 9178 + MD + O + + + be + be + 9179 + 9181 + VB + O + + + minorities + minority + 9183 + 9193 + NNS + O + + + , + , + 9193 + 9194 + , + O + + + immigrants + immigrant + 9195 + 9205 + NNS + O + + + , + , + 9205 + 9206 + , + O + + + and + and + 9207 + 9210 + CC + O + + + women + woman + 9211 + 9216 + NNS + O + + + . + . + 9216 + 9217 + . + O + + + (ROOT (S (NP (NP (NNP Brt) (CD 80) (NN percent)) (PP (IN of) (NP (NP (JJ new) (NN job) (NNS seekers)) (PP (IN at) (NP (DT that) (NN time)))))) (VP (MD will) (VP (VB be) (NP (NP (NNS minorities)) (, ,) (NP (NNS immigrants)) (, ,) (CC and) (NP (NNS women))))) (. .))) + + + ROOT + minorities + + + percent + Brt + + + percent + 80 + + + minorities + percent + + + percent + of + + + seekers + new + + + seekers + job + + + of + seekers + + + seekers + at + + + time + that + + + at + time + + + minorities + will + + + minorities + be + + + minorities + immigrants + + + minorities + and + + + minorities + women + + + + + ROOT + minorities + + + percent + Brt + + + percent + 80 + + + minorities + percent + + + seekers + new + + + seekers + job + + + percent + seekers + + + time + that + + + seekers + time + + + minorities + will + + + minorities + be + + + minorities + immigrants + + + minorities + women + + + + + ROOT + minorities + + + percent + Brt + + + percent + 80 + + + minorities + percent + + + seekers + new + + + seekers + job + + + percent + seekers + + + time + that + + + seekers + time + + + minorities + will + + + minorities + be + + + minorities + immigrants + + + minorities + women + + + + + + + If + if + 9218 + 9220 + IN + O + + + we + we + 9222 + 9224 + PRP + O + + + have + have + 9225 + 9229 + VBP + O + + + not + not + 9230 + 9233 + RB + O + + + substantially + substantially + 9234 + 9247 + RB + O + + + improved + improve + 9248 + 9256 + VBN + O + + + our + we + 9258 + 9261 + PRP$ + O + + + literacy + literacy + 9262 + 9270 + NN + O + + + levels + level + 9271 + 9277 + NNS + O + + + by + by + 9278 + 9280 + IN + O + + + that + that + 9281 + 9285 + DT + O + + + time + time + 9286 + 9290 + NN + O + + + , + , + 9290 + 9291 + , + O + + + we + we + 9292 + 9294 + PRP + O + + + risk + risk + 9296 + 9300 + VBP + O + + + seeing + see + 9301 + 9307 + VBG + O + + + those + those + 9308 + 9313 + DT + O + + + jobs + job + 9314 + 9318 + NNS + O + + + exported + export + 9319 + 9327 + VBN + O + + + over + over + 9328 + 9332 + IN + O + + + - + - + 9332 + 9333 + : + O + + + seas + sea + 9335 + 9339 + NNS + O + + + . + . + 9339 + 9340 + . + O + + + (ROOT (S (SBAR (IN If) (S (NP (PRP we)) (VP (VBP have) (RB not) (ADVP (RB substantially)) (VP (VBN improved) (NP (PRP$ our) (NN literacy) (NNS levels)) (PP (IN by) (NP (DT that) (NN time))))))) (, ,) (NP (PRP we)) (VP (VBP risk) (S (VP (VBG seeing) (NP (NP (DT those) (NNS jobs)) (VP (VBN exported) (PP (IN over))) (: -) (NP (NNS seas)))))) (. .))) + + + ROOT + risk + + + improved + If + + + improved + we + + + improved + have + + + improved + not + + + improved + substantially + + + risk + improved + + + levels + our + + + levels + literacy + + + improved + levels + + + improved + by + + + time + that + + + by + time + + + risk + we + + + risk + seeing + + + jobs + those + + + seeing + jobs + + + jobs + exported + + + exported + over + + + jobs + seas + + + + + ROOT + risk + + + improved + If + + + improved + we + + + improved + have + + + improved + not + + + improved + substantially + + + risk + improved + + + levels + our + + + levels + literacy + + + improved + levels + + + time + that + + + improved + time + + + risk + we + + + risk + seeing + + + jobs + those + + + seeing + jobs + + + jobs + exported + + + exported + over + + + jobs + seas + + + + + ROOT + risk + + + improved + If + + + improved + we + + + improved + have + + + improved + not + + + improved + substantially + + + risk + improved + + + levels + our + + + levels + literacy + + + improved + levels + + + time + that + + + improved + time + + + risk + we + + + risk + seeing + + + jobs + those + + + seeing + jobs + + + jobs + exported + + + exported + over + + + jobs + seas + + + + + + + For + for + 9344 + 9347 + IN + O + + + the + the + 9348 + 9351 + DT + DATE + PREV_IMMEDIATE P10Y + + + + last + last + 9352 + 9356 + JJ + DATE + PREV_IMMEDIATE P10Y + + + + decade + decade + 9357 + 9363 + NN + DATE + PREV_IMMEDIATE P10Y + + + + , + , + 9363 + 9364 + , + O + + + we + we + 9365 + 9367 + PRP + O + + + have + have + 9368 + 9372 + VBP + O + + + read + read + 9373 + 9377 + VBN + O + + + re + re + 9378 + 9380 + SYM + O + + + - + - + 9380 + 9381 + : + O + + + ports + port + 9383 + 9388 + NNS + O + + + and + and + 9389 + 9392 + CC + O + + + analyses + analysis + 9393 + 9401 + NNS + O + + + of + of + 9402 + 9404 + IN + O + + + the + the + 9405 + 9408 + DT + O + + + shortcom + shortcom + 9409 + 9417 + NN + O + + + - + - + 9417 + 9418 + : + O + + + ings + ing + 9420 + 9424 + NNS + O + + + in + in + 9425 + 9427 + IN + O + + + basic + basic + 9428 + 9433 + JJ + O + + + educational + educational + 9434 + 9445 + JJ + O + + + achievement + achievement + 9446 + 9457 + NN + O + + + in + in + 9459 + 9461 + IN + O + + + our + we + 9462 + 9465 + PRP$ + O + + + country + country + 9466 + 9473 + NN + O + + + . + . + 9473 + 9474 + . + O + + + (ROOT (S (PP (IN For) (NP (DT the) (JJ last) (NN decade))) (, ,) (NP (PRP we)) (VP (VBP have) (VP (VBN read) (FRAG (PP (X (SYM re)) (: -) (PP (NP (NNS ports) (CC and) (NNS analyses)) (IN of) (NP (NP (DT the) (NN shortcom)) (: -) (NP (NP (NNS ings)) (PP (IN in) (NP (NP (JJ basic) (JJ educational) (NN achievement)) (PP (IN in) (NP (PRP$ our) (NN country)))))))) (. .))))))) + + + ROOT + read + + + read + For + + + decade + the + + + decade + last + + + For + decade + + + read + we + + + read + have + + + of + re + + + of + ports + + + ports + and + + + ports + analyses + + + read + of + + + shortcom + the + + + of + shortcom + + + shortcom + ings + + + ings + in + + + achievement + basic + + + achievement + educational + + + in + achievement + + + achievement + in + + + country + our + + + in + country + + + + + ROOT + read + + + decade + the + + + decade + last + + + read + decade + + + read + we + + + read + have + + + of + re + + + of + ports + + + ports + analyses + + + read + of + + + shortcom + the + + + of + shortcom + + + shortcom + ings + + + achievement + basic + + + achievement + educational + + + ings + achievement + + + country + our + + + achievement + country + + + + + ROOT + read + + + decade + the + + + decade + last + + + read + decade + + + read + we + + + read + have + + + of + re + + + of + ports + + + ports + analyses + + + of + analyses + + + read + of + + + shortcom + the + + + of + shortcom + + + shortcom + ings + + + achievement + basic + + + achievement + educational + + + ings + achievement + + + country + our + + + achievement + country + + + + + + + It + it + 9475 + 9477 + PRP + O + + + is + be + 9478 + 9480 + VBZ + O + + + time + time + 9481 + 9485 + NN + O + + + to + to + 9486 + 9488 + TO + O + + + act + act + 9489 + 9492 + VB + O + + + on + on + 9493 + 9495 + IN + O + + + what + what + 9497 + 9501 + WP + O + + + we + we + 9502 + 9504 + PRP + O + + + know + know + 9505 + 9509 + VBP + O + + + , + , + 9509 + 9510 + , + O + + + both + both + 9511 + 9515 + CC + O + + + as + as + 9516 + 9518 + IN + O + + + to + to + 9519 + 9521 + TO + O + + + shortcom + shortcom + 9522 + 9530 + VB + O + + + - + - + 9530 + 9531 + : + O + + + ings + ing + 9533 + 9537 + NNS + O + + + and + and + 9538 + 9541 + CC + O + + + the + the + 9542 + 9545 + DT + O + + + best + best + 9546 + 9550 + JJS + O + + + way + way + 9551 + 9554 + NN + O + + + to + to + 9555 + 9557 + TO + O + + + correct + correct + 9558 + 9565 + VB + O + + + them + they + 9566 + 9570 + PRP + O + + + . + . + 9570 + 9571 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (NN time) (S (VP (TO to) (VP (VB act) (SBAR (SBAR (WHPP (IN on) (WHNP (WP what))) (S (NP (PRP we)) (VP (VBP know)))) (, ,) (CC both) (SBAR (IN as) (S (VP (TO to) (VP (VB shortcom) (: -) (S (NP (NP (NNS ings)) (CC and) (NP (DT the) (JJS best) (NN way))) (VP (TO to) (VP (VB correct) (NP (PRP them))))))))))))))) (. .))) + + + ROOT + time + + + time + It + + + time + is + + + act + to + + + time + act + + + know + on + + + on + what + + + know + we + + + act + know + + + know + both + + + shortcom + as + + + shortcom + to + + + know + shortcom + + + correct + ings + + + ings + and + + + way + the + + + way + best + + + ings + way + + + correct + to + + + shortcom + correct + + + correct + them + + + + + ROOT + time + + + time + It + + + time + is + + + act + to + + + time + act + + + know + what + + + know + we + + + act + know + + + know + both + + + shortcom + as + + + shortcom + to + + + know + shortcom + + + correct + ings + + + way + the + + + way + best + + + ings + way + + + correct + to + + + shortcom + correct + + + correct + them + + + + + ROOT + time + + + time + It + + + time + is + + + act + to + + + time + act + + + know + what + + + know + we + + + act + know + + + know + both + + + shortcom + as + + + shortcom + to + + + know + shortcom + + + correct + ings + + + way + the + + + way + best + + + ings + way + + + correct + way + + + correct + to + + + shortcom + correct + + + correct + them + + + + + + + We + we + 9575 + 9577 + PRP + O + + + know + know + 9578 + 9582 + VBP + O + + + that + that + 9583 + 9587 + IN + O + + + a + a + 9588 + 9589 + DT + O + + + third + third + 9590 + 9595 + JJ + ORDINAL + 3.0 + + + of + of + 9596 + 9598 + IN + O + + + our + we + 9599 + 9602 + PRP$ + O + + + math + math + 9603 + 9607 + NN + O + + + and + and + 9609 + 9612 + CC + O + + + science + science + 9613 + 9620 + NN + O + + + teachers + teacher + 9621 + 9629 + NNS + O + + + today + today + 9630 + 9635 + NN + DATE + THIS P1D + + + + are + be + 9636 + 9639 + VBP + O + + + un + un + 9640 + 9642 + SYM + O + + + - + - + 9642 + 9643 + : + O + + + qualified + qualify + 9645 + 9654 + VBN + O + + + to + to + 9655 + 9657 + TO + O + + + teach + teach + 9658 + 9663 + VB + O + + + in + in + 9664 + 9666 + IN + O + + + those + those + 9667 + 9672 + DT + O + + + subjects + subject + 9673 + 9681 + NNS + O + + + ; + ; + 9681 + 9682 + : + O + + + we + we + 9683 + 9685 + PRP + O + + + know + know + 9687 + 9691 + VBP + O + + + we + we + 9692 + 9694 + PRP + O + + + face + face + 9695 + 9699 + VBP + O + + + a + a + 9700 + 9701 + DT + O + + + shortfall + shortfall + 9702 + 9711 + NN + O + + + of + of + 9712 + 9714 + IN + O + + + teachers + teacher + 9715 + 9723 + NNS + O + + + in + in + 9724 + 9726 + IN + O + + + the + the + 9728 + 9731 + DT + DATE + NEXT_IMMEDIATE P10Y + + + + next + next + 9732 + 9736 + JJ + DATE + NEXT_IMMEDIATE P10Y + + + + decade + decade + 9737 + 9743 + NN + DATE + NEXT_IMMEDIATE P10Y + + + + that + that + 9744 + 9748 + WDT + O + + + could + could + 9749 + 9754 + MD + O + + + reach + reach + 9755 + 9760 + VB + O + + + 2 + 2 + 9761 + 9762 + CD + MONEY + $2000000.0 + + + million + million + 9764 + 9771 + CD + MONEY + $2000000.0 + + + , + , + 9771 + 9772 + , + O + + + we + we + 9773 + 9775 + PRP + O + + + know + know + 9776 + 9780 + VBP + O + + + American + american + 9781 + 9789 + JJ + LOCATION + + + children + child + 9790 + 9798 + NNS + O + + + score + score + 9800 + 9805 + VBP + NUMBER + 20.0 + + + consistently + consistently + 9806 + 9818 + RB + O + + + lower + lower + 9819 + 9824 + JJR + O + + + on + on + 9825 + 9827 + IN + O + + + math + math + 9828 + 9832 + NN + O + + + and + and + 9833 + 9836 + CC + O + + + science + science + 9838 + 9845 + NN + O + + + tests + test + 9846 + 9851 + NNS + O + + + than + than + 9852 + 9856 + IN + O + + + children + child + 9857 + 9865 + NNS + O + + + from + from + 9866 + 9870 + IN + O + + + Asian + asian + 9871 + 9876 + JJ + LOCATION + + + and + and + 9878 + 9881 + CC + O + + + European + european + 9882 + 9890 + JJ + MISC + + + countries + country + 9891 + 9900 + NNS + O + + + . + . + 9900 + 9901 + . + O + + + (ROOT (S (S (S (NP (PRP We)) (VP (VBP know) (SBAR (IN that) (S (NP (NP (DT a) (JJ third)) (PP (IN of) (NP (NP (PRP$ our) (NN math)) (CC and) (NP (NN science) (NNS teachers))))) (NP-TMP (NN today)) (VP (VBP are) (S (VP (X (SYM un)) (: -) (VP (VBN qualified) (S (VP (TO to) (VP (VB teach) (PP (IN in) (NP (DT those) (NNS subjects)))))))))))))) (: ;) (S (NP (PRP we)) (VP (VBP know) (SBAR (S (NP (PRP we)) (VP (VBP face) (NP (NP (DT a) (NN shortfall)) (PP (IN of) (NP (NP (NNS teachers)) (PP (IN in) (NP (NP (DT the) (JJ next) (NN decade)) (SBAR (WHNP (WDT that)) (S (VP (MD could) (VP (VB reach) (NP (QP (CD 2) (CD million)))))))))))))))))) (, ,) (NP (PRP we)) (VP (VBP know) (SBAR (S (NP (JJ American) (NNS children)) (VP (VBP score) (ADJP (ADJP (RB consistently) (JJR lower)) (PP (IN on) (NP (NP (NN math) (CC and) (NN science)) (NNS tests)))) (PP (IN than) (NP (NP (NNS children)) (PP (IN from) (NP (ADJP (JJ Asian) (CC and) (JJ European)) (NNS countries))))))))) (. .))) + + + ROOT + know + + + know + We + + + know + know + + + are + that + + + third + a + + + are + third + + + third + of + + + math + our + + + of + math + + + math + and + + + teachers + science + + + math + teachers + + + are + today + + + know + are + + + qualified + un + + + are + qualified + + + teach + to + + + qualified + teach + + + teach + in + + + subjects + those + + + in + subjects + + + know + we + + + know + know + + + face + we + + + know + face + + + shortfall + a + + + face + shortfall + + + shortfall + of + + + of + teachers + + + teachers + in + + + decade + the + + + decade + next + + + in + decade + + + reach + that + + + reach + could + + + decade + reach + + + million + 2 + + + reach + million + + + know + we + + + children + American + + + score + children + + + know + score + + + lower + consistently + + + score + lower + + + lower + on + + + tests + math + + + math + and + + + math + science + + + on + tests + + + score + than + + + than + children + + + children + from + + + countries + Asian + + + Asian + and + + + Asian + European + + + from + countries + + + + + ROOT + know + + + know + We + + + know + know + + + are + that + + + third + a + + + are + third + + + math + our + + + third + math + + + teachers + science + + + math + teachers + + + are + today + + + know + are + + + qualified + un + + + are + qualified + + + teach + to + + + qualified + teach + + + subjects + those + + + teach + subjects + + + know + we + + + know + know + + + face + we + + + know + face + + + shortfall + a + + + face + shortfall + + + shortfall + teachers + + + decade + the + + + decade + next + + + teachers + decade + + + reach + that + + + reach + could + + + decade + reach + + + million + 2 + + + reach + million + + + know + we + + + children + American + + + score + children + + + know + score + + + lower + consistently + + + score + lower + + + tests + math + + + math + science + + + lower + tests + + + score + children + + + countries + Asian + + + Asian + European + + + children + countries + + + + + ROOT + know + + + know + We + + + know + know + + + are + that + + + third + a + + + are + third + + + math + our + + + third + math + + + teachers + science + + + third + teachers + + + math + teachers + + + are + today + + + know + are + + + qualified + un + + + are + qualified + + + teach + to + + + qualified + teach + + + subjects + those + + + teach + subjects + + + know + we + + + know + know + + + face + we + + + know + face + + + shortfall + a + + + face + shortfall + + + shortfall + teachers + + + decade + the + + + decade + next + + + teachers + decade + + + reach + that + + + reach + could + + + decade + reach + + + million + 2 + + + reach + million + + + know + we + + + children + American + + + score + children + + + know + score + + + lower + consistently + + + score + lower + + + tests + math + + + math + science + + + tests + science + + + lower + tests + + + score + children + + + countries + Asian + + + Asian + European + + + countries + European + + + children + countries + + + + + + + We + we + 9902 + 9904 + PRP + O + + + know + know + 9905 + 9909 + VBP + O + + + reading + reading + 9911 + 9918 + NN + O + + + and + and + 9919 + 9922 + CC + O + + + writing + write + 9923 + 9930 + VBG + O + + + skills + skill + 9931 + 9937 + NNS + O + + + need + need + 9938 + 9942 + VBP + O + + + sub + sub + 9943 + 9946 + SYM + O + + + - + - + 9946 + 9947 + : + O + + + stantial + stantial + 9949 + 9957 + JJ + O + + + improvement + improvement + 9958 + 9969 + NN + O + + + . + . + 9969 + 9970 + . + O + + + (ROOT (S (NP (PRP We)) (VP (VBP know) (SBAR (S (NP (NP (NN reading)) (CC and) (NP (VBG writing) (NNS skills))) (VP (VBP need) (VP (FRAG (PP (X (SYM sub)) (: -) (PP (ADVP (JJ stantial)) (ADJP (NN improvement))) (. .)))))))))) + + + ROOT + know + + + know + We + + + need + reading + + + reading + and + + + skills + writing + + + reading + skills + + + know + need + + + stantial + sub + + + need + stantial + + + stantial + improvement + + + + + ROOT + know + + + know + We + + + need + reading + + + skills + writing + + + reading + skills + + + know + need + + + stantial + sub + + + need + stantial + + + stantial + improvement + + + + + ROOT + know + + + know + We + + + need + reading + + + skills + writing + + + reading + skills + + + need + skills + + + know + need + + + stantial + sub + + + need + stantial + + + stantial + improvement + + + + + + + We + we + 9974 + 9976 + PRP + O + + + also + also + 9977 + 9981 + RB + O + + + know + know + 9982 + 9986 + VBP + O + + + that + that + 9987 + 9991 + IN + O + + + early + early + 9992 + 9997 + JJ + O + + + interven + interven + 9998 + 10006 + NN + O + + + - + - + 10006 + 10007 + : + O + + + tion + tion + 10009 + 10013 + NN + O + + + and + and + 10014 + 10017 + CC + O + + + focused + focus + 10018 + 10025 + VBD + O + + + resources + resource + 10026 + 10035 + NNS + O + + + help + help + 10036 + 10040 + VB + O + + + . + . + 10040 + 10041 + . + O + + + (ROOT (S (NP (PRP We)) (ADVP (RB also)) (VP (VP (VBP know) (PP (IN that) (NP (JJ early) (NN interven) (: -) (NN tion)))) (CC and) (VP (VBD focused) (S (NP (NNS resources)) (VP (VB help))))) (. .))) + + + ROOT + know + + + know + We + + + know + also + + + know + that + + + tion + early + + + tion + interven + + + that + tion + + + know + and + + + know + focused + + + help + resources + + + focused + help + + + + + ROOT + know + + + know + We + + + know + also + + + tion + early + + + tion + interven + + + know + tion + + + know + focused + + + help + resources + + + focused + help + + + + + ROOT + know + + + know + We + + + focused + We + + + know + also + + + tion + early + + + tion + interven + + + know + tion + + + know + focused + + + help + resources + + + focused + help + + + + + + + We + we + 10042 + 10044 + PRP + O + + + know + know + 10046 + 10050 + VBP + O + + + that + that + 10051 + 10055 + IN + O + + + extra + extra + 10056 + 10061 + JJ + O + + + help + help + 10062 + 10066 + NN + O + + + to + to + 10067 + 10069 + TO + O + + + the + the + 10070 + 10073 + DT + O + + + disadvan + disadvan + 10074 + 10082 + NN + O + + + - + - + 10082 + 10083 + : + O + + + taged + tage + 10085 + 10090 + VBN + O + + + in + in + 10091 + 10093 + IN + O + + + elementary + elementary + 10094 + 10104 + JJ + O + + + schools + school + 10105 + 10112 + NNS + O + + + raises + raise + 10113 + 10119 + VBZ + O + + + edu + edu + 10120 + 10123 + SYM + O + + + - + - + 10123 + 10124 + : + O + + + cational + cational + 10126 + 10134 + JJ + O + + + achievement + achievement + 10135 + 10146 + NN + O + + + levels + level + 10147 + 10153 + NNS + O + + + in + in + 10154 + 10156 + IN + O + + + high + high + 10157 + 10161 + JJ + O + + + January + January + 10165 + 10172 + NNP + DATE + 1990-01-23 + 1990-01-23 + + + 23 + 23 + 10173 + 10175 + CD + DATE + 1990-01-23 + 1990-01-23 + + + , + , + 10175 + 10176 + , + DATE + 1990-01-23 + 1990-01-23 + + + 1990 + 1990 + 10177 + 10181 + CD + DATE + 1990-01-23 + 1990-01-23 + + + January + January + 10189 + 10196 + NNP + DATE + 1990-01-23 + 1990-01-23 + + + 23 + 23 + 10197 + 10199 + CD + DATE + 1990-01-23 + 1990-01-23 + + + , + , + 10199 + 10200 + , + DATE + 1990-01-23 + 1990-01-23 + + + 1990 + 1990 + 10201 + 10205 + CD + DATE + 1990-01-23 + 1990-01-23 + + + CO + CO + 10224 + 10226 + NNP + O + + + schools + school + 10228 + 10235 + NNS + O + + + ; + ; + 10235 + 10236 + : + O + + + we + we + 10237 + 10239 + PRP + O + + + know + know + 10240 + 10244 + VBP + O + + + Head + Head + 10249 + 10253 + NNP + O + + + Start + Start + 10254 + 10259 + NNP + O + + + and + and + 10260 + 10263 + CC + O + + + other + other + 10265 + 10270 + JJ + O + + + enrichment + enrichment + 10273 + 10283 + NN + O + + + programs + program + 10284 + 10292 + NNS + O + + + bring + bring + 10298 + 10303 + VBP + O + + + gains + gain + 10305 + 10310 + NNS + O + + + that + that + 10311 + 10315 + WDT + O + + + continue + continue + 10316 + 10324 + VBP + O + + + through + through + 10325 + 10332 + IN + O + + + a + a + 10333 + 10334 + DT + O + + + child + child + 10335 + 10340 + NN + O + + + 's + 's + 10340 + 10342 + POS + O + + + school + school + 10344 + 10350 + NN + O + + + life + life + 10351 + 10355 + NN + O + + + . + . + 10355 + 10356 + . + O + + + (ROOT (S (NP (PRP We)) (VP (VBP know) (SBAR (IN that) (S (NP (NP (JJ extra) (NN help)) (PP (TO to) (NP (NP (DT the) (NN disadvan)) (: -) (VP (VBN taged) (PP (IN in) (NP (JJ elementary) (NNS schools))))))) (VP (VBZ raises) (NP (SBAR (FRAG (X (SYM edu)) (: -) (NP (NP (JJ cational) (NN achievement) (NNS levels)) (PP (IN in) (NP (ADJP (JJ high) (NP-TMP (NNP January) (CD 23) (, ,) (CD 1990))) (NP (NP (NP (NNP January) (CD 23) (, ,) (CD 1990)) (NP (NNP CO) (NNS schools))) (: ;) (S (NP (PRP we)) (VP (VBP know) (SBAR (S (NP (NP (NNP Head) (NNP Start)) (CC and) (NP (JJ other) (NN enrichment) (NNS programs))) (VP (VBP bring) (NP (NP (NNS gains)) (SBAR (WHNP (WDT that)) (S (VP (VBP continue) (PP (IN through) (NP (NP (DT a) (NN child) (POS 's)) (NN school) (NN life)))))))))))))))) (. .)))))))))) + + + ROOT + know + + + know + We + + + raises + that + + + help + extra + + + raises + help + + + help + to + + + disadvan + the + + + to + disadvan + + + disadvan + taged + + + taged + in + + + schools + elementary + + + in + schools + + + know + raises + + + levels + edu + + + levels + cational + + + levels + achievement + + + raises + levels + + + levels + in + + + January + high + + + high + January + + + January + 23 + + + January + 1990 + + + in + January + + + January + 23 + + + January + 1990 + + + schools + CO + + + January + schools + + + know + we + + + January + know + + + Start + Head + + + bring + Start + + + Start + and + + + programs + other + + + programs + enrichment + + + Start + programs + + + know + bring + + + bring + gains + + + continue + that + + + gains + continue + + + continue + through + + + child + a + + + life + child + + + child + 's + + + life + school + + + through + life + + + + + ROOT + know + + + know + We + + + raises + that + + + help + extra + + + raises + help + + + disadvan + the + + + help + disadvan + + + disadvan + taged + + + schools + elementary + + + taged + schools + + + know + raises + + + levels + edu + + + levels + cational + + + levels + achievement + + + raises + levels + + + January + high + + + high + January + + + January + 23 + + + January + 1990 + + + levels + January + + + January + 23 + + + January + 1990 + + + schools + CO + + + January + schools + + + know + we + + + January + know + + + Start + Head + + + bring + Start + + + programs + other + + + programs + enrichment + + + Start + programs + + + know + bring + + + bring + gains + + + continue + that + + + gains + continue + + + child + a + + + life + child + + + life + school + + + continue + life + + + + + ROOT + know + + + know + We + + + raises + that + + + help + extra + + + raises + help + + + disadvan + the + + + help + disadvan + + + disadvan + taged + + + schools + elementary + + + taged + schools + + + know + raises + + + levels + edu + + + levels + cational + + + levels + achievement + + + raises + levels + + + January + high + + + high + January + + + January + 23 + + + January + 1990 + + + levels + January + + + January + 23 + + + January + 1990 + + + schools + CO + + + January + schools + + + know + we + + + January + know + + + Start + Head + + + bring + Start + + + programs + other + + + programs + enrichment + + + Start + programs + + + bring + programs + + + know + bring + + + bring + gains + + + continue + that + + + gains + continue + + + child + a + + + life + child + + + life + school + + + continue + life + + + + + + + This + this + 10360 + 10364 + DT + DATE + THIS P1Y + + + + year + year + 10365 + 10369 + NN + DATE + THIS P1Y + + + + , + , + 10369 + 10370 + , + O + + + it + it + 10371 + 10373 + PRP + O + + + is + be + 10374 + 10376 + VBZ + O + + + time + time + 10377 + 10381 + NN + O + + + to + to + 10382 + 10384 + TO + O + + + put + put + 10385 + 10388 + VB + O + + + what + what + 10389 + 10393 + WP + O + + + we + we + 10394 + 10396 + PRP + O + + + know + know + 10398 + 10402 + VBP + O + + + Is + be + 10403 + 10405 + VBZ + O + + + needed + need + 10406 + 10412 + VBN + O + + + together + together + 10413 + 10421 + RB + O + + + with + with + 10422 + 10426 + IN + O + + + what + what + 10427 + 10431 + WP + O + + + we + we + 10432 + 10434 + PRP + O + + + know + know + 10436 + 10440 + VBP + O + + + will + will + 10441 + 10445 + MD + O + + + make + make + 10446 + 10450 + VB + O + + + a + a + 10451 + 10452 + DT + O + + + difference + difference + 10453 + 10463 + NN + O + + + , + , + 10463 + 10464 + , + O + + + and + and + 10465 + 10468 + CC + O + + + get + get + 10469 + 10472 + VB + O + + + our + we + 10474 + 10477 + PRP$ + O + + + school + school + 10478 + 10484 + NN + O + + + system + system + 10485 + 10491 + NN + O + + + back + back + 10492 + 10496 + RB + O + + + on + on + 10497 + 10499 + IN + O + + + track + track + 10500 + 10505 + NN + O + + + . + . + 10505 + 10506 + . + O + + + (ROOT (S (NP-TMP (DT This) (NN year)) (, ,) (NP (PRP it)) (VP (VBZ is) (NP (NN time) (S (VP (TO to) (VP (VB put) (SBAR (WHNP (WP what)) (S (NP (PRP we)) (VP (VBP know) (SBAR (S (VP (VBZ Is) (VP (VBN needed) (ADVP (RB together)) (PP (IN with) (SBAR (WHNP (WP what)) (S (NP (PRP we)) (VP (VBP know) (SBAR (S (VP (MD will) (VP (VP (VB make) (NP (DT a) (NN difference))) (, ,) (CC and) (VP (VB get) (NP (PRP$ our) (NN school) (NN system)) (ADVP (RB back)) (PP (IN on) (NP (NN track)))))))))))))))))))))))) (. .))) + + + ROOT + time + + + year + This + + + time + year + + + time + it + + + time + is + + + put + to + + + time + put + + + know + what + + + know + we + + + put + know + + + needed + Is + + + know + needed + + + needed + together + + + needed + with + + + know + what + + + know + we + + + with + know + + + make + will + + + know + make + + + difference + a + + + make + difference + + + make + and + + + make + get + + + system + our + + + system + school + + + get + system + + + get + back + + + get + on + + + on + track + + + + + ROOT + time + + + year + This + + + time + year + + + time + it + + + time + is + + + put + to + + + time + put + + + know + what + + + know + we + + + put + know + + + needed + Is + + + know + needed + + + know + what + + + know + we + + + needed + know + + + make + will + + + know + make + + + difference + a + + + make + difference + + + make + get + + + system + our + + + system + school + + + get + system + + + get + back + + + get + track + + + + + ROOT + time + + + year + This + + + time + year + + + time + it + + + time + is + + + put + to + + + time + put + + + know + what + + + know + we + + + put + know + + + needed + Is + + + know + needed + + + know + what + + + know + we + + + needed + know + + + make + will + + + know + make + + + difference + a + + + make + difference + + + know + get + + + make + get + + + system + our + + + system + school + + + get + system + + + get + back + + + get + track + + + + + + + After + after + 10507 + 10512 + IN + O + + + a + a + 10514 + 10515 + DT + DURATION + P10Y + + + decade + decade + 10516 + 10522 + NN + DURATION + P10Y + + + of + of + 10523 + 10525 + IN + O + + + reports + report + 10526 + 10533 + NNS + O + + + and + and + 10534 + 10537 + CC + O + + + rhetoric + rhetoric + 10538 + 10546 + NN + O + + + , + , + 10546 + 10547 + , + O + + + Americans + Americans + 10549 + 10558 + NNPS + MISC + + + expect + expect + 10559 + 10565 + VBP + O + + + action + action + 10566 + 10572 + NN + O + + + and + and + 10573 + 10576 + CC + O + + + I + I + 10577 + 10578 + PRP + O + + + think + think + 10579 + 10584 + VBP + O + + + we + we + 10586 + 10588 + PRP + O + + + should + should + 10589 + 10595 + MD + O + + + provide + provide + 10596 + 10603 + VB + O + + + It + it + 10604 + 10606 + PRP + O + + + . + . + 10606 + 10607 + . + O + + + (ROOT (S (PP (IN After) (NP (NP (DT a) (NN decade)) (PP (IN of) (NP (NNS reports) (CC and) (NN rhetoric))))) (, ,) (S (NP (NNPS Americans)) (VP (VBP expect) (NP (NN action)))) (CC and) (S (NP (PRP I)) (VP (VBP think) (SBAR (S (NP (PRP we)) (VP (MD should) (VP (VB provide) (NP (PRP It)))))))) (. .))) + + + ROOT + expect + + + expect + After + + + decade + a + + + After + decade + + + decade + of + + + of + reports + + + reports + and + + + reports + rhetoric + + + expect + Americans + + + expect + action + + + expect + and + + + think + I + + + expect + think + + + provide + we + + + provide + should + + + think + provide + + + provide + It + + + + + ROOT + expect + + + decade + a + + + expect + decade + + + decade + reports + + + reports + rhetoric + + + expect + Americans + + + expect + action + + + think + I + + + expect + think + + + provide + we + + + provide + should + + + think + provide + + + provide + It + + + + + ROOT + expect + + + decade + a + + + expect + decade + + + decade + reports + + + decade + rhetoric + + + reports + rhetoric + + + expect + Americans + + + expect + action + + + think + I + + + expect + think + + + provide + we + + + provide + should + + + think + provide + + + provide + It + + + + + + + Americans + american + 10611 + 10620 + NNS + MISC + + + also + also + 10621 + 10625 + RB + O + + + expect + expect + 10626 + 10632 + VBP + O + + + action + action + 10633 + 10639 + NN + O + + + on + on + 10640 + 10642 + IN + O + + + child + child + 10644 + 10649 + NN + O + + + care + care + 10650 + 10654 + NN + O + + + legislation + legislation + 10655 + 10666 + NN + O + + + . + . + 10666 + 10667 + . + O + + + (ROOT (S (NP (NNS Americans)) (ADVP (RB also)) (VP (VBP expect) (NP (NP (NN action)) (PP (IN on) (NP (NN child) (NN care) (NN legislation))))) (. .))) + + + ROOT + expect + + + expect + Americans + + + expect + also + + + expect + action + + + action + on + + + legislation + child + + + legislation + care + + + on + legislation + + + + + ROOT + expect + + + expect + Americans + + + expect + also + + + expect + action + + + legislation + child + + + legislation + care + + + action + legislation + + + + + ROOT + expect + + + expect + Americans + + + expect + also + + + expect + action + + + legislation + child + + + legislation + care + + + action + legislation + + + + + + + I + I + 10668 + 10669 + PRP + O + + + hope + hope + 10670 + 10674 + VBP + O + + + the + the + 10675 + 10678 + DT + O + + + dif + dif + 10679 + 10682 + NN + O + + + - + - + 10682 + 10683 + : + O + + + ferences + ference + 10685 + 10693 + NNS + O + + + there + there + 10694 + 10699 + RB + O + + + can + can + 10700 + 10703 + MD + O + + + be + be + 10704 + 10706 + VB + O + + + worked + work + 10707 + 10713 + VBN + O + + + out + out + 10714 + 10717 + RP + O + + + shortly + shortly + 10719 + 10726 + RB + O + + + so + so + 10727 + 10729 + IN + O + + + that + that + 10730 + 10734 + IN + O + + + a + a + 10735 + 10736 + DT + O + + + final + final + 10737 + 10742 + JJ + O + + + form + form + 10743 + 10747 + NN + O + + + of + of + 10748 + 10750 + IN + O + + + this + this + 10751 + 10755 + DT + O + + + bill + bill + 10756 + 10760 + NN + O + + + can + can + 10762 + 10765 + MD + O + + + be + be + 10766 + 10768 + VB + O + + + voted + vote + 10769 + 10774 + VBN + O + + + upon + upon + 10775 + 10779 + IN + O + + + and + and + 10780 + 10783 + CC + O + + + sent + send + 10784 + 10788 + VBN + O + + + to + to + 10789 + 10791 + TO + O + + + the + the + 10792 + 10795 + DT + O + + + President + President + 10797 + 10806 + NNP + O + + + . + . + 10806 + 10807 + . + O + + + (ROOT (S (S (NP (PRP I)) (VP (VBP hope) (NP (DT the) (NN dif)))) (: -) (S (NP (NNS ferences)) (ADVP (RB there)) (VP (MD can) (VP (VB be) (VP (VBN worked) (PRT (RP out)) (ADVP (RB shortly)) (SBAR (IN so) (IN that) (S (NP (NP (DT a) (JJ final) (NN form)) (PP (IN of) (NP (DT this) (NN bill)))) (VP (MD can) (VP (VB be) (VP (VP (VBN voted) (PP (IN upon))) (CC and) (VP (VBN sent) (PP (TO to) (NP (DT the) (NNP President))))))))))))) (. .))) + + + ROOT + hope + + + hope + I + + + dif + the + + + hope + dif + + + worked + ferences + + + worked + there + + + worked + can + + + worked + be + + + hope + worked + + + worked + out + + + worked + shortly + + + voted + so + + + voted + that + + + form + a + + + form + final + + + voted + form + + + form + of + + + bill + this + + + of + bill + + + voted + can + + + voted + be + + + worked + voted + + + voted + upon + + + voted + and + + + voted + sent + + + sent + to + + + President + the + + + to + President + + + + + ROOT + hope + + + hope + I + + + dif + the + + + hope + dif + + + worked + ferences + + + worked + there + + + worked + can + + + worked + be + + + hope + worked + + + worked + out + + + worked + shortly + + + voted + so + + + voted + that + + + form + a + + + form + final + + + voted + form + + + bill + this + + + form + bill + + + voted + can + + + voted + be + + + worked + voted + + + voted + upon + + + voted + sent + + + President + the + + + sent + President + + + + + ROOT + hope + + + hope + I + + + dif + the + + + hope + dif + + + worked + ferences + + + worked + there + + + worked + can + + + worked + be + + + hope + worked + + + worked + out + + + worked + shortly + + + voted + so + + + voted + that + + + form + a + + + form + final + + + voted + form + + + sent + form + + + bill + this + + + form + bill + + + voted + can + + + voted + be + + + worked + voted + + + voted + upon + + + worked + sent + + + voted + sent + + + President + the + + + sent + President + + + + + + + Working + work + 10808 + 10815 + VBG + O + + + parents + parent + 10816 + 10823 + NNS + O + + + need + need + 10824 + 10828 + VBP + O + + + af + af + 10829 + 10831 + SYM + O + + + - + - + 10831 + 10832 + : + O + + + fordable + fordable + 10834 + 10842 + JJ + O + + + care + care + 10843 + 10847 + NN + O + + + for + for + 10848 + 10851 + IN + O + + + their + they + 10852 + 10857 + PRP$ + O + + + children + child + 10858 + 10866 + NNS + O + + + but + but + 10867 + 10870 + CC + O + + + they + they + 10872 + 10876 + PRP + O + + + also + also + 10877 + 10881 + RB + O + + + want + want + 10882 + 10886 + VBP + O + + + quality + quality + 10887 + 10894 + NN + O + + + care + care + 10895 + 10899 + NN + O + + + . + . + 10899 + 10900 + . + O + + + (ROOT (S (VP (VBG Working) (SBAR (S (NP (NNS parents)) (VP (VBP need) (FRAG (X (SYM af)) (: -) (NP (NP (JJ fordable) (NN care)) (SBAR (IN for) (S (NP (NP (PRP$ their) (NNS children)) (PP (CC but) (NP (PRP they)))) (ADVP (RB also)) (VP (VBP want) (NP (NN quality) (NN care)))))) (. .)))))))) + + + ROOT + Working + + + need + parents + + + Working + need + + + care + af + + + care + fordable + + + need + care + + + want + for + + + children + their + + + want + children + + + children + but + + + but + they + + + want + also + + + care + want + + + care + quality + + + want + care + + + + + ROOT + Working + + + need + parents + + + Working + need + + + care + af + + + care + fordable + + + need + care + + + want + for + + + children + their + + + want + children + + + children + but + + + but + they + + + want + also + + + care + want + + + care + quality + + + want + care + + + + + ROOT + Working + + + need + parents + + + Working + need + + + care + af + + + care + fordable + + + need + care + + + want + for + + + children + their + + + want + children + + + children + but + + + but + they + + + want + also + + + care + want + + + care + quality + + + want + care + + + + + + + The + the + 10901 + 10904 + DT + O + + + Con + con + 10905 + 10908 + NN + O + + + - + - + 10908 + 10909 + : + O + + + gress + gress + 10911 + 10916 + NN + O + + + should + should + 10917 + 10923 + MD + O + + + pass + pass + 10924 + 10928 + VB + O + + + a + a + 10929 + 10930 + DT + O + + + bill + bill + 10931 + 10935 + NN + O + + + to + to + 10936 + 10938 + TO + O + + + ensure + ensure + 10939 + 10945 + VB + O + + + both + both + 10946 + 10950 + DT + O + + + . + . + 10950 + 10951 + . + O + + + (ROOT (FRAG (NP (DT The) (NN Con)) (: -) (S (NP (NN gress)) (VP (MD should) (VP (VB pass) (NP (DT a) (NN bill) (S (VP (TO to) (VP (VB ensure) (NP (DT both))))))))) (. .))) + + + ROOT + Con + + + Con + The + + + pass + gress + + + pass + should + + + Con + pass + + + bill + a + + + pass + bill + + + ensure + to + + + bill + ensure + + + ensure + both + + + + + ROOT + Con + + + Con + The + + + pass + gress + + + pass + should + + + Con + pass + + + bill + a + + + pass + bill + + + ensure + to + + + bill + ensure + + + ensure + both + + + + + ROOT + Con + + + Con + The + + + pass + gress + + + pass + should + + + Con + pass + + + bill + a + + + pass + bill + + + ensure + to + + + bill + ensure + + + ensure + both + + + + + + + Those + those + 10955 + 10960 + DT + O + + + immediate + immediate + 10963 + 10972 + JJ + O + + + concerns-clean + concerns-clean + 10976 + 10990 + JJ + O + + + air + air + 10992 + 10995 + NN + O + + + , + , + 10995 + 10996 + , + O + + + national + national + 10997 + 11005 + JJ + O + + + service + service + 11006 + 11013 + NN + O + + + , + , + 11013 + 11014 + , + O + + + crime + crime + 11015 + 11020 + NN + O + + + legislation + legislation + 11021 + 11032 + NN + O + + + , + , + 11032 + 11033 + , + O + + + and + and + 11035 + 11038 + CC + O + + + education + education + 11039 + 11048 + NN + O + + + reform + reform + 11049 + 11055 + NN + O + + + measures-are + measures-are + 11056 + 11068 + NN + O + + + a + a + 11069 + 11070 + DT + O + + + good + good + 11072 + 11076 + JJ + O + + + start + start + 11077 + 11082 + NN + O + + + for + for + 11083 + 11086 + IN + O + + + our + we + 11087 + 11090 + PRP$ + O + + + work + work + 11091 + 11095 + NN + O + + + this + this + 11096 + 11100 + DT + DATE + THIS P1Y + + + + year + year + 11101 + 11105 + NN + DATE + THIS P1Y + + + + . + . + 11105 + 11106 + . + O + + + (ROOT (X (NP (NP (DT Those) (JJ immediate) (JJ concerns-clean) (NN air)) (, ,) (NP (JJ national) (NN service)) (, ,) (NP (NN crime) (NN legislation)) (, ,) (CC and) (NP (NN education) (NN reform) (NN measures-are))) (NP (NP (DT a) (JJ good) (NN start)) (PP (IN for) (NP (NP (PRP$ our) (NN work)) (NP (DT this) (NN year))))) (. .))) + + + ROOT + start + + + air + Those + + + air + immediate + + + air + concerns-clean + + + start + air + + + service + national + + + air + service + + + legislation + crime + + + air + legislation + + + air + and + + + measures-are + education + + + measures-are + reform + + + air + measures-are + + + start + a + + + start + good + + + start + for + + + work + our + + + for + work + + + year + this + + + work + year + + + + + ROOT + start + + + air + Those + + + air + immediate + + + air + concerns-clean + + + start + air + + + service + national + + + air + service + + + legislation + crime + + + air + legislation + + + measures-are + education + + + measures-are + reform + + + air + measures-are + + + start + a + + + start + good + + + work + our + + + start + work + + + year + this + + + work + year + + + + + ROOT + start + + + air + Those + + + air + immediate + + + air + concerns-clean + + + start + air + + + service + national + + + air + service + + + start + service + + + legislation + crime + + + air + legislation + + + start + legislation + + + measures-are + education + + + measures-are + reform + + + air + measures-are + + + start + measures-are + + + start + a + + + start + good + + + work + our + + + start + work + + + year + this + + + work + year + + + + + + + But + but + 11107 + 11110 + CC + O + + + they + they + 11112 + 11116 + PRP + O + + + do + do + 11117 + 11119 + VBP + O + + + not + not + 11120 + 11123 + RB + O + + + exhaust + exhaust + 11124 + 11131 + VB + O + + + our + we + 11132 + 11135 + PRP$ + O + + + agenda + agenda + 11136 + 11142 + NN + O + + + . + . + 11142 + 11143 + . + O + + + (ROOT (S (CC But) (NP (PRP they)) (VP (VBP do) (RB not) (VP (VB exhaust) (NP (PRP$ our) (NN agenda)))) (. .))) + + + ROOT + exhaust + + + exhaust + But + + + exhaust + they + + + exhaust + do + + + exhaust + not + + + agenda + our + + + exhaust + agenda + + + + + ROOT + exhaust + + + exhaust + But + + + exhaust + they + + + exhaust + do + + + exhaust + not + + + agenda + our + + + exhaust + agenda + + + + + ROOT + exhaust + + + exhaust + But + + + exhaust + they + + + exhaust + do + + + exhaust + not + + + agenda + our + + + exhaust + agenda + + + + + + + One + one + 11147 + 11150 + CD + NUMBER + 1.0 + + + issue + issue + 11151 + 11156 + NN + O + + + of + of + 11157 + 11159 + IN + O + + + particular + particular + 11160 + 11170 + JJ + O + + + importance + importance + 11171 + 11181 + NN + O + + + to + to + 11183 + 11185 + TO + O + + + all + all + 11186 + 11189 + DT + O + + + of + of + 11190 + 11192 + IN + O + + + us + we + 11193 + 11195 + PRP + O + + + is + be + 11196 + 11198 + VBZ + O + + + campaign + campaign + 11199 + 11207 + NN + O + + + finance + finance + 11208 + 11215 + NN + O + + + reform + reform + 11217 + 11223 + NN + O + + + . + . + 11223 + 11224 + . + O + + + (ROOT (S (NP (NP (CD One) (NN issue)) (PP (IN of) (NP (NP (JJ particular) (NN importance)) (PP (TO to) (NP (NP (DT all)) (PP (IN of) (NP (PRP us)))))))) (VP (VBZ is) (NP (NN campaign) (NN finance) (NN reform))) (. .))) + + + ROOT + reform + + + issue + One + + + reform + issue + + + issue + of + + + importance + particular + + + of + importance + + + importance + to + + + to + all + + + all + of + + + of + us + + + reform + is + + + reform + campaign + + + reform + finance + + + + + ROOT + reform + + + issue + One + + + reform + issue + + + importance + particular + + + issue + importance + + + importance + all + + + all + us + + + reform + is + + + reform + campaign + + + reform + finance + + + + + ROOT + reform + + + issue + One + + + reform + issue + + + importance + particular + + + issue + importance + + + importance + all + + + all + us + + + reform + is + + + reform + campaign + + + reform + finance + + + + + + + It + it + 11228 + 11230 + PRP + O + + + is + be + 11231 + 11233 + VBZ + O + + + evident + evident + 11234 + 11241 + JJ + O + + + that + that + 11242 + 11246 + IN + O + + + if + if + 11247 + 11249 + IN + O + + + we + we + 11250 + 11252 + PRP + O + + + do + do + 11253 + 11255 + VBP + O + + + not + not + 11256 + 11259 + RB + O + + + reform + reform + 11260 + 11266 + VB + O + + + the + the + 11268 + 11271 + DT + O + + + manner + manner + 11272 + 11278 + NN + O + + + in + in + 11279 + 11281 + IN + O + + + which + which + 11282 + 11287 + WDT + O + + + election + election + 11288 + 11296 + NN + O + + + cam + cam + 11297 + 11300 + NN + O + + + - + - + 11300 + 11301 + : + O + + + paigns + paign + 11303 + 11309 + NNS + O + + + are + be + 11310 + 11313 + VBP + O + + + financed + finance + 11314 + 11322 + VBN + O + + + , + , + 11322 + 11323 + , + O + + + we + we + 11324 + 11326 + PRP + O + + + will + will + 11327 + 11331 + MD + O + + + forfeit + forfeit + 11332 + 11339 + VB + O + + + the + the + 11340 + 11343 + DT + O + + + trust + trust + 11345 + 11350 + NN + O + + + of + of + 11351 + 11353 + IN + O + + + the + the + 11354 + 11357 + DT + O + + + American + american + 11358 + 11366 + JJ + LOCATION + + + people + people + 11367 + 11373 + NNS + O + + + . + . + 11373 + 11374 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (ADJP (JJ evident)) (SBAR (IN that) (S (SBAR (IN if) (S (NP (PRP we)) (VP (VBP do) (RB not) (VP (VB reform) (NP (NP (DT the) (NN manner)) (SBAR (WHPP (IN in) (WHNP (WDT which))) (S (NP (NP (NN election) (NN cam)) (: -) (NP (NNS paigns))) (VP (VBP are) (VP (VBN financed)))))))))) (, ,) (NP (PRP we)) (VP (MD will) (VP (VB forfeit) (NP (NP (DT the) (NN trust)) (PP (IN of) (NP (DT the) (JJ American) (NNS people))))))))) (. .))) + + + ROOT + evident + + + evident + It + + + evident + is + + + forfeit + that + + + reform + if + + + reform + we + + + reform + do + + + reform + not + + + forfeit + reform + + + manner + the + + + reform + manner + + + financed + in + + + in + which + + + cam + election + + + financed + cam + + + cam + paigns + + + financed + are + + + manner + financed + + + forfeit + we + + + forfeit + will + + + evident + forfeit + + + trust + the + + + forfeit + trust + + + trust + of + + + people + the + + + people + American + + + of + people + + + + + ROOT + evident + + + evident + It + + + evident + is + + + forfeit + that + + + reform + if + + + reform + we + + + reform + do + + + reform + not + + + forfeit + reform + + + manner + the + + + reform + manner + + + financed + which + + + cam + election + + + financed + cam + + + cam + paigns + + + financed + are + + + manner + financed + + + forfeit + we + + + forfeit + will + + + evident + forfeit + + + trust + the + + + forfeit + trust + + + people + the + + + people + American + + + trust + people + + + + + ROOT + evident + + + evident + It + + + evident + is + + + forfeit + that + + + reform + if + + + reform + we + + + reform + do + + + reform + not + + + forfeit + reform + + + manner + the + + + reform + manner + + + financed + which + + + cam + election + + + financed + cam + + + cam + paigns + + + financed + are + + + manner + financed + + + forfeit + we + + + forfeit + will + + + evident + forfeit + + + trust + the + + + forfeit + trust + + + people + the + + + people + American + + + trust + people + + + + + + + The + the + 11375 + 11378 + DT + O + + + enormous + enormous + 11380 + 11388 + JJ + O + + + costs + cost + 11389 + 11394 + NNS + O + + + of + of + 11395 + 11397 + IN + O + + + campaigning + campaigning + 11398 + 11409 + NN + O + + + are + be + 11410 + 11413 + VBP + O + + + making + make + 11415 + 11421 + VBG + O + + + it + it + 11422 + 11424 + PRP + O + + + more + more + 11425 + 11429 + RBR + O + + + and + and + 11430 + 11433 + CC + O + + + more + more + 11434 + 11438 + RBR + O + + + difficult + difficult + 11439 + 11448 + JJ + O + + + for + for + 11449 + 11452 + IN + O + + + any + any + 11454 + 11457 + DT + O + + + other + other + 11458 + 11463 + JJ + O + + + than + than + 11464 + 11468 + IN + O + + + the + the + 11469 + 11472 + DT + O + + + very + very + 11473 + 11477 + RB + O + + + wealthy + wealthy + 11478 + 11485 + JJ + O + + + to + to + 11486 + 11488 + TO + O + + + contemplate + contemplate + 11490 + 11501 + VB + O + + + serving + serve + 11502 + 11509 + VBG + O + + + in + in + 11510 + 11512 + IN + O + + + the + the + 11513 + 11516 + DT + O + + + Congress + Congress + 11517 + 11525 + NNP + ORGANIZATION + + + . + . + 11525 + 11526 + . + O + + + (ROOT (S (NP (NP (DT The) (JJ enormous) (NNS costs)) (PP (IN of) (NP (NN campaigning)))) (VP (VBP are) (VP (VBG making) (S (NP (PRP it)) (ADJP (ADJP (ADVP (RBR more) (CC and) (RBR more)) (JJ difficult)) (PP (IN for) (NP (NP (DT any) (JJ other)) (PP (IN than) (NP (DT the) (ADJP (RB very) (JJ wealthy))))))) (S (VP (TO to) (VP (VB contemplate) (S (VP (VBG serving) (PP (IN in) (NP (DT the) (NNP Congress))))))))))) (. .))) + + + ROOT + making + + + costs + The + + + costs + enormous + + + making + costs + + + costs + of + + + of + campaigning + + + making + are + + + contemplate + it + + + difficult + more + + + more + and + + + more + more + + + contemplate + difficult + + + difficult + for + + + other + any + + + for + other + + + other + than + + + wealthy + the + + + wealthy + very + + + than + wealthy + + + contemplate + to + + + making + contemplate + + + contemplate + serving + + + serving + in + + + Congress + the + + + in + Congress + + + + + ROOT + making + + + costs + The + + + costs + enormous + + + making + costs + + + costs + campaigning + + + making + are + + + contemplate + it + + + difficult + more + + + more + more + + + contemplate + difficult + + + other + any + + + difficult + other + + + wealthy + the + + + wealthy + very + + + other + wealthy + + + contemplate + to + + + making + contemplate + + + contemplate + serving + + + Congress + the + + + serving + Congress + + + + + ROOT + making + + + costs + The + + + costs + enormous + + + making + costs + + + costs + campaigning + + + making + are + + + contemplate + it + + + difficult + more + + + more + more + + + difficult + more + + + contemplate + difficult + + + other + any + + + difficult + other + + + wealthy + the + + + wealthy + very + + + other + wealthy + + + contemplate + to + + + making + contemplate + + + contemplate + serving + + + Congress + the + + + serving + Congress + + + + + + + The + the + 11528 + 11531 + DT + O + + + demands + demand + 11532 + 11539 + NNS + O + + + of + of + 11540 + 11542 + IN + O + + + election + election + 11543 + 11551 + NN + O + + + campaigns + campaign + 11552 + 11561 + NNS + O + + + force + force + 11563 + 11568 + VBP + O + + + far + far + 11569 + 11572 + RB + O + + + too + too + 11573 + 11576 + RB + O + + + much + much + 11577 + 11581 + JJ + O + + + attention + attention + 11582 + 11591 + NN + O + + + to + to + 11592 + 11594 + TO + O + + + be + be + 11595 + 11597 + VB + O + + + paid + pay + 11599 + 11603 + VBN + O + + + to + to + 11604 + 11606 + TO + O + + + fundraising + fundraise + 11607 + 11618 + VBG + O + + + activities + activity + 11619 + 11629 + NNS + O + + + . + . + 11629 + 11630 + . + O + + + (ROOT (S (NP (NP (DT The) (NNS demands)) (PP (IN of) (NP (NN election) (NNS campaigns)))) (VP (VBP force) (NP (ADJP (RB far) (RB too) (JJ much)) (NN attention)) (S (VP (TO to) (VP (VB be) (VP (VBN paid) (PP (TO to) (S (VP (VBG fundraising) (NP (NNS activities)))))))))) (. .))) + + + ROOT + force + + + demands + The + + + force + demands + + + demands + of + + + campaigns + election + + + of + campaigns + + + much + far + + + much + too + + + attention + much + + + force + attention + + + paid + to + + + paid + be + + + force + paid + + + paid + to + + + to + fundraising + + + fundraising + activities + + + + + ROOT + force + + + demands + The + + + force + demands + + + campaigns + election + + + demands + campaigns + + + much + far + + + much + too + + + attention + much + + + force + attention + + + paid + to + + + paid + be + + + force + paid + + + paid + fundraising + + + fundraising + activities + + + + + ROOT + force + + + demands + The + + + force + demands + + + campaigns + election + + + demands + campaigns + + + much + far + + + much + too + + + attention + much + + + force + attention + + + paid + to + + + paid + be + + + force + paid + + + paid + fundraising + + + fundraising + activities + + + + + + + The + the + 11634 + 11637 + DT + O + + + appearance + appearance + 11638 + 11648 + NN + O + + + is + be + 11649 + 11651 + VBZ + O + + + one + one + 11652 + 11655 + CD + NUMBER + 1.0 + + + that + that + 11656 + 11660 + WDT + O + + + under + under + 11661 + 11666 + IN + O + + + - + - + 11666 + 11667 + : + O + + + mines + mine + 11669 + 11674 + NNS + O + + + confidence + confidence + 11675 + 11685 + NN + O + + + in + in + 11686 + 11688 + IN + O + + + Congress + Congress + 11689 + 11697 + NNP + ORGANIZATION + + + . + . + 11697 + 11698 + . + O + + + (ROOT (S (NP (DT The) (NN appearance)) (VP (VBZ is) (NP (NP (NP (CD one)) (SBAR (WHNP (WDT that)))) (PP (IN under) (: -) (NP (NP (NNS mines) (NN confidence)) (PP (IN in) (NP (NNP Congress))))))) (. .))) + + + ROOT + one + + + appearance + The + + + one + appearance + + + one + is + + + one + that + + + one + under + + + confidence + mines + + + under + confidence + + + confidence + in + + + in + Congress + + + + + ROOT + one + + + appearance + The + + + one + appearance + + + one + is + + + one + that + + + confidence + mines + + + one + confidence + + + confidence + Congress + + + + + ROOT + one + + + appearance + The + + + one + appearance + + + one + is + + + one + that + + + confidence + mines + + + one + confidence + + + confidence + Congress + + + + + + + The + the + 11699 + 11702 + DT + O + + + re + re + 11703 + 11705 + SYM + O + + + - + - + 11705 + 11706 + : + O + + + ality + ality + 11708 + 11713 + NN + O + + + is + be + 11714 + 11716 + VBZ + O + + + one + one + 11717 + 11720 + CD + NUMBER + 1.0 + + + that + that + 11721 + 11725 + WDT + O + + + distorts + distort + 11726 + 11734 + VBZ + O + + + Congress + Congress + 11735 + 11743 + NNP + ORGANIZATION + + + ' + ' + 11743 + 11744 + POS + O + + + ability + ability + 11746 + 11753 + NN + O + + + to + to + 11754 + 11756 + TO + O + + + function + function + 11757 + 11765 + VB + O + + + . + . + 11765 + 11766 + . + O + + + (ROOT (FRAG (NP (NP (DT The)) (SBAR (FRAG (X (SYM re)) (: -) (NP (NP (NN ality)) (SBAR (S (VP (VBZ is) (NP (NP (CD one)) (SBAR (WHNP (WDT that)) (S (VP (VBZ distorts) (NP (NP (NNP Congress) (POS ')) (NN ability) (S (VP (TO to) (VP (VB function))))))))))))) (. .)))))) + + + ROOT + The + + + ality + re + + + The + ality + + + one + is + + + ality + one + + + distorts + that + + + one + distorts + + + ability + Congress + + + distorts + ability + + + function + to + + + ability + function + + + + + ROOT + The + + + ality + re + + + The + ality + + + one + is + + + ality + one + + + distorts + that + + + one + distorts + + + ability + Congress + + + distorts + ability + + + function + to + + + ability + function + + + + + ROOT + The + + + ality + re + + + The + ality + + + one + is + + + ality + one + + + distorts + that + + + one + distorts + + + ability + Congress + + + distorts + ability + + + function + to + + + ability + function + + + + + + + Campaign + campaign + 11770 + 11778 + NN + O + + + finance + finance + 11779 + 11786 + NN + O + + + reform + reform + 11787 + 11793 + NN + O + + + is + be + 11794 + 11796 + VBZ + O + + + a + a + 11797 + 11798 + DT + O + + + goal + goal + 11799 + 11803 + NN + O + + + I + I + 11804 + 11805 + PRP + O + + + have + have + 11807 + 11811 + VBP + O + + + pursued + pursue + 11812 + 11819 + VBN + O + + + for + for + 11820 + 11823 + IN + O + + + 8 + 8 + 11824 + 11825 + CD + DURATION + 8.0 + P8Y + + + years + year + 11826 + 11831 + NNS + NUMBER + 0.0 + P8Y + + + . + . + 11831 + 11832 + . + O + + + (ROOT (S (NP (NN Campaign) (NN finance) (NN reform)) (VP (VBZ is) (NP (NP (DT a) (NN goal)) (SBAR (S (NP (PRP I)) (VP (VBP have) (VP (VBN pursued) (PP (IN for) (NP (CD 8) (NNS years))))))))) (. .))) + + + ROOT + goal + + + reform + Campaign + + + reform + finance + + + goal + reform + + + goal + is + + + goal + a + + + pursued + I + + + pursued + have + + + goal + pursued + + + pursued + for + + + years + 8 + + + for + years + + + + + ROOT + goal + + + reform + Campaign + + + reform + finance + + + goal + reform + + + goal + is + + + goal + a + + + pursued + I + + + pursued + have + + + goal + pursued + + + years + 8 + + + pursued + years + + + + + ROOT + goal + + + reform + Campaign + + + reform + finance + + + goal + reform + + + goal + is + + + goal + a + + + pursued + I + + + pursued + have + + + goal + pursued + + + years + 8 + + + pursued + years + + + + + + + I + I + 11833 + 11834 + PRP + O + + + shall + shall + 11835 + 11840 + MD + O + + + con + con + 11841 + 11844 + VB + O + + + - + - + 11844 + 11845 + : + O + + + tinue + tinue + 11847 + 11852 + NN + O + + + to + to + 11853 + 11855 + TO + O + + + press + press + 11856 + 11861 + VB + O + + + for + for + 11862 + 11865 + IN + O + + + It + it + 11866 + 11868 + PRP + O + + + , + , + 11868 + 11869 + , + O + + + and + and + 11870 + 11873 + CC + O + + + I + I + 11874 + 11875 + PRP + O + + + hope + hope + 11876 + 11880 + VBP + O + + + that + that + 11881 + 11885 + IN + O + + + this + this + 11887 + 11891 + DT + DATE + THIS P1Y + + + + year + year + 11892 + 11896 + NN + DATE + THIS P1Y + + + + we + we + 11897 + 11899 + PRP + O + + + will + will + 11900 + 11904 + MD + O + + + finally + finally + 11905 + 11912 + RB + O + + + see + see + 11913 + 11916 + VB + O + + + an + a + 11917 + 11919 + DT + O + + + oppor + oppor + 11920 + 11925 + NN + O + + + - + - + 11925 + 11926 + : + O + + + tunity + tunity + 11928 + 11934 + NN + O + + + to + to + 11935 + 11937 + TO + O + + + take + take + 11938 + 11942 + VB + O + + + effective + effective + 11943 + 11952 + JJ + O + + + action + action + 11953 + 11959 + NN + O + + + . + . + 11959 + 11960 + . + O + + + (ROOT (S (S (NP (PRP I)) (VP (MD shall) (VP (VB con) (: -) (S (NP (NN tinue)) (VP (TO to) (VP (VB press) (PP (IN for) (NP (PRP It))))))))) (, ,) (CC and) (S (NP (PRP I)) (VP (VBP hope) (SBAR (IN that) (S (NP-TMP (DT this) (NN year)) (NP (PRP we)) (VP (MD will) (ADVP (RB finally)) (VP (VB see) (NP (DT an) (NN oppor) (: -) (NN tunity)) (S (VP (TO to) (VP (VB take) (NP (JJ effective) (NN action))))))))))) (. .))) + + + ROOT + con + + + con + I + + + con + shall + + + press + tinue + + + press + to + + + con + press + + + press + for + + + for + It + + + con + and + + + hope + I + + + con + hope + + + see + that + + + year + this + + + see + year + + + see + we + + + see + will + + + see + finally + + + hope + see + + + tunity + an + + + tunity + oppor + + + see + tunity + + + take + to + + + see + take + + + action + effective + + + take + action + + + + + ROOT + con + + + con + I + + + con + shall + + + press + tinue + + + press + to + + + con + press + + + press + It + + + hope + I + + + con + hope + + + see + that + + + year + this + + + see + year + + + see + we + + + see + will + + + see + finally + + + hope + see + + + tunity + an + + + tunity + oppor + + + see + tunity + + + take + to + + + see + take + + + action + effective + + + take + action + + + + + ROOT + con + + + con + I + + + con + shall + + + press + tinue + + + press + to + + + con + press + + + press + It + + + hope + I + + + con + hope + + + see + that + + + year + this + + + see + year + + + see + we + + + see + will + + + see + finally + + + hope + see + + + tunity + an + + + tunity + oppor + + + see + tunity + + + take + to + + + see + take + + + action + effective + + + take + action + + + + + + + THE + the + 11967 + 11970 + DT + O + + + RETIREMENT + retirement + 11971 + 11981 + NN + O + + + OF + of + 11982 + 11984 + IN + O + + + MAX + MAX + 11985 + 11988 + NNP + O + + + BARBER + BARBER + 12004 + 12010 + NNP + O + + + Mr. + Mr. + 12014 + 12017 + NNP + O + + + MITCHELL + MITCHELL + 12018 + 12026 + NNP + PERSON + + + . + . + 12026 + 12027 + . + O + + + (ROOT (NP (NP (DT THE) (NN RETIREMENT)) (PP (IN OF) (NP (NNP MAX) (NNP BARBER) (NNP Mr.) (NNP MITCHELL))) (. .))) + + + ROOT + RETIREMENT + + + RETIREMENT + THE + + + RETIREMENT + OF + + + MITCHELL + MAX + + + MITCHELL + BARBER + + + MITCHELL + Mr. + + + OF + MITCHELL + + + + + ROOT + RETIREMENT + + + RETIREMENT + THE + + + MITCHELL + MAX + + + MITCHELL + BARBER + + + MITCHELL + Mr. + + + RETIREMENT + MITCHELL + + + + + ROOT + RETIREMENT + + + RETIREMENT + THE + + + MITCHELL + MAX + + + MITCHELL + BARBER + + + MITCHELL + Mr. + + + RETIREMENT + MITCHELL + + + + + + + Mr. + Mr. + 12028 + 12031 + NNP + O + + + President + President + 12032 + 12041 + NNP + O + + + , + , + 12041 + 12042 + , + O + + + I + I + 12043 + 12044 + PRP + O + + + would + would + 12046 + 12051 + MD + O + + + like + like + 12052 + 12056 + VB + O + + + to + to + 12057 + 12059 + TO + O + + + take + take + 12060 + 12064 + VB + O + + + this + this + 12065 + 12069 + DT + O + + + opportunity + opportunity + 12070 + 12081 + NN + O + + + to + to + 12082 + 12084 + TO + O + + + acknowledge + acknowledge + 12086 + 12097 + VB + O + + + the + the + 12098 + 12101 + DT + O + + + recent + recent + 12102 + 12108 + JJ + O + + + retirement + retirement + 12109 + 12119 + NN + O + + + of + of + 12120 + 12122 + IN + O + + + Max + Max + 12124 + 12127 + NNP + PERSON + + + Barber + Barber + 12128 + 12134 + NNP + PERSON + + + , + , + 12134 + 12135 + , + O + + + who + who + 12136 + 12139 + WP + O + + + served + serve + 12140 + 12146 + VBD + O + + + as + as + 12147 + 12149 + IN + O + + + superin + superin + 12150 + 12157 + NN + O + + + - + - + 12157 + 12158 + : + O + + + tendent + tendent + 12160 + 12167 + NN + O + + + of + of + 12168 + 12170 + IN + O + + + the + the + 12171 + 12174 + DT + O + + + Senate + Senate + 12175 + 12181 + NNP + ORGANIZATION + + + Radio-TV + Radio-TV + 12182 + 12190 + NNP + O + + + Gal + Gal + 12191 + 12194 + NNP + O + + + - + - + 12194 + 12195 + : + O + + + lery + lery + 12197 + 12201 + NN + O + + + . + . + 12201 + 12202 + . + O + + + (ROOT (S (NP (NNP Mr.) (NNP President)) (, ,) (NP (PRP I)) (VP (MD would) (VP (VB like) (S (VP (TO to) (VP (VB take) (NP (DT this) (NN opportunity) (S (VP (TO to) (VP (VB acknowledge) (NP (NP (DT the) (JJ recent) (NN retirement)) (PP (IN of) (NP (NP (NNP Max) (NNP Barber)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBD served) (PP (IN as) (NP (NP (NN superin)) (: -) (NP (NP (NN tendent)) (PP (IN of) (NP (DT the) (NNP Senate) (NNP Radio-TV) (NNP Gal)))) (: -) (NP (NN lery))))))))))))))))))) (. .))) + + + ROOT + like + + + President + Mr. + + + like + President + + + like + I + + + like + would + + + take + to + + + like + take + + + opportunity + this + + + take + opportunity + + + acknowledge + to + + + opportunity + acknowledge + + + retirement + the + + + retirement + recent + + + acknowledge + retirement + + + retirement + of + + + Barber + Max + + + of + Barber + + + served + who + + + Barber + served + + + served + as + + + as + superin + + + superin + tendent + + + tendent + of + + + Gal + the + + + Gal + Senate + + + Gal + Radio-TV + + + of + Gal + + + superin + lery + + + + + ROOT + like + + + President + Mr. + + + like + President + + + like + I + + + like + would + + + take + to + + + like + take + + + opportunity + this + + + take + opportunity + + + acknowledge + to + + + opportunity + acknowledge + + + retirement + the + + + retirement + recent + + + acknowledge + retirement + + + Barber + Max + + + retirement + Barber + + + served + who + + + Barber + served + + + served + superin + + + superin + tendent + + + Gal + the + + + Gal + Senate + + + Gal + Radio-TV + + + tendent + Gal + + + superin + lery + + + + + ROOT + like + + + President + Mr. + + + like + President + + + like + I + + + like + would + + + take + to + + + like + take + + + opportunity + this + + + take + opportunity + + + acknowledge + to + + + opportunity + acknowledge + + + retirement + the + + + retirement + recent + + + acknowledge + retirement + + + Barber + Max + + + retirement + Barber + + + served + who + + + Barber + served + + + served + superin + + + superin + tendent + + + Gal + the + + + Gal + Senate + + + Gal + Radio-TV + + + tendent + Gal + + + superin + lery + + + + + + + Max + Max + 12206 + 12209 + NNP + PERSON + + + has + have + 12210 + 12213 + VBZ + O + + + been + be + 12214 + 12218 + VBN + O + + + a + a + 12219 + 12220 + DT + O + + + familiar + familiar + 12221 + 12229 + JJ + O + + + face + face + 12230 + 12234 + NN + O + + + in + in + 12235 + 12237 + IN + O + + + the + the + 12238 + 12241 + DT + O + + + U.S. + U.S. + 12243 + 12247 + NNP + LOCATION + + + Capitol + Capitol + 12248 + 12255 + NNP + O + + + for + for + 12256 + 12259 + IN + O + + + 38 + 38 + 12260 + 12262 + CD + DURATION + 38.0 + P38Y + + + years + year + 12263 + 12268 + NNS + NUMBER + 0.0 + P38Y + + + . + . + 12268 + 12269 + . + O + + + (ROOT (S (NP (NNP Max)) (VP (VBZ has) (VP (VBN been) (NP (NP (DT a) (JJ familiar) (NN face)) (PP (IN in) (NP (DT the) (NNP U.S.) (NNP Capitol)))) (PP (IN for) (NP (CD 38) (NNS years))))) (. .))) + + + ROOT + face + + + face + Max + + + face + has + + + face + been + + + face + a + + + face + familiar + + + face + in + + + Capitol + the + + + Capitol + U.S. + + + in + Capitol + + + face + for + + + years + 38 + + + for + years + + + + + ROOT + face + + + face + Max + + + face + has + + + face + been + + + face + a + + + face + familiar + + + Capitol + the + + + Capitol + U.S. + + + face + Capitol + + + years + 38 + + + face + years + + + + + ROOT + face + + + face + Max + + + face + has + + + face + been + + + face + a + + + face + familiar + + + Capitol + the + + + Capitol + U.S. + + + face + Capitol + + + years + 38 + + + face + years + + + + + + + During + during + 12270 + 12276 + IN + O + + + those + those + 12277 + 12282 + DT + O + + + years + year + 12284 + 12289 + NNS + DURATION + PXY + + + , + , + 12289 + 12290 + , + O + + + Max + Max + 12291 + 12294 + NNP + PERSON + + + worked + work + 12295 + 12301 + VBD + O + + + as + as + 12302 + 12304 + IN + O + + + an + a + 12305 + 12307 + DT + O + + + elevator + elevator + 12308 + 12316 + NN + O + + + oper + oper + 12317 + 12321 + SYM + O + + + - + - + 12321 + 12322 + : + O + + + tor + tor + 12324 + 12327 + NN + O + + + , + , + 12327 + 12328 + , + O + + + served + serve + 12329 + 12335 + VBD + O + + + on + on + 12336 + 12338 + IN + O + + + the + the + 12339 + 12342 + DT + O + + + Capitol + Capitol + 12343 + 12350 + NNP + ORGANIZATION + + + Police + Police + 12351 + 12357 + NNP + ORGANIZATION + + + Force + Force + 12359 + 12364 + NNP + ORGANIZATION + + + , + , + 12364 + 12365 + , + O + + + and + and + 12366 + 12369 + CC + O + + + most + most + 12370 + 12374 + RBS + O + + + recently + recently + 12375 + 12383 + RB + DATE + PAST_REF + PAST_REF + + + was + be + 12384 + 12387 + VBD + O + + + the + the + 12388 + 12391 + DT + O + + + su + su + 12392 + 12394 + NN + O + + + - + - + 12394 + 12395 + : + O + + + perintendent + perintendent + 12397 + 12409 + NN + O + + + of + of + 12410 + 12412 + IN + O + + + the + the + 12413 + 12416 + DT + O + + + Senate + Senate + 12417 + 12423 + NNP + ORGANIZATION + + + Radio-TV + Radio-TV + 12424 + 12432 + NNP + O + + + Gallery + Gallery + 12434 + 12441 + NNP + O + + + , + , + 12441 + 12442 + , + O + + + where + where + 12443 + 12448 + WRB + O + + + for + for + 12449 + 12452 + IN + O + + + 17 + 17 + 12453 + 12455 + CD + DURATION + 17.0 + P17Y + + + years + year + 12456 + 12461 + NNS + NUMBER + 0.0 + P17Y + + + he + he + 12462 + 12464 + PRP + O + + + assisted + assist + 12465 + 12473 + VBD + O + + + our + we + 12475 + 12478 + PRP$ + O + + + friends + friend + 12479 + 12486 + NNS + O + + + in + in + 12487 + 12489 + IN + O + + + the + the + 12490 + 12493 + DT + O + + + broadcast + broadcast + 12494 + 12503 + NN + O + + + media + media + 12504 + 12509 + NNS + O + + + . + . + 12509 + 12510 + . + O + + + (ROOT (S (PP (IN During) (NP (DT those) (NNS years))) (, ,) (NP (NNP Max)) (VP (VBD worked) (PP (IN as) (NP (NP (DT an) (NN elevator)) (SBAR (S (FRAG (X (SYM oper)) (: -) (NP (NN tor))) (, ,) (VP (VP (VBD served) (PP (IN on) (NP (DT the) (NNP Capitol) (NNP Police) (NNP Force)))) (, ,) (CC and) (VP (ADVP (RBS most) (RB recently)) (VBD was) (NP (NP (DT the) (NN su)) (: -) (NP (NP (NN perintendent)) (PP (IN of) (NP (DT the) (NNP Senate) (NNP Radio-TV) (NNP Gallery))))))) (, ,))))) (SBAR (WHADVP (WRB where)) (S (PP (IN for) (NP (CD 17) (NNS years))) (NP (PRP he)) (VP (VBD assisted) (NP (PRP$ our) (NNS friends)) (PP (IN in) (NP (DT the) (NN broadcast) (NNS media))))))) (. .))) + + + ROOT + worked + + + worked + During + + + years + those + + + During + years + + + worked + Max + + + worked + as + + + elevator + an + + + as + elevator + + + tor + oper + + + served + tor + + + elevator + served + + + served + on + + + Force + the + + + Force + Capitol + + + Force + Police + + + on + Force + + + served + and + + + recently + most + + + su + recently + + + su + was + + + su + the + + + served + su + + + su + perintendent + + + perintendent + of + + + Gallery + the + + + Gallery + Senate + + + Gallery + Radio-TV + + + of + Gallery + + + assisted + where + + + assisted + for + + + years + 17 + + + for + years + + + assisted + he + + + worked + assisted + + + friends + our + + + assisted + friends + + + assisted + in + + + media + the + + + media + broadcast + + + in + media + + + + + ROOT + worked + + + years + those + + + worked + years + + + worked + Max + + + elevator + an + + + worked + elevator + + + tor + oper + + + served + tor + + + elevator + served + + + Force + the + + + Force + Capitol + + + Force + Police + + + served + Force + + + recently + most + + + su + recently + + + su + was + + + su + the + + + served + su + + + su + perintendent + + + Gallery + the + + + Gallery + Senate + + + Gallery + Radio-TV + + + perintendent + Gallery + + + assisted + where + + + years + 17 + + + assisted + years + + + assisted + he + + + worked + assisted + + + friends + our + + + assisted + friends + + + media + the + + + media + broadcast + + + assisted + media + + + + + ROOT + worked + + + years + those + + + worked + years + + + worked + Max + + + elevator + an + + + worked + elevator + + + tor + oper + + + served + tor + + + elevator + served + + + Force + the + + + Force + Capitol + + + Force + Police + + + served + Force + + + recently + most + + + su + recently + + + su + was + + + su + the + + + elevator + su + + + served + su + + + su + perintendent + + + Gallery + the + + + Gallery + Senate + + + Gallery + Radio-TV + + + perintendent + Gallery + + + assisted + where + + + years + 17 + + + assisted + years + + + assisted + he + + + worked + assisted + + + friends + our + + + assisted + friends + + + media + the + + + media + broadcast + + + assisted + media + + + + + + + Max + max + 12514 + 12517 + NN + PERSON + + + was + be + 12518 + 12521 + VBD + O + + + privileged + privileged + 12522 + 12532 + JJ + O + + + to + to + 12533 + 12535 + TO + O + + + witness + witness + 12536 + 12543 + NN + O + + + many + many + 12544 + 12548 + JJ + O + + + changes + change + 12550 + 12557 + NNS + O + + + that + that + 12558 + 12562 + WDT + O + + + have + have + 12563 + 12567 + VBP + O + + + occurred + occur + 12568 + 12576 + VBN + O + + + in + in + 12577 + 12579 + IN + O + + + the + the + 12580 + 12583 + DT + O + + + Congress + Congress + 12585 + 12593 + NNP + ORGANIZATION + + + . + . + 12593 + 12594 + . + O + + + (ROOT (S (NP (NN Max)) (VP (VBD was) (ADJP (JJ privileged) (PP (TO to) (NP (NP (NN witness) (JJ many) (NNS changes)) (SBAR (WHNP (WDT that)) (S (VP (VBP have) (VP (VBN occurred) (PP (IN in) (NP (DT the) (NNP Congress))))))))))) (. .))) + + + ROOT + privileged + + + privileged + Max + + + privileged + was + + + privileged + to + + + changes + witness + + + changes + many + + + to + changes + + + occurred + that + + + occurred + have + + + changes + occurred + + + occurred + in + + + Congress + the + + + in + Congress + + + + + ROOT + privileged + + + privileged + Max + + + privileged + was + + + changes + witness + + + changes + many + + + privileged + changes + + + occurred + that + + + occurred + have + + + changes + occurred + + + Congress + the + + + occurred + Congress + + + + + ROOT + privileged + + + privileged + Max + + + privileged + was + + + changes + witness + + + changes + many + + + privileged + changes + + + occurred + that + + + occurred + have + + + changes + occurred + + + Congress + the + + + occurred + Congress + + + + + + + I + I + 12595 + 12596 + PRP + O + + + was + be + 12597 + 12600 + VBD + O + + + privileged + privileged + 12601 + 12611 + JJ + O + + + to + to + 12612 + 12614 + TO + O + + + have + have + 12615 + 12619 + VB + O + + + his + he + 12620 + 12623 + PRP$ + O + + + support + support + 12625 + 12632 + NN + O + + + and + and + 12633 + 12636 + CC + O + + + assistance + assistance + 12637 + 12647 + NN + O + + + during + during + 12648 + 12654 + IN + O + + + my + my + 12655 + 12657 + PRP$ + O + + + first + first + 12658 + 12663 + JJ + ORDINAL + 1.0 + + + year + year + 12665 + 12669 + NN + DURATION + P1Y + + + as + as + 12670 + 12672 + IN + O + + + majority + majority + 12673 + 12681 + NN + O + + + leader + leader + 12682 + 12688 + NN + O + + + . + . + 12688 + 12689 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBD was) (ADJP (JJ privileged) (S (VP (TO to) (VP (VB have) (NP (PRP$ his) (NN support) (CC and) (NN assistance)) (PP (IN during) (NP (NP (PRP$ my) (JJ first) (NN year)) (PP (IN as) (NP (NN majority) (NN leader)))))))))) (. .))) + + + ROOT + privileged + + + privileged + I + + + privileged + was + + + have + to + + + privileged + have + + + support + his + + + have + support + + + support + and + + + support + assistance + + + have + during + + + year + my + + + year + first + + + during + year + + + year + as + + + leader + majority + + + as + leader + + + + + ROOT + privileged + + + privileged + I + + + privileged + was + + + have + to + + + privileged + have + + + support + his + + + have + support + + + support + assistance + + + year + my + + + year + first + + + have + year + + + leader + majority + + + year + leader + + + + + ROOT + privileged + + + privileged + I + + + privileged + was + + + have + to + + + privileged + have + + + support + his + + + have + support + + + have + assistance + + + support + assistance + + + year + my + + + year + first + + + have + year + + + leader + majority + + + year + leader + + + + + + + Shortly + shortly + 12693 + 12700 + RB + O + + + before + before + 12701 + 12707 + IN + O + + + the + the + 12708 + 12711 + DT + O + + + holidays + holiday + 12712 + 12720 + NNS + O + + + , + , + 12720 + 12721 + , + O + + + Max + Max + 12722 + 12725 + NNP + PERSON + + + an + a + 12726 + 12728 + DT + O + + + - + - + 12728 + 12729 + : + O + + + nounced + nounce + 12731 + 12738 + VBD + O + + + his + he + 12739 + 12742 + PRP$ + O + + + retirement + retirement + 12743 + 12753 + NN + O + + + . + . + 12753 + 12754 + . + O + + + (ROOT (FRAG (PP (RB Shortly) (IN before) (NP (NP (DT the) (NNS holidays)) (, ,) (NP (NNP Max) (DT an)))) (: -) (S (VP (VBD nounced) (NP (PRP$ his) (NN retirement)))) (. .))) + + + ROOT + nounced + + + before + Shortly + + + nounced + before + + + holidays + the + + + before + holidays + + + holidays + Max + + + Max + an + + + retirement + his + + + nounced + retirement + + + + + ROOT + nounced + + + nounced + Shortly + + + holidays + the + + + nounced + holidays + + + holidays + Max + + + Max + an + + + retirement + his + + + nounced + retirement + + + + + ROOT + nounced + + + nounced + Shortly + + + holidays + the + + + nounced + holidays + + + holidays + Max + + + Max + an + + + retirement + his + + + nounced + retirement + + + + + + + I + I + 12755 + 12756 + PRP + O + + + understand + understand + 12757 + 12767 + VBP + O + + + that + that + 12769 + 12773 + IN + O + + + he + he + 12774 + 12776 + PRP + O + + + and + and + 12777 + 12780 + CC + O + + + his + he + 12781 + 12784 + PRP$ + O + + + wife + wife + 12785 + 12789 + NN + O + + + , + , + 12789 + 12790 + , + O + + + Sylvia + Sylvia + 12791 + 12797 + NNP + PERSON + + + , + , + 12797 + 12798 + , + O + + + are + be + 12799 + 12802 + VBP + O + + + now + now + 12803 + 12806 + RB + DATE + PRESENT_REF + PRESENT_REF + + + enjoying + enjoy + 12808 + 12816 + VBG + O + + + the + the + 12817 + 12820 + DT + O + + + sunny + sunny + 12821 + 12826 + JJ + O + + + skies + sky + 12827 + 12832 + NNS + O + + + of + of + 12833 + 12835 + IN + O + + + Florida + Florida + 12836 + 12843 + NNP + LOCATION + + + for + for + 12844 + 12847 + IN + O + + + NGRESSIONAL + NGRESSIONAL + 12851 + 12862 + NNP + O + + + RECORD-SENATE + RECORD-SENATE + 12863 + 12876 + NNP + O + + + the + the + 12880 + 12883 + DT + O + + + winter + winter + 12884 + 12890 + NN + DATE + XXXX-WI + XXXX-WI + + + months + month + 12891 + 12897 + NNS + DURATION + PXM + + + . + . + 12897 + 12898 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP understand) (SBAR (IN that) (S (NP (NP (NP (PRP he)) (CC and) (NP (PRP$ his) (NN wife))) (, ,) (NP (NNP Sylvia)) (, ,)) (VP (VBP are) (ADVP (RB now)) (VP (VBG enjoying) (NP (NP (DT the) (JJ sunny) (NNS skies)) (PP (IN of) (NP (NNP Florida)))) (PP (IN for) (NP (NP (NNP NGRESSIONAL) (NNP RECORD-SENATE)) (NP (DT the) (NN winter) (NNS months))))))))) (. .))) + + + ROOT + understand + + + understand + I + + + enjoying + that + + + enjoying + he + + + he + and + + + wife + his + + + he + wife + + + he + Sylvia + + + enjoying + are + + + enjoying + now + + + understand + enjoying + + + skies + the + + + skies + sunny + + + enjoying + skies + + + skies + of + + + of + Florida + + + enjoying + for + + + RECORD-SENATE + NGRESSIONAL + + + for + RECORD-SENATE + + + months + the + + + months + winter + + + RECORD-SENATE + months + + + + + ROOT + understand + + + understand + I + + + enjoying + that + + + enjoying + he + + + wife + his + + + he + wife + + + he + Sylvia + + + enjoying + are + + + enjoying + now + + + understand + enjoying + + + skies + the + + + skies + sunny + + + enjoying + skies + + + skies + Florida + + + RECORD-SENATE + NGRESSIONAL + + + enjoying + RECORD-SENATE + + + months + the + + + months + winter + + + RECORD-SENATE + months + + + + + ROOT + understand + + + understand + I + + + enjoying + that + + + enjoying + he + + + wife + his + + + he + wife + + + enjoying + wife + + + he + Sylvia + + + enjoying + are + + + enjoying + now + + + understand + enjoying + + + skies + the + + + skies + sunny + + + enjoying + skies + + + skies + Florida + + + RECORD-SENATE + NGRESSIONAL + + + enjoying + RECORD-SENATE + + + months + the + + + months + winter + + + RECORD-SENATE + months + + + + + + + On + on + 12899 + 12901 + IN + O + + + behalf + behalf + 12902 + 12908 + NN + O + + + of + of + 12909 + 12911 + IN + O + + + all + all + 12912 + 12915 + DT + O + + + my + my + 12917 + 12919 + PRP$ + O + + + colleagues + colleague + 12920 + 12930 + NNS + O + + + in + in + 12931 + 12933 + IN + O + + + Congress + Congress + 12934 + 12942 + NNP + ORGANIZATION + + + , + , + 12942 + 12943 + , + O + + + I + I + 12944 + 12945 + PRP + O + + + wish + wish + 12946 + 12950 + VBP + O + + + Max + max + 12951 + 12954 + NN + PERSON + + + and + and + 12956 + 12959 + CC + O + + + Sylvia + Sylvia + 12960 + 12966 + NNP + PERSON + + + a + a + 12967 + 12968 + DT + O + + + most + most + 12969 + 12973 + RBS + O + + + happy + happy + 12974 + 12979 + JJ + O + + + and + and + 12980 + 12983 + CC + O + + + healthy + healthy + 12984 + 12991 + JJ + O + + + retirement + retirement + 12993 + 13003 + NN + O + + + . + . + 13003 + 13004 + . + O + + + (ROOT (S (PP (IN On) (NP (NP (NN behalf)) (PP (IN of) (NP (DT all) (PRP$ my) (NNS colleagues))) (PP (IN in) (NP (NNP Congress))))) (, ,) (NP (PRP I)) (VP (VBP wish) (NP (NP (NN Max)) (CC and) (NP (NP (NNP Sylvia)) (NP (DT a) (ADJP (RBS most) (JJ happy) (CC and) (JJ healthy)) (NN retirement))))) (. .))) + + + ROOT + wish + + + wish + On + + + On + behalf + + + behalf + of + + + colleagues + all + + + colleagues + my + + + of + colleagues + + + behalf + in + + + in + Congress + + + wish + I + + + wish + Max + + + Max + and + + + Max + Sylvia + + + retirement + a + + + happy + most + + + retirement + happy + + + happy + and + + + happy + healthy + + + Sylvia + retirement + + + + + ROOT + wish + + + colleagues + all + + + colleagues + my + + + wish + colleagues + + + wish + Congress + + + wish + I + + + wish + Max + + + Max + Sylvia + + + retirement + a + + + happy + most + + + retirement + happy + + + happy + healthy + + + Sylvia + retirement + + + + + ROOT + wish + + + colleagues + all + + + colleagues + my + + + wish + colleagues + + + wish + Congress + + + wish + I + + + wish + Max + + + wish + Sylvia + + + Max + Sylvia + + + retirement + a + + + happy + most + + + retirement + happy + + + happy + healthy + + + retirement + healthy + + + Sylvia + retirement + + + + + + + He + he + 13005 + 13007 + PRP + O + + + will + will + 13008 + 13012 + MD + O + + + be + be + 13013 + 13015 + VB + O + + + missed + miss + 13016 + 13022 + VBN + O + + + . + . + 13022 + 13023 + . + O + + + (ROOT (S (NP (PRP He)) (VP (MD will) (VP (VB be) (VP (VBN missed)))) (. .))) + + + ROOT + missed + + + missed + He + + + missed + will + + + missed + be + + + + + ROOT + missed + + + missed + He + + + missed + will + + + missed + be + + + + + ROOT + missed + + + missed + He + + + missed + will + + + missed + be + + + + + + + I + I + 13027 + 13028 + PRP + O + + + also + also + 13029 + 13033 + RB + O + + + want + want + 13034 + 13038 + VBP + O + + + to + to + 13039 + 13041 + TO + O + + + take + take + 13042 + 13046 + VB + O + + + this + this + 13047 + 13051 + DT + O + + + opportunity + opportunity + 13052 + 13063 + NN + O + + + to + to + 13065 + 13067 + TO + O + + + extend + extend + 13068 + 13074 + VB + O + + + to + to + 13075 + 13077 + TO + O + + + his + he + 13078 + 13081 + PRP$ + O + + + successor + successor + 13082 + 13091 + NN + O + + + , + , + 13091 + 13092 + , + O + + + Larry + Larry + 13093 + 13098 + NNP + PERSON + + + Jane + Jane + 13099 + 13103 + NNP + PERSON + + + - + - + 13103 + 13104 + : + O + + + zich + zich + 13106 + 13110 + NN + O + + + , + , + 13110 + 13111 + , + O + + + the + the + 13112 + 13115 + DT + O + + + very + very + 13116 + 13120 + RB + O + + + best + best + 13121 + 13125 + JJS + O + + + wishes + wish + 13126 + 13132 + NNS + O + + + in + in + 13133 + 13135 + IN + O + + + his + he + 13136 + 13139 + PRP$ + O + + + new + new + 13140 + 13143 + JJ + O + + + role + role + 13145 + 13149 + NN + O + + + . + . + 13149 + 13150 + . + O + + + (ROOT (S (NP (PRP I)) (ADVP (RB also)) (VP (VBP want) (S (VP (TO to) (VP (VB take) (NP (DT this) (NN opportunity) (S (VP (TO to) (VP (VB extend) (PP (TO to) (NP (NP (PRP$ his) (NN successor)) (, ,) (NP (NP (NNP Larry) (NNP Jane)) (: -) (NP (NP (NN zich)) (, ,) (NP (NP (DT the) (ADJP (RB very) (JJS best)) (NNS wishes)) (PP (IN in) (NP (PRP$ his) (JJ new) (NN role)))))))))))))))) (. .))) + + + ROOT + want + + + want + I + + + want + also + + + take + to + + + want + take + + + opportunity + this + + + take + opportunity + + + extend + to + + + opportunity + extend + + + extend + to + + + successor + his + + + to + successor + + + Jane + Larry + + + successor + Jane + + + Jane + zich + + + wishes + the + + + best + very + + + wishes + best + + + zich + wishes + + + wishes + in + + + role + his + + + role + new + + + in + role + + + + + ROOT + want + + + want + I + + + want + also + + + take + to + + + want + take + + + opportunity + this + + + take + opportunity + + + extend + to + + + opportunity + extend + + + successor + his + + + extend + successor + + + Jane + Larry + + + successor + Jane + + + Jane + zich + + + wishes + the + + + best + very + + + wishes + best + + + zich + wishes + + + role + his + + + role + new + + + wishes + role + + + + + ROOT + want + + + want + I + + + want + also + + + take + to + + + want + take + + + opportunity + this + + + take + opportunity + + + extend + to + + + opportunity + extend + + + successor + his + + + extend + successor + + + Jane + Larry + + + successor + Jane + + + Jane + zich + + + wishes + the + + + best + very + + + wishes + best + + + zich + wishes + + + role + his + + + role + new + + + wishes + role + + + + + + + I + I + 13151 + 13152 + PRP + O + + + know + know + 13153 + 13157 + VBP + O + + + he + he + 13158 + 13160 + PRP + O + + + is + be + 13161 + 13163 + VBZ + O + + + up + up + 13164 + 13166 + RB + O + + + to + to + 13167 + 13169 + TO + O + + + the + the + 13170 + 13173 + DT + O + + + task + task + 13174 + 13178 + NN + O + + + . + . + 13178 + 13179 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP know) (SBAR (S (NP (PRP he)) (VP (VBZ is) (ADVP (RB up) (PP (TO to) (NP (DT the) (NN task)))))))) (. .))) + + + ROOT + know + + + know + I + + + is + he + + + know + is + + + is + up + + + up + to + + + task + the + + + to + task + + + + + ROOT + know + + + know + I + + + is + he + + + know + is + + + is + up + + + task + the + + + up + task + + + + + ROOT + know + + + know + I + + + is + he + + + know + is + + + is + up + + + task + the + + + up + task + + + + + + + AN + a + 13182 + 13184 + DT + O + + + ENVIRONMENTAL + environmental + 13185 + 13198 + JJ + O + + + DIVIDEND + dividend + 13199 + 13207 + NN + O + + + : + : + 13207 + 13208 + : + O + + + CAPITALIZING + capitalize + 13212 + 13224 + VBG + O + + + ON + on + 13225 + 13227 + IN + O + + + NEW + NEW + 13228 + 13231 + NNP + O + + + OPPOR + OPPOR + 13232 + 13237 + NNP + O + + + - + - + 13237 + 13238 + : + O + + + TUNITIES + TUNITIES + 13242 + 13250 + NNP + O + + + FOR + for + 13251 + 13254 + IN + O + + + INTERNATION + INTERNATION + 13255 + 13266 + NNP + ORGANIZATION + + + - + - + 13266 + 13267 + : + O + + + AL + AL + 13271 + 13273 + NNP + O + + + ACTION + ACTION + 13274 + 13280 + NNP + O + + + Mr. + Mr. + 13284 + 13287 + NNP + O + + + MITCHELL + MITCHELL + 13288 + 13296 + NNP + PERSON + + + . + . + 13296 + 13297 + . + O + + + (ROOT (NP (NP (DT AN) (JJ ENVIRONMENTAL) (NN DIVIDEND)) (: :) (NP (NP (VBG CAPITALIZING)) (PP (IN ON) (NP (NNP NEW) (NNP OPPOR)))) (: -) (NP (NP (NNP TUNITIES)) (PP (IN FOR) (NP (NNP INTERNATION)))) (: -) (NP (NP (NNP AL) (NNP ACTION)) (NP (NNP Mr.) (NNP MITCHELL))) (. .))) + + + ROOT + DIVIDEND + + + DIVIDEND + AN + + + DIVIDEND + ENVIRONMENTAL + + + DIVIDEND + CAPITALIZING + + + CAPITALIZING + ON + + + OPPOR + NEW + + + ON + OPPOR + + + DIVIDEND + TUNITIES + + + TUNITIES + FOR + + + FOR + INTERNATION + + + ACTION + AL + + + DIVIDEND + ACTION + + + MITCHELL + Mr. + + + ACTION + MITCHELL + + + + + ROOT + DIVIDEND + + + DIVIDEND + AN + + + DIVIDEND + ENVIRONMENTAL + + + DIVIDEND + CAPITALIZING + + + OPPOR + NEW + + + CAPITALIZING + OPPOR + + + DIVIDEND + TUNITIES + + + TUNITIES + INTERNATION + + + ACTION + AL + + + DIVIDEND + ACTION + + + MITCHELL + Mr. + + + ACTION + MITCHELL + + + + + ROOT + DIVIDEND + + + DIVIDEND + AN + + + DIVIDEND + ENVIRONMENTAL + + + DIVIDEND + CAPITALIZING + + + OPPOR + NEW + + + CAPITALIZING + OPPOR + + + DIVIDEND + TUNITIES + + + TUNITIES + INTERNATION + + + ACTION + AL + + + DIVIDEND + ACTION + + + MITCHELL + Mr. + + + ACTION + MITCHELL + + + + + + + Mr. + Mr. + 13298 + 13301 + NNP + O + + + President + President + 13302 + 13311 + NNP + O + + + , + , + 13311 + 13312 + , + O + + + I + I + 13313 + 13314 + PRP + O + + + ask + ask + 13316 + 13319 + VBP + O + + + unanimous + unanimous + 13320 + 13329 + JJ + O + + + consent + consent + 13330 + 13337 + NN + O + + + that + that + 13338 + 13342 + IN + O + + + a + a + 13343 + 13344 + DT + O + + + speech + speech + 13345 + 13351 + NN + O + + + given + give + 13353 + 13358 + VBN + O + + + by + by + 13359 + 13361 + IN + O + + + the + the + 13362 + 13365 + DT + O + + + distinguished + distinguished + 13366 + 13379 + JJ + O + + + chairman + chairman + 13380 + 13388 + NN + O + + + of + of + 13390 + 13392 + IN + O + + + the + the + 13393 + 13396 + DT + O + + + Senate + Senate + 13397 + 13403 + NNP + ORGANIZATION + + + Foreign + Foreign + 13404 + 13411 + NNP + ORGANIZATION + + + Relations + Relations + 13412 + 13421 + NNP + ORGANIZATION + + + Com + Com + 13422 + 13425 + NNP + ORGANIZATION + + + - + - + 13425 + 13426 + : + O + + + mittee + mittee + 13428 + 13434 + NN + O + + + , + , + 13434 + 13435 + , + O + + + Senator + Senator + 13436 + 13443 + NNP + O + + + CLAIBORNE + CLAIBORNE + 13444 + 13453 + NNP + O + + + PELL + PELL + 13454 + 13458 + NNP + O + + + of + of + 13459 + 13461 + IN + O + + + Rhode + Rhode + 13463 + 13468 + NNP + LOCATION + + + Island + Island + 13469 + 13475 + NNP + LOCATION + + + , + , + 13475 + 13476 + , + O + + + be + be + 13477 + 13479 + VB + O + + + inserted + insert + 13480 + 13488 + VBN + O + + + in + in + 13489 + 13491 + IN + O + + + the + the + 13492 + 13495 + DT + O + + + RECORD + record + 13497 + 13503 + NN + O + + + . + . + 13503 + 13504 + . + O + + + (ROOT (S (NP (NNP Mr.) (NNP President)) (PRN (, ,) (S (NP (PRP I)) (VP (VBP ask) (NP (JJ unanimous) (NN consent)) (SBAR (IN that) (S (NP (DT a) (NN speech)) (VP (VBN given) (PP (IN by) (NP (NP (NP (DT the) (JJ distinguished) (NN chairman)) (PP (IN of) (NP (DT the) (NNP Senate) (NNP Foreign) (NNP Relations) (NNP Com)))) (: -) (NP (NP (NN mittee)) (, ,) (NP (NP (NNP Senator) (NNP CLAIBORNE) (NNP PELL)) (PP (IN of) (NP (NNP Rhode) (NNP Island)))))))))))) (, ,)) (VP (VB be) (VP (VBN inserted) (PP (IN in) (NP (DT the) (NN RECORD))))) (. .))) + + + ROOT + inserted + + + President + Mr. + + + inserted + President + + + ask + I + + + inserted + ask + + + consent + unanimous + + + ask + consent + + + given + that + + + speech + a + + + given + speech + + + ask + given + + + given + by + + + chairman + the + + + chairman + distinguished + + + by + chairman + + + chairman + of + + + Com + the + + + Com + Senate + + + Com + Foreign + + + Com + Relations + + + of + Com + + + chairman + mittee + + + PELL + Senator + + + PELL + CLAIBORNE + + + mittee + PELL + + + PELL + of + + + Island + Rhode + + + of + Island + + + inserted + be + + + inserted + in + + + RECORD + the + + + in + RECORD + + + + + ROOT + inserted + + + President + Mr. + + + inserted + President + + + ask + I + + + inserted + ask + + + consent + unanimous + + + ask + consent + + + given + that + + + speech + a + + + given + speech + + + ask + given + + + chairman + the + + + chairman + distinguished + + + given + chairman + + + Com + the + + + Com + Senate + + + Com + Foreign + + + Com + Relations + + + chairman + Com + + + chairman + mittee + + + PELL + Senator + + + PELL + CLAIBORNE + + + mittee + PELL + + + Island + Rhode + + + PELL + Island + + + inserted + be + + + RECORD + the + + + inserted + RECORD + + + + + ROOT + inserted + + + President + Mr. + + + inserted + President + + + ask + I + + + inserted + ask + + + consent + unanimous + + + ask + consent + + + given + that + + + speech + a + + + given + speech + + + ask + given + + + chairman + the + + + chairman + distinguished + + + given + chairman + + + Com + the + + + Com + Senate + + + Com + Foreign + + + Com + Relations + + + chairman + Com + + + chairman + mittee + + + PELL + Senator + + + PELL + CLAIBORNE + + + mittee + PELL + + + Island + Rhode + + + PELL + Island + + + inserted + be + + + RECORD + the + + + inserted + RECORD + + + + + + + The + the + 13508 + 13511 + DT + O + + + honorable + honorable + 13512 + 13521 + JJ + O + + + chairman + chairman + 13522 + 13530 + NN + O + + + of + of + 13531 + 13533 + IN + O + + + the + the + 13534 + 13537 + DT + O + + + For + for + 13538 + 13541 + IN + O + + + - + - + 13541 + 13542 + : + O + + + eign + eign + 13544 + 13548 + NN + O + + + Relations + Relations + 13549 + 13558 + NNP + ORGANIZATION + + + Committee + Committee + 13559 + 13568 + NNP + ORGANIZATION + + + recently + recently + 13569 + 13577 + RB + DATE + PAST_REF + PAST_REF + + + ad + ad + 13578 + 13580 + NN + O + + + - + - + 13580 + 13581 + : + O + + + dressed + dress + 13583 + 13590 + VBN + O + + + the + the + 13591 + 13594 + DT + O + + + Global + global + 13595 + 13601 + JJ + O + + + Forum + Forum + 13602 + 13607 + NNP + O + + + on + on + 13608 + 13610 + IN + O + + + Environ + Environ + 13611 + 13618 + NNP + LOCATION + + + - + - + 13618 + 13619 + : + O + + + ment + ment + 13621 + 13625 + NN + O + + + and + and + 13626 + 13629 + CC + O + + + Development + development + 13630 + 13641 + NN + O + + + for + for + 13642 + 13645 + IN + O + + + Survival + Survival + 13646 + 13654 + NNP + O + + + in + in + 13655 + 13657 + IN + O + + + Moscow + Moscow + 13659 + 13665 + NNP + LOCATION + + + . + . + 13665 + 13666 + . + O + + + (ROOT (X (X (NP (NP (DT The) (JJ honorable) (NN chairman)) (PP (IN of) (FRAG (X (X (DT the)) (PP (IN For))) (: -) (NP (NN eign))))) (NP (NNP Relations) (NNP Committee)) (NP (NP (RB recently)) (NP (NN ad)))) (: -) (VP (VBN dressed) (NP (NP (NP (DT the) (JJ Global) (NNP Forum)) (PP (IN on) (NP (NNP Environ)))) (: -) (NP (NP (NN ment) (CC and) (NN Development)) (PP (IN for) (NP (NNP Survival))))) (PP (IN in) (NP (NNP Moscow)))) (. .))) + + + ROOT + dressed + + + chairman + The + + + chairman + honorable + + + recently + chairman + + + chairman + of + + + For + the + + + eign + For + + + of + eign + + + Committee + Relations + + + recently + Committee + + + dressed + recently + + + recently + ad + + + Forum + the + + + Forum + Global + + + dressed + Forum + + + Forum + on + + + on + Environ + + + Forum + ment + + + ment + and + + + ment + Development + + + ment + for + + + for + Survival + + + dressed + in + + + in + Moscow + + + + + ROOT + dressed + + + chairman + The + + + chairman + honorable + + + recently + chairman + + + chairman + of + + + For + the + + + eign + For + + + of + eign + + + Committee + Relations + + + recently + Committee + + + dressed + recently + + + recently + ad + + + Forum + the + + + Forum + Global + + + dressed + Forum + + + Forum + Environ + + + Forum + ment + + + ment + Development + + + ment + Survival + + + dressed + Moscow + + + + + ROOT + dressed + + + chairman + The + + + chairman + honorable + + + recently + chairman + + + chairman + of + + + For + the + + + eign + For + + + of + eign + + + Committee + Relations + + + recently + Committee + + + dressed + recently + + + recently + ad + + + Forum + the + + + Forum + Global + + + dressed + Forum + + + Forum + Environ + + + Forum + ment + + + Forum + Development + + + ment + Development + + + ment + Survival + + + dressed + Moscow + + + + + + + His + he + 13667 + 13670 + PRP$ + O + + + remarks + remark + 13671 + 13678 + NNS + O + + + focus + focus + 13679 + 13684 + VBP + O + + + on + on + 13685 + 13687 + IN + O + + + the + the + 13688 + 13691 + DT + O + + + cat + cat + 13692 + 13695 + NN + O + + + - + - + 13695 + 13696 + : + O + + + astrophic + astrophic + 13698 + 13707 + JJ + O + + + threats + threat + 13708 + 13715 + NNS + O + + + to + to + 13716 + 13718 + TO + O + + + the + the + 13719 + 13722 + DT + O + + + world + world + 13723 + 13728 + NN + O + + + 's + 's + 13728 + 13730 + POS + O + + + envi + envi + 13731 + 13735 + SYM + O + + + - + - + 13735 + 13736 + : + O + + + ronment-including + ronment-including + 13738 + 13755 + NN + O + + + global + global + 13760 + 13766 + JJ + O + + + climate + climate + 13769 + 13776 + NN + O + + + change + change + 13778 + 13784 + NN + O + + + , + , + 13784 + 13785 + , + O + + + ozone + ozone + 13786 + 13791 + NN + O + + + depletion + depletion + 13792 + 13801 + NN + O + + + and + and + 13802 + 13805 + CC + O + + + a + a + 13806 + 13807 + DT + O + + + host + host + 13808 + 13812 + NN + O + + + of + of + 13813 + 13815 + IN + O + + + problems + problem + 13817 + 13825 + NNS + O + + + that + that + 13826 + 13830 + WDT + O + + + require + require + 13831 + 13838 + VBP + O + + + international + international + 13839 + 13852 + JJ + O + + + cooperation + cooperation + 13854 + 13865 + NN + O + + + . + . + 13865 + 13866 + . + O + + + (ROOT (S (NP (PRP$ His) (NNS remarks)) (VP (VBP focus) (PP (IN on) (NP (NP (DT the) (NN cat)) (: -) (NP (NP (JJ astrophic) (NNS threats)) (PP (TO to) (NP (NP (DT the) (NN world) (POS 's)) (NP (FRAG (X (SYM envi)) (: -) (NP (NN ronment-including))))))))) (NP (NP (JJ global) (NN climate) (NN change)) (, ,) (NP (NN ozone) (NN depletion)) (CC and) (NP (NP (DT a) (NN host)) (PP (IN of) (NP (NP (NNS problems)) (SBAR (WHNP (WDT that)) (S (VP (VBP require) (NP (JJ international) (NN cooperation)))))))))) (. .))) + + + ROOT + focus + + + remarks + His + + + focus + remarks + + + focus + on + + + cat + the + + + on + cat + + + threats + astrophic + + + cat + threats + + + threats + to + + + world + the + + + to + world + + + world + 's + + + ronment-including + envi + + + world + ronment-including + + + change + global + + + change + climate + + + focus + change + + + depletion + ozone + + + change + depletion + + + change + and + + + host + a + + + change + host + + + host + of + + + of + problems + + + require + that + + + problems + require + + + cooperation + international + + + require + cooperation + + + + + ROOT + focus + + + remarks + His + + + focus + remarks + + + cat + the + + + focus + cat + + + threats + astrophic + + + cat + threats + + + world + the + + + threats + world + + + world + 's + + + ronment-including + envi + + + world + ronment-including + + + change + global + + + change + climate + + + focus + change + + + depletion + ozone + + + change + depletion + + + host + a + + + change + host + + + host + problems + + + require + that + + + problems + require + + + cooperation + international + + + require + cooperation + + + + + ROOT + focus + + + remarks + His + + + focus + remarks + + + cat + the + + + focus + cat + + + threats + astrophic + + + cat + threats + + + world + the + + + threats + world + + + world + 's + + + ronment-including + envi + + + world + ronment-including + + + change + global + + + change + climate + + + focus + change + + + depletion + ozone + + + focus + depletion + + + change + depletion + + + host + a + + + focus + host + + + change + host + + + host + problems + + + require + that + + + problems + require + + + cooperation + international + + + require + cooperation + + + + + + + I + I + 13870 + 13871 + PRP + O + + + would + would + 13872 + 13877 + MD + O + + + like + like + 13878 + 13882 + VB + O + + + to + to + 13883 + 13885 + TO + O + + + call + call + 13886 + 13890 + VB + O + + + this + this + 13891 + 13895 + DT + O + + + important + important + 13896 + 13905 + JJ + O + + + speech + speech + 13907 + 13913 + NN + O + + + to + to + 13914 + 13916 + TO + O + + + the + the + 13917 + 13920 + DT + O + + + attention + attention + 13921 + 13930 + NN + O + + + of + of + 13931 + 13933 + IN + O + + + my + my + 13934 + 13936 + PRP$ + O + + + col + col + 13937 + 13940 + NN + O + + + - + - + 13940 + 13941 + : + O + + + leagues + league + 13943 + 13950 + NNS + O + + + . + . + 13950 + 13951 + . + O + + + (ROOT (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (VP (TO to) (VP (VB call) (NP (DT this) (JJ important) (NN speech)) (PP (TO to) (NP (NP (NP (DT the) (NN attention)) (PP (IN of) (NP (PRP$ my) (NN col)))) (: -) (NP (NNS leagues))))))))) (. .))) + + + ROOT + like + + + like + I + + + like + would + + + call + to + + + like + call + + + speech + this + + + speech + important + + + call + speech + + + call + to + + + attention + the + + + to + attention + + + attention + of + + + col + my + + + of + col + + + attention + leagues + + + + + ROOT + like + + + like + I + + + like + would + + + call + to + + + like + call + + + speech + this + + + speech + important + + + call + speech + + + attention + the + + + call + attention + + + col + my + + + attention + col + + + attention + leagues + + + + + ROOT + like + + + like + I + + + like + would + + + call + to + + + like + call + + + speech + this + + + speech + important + + + call + speech + + + attention + the + + + call + attention + + + col + my + + + attention + col + + + attention + leagues + + + + + + + Not + not + 13952 + 13955 + RB + O + + + only + only + 13956 + 13960 + RB + O + + + does + do + 13961 + 13965 + VBZ + O + + + it + it + 13966 + 13968 + PRP + O + + + deal + deal + 13969 + 13973 + VB + O + + + with + with + 13974 + 13978 + IN + O + + + one + one + 13979 + 13982 + CD + NUMBER + 1.0 + + + of + of + 13984 + 13986 + IN + O + + + the + the + 13987 + 13990 + DT + O + + + most + most + 13991 + 13995 + RBS + O + + + significant + significant + 13996 + 14007 + JJ + O + + + problems + problem + 14008 + 14016 + NNS + O + + + of + of + 14017 + 14019 + IN + O + + + our + we + 14021 + 14024 + PRP$ + O + + + times + time + 14025 + 14030 + NNS + O + + + , + , + 14030 + 14031 + , + O + + + it + it + 14032 + 14034 + PRP + O + + + does + do + 14035 + 14039 + VBZ + O + + + so + so + 14040 + 14042 + RB + O + + + with + with + 14043 + 14047 + IN + O + + + eloquence + eloquence + 14048 + 14057 + NN + O + + + and + and + 14059 + 14062 + CC + O + + + clarity + clarity + 14063 + 14070 + NN + O + + + . + . + 14070 + 14071 + . + O + + + (ROOT (S (SINV (CONJP (RB Not) (RB only)) (VBZ does) (NP (PRP it)) (VP (VB deal) (PP (IN with) (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (RBS most) (JJ significant) (NNS problems)) (PP (IN of) (NP (PRP$ our) (NNS times))))))))) (, ,) (NP (PRP it)) (VP (VBZ does) (ADVP (RB so)) (PP (IN with) (NP (NN eloquence) (CC and) (NN clarity)))) (. .))) + + + ROOT + does + + + only + Not + + + deal + only + + + deal + does + + + deal + it + + + does + deal + + + deal + with + + + with + one + + + one + of + + + problems + the + + + problems + most + + + problems + significant + + + of + problems + + + problems + of + + + times + our + + + of + times + + + does + it + + + does + so + + + does + with + + + with + eloquence + + + eloquence + and + + + eloquence + clarity + + + + + ROOT + does + + + only + Not + + + deal + only + + + deal + does + + + deal + it + + + does + deal + + + deal + one + + + problems + the + + + problems + most + + + problems + significant + + + one + problems + + + times + our + + + problems + times + + + does + it + + + does + so + + + does + eloquence + + + eloquence + clarity + + + + + ROOT + does + + + only + Not + + + deal + only + + + deal + does + + + deal + it + + + does + deal + + + deal + one + + + problems + the + + + problems + most + + + problems + significant + + + one + problems + + + times + our + + + problems + times + + + does + it + + + does + so + + + does + eloquence + + + does + clarity + + + eloquence + clarity + + + + + + + I + I + 14072 + 14073 + PRP + O + + + hope + hope + 14074 + 14078 + VBP + O + + + other + other + 14079 + 14084 + JJ + O + + + Senators + senator + 14085 + 14093 + NNS + O + + + will + will + 14094 + 14098 + MD + O + + + take + take + 14100 + 14104 + VB + O + + + the + the + 14105 + 14108 + DT + O + + + time + time + 14109 + 14113 + NN + O + + + to + to + 14114 + 14116 + TO + O + + + review + review + 14117 + 14123 + VB + O + + + Senator + Senator + 14124 + 14131 + NNP + O + + + PELL + PELL + 14132 + 14136 + NNP + O + + + 'S + 's + 14136 + 14138 + POS + O + + + statement + statement + 14140 + 14149 + NN + O + + + . + . + 14149 + 14150 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP hope) (SBAR (S (NP (JJ other) (NNS Senators)) (VP (MD will) (VP (VB take) (NP (DT the) (NN time) (S (VP (TO to) (VP (VB review) (NP (NP (NNP Senator) (NNP PELL) (POS 'S)) (NN statement))))))))))) (. .))) + + + ROOT + hope + + + hope + I + + + Senators + other + + + take + Senators + + + take + will + + + hope + take + + + time + the + + + take + time + + + review + to + + + time + review + + + PELL + Senator + + + statement + PELL + + + PELL + 'S + + + review + statement + + + + + ROOT + hope + + + hope + I + + + Senators + other + + + take + Senators + + + take + will + + + hope + take + + + time + the + + + take + time + + + review + to + + + time + review + + + PELL + Senator + + + statement + PELL + + + review + statement + + + + + ROOT + hope + + + hope + I + + + Senators + other + + + take + Senators + + + take + will + + + hope + take + + + time + the + + + take + time + + + review + to + + + time + review + + + PELL + Senator + + + statement + PELL + + + review + statement + + + + + + + There + there + 14154 + 14159 + EX + O + + + being + be + 14160 + 14165 + VBG + O + + + no + no + 14166 + 14168 + DT + O + + + objection + objection + 14169 + 14178 + NN + O + + + , + , + 14178 + 14179 + , + O + + + the + the + 14180 + 14183 + DT + O + + + re + re + 14184 + 14186 + SYM + O + + + - + - + 14186 + 14187 + : + O + + + marks + mark + 14189 + 14194 + NNS + O + + + werc + werc + 14195 + 14199 + JJ + O + + + ordered + order + 14200 + 14207 + VBN + O + + + to + to + 14208 + 14210 + TO + O + + + be + be + 14211 + 14213 + VB + O + + + printed + print + 14214 + 14221 + VBN + O + + + in + in + 14222 + 14224 + IN + O + + + the + the + 14226 + 14229 + DT + O + + + RECORD + RECORD + 14230 + 14236 + NNP + O + + + , + , + 14236 + 14237 + , + O + + + as + as + 14238 + 14240 + IN + O + + + follows + follow + 14241 + 14248 + VBZ + O + + + : + : + 14248 + 14249 + : + O + + + AN + a + 14251 + 14253 + DT + O + + + ENVIRONMENTAL + environmental + 14254 + 14267 + JJ + O + + + DIVIDEND + dividend + 14268 + 14276 + NN + O + + + : + : + 14276 + 14277 + : + O + + + CAPITALIZING + capitalize + 14278 + 14290 + VBG + O + + + ON + on + 14294 + 14296 + IN + O + + + NEw + NEw + 14297 + 14300 + NNP + O + + + OPPORTUNITIES + OPPORTUNITIES + 14301 + 14314 + NNP + O + + + FOR + for + 14315 + 14318 + IN + O + + + INTERNATIONAL + INTERNATIONAL + 14319 + 14332 + NNP + O + + + AcTION + AcTION + 14336 + 14342 + NNP + O + + + -LRB- + -lrb- + 14344 + 14345 + -LRB- + O + + + Remarks + remark + 14345 + 14352 + NNS + O + + + by + by + 14353 + 14355 + IN + O + + + Senator + Senator + 14356 + 14363 + NNP + O + + + Claiborne + Claiborne + 14364 + 14373 + NNP + PERSON + + + Pell + Pell + 14374 + 14378 + NNP + PERSON + + + , + , + 14378 + 14379 + , + O + + + Global + Global + 14380 + 14386 + NNP + O + + + Forum + Forum + 14390 + 14395 + NNP + O + + + on + on + 14396 + 14398 + IN + O + + + Environment + Environment + 14399 + 14410 + NNP + O + + + and + and + 14411 + 14414 + CC + O + + + Development + Development + 14415 + 14426 + NNP + O + + + , + , + 14426 + 14427 + , + O + + + Moscow + Moscow + 14431 + 14437 + NNP + LOCATION + + + , + , + 14437 + 14438 + , + O + + + U.S.S.R. + U.S.S.R. + 14439 + 14447 + NNP + ORGANIZATION + + + , + , + 14447 + 14448 + , + O + + + January + January + 14449 + 14456 + NNP + DATE + 1990-01-17 + 1990-01-17 + + + 17 + 17 + 14457 + 14459 + CD + DATE + 1990-01-17 + 1990-01-17 + + + , + , + 14459 + 14460 + , + DATE + 1990-01-17 + 1990-01-17 + + + 1990 + 1990 + 14461 + 14465 + CD + DATE + 1990-01-17 + 1990-01-17 + + + -RRB- + -rrb- + 14465 + 14466 + -RRB- + O + + + We + we + 14470 + 14472 + PRP + O + + + are + be + 14473 + 14476 + VBP + O + + + gathered + gather + 14477 + 14485 + VBN + O + + + here + here + 14486 + 14490 + RB + O + + + at + at + 14491 + 14493 + IN + O + + + an + a + 14494 + 14496 + DT + O + + + extraordinary + extraordinary + 14497 + 14510 + JJ + O + + + time + time + 14512 + 14516 + NN + O + + + in + in + 14517 + 14519 + IN + O + + + human + human + 14520 + 14525 + JJ + O + + + history + history + 14526 + 14533 + NN + O + + + . + . + 14533 + 14534 + . + O + + + (ROOT (UCP (S (NP (EX There)) (VP (VBG being) (NP (NP (NP (DT no) (NN objection)) (, ,) (NP (DT the))) (X (SYM re))))) (: -) (NP (NP (NP (NNS marks) (JJ werc)) (VP (VBN ordered) (S (VP (TO to) (VP (VB be) (VP (VBN printed) (PP (IN in) (NP (DT the) (NNP RECORD))))))) (, ,) (SBAR (IN as) (S (VP (VBZ follows) (: :) (S (NP (DT AN) (JJ ENVIRONMENTAL) (NN DIVIDEND)) (: :) (VP (VBG CAPITALIZING)))))))) (PP (IN ON) (NP (NP (NNP NEw) (NNP OPPORTUNITIES)) (PP (IN FOR) (NP (NP (NP (NNP INTERNATIONAL) (NNP AcTION)) (PRN (-LRB- -LRB-) (NP (NP (NP (NNS Remarks)) (PP (IN by) (NP (NNP Senator) (NNP Claiborne) (NNP Pell)))) (, ,) (NP (NP (NNP Global) (NNP Forum)) (PP (IN on) (NP (NNP Environment) (CC and) (NNP Development)))) (, ,) (NP (NP (NNP Moscow)) (, ,) (NP (NNP U.S.S.R.) (, ,) (NNP January) (CD 17) (, ,) (CD 1990)))) (-RRB- -RRB-))) (SBAR (S (NP (PRP We)) (VP (VBP are) (VP (VBN gathered) (ADVP (RB here)) (PP (IN at) (NP (NP (DT an) (JJ extraordinary) (NN time)) (PP (IN in) (NP (JJ human) (NN history)))))))))))))) (. .))) + + + ROOT + being + + + being + There + + + objection + no + + + being + objection + + + objection + the + + + objection + re + + + being + marks + + + marks + werc + + + marks + ordered + + + printed + to + + + printed + be + + + ordered + printed + + + printed + in + + + RECORD + the + + + in + RECORD + + + follows + as + + + ordered + follows + + + DIVIDEND + AN + + + DIVIDEND + ENVIRONMENTAL + + + CAPITALIZING + DIVIDEND + + + follows + CAPITALIZING + + + marks + ON + + + OPPORTUNITIES + NEw + + + ON + OPPORTUNITIES + + + OPPORTUNITIES + FOR + + + AcTION + INTERNATIONAL + + + FOR + AcTION + + + AcTION + Remarks + + + Remarks + by + + + Pell + Senator + + + Pell + Claiborne + + + by + Pell + + + Forum + Global + + + Remarks + Forum + + + Forum + on + + + on + Environment + + + Environment + and + + + Environment + Development + + + Remarks + Moscow + + + January + U.S.S.R. + + + Moscow + January + + + January + 17 + + + January + 1990 + + + gathered + We + + + gathered + are + + + AcTION + gathered + + + gathered + here + + + gathered + at + + + time + an + + + time + extraordinary + + + at + time + + + time + in + + + history + human + + + in + history + + + + + ROOT + being + + + being + There + + + objection + no + + + being + objection + + + objection + the + + + objection + re + + + being + marks + + + marks + werc + + + marks + ordered + + + printed + to + + + printed + be + + + ordered + printed + + + RECORD + the + + + printed + RECORD + + + follows + as + + + ordered + follows + + + DIVIDEND + AN + + + DIVIDEND + ENVIRONMENTAL + + + CAPITALIZING + DIVIDEND + + + follows + CAPITALIZING + + + OPPORTUNITIES + NEw + + + marks + OPPORTUNITIES + + + AcTION + INTERNATIONAL + + + OPPORTUNITIES + AcTION + + + AcTION + Remarks + + + Pell + Senator + + + Pell + Claiborne + + + Remarks + Pell + + + Forum + Global + + + Remarks + Forum + + + Forum + Environment + + + Environment + Development + + + Remarks + Moscow + + + January + U.S.S.R. + + + Moscow + January + + + January + 17 + + + January + 1990 + + + gathered + We + + + gathered + are + + + AcTION + gathered + + + gathered + here + + + time + an + + + time + extraordinary + + + gathered + time + + + history + human + + + time + history + + + + + ROOT + being + + + being + There + + + objection + no + + + being + objection + + + objection + the + + + objection + re + + + being + marks + + + marks + werc + + + marks + ordered + + + printed + to + + + printed + be + + + ordered + printed + + + RECORD + the + + + printed + RECORD + + + follows + as + + + ordered + follows + + + DIVIDEND + AN + + + DIVIDEND + ENVIRONMENTAL + + + CAPITALIZING + DIVIDEND + + + follows + CAPITALIZING + + + OPPORTUNITIES + NEw + + + marks + OPPORTUNITIES + + + AcTION + INTERNATIONAL + + + OPPORTUNITIES + AcTION + + + AcTION + Remarks + + + Pell + Senator + + + Pell + Claiborne + + + Remarks + Pell + + + Forum + Global + + + Remarks + Forum + + + Forum + Environment + + + Forum + Development + + + Environment + Development + + + Remarks + Moscow + + + January + U.S.S.R. + + + Moscow + January + + + January + 17 + + + January + 1990 + + + gathered + We + + + gathered + are + + + AcTION + gathered + + + gathered + here + + + time + an + + + time + extraordinary + + + gathered + time + + + history + human + + + time + history + + + + + + + In + in + 14535 + 14537 + IN + O + + + a + a + 14538 + 14539 + DT + O + + + matter + matter + 14540 + 14546 + NN + O + + + of + of + 14547 + 14549 + IN + O + + + months + month + 14551 + 14557 + NNS + DURATION + PXM + + + a + a + 14558 + 14559 + DT + O + + + series + series + 14560 + 14566 + NN + O + + + of + of + 14567 + 14569 + IN + O + + + popular + popular + 14570 + 14577 + JJ + O + + + movements + movement + 14578 + 14587 + NNS + O + + + have + have + 14588 + 14592 + VBP + O + + + transformed + transform + 14594 + 14605 + VBN + O + + + Europe + Europe + 14606 + 14612 + NNP + LOCATION + + + . + . + 14612 + 14613 + . + O + + + (ROOT (S (PP (IN In) (NP (NP (DT a) (NN matter)) (PP (IN of) (NP (NNS months))))) (NP (NP (DT a) (NN series)) (PP (IN of) (NP (JJ popular) (NNS movements)))) (VP (VBP have) (VP (VBN transformed) (NP (NNP Europe)))) (. .))) + + + ROOT + transformed + + + transformed + In + + + matter + a + + + In + matter + + + matter + of + + + of + months + + + series + a + + + transformed + series + + + series + of + + + movements + popular + + + of + movements + + + transformed + have + + + transformed + Europe + + + + + ROOT + transformed + + + matter + a + + + transformed + matter + + + matter + months + + + series + a + + + transformed + series + + + movements + popular + + + series + movements + + + transformed + have + + + transformed + Europe + + + + + ROOT + transformed + + + matter + a + + + transformed + matter + + + matter + months + + + series + a + + + transformed + series + + + movements + popular + + + series + movements + + + transformed + have + + + transformed + Europe + + + + + + + The + the + 14614 + 14617 + DT + O + + + Iron + Iron + 14618 + 14622 + NNP + MISC + + + Curtain + Curtain + 14623 + 14630 + NNP + MISC + + + has + have + 14631 + 14634 + VBZ + O + + + ceased + cease + 14636 + 14642 + VBN + O + + + to + to + 14643 + 14645 + TO + O + + + be + be + 14646 + 14648 + VB + O + + + a + a + 14649 + 14650 + DT + O + + + barrier + barrier + 14651 + 14658 + NN + O + + + between + between + 14659 + 14666 + IN + O + + + East + East + 14667 + 14671 + NNP + O + + + and + and + 14672 + 14675 + CC + O + + + West + West + 14677 + 14681 + NNP + O + + + . + . + 14681 + 14682 + . + O + + + (ROOT (S (NP (DT The) (NNP Iron) (NNP Curtain)) (VP (VBZ has) (VP (VBN ceased) (S (VP (TO to) (VP (VB be) (NP (NP (DT a) (NN barrier)) (PP (IN between) (NP (NNP East) (CC and) (NNP West))))))))) (. .))) + + + ROOT + ceased + + + Curtain + The + + + Curtain + Iron + + + ceased + Curtain + + + ceased + has + + + barrier + to + + + barrier + be + + + barrier + a + + + ceased + barrier + + + barrier + between + + + between + East + + + East + and + + + East + West + + + + + ROOT + ceased + + + Curtain + The + + + Curtain + Iron + + + ceased + Curtain + + + ceased + has + + + barrier + to + + + barrier + be + + + barrier + a + + + ceased + barrier + + + barrier + East + + + East + West + + + + + ROOT + ceased + + + Curtain + The + + + Curtain + Iron + + + ceased + Curtain + + + ceased + has + + + barrier + to + + + barrier + be + + + barrier + a + + + ceased + barrier + + + barrier + East + + + barrier + West + + + East + West + + + + + + + A + a + 14683 + 14684 + DT + O + + + democratically + democratically + 14685 + 14699 + RB + O + + + elected + elect + 14700 + 14707 + VBN + O + + + government + government + 14708 + 14718 + NN + O + + + has + have + 14720 + 14723 + VBZ + O + + + taken + take + 14724 + 14729 + VBN + O + + + power + power + 14730 + 14735 + NN + O + + + in + in + 14736 + 14738 + IN + O + + + Poland + Poland + 14739 + 14745 + NNP + LOCATION + + + , + , + 14745 + 14746 + , + O + + + and + and + 14747 + 14750 + CC + O + + + in + in + 14751 + 14753 + IN + O + + + the + the + 14754 + 14757 + DT + DURATION + PXM + + + next + next + 14758 + 14762 + JJ + DURATION + PXM + + + few + few + 14764 + 14767 + JJ + DURATION + PXM + + + months + month + 14768 + 14774 + NNS + DURATION + PXM + + + free + free + 14775 + 14779 + JJ + O + + + elcdtions + elcdtion + 14780 + 14789 + NNS + O + + + will + will + 14790 + 14794 + MD + O + + + be + be + 14795 + 14797 + VB + O + + + held + hold + 14798 + 14802 + VBN + O + + + in + in + 14803 + 14805 + IN + O + + + East + East + 14807 + 14811 + NNP + LOCATION + + + Germany + Germany + 14812 + 14819 + NNP + LOCATION + + + , + , + 14819 + 14820 + , + O + + + Czechoslovakia + Czechoslovakia + 14821 + 14835 + NNP + LOCATION + + + , + , + 14835 + 14836 + , + O + + + Bulgaria + Bulgaria + 14837 + 14845 + NNP + LOCATION + + + , + , + 14845 + 14846 + , + O + + + Hungary + Hungary + 14848 + 14855 + NNP + LOCATION + + + , + , + 14855 + 14856 + , + O + + + and + and + 14857 + 14860 + CC + O + + + Romania + Romania + 14861 + 14868 + NNP + LOCATION + + + . + . + 14868 + 14869 + . + O + + + (ROOT (S (S (NP (DT A) (ADJP (RB democratically) (VBN elected)) (NN government)) (VP (VBZ has) (VP (VBN taken) (NP (NP (NN power)) (PP (IN in) (NP (NNP Poland))))))) (, ,) (CC and) (S (PP (IN in) (NP (DT the) (JJ next) (JJ few) (NNS months))) (NP (JJ free) (NNS elcdtions)) (VP (MD will) (VP (VB be) (VP (VBN held) (PP (IN in) (NP (NP (NNP East) (NNP Germany) (, ,) (NNP Czechoslovakia) (, ,) (NNP Bulgaria) (, ,) (NNP Hungary) (, ,)) (CC and) (NP (NNP Romania)))))))) (. .))) + + + ROOT + taken + + + government + A + + + elected + democratically + + + government + elected + + + taken + government + + + taken + has + + + taken + power + + + power + in + + + in + Poland + + + taken + and + + + held + in + + + months + the + + + months + next + + + months + few + + + in + months + + + elcdtions + free + + + held + elcdtions + + + held + will + + + held + be + + + taken + held + + + held + in + + + Hungary + East + + + Hungary + Germany + + + Hungary + Czechoslovakia + + + Hungary + Bulgaria + + + in + Hungary + + + Hungary + and + + + Hungary + Romania + + + + + ROOT + taken + + + government + A + + + elected + democratically + + + government + elected + + + taken + government + + + taken + has + + + taken + power + + + power + Poland + + + months + the + + + months + next + + + months + few + + + held + months + + + elcdtions + free + + + held + elcdtions + + + held + will + + + held + be + + + taken + held + + + Hungary + East + + + Hungary + Germany + + + Hungary + Czechoslovakia + + + Hungary + Bulgaria + + + held + Hungary + + + Hungary + Romania + + + + + ROOT + taken + + + government + A + + + elected + democratically + + + government + elected + + + taken + government + + + taken + has + + + taken + power + + + power + Poland + + + months + the + + + months + next + + + months + few + + + held + months + + + elcdtions + free + + + held + elcdtions + + + held + will + + + held + be + + + taken + held + + + Hungary + East + + + Hungary + Germany + + + Hungary + Czechoslovakia + + + Hungary + Bulgaria + + + held + Hungary + + + held + Romania + + + Hungary + Romania + + + + + + + In + in + 14870 + 14872 + IN + O + + + addition + addition + 14873 + 14881 + NN + O + + + , + , + 14881 + 14882 + , + O + + + the + the + 14883 + 14886 + DT + O + + + Soviet + Soviet + 14888 + 14894 + NNP + LOCATION + + + Union + Union + 14895 + 14900 + NNP + LOCATION + + + is + be + 14901 + 14903 + VBZ + O + + + well + well + 14904 + 14908 + RB + O + + + along + along + 14909 + 14914 + IN + O + + + on + on + 14915 + 14917 + IN + O + + + a + a + 14918 + 14919 + DT + O + + + path + path + 14920 + 14924 + NN + O + + + to + to + 14925 + 14927 + TO + O + + + free + free + 14928 + 14932 + JJ + O + + + - + - + 14932 + 14933 + : + O + + + dom + dom + 14935 + 14938 + NN + O + + + , + , + 14938 + 14939 + , + O + + + openness + openness + 14940 + 14948 + NN + O + + + and + and + 14949 + 14952 + CC + O + + + democratic + democratic + 14953 + 14963 + JJ + O + + + renewal + renewal + 14964 + 14971 + NN + O + + + . + . + 14971 + 14972 + . + O + + + (ROOT (S (PP (IN In) (NP (NN addition))) (, ,) (NP (DT the) (NNP Soviet) (NNP Union)) (VP (VBZ is) (ADVP (RB well) (IN along)) (PP (IN on) (NP (NP (NP (DT a) (NN path)) (PP (TO to) (NP (JJ free)))) (: -) (NP (NP (NN dom)) (, ,) (NP (NN openness)) (CC and) (NP (JJ democratic) (NN renewal)))))) (. .))) + + + ROOT + is + + + is + In + + + In + addition + + + Union + the + + + Union + Soviet + + + is + Union + + + along + well + + + is + along + + + is + on + + + path + a + + + on + path + + + path + to + + + to + free + + + path + dom + + + dom + openness + + + dom + and + + + renewal + democratic + + + dom + renewal + + + + + ROOT + is + + + is + addition + + + Union + the + + + Union + Soviet + + + is + Union + + + along + well + + + is + along + + + path + a + + + is + path + + + path + free + + + path + dom + + + dom + openness + + + renewal + democratic + + + dom + renewal + + + + + ROOT + is + + + is + addition + + + Union + the + + + Union + Soviet + + + is + Union + + + along + well + + + is + along + + + path + a + + + is + path + + + path + free + + + path + dom + + + path + openness + + + dom + openness + + + renewal + democratic + + + path + renewal + + + dom + renewal + + + + + + + With + with + 14976 + 14980 + IN + O + + + the + the + 14981 + 14984 + DT + O + + + changes + change + 14985 + 14992 + NNS + O + + + in + in + 14993 + 14995 + IN + O + + + Eastern + Eastern + 14996 + 15003 + NNP + LOCATION + + + Europe + Europe + 15004 + 15010 + NNP + LOCATION + + + and + and + 15011 + 15014 + CC + O + + + the + the + 15016 + 15019 + DT + O + + + Soviet + soviet + 15020 + 15026 + JJ + MISC + + + U + U + 15027 + 15028 + NNP + O + + + nion + nion + 15029 + 15033 + NNP + O + + + , + , + 15033 + 15034 + , + O + + + East + East + 15035 + 15039 + NNP + O + + + and + and + 15040 + 15043 + CC + O + + + West + West + 15044 + 15048 + NNP + O + + + will + will + 15049 + 15053 + MD + O + + + begin + begin + 15054 + 15059 + VB + O + + + to + to + 15061 + 15063 + TO + O + + + share + share + 15064 + 15069 + VB + O + + + common + common + 15070 + 15076 + JJ + O + + + values + value + 15077 + 15083 + NNS + O + + + of + of + 15084 + 15086 + IN + O + + + a + a + 15087 + 15088 + DT + O + + + belief + belief + 15089 + 15095 + NN + O + + + in + in + 15096 + 15098 + IN + O + + + indi + indus + 15099 + 15103 + NN + O + + + - + - + 15103 + 15104 + : + O + + + vidual + vidual + 15106 + 15112 + JJ + O + + + rights + rights + 15113 + 15119 + NNS + O + + + and + and + 15120 + 15123 + CC + O + + + democratic + democratic + 15124 + 15134 + JJ + O + + + institutions + institution + 15135 + 15147 + NNS + O + + + . + . + 15147 + 15148 + . + O + + + (ROOT (S (PP (IN With) (NP (NP (DT the) (NNS changes)) (PP (IN in) (NP (NP (NNP Eastern) (NNP Europe)) (CC and) (NP (DT the) (JJ Soviet) (NNP U) (NNP nion)))))) (, ,) (NP (NNP East) (CC and) (NNP West)) (VP (MD will) (VP (VB begin) (S (VP (TO to) (VP (VB share) (NP (NP (NP (JJ common) (NNS values)) (PP (IN of) (NP (NP (DT a) (NN belief)) (PP (IN in) (NP (NN indi)))))) (: -) (NP (NP (JJ vidual) (NNS rights)) (CC and) (NP (JJ democratic) (NNS institutions))))))))) (. .))) + + + ROOT + begin + + + begin + With + + + changes + the + + + With + changes + + + changes + in + + + Europe + Eastern + + + in + Europe + + + Europe + and + + + nion + the + + + nion + Soviet + + + nion + U + + + Europe + nion + + + begin + East + + + East + and + + + East + West + + + begin + will + + + share + to + + + begin + share + + + values + common + + + share + values + + + values + of + + + belief + a + + + of + belief + + + belief + in + + + in + indi + + + rights + vidual + + + values + rights + + + rights + and + + + institutions + democratic + + + rights + institutions + + + + + ROOT + begin + + + changes + the + + + begin + changes + + + Europe + Eastern + + + changes + Europe + + + nion + the + + + nion + Soviet + + + nion + U + + + Europe + nion + + + begin + East + + + East + West + + + begin + will + + + share + to + + + begin + share + + + values + common + + + share + values + + + belief + a + + + values + belief + + + belief + indi + + + rights + vidual + + + values + rights + + + institutions + democratic + + + rights + institutions + + + + + ROOT + begin + + + changes + the + + + begin + changes + + + Europe + Eastern + + + changes + Europe + + + nion + the + + + nion + Soviet + + + nion + U + + + changes + nion + + + Europe + nion + + + begin + East + + + East + West + + + begin + West + + + begin + will + + + share + to + + + begin + share + + + values + common + + + share + values + + + belief + a + + + values + belief + + + belief + indi + + + rights + vidual + + + values + rights + + + institutions + democratic + + + values + institutions + + + rights + institutions + + + + + + + The + the + 15150 + 15153 + DT + O + + + wave + wave + 15154 + 15158 + NN + O + + + of + of + 15159 + 15161 + IN + O + + + democracy + democracy + 15162 + 15171 + NN + O + + + is + be + 15172 + 15174 + VBZ + O + + + also + also + 15175 + 15179 + RB + O + + + spreading + spread + 15180 + 15189 + VBG + O + + + to + to + 15190 + 15192 + TO + O + + + the + the + 15194 + 15197 + DT + O + + + developing + develop + 15198 + 15208 + VBG + O + + + world + world + 15209 + 15214 + NN + O + + + . + . + 15214 + 15215 + . + O + + + (ROOT (S (NP (NP (DT The) (NN wave)) (PP (IN of) (NP (NN democracy)))) (VP (VBZ is) (ADVP (RB also)) (VP (VBG spreading) (PP (TO to) (NP (DT the) (VBG developing) (NN world))))) (. .))) + + + ROOT + spreading + + + wave + The + + + spreading + wave + + + wave + of + + + of + democracy + + + spreading + is + + + spreading + also + + + spreading + to + + + world + the + + + world + developing + + + to + world + + + + + ROOT + spreading + + + wave + The + + + spreading + wave + + + wave + democracy + + + spreading + is + + + spreading + also + + + world + the + + + world + developing + + + spreading + world + + + + + ROOT + spreading + + + wave + The + + + spreading + wave + + + wave + democracy + + + spreading + is + + + spreading + also + + + world + the + + + world + developing + + + spreading + world + + + + + + + With + with + 15216 + 15220 + IN + O + + + the + the + 15221 + 15224 + DT + O + + + recent + recent + 15225 + 15231 + JJ + O + + + elec + elec + 15232 + 15236 + NN + O + + + - + - + 15236 + 15237 + : + O + + + tions + tion + 15239 + 15244 + NNS + O + + + in + in + 15245 + 15247 + IN + O + + + Chile + Chile + 15248 + 15253 + NNP + LOCATION + + + , + , + 15253 + 15254 + , + O + + + every + every + 15255 + 15260 + DT + O + + + government + government + 15261 + 15271 + NN + O + + + in + in + 15272 + 15274 + IN + O + + + South + South + 15275 + 15280 + NNP + LOCATION + + + America + America + 15282 + 15289 + NNP + LOCATION + + + will + will + 15290 + 15294 + MD + O + + + be + be + 15295 + 15297 + VB + O + + + a + a + 15298 + 15299 + DT + O + + + democracy + democracy + 15300 + 15309 + NN + O + + + . + . + 15309 + 15310 + . + O + + + (ROOT (S (PP (IN With) (NP (NP (DT the) (JJ recent) (NN elec)) (: -) (NP (NP (NNS tions)) (PP (IN in) (NP (NNP Chile)))))) (, ,) (NP (NP (DT every) (NN government)) (PP (IN in) (NP (NNP South) (NNP America)))) (VP (MD will) (VP (VB be) (NP (DT a) (NN democracy)))) (. .))) + + + ROOT + democracy + + + democracy + With + + + elec + the + + + elec + recent + + + With + elec + + + elec + tions + + + tions + in + + + in + Chile + + + government + every + + + democracy + government + + + government + in + + + America + South + + + in + America + + + democracy + will + + + democracy + be + + + democracy + a + + + + + ROOT + democracy + + + elec + the + + + elec + recent + + + democracy + elec + + + elec + tions + + + tions + Chile + + + government + every + + + democracy + government + + + America + South + + + government + America + + + democracy + will + + + democracy + be + + + democracy + a + + + + + ROOT + democracy + + + elec + the + + + elec + recent + + + democracy + elec + + + elec + tions + + + tions + Chile + + + government + every + + + democracy + government + + + America + South + + + government + America + + + democracy + will + + + democracy + be + + + democracy + a + + + + + + + Elsewhere + elsewhere + 15311 + 15320 + RB + O + + + In + in + 15321 + 15323 + IN + O + + + the + the + 15325 + 15328 + DT + O + + + just + just + 15329 + 15333 + RB + O + + + concluded + conclude + 15334 + 15343 + VBN + O + + + decade + decade + 15344 + 15350 + NN + DURATION + P10Y + + + , + , + 15350 + 15351 + , + O + + + dictatorships + dictatorship + 15352 + 15365 + NNS + O + + + in + in + 15366 + 15368 + IN + O + + + the + the + 15370 + 15373 + DT + O + + + Philippines + Philippines + 15374 + 15385 + NNPS + LOCATION + + + and + and + 15386 + 15389 + CC + O + + + Pakistan + Pakistan + 15390 + 15398 + NNP + LOCATION + + + have + have + 15399 + 15403 + VBP + O + + + disap + disap + 15404 + 15409 + NN + O + + + - + - + 15409 + 15410 + : + O + + + peared + pear + 15412 + 15418 + VBN + O + + + and + and + 15419 + 15422 + CC + O + + + India + India + 15423 + 15428 + NNP + LOCATION + + + 's + 's + 15428 + 15430 + POS + O + + + recent + recent + 15431 + 15437 + JJ + O + + + elections + election + 15438 + 15447 + NNS + O + + + , + , + 15447 + 15448 + , + O + + + the + the + 15449 + 15452 + DT + O + + + larg + larg + 15453 + 15457 + NN + O + + + - + - + 15457 + 15458 + : + O + + + est + est + 15460 + 15463 + NN + O + + + exercise + exercise + 15464 + 15472 + NN + O + + + of + of + 15473 + 15475 + IN + O + + + popular + popular + 15476 + 15483 + JJ + O + + + choice + choice + 15484 + 15490 + NN + O + + + in + in + 15491 + 15493 + IN + O + + + human + human + 15494 + 15499 + JJ + O + + + his + he + 15500 + 15503 + PRP$ + O + + + - + - + 15503 + 15504 + : + O + + + tory + tory + 15506 + 15510 + JJ + O + + + , + , + 15510 + 15511 + , + O + + + reminds + remind + 15512 + 15519 + VBZ + O + + + us + we + 15520 + 15522 + PRP + O + + + of + of + 15523 + 15525 + IN + O + + + the + the + 15526 + 15529 + DT + O + + + appeal + appeal + 15530 + 15536 + NN + O + + + of + of + 15537 + 15539 + IN + O + + + democracy + democracy + 15540 + 15549 + NN + O + + + to + to + 15551 + 15553 + TO + O + + + even + even + 15554 + 15558 + RB + O + + + the + the + 15559 + 15562 + DT + O + + + world + world + 15563 + 15568 + NN + O + + + 's + 's + 15568 + 15570 + POS + O + + + poorest + poorest + 15571 + 15578 + JJS + O + + + people + people + 15579 + 15585 + NNS + O + + + . + . + 15585 + 15586 + . + O + + + (ROOT (S (PP (ADVP (RB Elsewhere)) (IN In) (NP (DT the) (RB just) (VBN concluded) (NN decade))) (, ,) (NP (NP (NNS dictatorships)) (PP (IN in) (NP (NP (DT the) (NNPS Philippines)) (CC and) (NP (NNP Pakistan))))) (VP (VBP have) (NP (NP (NN disap)) (: -) (S (S (VP (VBN peared))) (CC and) (S (NP (NP (NP (NNP India) (POS 's)) (JJ recent) (NNS elections)) (, ,) (NP (NP (DT the) (NN larg)) (PRN (: -) (S (NP (NP (NN est) (NN exercise)) (PP (IN of) (NP (NP (JJ popular) (NN choice)) (PP (IN in) (NP (JJ human)))))) (NP (PRP$ his))) (: -))) (ADJP (JJ tory)) (, ,)) (VP (VBZ reminds) (NP (PRP us)) (PP (PP (IN of) (NP (NP (DT the) (NN appeal)) (PP (IN of) (NP (NN democracy))))) (PP (TO to) (RB even)))))))) (NP (NP (DT the) (NN world) (POS 's)) (JJS poorest) (NNS people)) (. .))) + + + ROOT + have + + + In + Elsewhere + + + have + In + + + decade + the + + + decade + just + + + decade + concluded + + + In + decade + + + have + dictatorships + + + dictatorships + in + + + Philippines + the + + + in + Philippines + + + Philippines + and + + + Philippines + Pakistan + + + have + disap + + + disap + peared + + + peared + and + + + elections + India + + + India + 's + + + elections + recent + + + reminds + elections + + + larg + the + + + elections + larg + + + exercise + est + + + his + exercise + + + exercise + of + + + choice + popular + + + of + choice + + + choice + in + + + in + human + + + larg + his + + + elections + tory + + + peared + reminds + + + reminds + us + + + reminds + of + + + appeal + the + + + of + appeal + + + appeal + of + + + of + democracy + + + of + to + + + to + even + + + world + the + + + people + world + + + world + 's + + + people + poorest + + + have + people + + + + + ROOT + have + + + have + Elsewhere + + + decade + the + + + decade + just + + + decade + concluded + + + have + decade + + + have + dictatorships + + + Philippines + the + + + dictatorships + Philippines + + + Philippines + Pakistan + + + have + disap + + + disap + peared + + + elections + India + + + elections + recent + + + reminds + elections + + + larg + the + + + elections + larg + + + exercise + est + + + his + exercise + + + choice + popular + + + exercise + choice + + + choice + human + + + larg + his + + + elections + tory + + + peared + reminds + + + reminds + us + + + appeal + the + + + reminds + appeal + + + appeal + democracy + + + reminds + to + + + to + even + + + world + the + + + people + world + + + people + poorest + + + have + people + + + + + ROOT + have + + + have + Elsewhere + + + decade + the + + + decade + just + + + decade + concluded + + + have + decade + + + have + dictatorships + + + Philippines + the + + + dictatorships + Philippines + + + dictatorships + Pakistan + + + Philippines + Pakistan + + + have + disap + + + disap + peared + + + elections + India + + + elections + recent + + + reminds + elections + + + larg + the + + + elections + larg + + + exercise + est + + + his + exercise + + + choice + popular + + + exercise + choice + + + choice + human + + + larg + his + + + elections + tory + + + disap + reminds + + + peared + reminds + + + reminds + us + + + appeal + the + + + reminds + appeal + + + appeal + democracy + + + reminds + to + + + to + even + + + world + the + + + people + world + + + people + poorest + + + have + people + + + + + + + Of + of + 15587 + 15589 + IN + O + + + course + course + 15591 + 15597 + NN + O + + + , + , + 15597 + 15598 + , + O + + + there + there + 15599 + 15604 + EX + O + + + are + be + 15605 + 15608 + VBP + O + + + setbacks + setback + 15609 + 15617 + NNS + O + + + , + , + 15617 + 15618 + , + O + + + as + as + 15619 + 15621 + IN + O + + + last + last + 15622 + 15626 + JJ + DATE + XXXX-06 OFFSET P-1Y + + + + June + June + 15627 + 15631 + NNP + DATE + XXXX-06 OFFSET P-1Y + + + + 's + 's + 15631 + 15633 + POS + O + + + events + event + 15635 + 15641 + NNS + O + + + in + in + 15642 + 15644 + IN + O + + + Tiananmen + Tiananmen + 15645 + 15654 + NNP + O + + + Square + Square + 15655 + 15661 + NNP + O + + + remind + remind + 15662 + 15668 + VB + O + + + us + we + 15669 + 15671 + PRP + O + + + , + , + 15671 + 15672 + , + O + + + and + and + 15673 + 15676 + CC + O + + + democracy + democracy + 15678 + 15687 + NN + O + + + can + can + 15688 + 15691 + MD + O + + + be + be + 15692 + 15694 + VB + O + + + fragile + fragile + 15695 + 15702 + JJ + O + + + as + as + 15703 + 15705 + IN + O + + + witnessed + witness + 15706 + 15715 + VBN + O + + + by + by + 15716 + 15718 + IN + O + + + recent + recent + 15720 + 15726 + JJ + O + + + events + event + 15727 + 15733 + NNS + O + + + in + in + 15734 + 15736 + IN + O + + + Manila + Manila + 15737 + 15743 + NNP + LOCATION + + + . + . + 15743 + 15744 + . + O + + + (ROOT (S (PP (IN Of) (NP (NN course))) (, ,) (S (NP (EX there)) (VP (VBP are) (NP (NNS setbacks)) (, ,) (SBAR (IN as) (S (NP (NP (NP (JJ last) (NNP June) (POS 's)) (NNS events)) (PP (IN in) (NP (NNP Tiananmen) (NNP Square)))) (VP (VB remind) (NP (PRP us))))))) (, ,) (CC and) (S (NP (NN democracy)) (VP (MD can) (VP (VB be) (ADJP (JJ fragile)) (SBAR (IN as) (S (VP (VBN witnessed) (PP (IN by) (NP (NP (JJ recent) (NNS events)) (PP (IN in) (NP (NNP Manila))))))))))) (. .))) + + + ROOT + are + + + are + Of + + + Of + course + + + are + there + + + are + setbacks + + + remind + as + + + June + last + + + events + June + + + June + 's + + + remind + events + + + events + in + + + Square + Tiananmen + + + in + Square + + + are + remind + + + remind + us + + + are + and + + + fragile + democracy + + + fragile + can + + + fragile + be + + + are + fragile + + + witnessed + as + + + fragile + witnessed + + + witnessed + by + + + events + recent + + + by + events + + + events + in + + + in + Manila + + + + + ROOT + are + + + are + course + + + are + there + + + are + setbacks + + + remind + as + + + June + last + + + events + June + + + remind + events + + + Square + Tiananmen + + + events + Square + + + are + remind + + + remind + us + + + fragile + democracy + + + fragile + can + + + fragile + be + + + are + fragile + + + witnessed + as + + + fragile + witnessed + + + events + recent + + + witnessed + events + + + events + Manila + + + + + ROOT + are + + + are + course + + + are + there + + + are + setbacks + + + remind + as + + + June + last + + + events + June + + + remind + events + + + Square + Tiananmen + + + events + Square + + + are + remind + + + remind + us + + + fragile + democracy + + + fragile + can + + + fragile + be + + + are + fragile + + + witnessed + as + + + fragile + witnessed + + + events + recent + + + witnessed + events + + + events + Manila + + + + + + + Of + of + 15745 + 15747 + IN + O + + + the + the + 15748 + 15751 + DT + O + + + overall + overall + 15752 + 15759 + JJ + O + + + trend + trend + 15763 + 15768 + NN + O + + + , + , + 15768 + 15769 + , + O + + + however + however + 15770 + 15777 + RB + O + + + , + , + 15777 + 15778 + , + O + + + we + we + 15779 + 15781 + PRP + O + + + can + can + 15782 + 15785 + MD + O + + + be + be + 15786 + 15788 + VB + O + + + optimistic + optimistic + 15789 + 15799 + JJ + O + + + : + : + 15799 + 15800 + : + O + + + democ + democ + 15801 + 15806 + NN + O + + + - + - + 15806 + 15807 + : + O + + + racy + racy + 15809 + 15813 + JJ + O + + + Is + be + 15814 + 15816 + VBZ + O + + + indeed + indeed + 15817 + 15823 + RB + O + + + on + on + 15824 + 15826 + IN + O + + + the + the + 15827 + 15830 + DT + O + + + march + march + 15831 + 15836 + NN + DATE + XXXX-03 + XXXX-03 + + + . + . + 15836 + 15837 + . + O + + + (ROOT (S (PP (IN Of) (NP (DT the) (JJ overall) (NN trend))) (, ,) (ADVP (RB however)) (, ,) (NP (PRP we)) (VP (MD can) (VP (VB be) (ADJP (JJ optimistic)) (: :) (NP (NP (NN democ)) (: -) (S (NP (JJ racy)) (VP (VBZ Is) (ADVP (RB indeed)) (PP (IN on) (NP (DT the) (NN march)))))))) (. .))) + + + ROOT + optimistic + + + optimistic + Of + + + trend + the + + + trend + overall + + + Of + trend + + + optimistic + however + + + optimistic + we + + + optimistic + can + + + optimistic + be + + + optimistic + democ + + + Is + racy + + + democ + Is + + + Is + indeed + + + Is + on + + + march + the + + + on + march + + + + + ROOT + optimistic + + + trend + the + + + trend + overall + + + optimistic + trend + + + optimistic + however + + + optimistic + we + + + optimistic + can + + + optimistic + be + + + optimistic + democ + + + Is + racy + + + democ + Is + + + Is + indeed + + + march + the + + + Is + march + + + + + ROOT + optimistic + + + trend + the + + + trend + overall + + + optimistic + trend + + + optimistic + however + + + optimistic + we + + + optimistic + can + + + optimistic + be + + + optimistic + democ + + + Is + racy + + + democ + Is + + + Is + indeed + + + march + the + + + Is + march + + + + + + + It + it + 15841 + 15843 + PRP + O + + + is + be + 15844 + 15846 + VBZ + O + + + an + a + 15847 + 15849 + DT + O + + + interesting + interesting + 15850 + 15861 + JJ + O + + + fact + fact + 15862 + 15866 + NN + O + + + that + that + 15867 + 15871 + IN + O + + + modern + modern + 15872 + 15878 + JJ + O + + + histo + histo + 15879 + 15884 + NN + O + + + - + - + 15884 + 15885 + : + O + + + ry + ry + 15887 + 15889 + NN + O + + + has + have + 15890 + 15893 + VBZ + O + + + never + never + 15894 + 15899 + RB + O + + + known + know + 15900 + 15905 + VBN + O + + + a + a + 15906 + 15907 + DT + O + + + war + war + 15908 + 15911 + NN + O + + + between + between + 15912 + 15919 + IN + O + + + demo + demo + 15920 + 15924 + NN + O + + + - + - + 15924 + 15925 + : + O + + + cratic + cratic + 15927 + 15933 + JJ + O + + + states + state + 15934 + 15940 + NNS + O + + + . + . + 15940 + 15941 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (DT an) (JJ interesting) (NN fact)) (SBAR (IN that) (S (NP (NP (JJ modern) (NN histo)) (: -) (NP (NN ry))) (VP (VBZ has) (ADVP (RB never)) (VP (VBN known) (NP (NP (NP (DT a) (NN war)) (PP (IN between) (NP (NN demo)))) (: -) (NP (JJ cratic) (NNS states)))))))) (. .))) + + + ROOT + fact + + + fact + It + + + fact + is + + + fact + an + + + fact + interesting + + + known + that + + + histo + modern + + + known + histo + + + histo + ry + + + known + has + + + known + never + + + fact + known + + + war + a + + + known + war + + + war + between + + + between + demo + + + states + cratic + + + war + states + + + + + ROOT + fact + + + fact + It + + + fact + is + + + fact + an + + + fact + interesting + + + known + that + + + histo + modern + + + known + histo + + + histo + ry + + + known + has + + + known + never + + + fact + known + + + war + a + + + known + war + + + war + demo + + + states + cratic + + + war + states + + + + + ROOT + fact + + + fact + It + + + fact + is + + + fact + an + + + fact + interesting + + + known + that + + + histo + modern + + + known + histo + + + histo + ry + + + known + has + + + known + never + + + fact + known + + + war + a + + + known + war + + + war + demo + + + states + cratic + + + war + states + + + + + + + And + and + 15942 + 15945 + CC + O + + + , + , + 15945 + 15946 + , + O + + + Indeed + indeed + 15947 + 15953 + RB + O + + + , + , + 15953 + 15954 + , + O + + + the + the + 15955 + 15958 + DT + O + + + spread + spread + 15959 + 15965 + NN + O + + + of + of + 15966 + 15968 + IN + O + + + de + de + 15969 + 15971 + FW + O + + + - + - + 15971 + 15972 + : + O + + + mocracy + mocracy + 15974 + 15981 + NN + O + + + and + and + 15982 + 15985 + CC + O + + + freedom + freedom + 15986 + 15993 + NN + O + + + across + across + 15994 + 16000 + IN + O + + + Europe + Europe + 16001 + 16007 + NNP + LOCATION + + + has + have + 16008 + 16011 + VBZ + O + + + re + re + 16012 + 16014 + SYM + O + + + - + - + 16014 + 16015 + : + O + + + sulted + sulted + 16017 + 16023 + JJ + O + + + In + in + 16024 + 16026 + IN + O + + + a + a + 16027 + 16028 + DT + O + + + dramatic + dramatic + 16029 + 16037 + JJ + O + + + reduction + reduction + 16038 + 16047 + NN + O + + + of + of + 16048 + 16050 + IN + O + + + tensions + tension + 16051 + 16059 + NNS + O + + + . + . + 16059 + 16060 + . + O + + + (ROOT (S (CC And) (, ,) (ADVP (RB Indeed)) (, ,) (NP (NP (DT the) (NN spread)) (PP (IN of) (NP (NP (FW de)) (: -) (NP (NP (NN mocracy) (CC and) (NN freedom)) (PP (IN across) (NP (NNP Europe))))))) (VP (VBZ has) (FRAG (PP (X (SYM re)) (: -) (PP (NP (JJ sulted)) (IN In) (NP (NP (DT a) (JJ dramatic) (NN reduction)) (PP (IN of) (NP (NNS tensions))))) (. .)))))) + + + ROOT + has + + + has + And + + + has + Indeed + + + spread + the + + + has + spread + + + spread + of + + + of + de + + + de + mocracy + + + mocracy + and + + + mocracy + freedom + + + mocracy + across + + + across + Europe + + + In + re + + + In + sulted + + + has + In + + + reduction + a + + + reduction + dramatic + + + In + reduction + + + reduction + of + + + of + tensions + + + + + ROOT + has + + + has + And + + + has + Indeed + + + spread + the + + + has + spread + + + spread + de + + + de + mocracy + + + mocracy + freedom + + + mocracy + Europe + + + In + re + + + In + sulted + + + has + In + + + reduction + a + + + reduction + dramatic + + + In + reduction + + + reduction + tensions + + + + + ROOT + has + + + has + And + + + has + Indeed + + + spread + the + + + has + spread + + + spread + de + + + de + mocracy + + + de + freedom + + + mocracy + freedom + + + mocracy + Europe + + + In + re + + + In + sulted + + + has + In + + + reduction + a + + + reduction + dramatic + + + In + reduction + + + reduction + tensions + + + + + + + In + in + 16062 + 16064 + IN + O + + + 1981 + 1981 + 16065 + 16069 + CD + DATE + 1981 + 1981 + + + , + , + 16069 + 16070 + , + O + + + the + the + 16071 + 16074 + DT + O + + + Bulletin + Bulletin + 16075 + 16083 + NNP + O + + + of + of + 16084 + 16086 + IN + O + + + Atomic + atomic + 16087 + 16093 + JJ + O + + + Scientists + scientist + 16094 + 16104 + NNS + O + + + advanced + advance + 16106 + 16114 + VBD + O + + + the + the + 16115 + 16118 + DT + O + + + clock + clock + 16119 + 16124 + NN + O + + + of + of + 16125 + 16127 + IN + O + + + global + global + 16128 + 16134 + JJ + O + + + survival + survival + 16135 + 16143 + NN + O + + + to + to + 16144 + 16146 + TO + O + + + three + three + 16148 + 16153 + CD + DURATION + PT3M + + + minutes + minute + 16154 + 16161 + NNS + NUMBER + 0.0 + PT3M + + + before + before + 16162 + 16168 + IN + O + + + the + the + 16169 + 16172 + DT + O + + + midnight + midnight + 16173 + 16181 + NN + TIME + T00:00 + T00:00 + + + of + of + 16182 + 16184 + IN + O + + + nucle + nucle + 16185 + 16190 + NN + O + + + - + - + 16190 + 16191 + : + O + + + ar + ar + 16193 + 16195 + NN + O + + + Armageddon + Armageddon + 16196 + 16206 + NNP + O + + + . + . + 16206 + 16207 + . + O + + + (ROOT (S (PP (IN In) (NP (CD 1981))) (, ,) (NP (NP (DT the) (NNP Bulletin)) (PP (IN of) (NP (JJ Atomic) (NNS Scientists)))) (VP (VBD advanced) (NP (NP (DT the) (NN clock)) (PP (IN of) (NP (JJ global) (NN survival)))) (PP (TO to) (NP (CD three) (NNS minutes))) (PP (IN before) (NP (NP (NP (DT the) (NN midnight)) (PP (IN of) (NP (NN nucle)))) (: -) (NP (NP (NN ar)) (NP (NNP Armageddon)))))) (. .))) + + + ROOT + advanced + + + advanced + In + + + In + 1981 + + + Bulletin + the + + + advanced + Bulletin + + + Bulletin + of + + + Scientists + Atomic + + + of + Scientists + + + clock + the + + + advanced + clock + + + clock + of + + + survival + global + + + of + survival + + + advanced + to + + + minutes + three + + + to + minutes + + + advanced + before + + + midnight + the + + + before + midnight + + + midnight + of + + + of + nucle + + + midnight + ar + + + ar + Armageddon + + + + + ROOT + advanced + + + advanced + 1981 + + + Bulletin + the + + + advanced + Bulletin + + + Scientists + Atomic + + + Bulletin + Scientists + + + clock + the + + + advanced + clock + + + survival + global + + + clock + survival + + + minutes + three + + + advanced + minutes + + + midnight + the + + + advanced + midnight + + + midnight + nucle + + + midnight + ar + + + ar + Armageddon + + + + + ROOT + advanced + + + advanced + 1981 + + + Bulletin + the + + + advanced + Bulletin + + + Scientists + Atomic + + + Bulletin + Scientists + + + clock + the + + + advanced + clock + + + survival + global + + + clock + survival + + + minutes + three + + + advanced + minutes + + + midnight + the + + + advanced + midnight + + + midnight + nucle + + + midnight + ar + + + ar + Armageddon + + + + + + + Today + today + 16208 + 16213 + NN + DATE + THIS P1D + + + + with + with + 16214 + 16218 + IN + O + + + a + a + 16219 + 16220 + DT + O + + + treaty + treaty + 16221 + 16227 + NN + O + + + on + on + 16228 + 16230 + IN + O + + + In + in + 16231 + 16233 + IN + O + + + - + - + 16233 + 16234 + : + O + + + termediate + termediate + 16236 + 16246 + JJ + O + + + Nuclear + nuclear + 16247 + 16254 + JJ + O + + + Forces + force + 16255 + 16261 + NNS + O + + + In + in + 16262 + 16264 + IN + O + + + place + place + 16265 + 16270 + NN + O + + + and + and + 16271 + 16274 + CC + O + + + agreements + agreement + 16276 + 16286 + NNS + O + + + for + for + 16287 + 16290 + IN + O + + + drastic + drastic + 16291 + 16298 + JJ + O + + + reductions + reduction + 16299 + 16309 + NNS + O + + + in + in + 16310 + 16312 + IN + O + + + strate + strate + 16313 + 16319 + NN + O + + + - + - + 16319 + 16320 + : + O + + + gic + gic + 16322 + 16325 + JJ + O + + + and + and + 16326 + 16329 + CC + O + + + conventional + conventional + 16330 + 16342 + JJ + O + + + forces + force + 16343 + 16349 + NNS + O + + + in + in + 16350 + 16352 + IN + O + + + the + the + 16353 + 16356 + DT + O + + + offing + offing + 16357 + 16363 + NN + O + + + , + , + 16363 + 16364 + , + O + + + we + we + 16365 + 16367 + PRP + O + + + can + can + 16369 + 16372 + MD + O + + + see + see + 16373 + 16376 + VB + O + + + the + the + 16377 + 16380 + DT + O + + + hands + hand + 16381 + 16386 + NNS + O + + + of + of + 16387 + 16389 + IN + O + + + that + that + 16390 + 16394 + DT + O + + + clock + clock + 16395 + 16400 + NN + O + + + being + be + 16401 + 16406 + VBG + O + + + set + set + 16407 + 16410 + VBN + O + + + further + further + 16412 + 16419 + RB + O + + + back + back + 16420 + 16424 + RB + O + + + . + . + 16424 + 16425 + . + O + + + (ROOT (S (NP (NP (NN Today)) (PP (IN with) (NP (DT a) (NN treaty)))) (PP (IN on) (IN In) (NP (NP (: -) (NP (NP (JJ termediate) (JJ Nuclear) (NNS Forces)) (PP (IN In) (NP (NN place)))) (CC and) (NP (NP (NNS agreements)) (PP (IN for) (NP (NP (JJ drastic) (NNS reductions)) (PP (IN in) (NP (NN strate))))))) (: -) (NP (NP (ADJP (JJ gic) (CC and) (JJ conventional)) (NNS forces)) (PP (IN in) (NP (DT the) (NN offing)))))) (, ,) (NP (PRP we)) (VP (MD can) (VP (VB see) (NP (NP (DT the) (NNS hands)) (PP (IN of) (NP (NP (DT that) (NN clock)) (VP (VBG being) (VP (VBN set) (ADVP (RB further) (RB back))))))))) (. .))) + + + ROOT + see + + + see + Today + + + Today + with + + + treaty + a + + + with + treaty + + + In + on + + + see + In + + + Forces + termediate + + + Forces + Nuclear + + + In + Forces + + + Forces + In + + + In + place + + + Forces + and + + + Forces + agreements + + + agreements + for + + + reductions + drastic + + + for + reductions + + + reductions + in + + + in + strate + + + forces + gic + + + gic + and + + + gic + conventional + + + Forces + forces + + + forces + in + + + offing + the + + + in + offing + + + see + we + + + see + can + + + hands + the + + + see + hands + + + hands + of + + + clock + that + + + of + clock + + + set + being + + + clock + set + + + back + further + + + set + back + + + + + ROOT + see + + + see + Today + + + treaty + a + + + Today + treaty + + + see + on + + + Forces + termediate + + + Forces + Nuclear + + + see + Forces + + + Forces + place + + + Forces + agreements + + + reductions + drastic + + + agreements + reductions + + + reductions + strate + + + forces + gic + + + gic + conventional + + + Forces + forces + + + offing + the + + + forces + offing + + + see + we + + + see + can + + + hands + the + + + see + hands + + + clock + that + + + hands + clock + + + set + being + + + clock + set + + + back + further + + + set + back + + + + + ROOT + see + + + see + Today + + + treaty + a + + + Today + treaty + + + see + on + + + Forces + termediate + + + Forces + Nuclear + + + see + Forces + + + Forces + place + + + Forces + agreements + + + see + agreements + + + reductions + drastic + + + agreements + reductions + + + reductions + strate + + + forces + gic + + + gic + conventional + + + forces + conventional + + + Forces + forces + + + offing + the + + + forces + offing + + + see + we + + + see + can + + + hands + the + + + see + hands + + + clock + that + + + hands + clock + + + set + being + + + clock + set + + + back + further + + + set + back + + + + + + + We + we + 16429 + 16431 + PRP + O + + + in + in + 16432 + 16434 + IN + O + + + the + the + 16435 + 16438 + DT + O + + + United + United + 16439 + 16445 + NNP + LOCATION + + + States + States + 16446 + 16452 + NNPS + LOCATION + + + consider + consider + 16453 + 16461 + VB + O + + + the + the + 16462 + 16465 + DT + O + + + Eu + eu + 16466 + 16468 + NN + ORGANIZATION + + + - + - + 16468 + 16469 + : + O + + + ropean + ropean + 16471 + 16477 + NN + O + + + democracies + democracy + 16478 + 16489 + NNS + O + + + to + to + 16490 + 16492 + TO + O + + + be + be + 16493 + 16495 + VB + O + + + our + we + 16496 + 16499 + PRP$ + O + + + friends + friend + 16500 + 16507 + NNS + O + + + and + and + 16508 + 16511 + CC + O + + + allies + ally + 16513 + 16519 + NNS + O + + + . + . + 16519 + 16520 + . + O + + + (ROOT (S (NP (NP (PRP We)) (PP (IN in) (NP (DT the) (NNP United) (NNPS States)))) (VP (VB consider) (NP (NP (DT the) (NN Eu)) (: -) (NP (NN ropean) (NNS democracies) (S (VP (TO to) (VP (VB be) (NP (PRP$ our) (NNS friends) (CC and) (NNS allies)))))))) (. .))) + + + ROOT + consider + + + consider + We + + + We + in + + + States + the + + + States + United + + + in + States + + + Eu + the + + + consider + Eu + + + democracies + ropean + + + Eu + democracies + + + friends + to + + + friends + be + + + friends + our + + + democracies + friends + + + friends + and + + + friends + allies + + + + + ROOT + consider + + + consider + We + + + States + the + + + States + United + + + We + States + + + Eu + the + + + consider + Eu + + + democracies + ropean + + + Eu + democracies + + + friends + to + + + friends + be + + + friends + our + + + democracies + friends + + + friends + allies + + + + + ROOT + consider + + + consider + We + + + States + the + + + States + United + + + We + States + + + Eu + the + + + consider + Eu + + + democracies + ropean + + + Eu + democracies + + + friends + to + + + friends + be + + + friends + our + + + democracies + friends + + + democracies + allies + + + friends + allies + + + + + + + Looking + look + 16521 + 16528 + VBG + O + + + ahead + ahead + 16529 + 16534 + RB + O + + + we + we + 16535 + 16537 + PRP + O + + + might + might + 16538 + 16543 + MD + O + + + ask + ask + 16544 + 16547 + VB + O + + + whether + whether + 16548 + 16555 + IN + O + + + democratic + democratic + 16557 + 16567 + JJ + O + + + nations + nation + 16568 + 16575 + NNS + O + + + in + in + 16576 + 16578 + IN + O + + + Eastern + Eastern + 16579 + 16586 + NNP + LOCATION + + + Europe + Europe + 16587 + 16593 + NNP + LOCATION + + + and + and + 16594 + 16597 + CC + O + + + the + the + 16599 + 16602 + DT + O + + + Soviet + Soviet + 16603 + 16609 + NNP + LOCATION + + + Union + Union + 16610 + 16615 + NNP + LOCATION + + + might + might + 16616 + 16621 + MD + O + + + also + also + 16622 + 16626 + RB + O + + + be + be + 16627 + 16629 + VB + O + + + our + we + 16630 + 16633 + PRP$ + O + + + friends + friend + 16634 + 16641 + NNS + O + + + and + and + 16643 + 16646 + CC + O + + + allies + ally + 16647 + 16653 + NNS + O + + + , + , + 16653 + 16654 + , + O + + + and + and + 16655 + 16658 + CC + O + + + if + if + 16659 + 16661 + IN + O + + + so + so + 16662 + 16664 + RB + O + + + , + , + 16664 + 16665 + , + O + + + for + for + 16666 + 16669 + IN + O + + + what + what + 16670 + 16674 + WP + O + + + threat + threat + 16675 + 16681 + NN + O + + + do + do + 16682 + 16684 + VBP + O + + + we + we + 16685 + 16687 + PRP + O + + + need + need + 16689 + 16693 + VB + O + + + anything + anything + 16694 + 16702 + NN + O + + + comparable + comparable + 16703 + 16713 + JJ + O + + + to + to + 16714 + 16716 + TO + O + + + our + we + 16717 + 16720 + PRP$ + O + + + existing + exist + 16721 + 16729 + VBG + O + + + level + level + 16731 + 16736 + NN + O + + + of + of + 16737 + 16739 + IN + O + + + armaments + armament + 16740 + 16749 + NNS + O + + + ? + ? + 16749 + 16750 + . + O + + + (ROOT (S (VP (VBG Looking) (S (ADVP (RB ahead)) (NP (PRP we)) (VP (MD might) (VP (VB ask) (NP (SBAR (IN whether) (S (NP (NP (JJ democratic) (NNS nations)) (PP (IN in) (NP (NP (NNP Eastern) (NNP Europe)) (CC and) (NP (DT the) (NNP Soviet) (NNP Union))))) (VP (MD might) (ADVP (RB also)) (VP (VB be) (NP (PRP$ our) (NNS friends) (CC and) (NNS allies)))))) (, ,) (CC and) (SBAR (IN if) (FRAG (ADVP (RB so))))) (, ,) (PP (IN for) (SBARQ (WHNP (WP what) (NN threat)) (SQ (VBP do) (NP (PRP we)) (VP (VB need) (S (NP (NN anything)) (ADJP (JJ comparable) (PP (TO to) (NP (NP (PRP$ our) (VBG existing) (NN level)) (PP (IN of) (NP (NNS armaments))))))))) (. ?))))))))) + + + ROOT + Looking + + + ask + ahead + + + ask + we + + + ask + might + + + Looking + ask + + + friends + whether + + + nations + democratic + + + friends + nations + + + nations + in + + + Europe + Eastern + + + in + Europe + + + Europe + and + + + Union + the + + + Union + Soviet + + + Europe + Union + + + friends + might + + + friends + also + + + friends + be + + + friends + our + + + ask + friends + + + friends + and + + + friends + allies + + + friends + and + + + so + if + + + friends + so + + + ask + for + + + threat + what + + + need + threat + + + need + do + + + need + we + + + for + need + + + comparable + anything + + + need + comparable + + + comparable + to + + + level + our + + + level + existing + + + to + level + + + level + of + + + of + armaments + + + + + ROOT + Looking + + + ask + ahead + + + ask + we + + + ask + might + + + Looking + ask + + + friends + whether + + + nations + democratic + + + friends + nations + + + Europe + Eastern + + + nations + Europe + + + Union + the + + + Union + Soviet + + + Europe + Union + + + friends + might + + + friends + also + + + friends + be + + + friends + our + + + ask + friends + + + friends + allies + + + so + if + + + friends + so + + + ask + for + + + threat + what + + + need + threat + + + need + do + + + need + we + + + for + need + + + comparable + anything + + + need + comparable + + + level + our + + + level + existing + + + comparable + level + + + level + armaments + + + + + ROOT + Looking + + + ask + ahead + + + ask + we + + + ask + might + + + Looking + ask + + + friends + whether + + + nations + democratic + + + friends + nations + + + Europe + Eastern + + + nations + Europe + + + Union + the + + + Union + Soviet + + + nations + Union + + + Europe + Union + + + friends + might + + + friends + also + + + friends + be + + + friends + our + + + ask + friends + + + ask + allies + + + friends + allies + + + so + if + + + ask + so + + + friends + so + + + ask + for + + + threat + what + + + need + threat + + + need + do + + + need + we + + + for + need + + + comparable + anything + + + need + comparable + + + level + our + + + level + existing + + + comparable + level + + + level + armaments + + + + + + + A + a + 16754 + 16755 + DT + O + + + more + more + 16756 + 16760 + JJR + O + + + peaceful + peaceful + 16761 + 16769 + JJ + O + + + world + world + 16770 + 16775 + NN + O + + + does + do + 16776 + 16780 + VBZ + O + + + not + not + 16781 + 16784 + RB + O + + + translate + translate + 16785 + 16794 + VB + O + + + into + into + 16796 + 16800 + IN + O + + + a + a + 16801 + 16802 + DT + O + + + problem-free + problem-free + 16803 + 16815 + JJ + O + + + world + world + 16816 + 16821 + NN + O + + + . + . + 16821 + 16822 + . + O + + + (ROOT (S (NP (DT A) (JJR more) (JJ peaceful) (NN world)) (VP (VBZ does) (RB not) (VP (VB translate) (PP (IN into) (NP (DT a) (JJ problem-free) (NN world))))) (. .))) + + + ROOT + translate + + + world + A + + + world + more + + + world + peaceful + + + translate + world + + + translate + does + + + translate + not + + + translate + into + + + world + a + + + world + problem-free + + + into + world + + + + + ROOT + translate + + + world + A + + + world + more + + + world + peaceful + + + translate + world + + + translate + does + + + translate + not + + + world + a + + + world + problem-free + + + translate + world + + + + + ROOT + translate + + + world + A + + + world + more + + + world + peaceful + + + translate + world + + + translate + does + + + translate + not + + + world + a + + + world + problem-free + + + translate + world + + + + + + + Indeed + indeed + 16823 + 16829 + RB + O + + + , + , + 16829 + 16830 + , + O + + + as + as + 16831 + 16833 + IN + O + + + the + the + 16834 + 16837 + DT + O + + + threat + threat + 16839 + 16845 + NN + O + + + of + of + 16846 + 16848 + IN + O + + + nuclear + nuclear + 16849 + 16856 + JJ + O + + + incineration + incineration + 16857 + 16869 + NN + O + + + recedes + recede + 16870 + 16877 + VBZ + O + + + , + , + 16877 + 16878 + , + O + + + we + we + 16879 + 16881 + PRP + O + + + can + can + 16883 + 16886 + MD + O + + + see + see + 16887 + 16890 + VB + O + + + more + more + 16891 + 16895 + JJR + O + + + clearly + clearly + 16896 + 16903 + RB + O + + + the + the + 16904 + 16907 + DT + O + + + danger + danger + 16908 + 16914 + NN + O + + + posed + pose + 16915 + 16920 + VBN + O + + + by + by + 16921 + 16923 + IN + O + + + environmental + environmental + 16925 + 16938 + JJ + O + + + degradation + degradation + 16939 + 16950 + NN + O + + + and + and + 16951 + 16954 + CC + O + + + global + global + 16955 + 16961 + JJ + O + + + cli + cli + 16962 + 16965 + SYM + O + + + - + - + 16965 + 16966 + : + O + + + mate + mate + 16968 + 16972 + NN + O + + + change + change + 16973 + 16979 + NN + O + + + . + . + 16979 + 16980 + . + O + + + (ROOT (S (ADVP (RB Indeed)) (, ,) (SBAR (IN as) (S (NP (NP (DT the) (NN threat)) (PP (IN of) (NP (JJ nuclear) (NN incineration)))) (VP (VBZ recedes)))) (, ,) (NP (PRP we)) (VP (MD can) (VP (VB see) (NP (JJR more)) (S (VP (ADVP (RB clearly) (NP (DT the) (NN danger))) (VBN posed) (PP (IN by)) (NP (NP (NP (JJ environmental) (NN degradation)) (CC and) (NP (JJ global))) (X (X (SYM cli)) (: -) (NP (NN mate) (NN change)) (. .))))))))) + + + ROOT + see + + + see + Indeed + + + recedes + as + + + threat + the + + + recedes + threat + + + threat + of + + + incineration + nuclear + + + of + incineration + + + see + recedes + + + see + we + + + see + can + + + see + more + + + posed + clearly + + + danger + the + + + clearly + danger + + + see + posed + + + posed + by + + + degradation + environmental + + + posed + degradation + + + degradation + and + + + degradation + global + + + change + cli + + + change + mate + + + degradation + change + + + + + ROOT + see + + + see + Indeed + + + recedes + as + + + threat + the + + + recedes + threat + + + incineration + nuclear + + + threat + incineration + + + see + recedes + + + see + we + + + see + can + + + see + more + + + posed + clearly + + + danger + the + + + clearly + danger + + + see + posed + + + posed + by + + + degradation + environmental + + + posed + degradation + + + degradation + global + + + change + cli + + + change + mate + + + degradation + change + + + + + ROOT + see + + + see + Indeed + + + recedes + as + + + threat + the + + + recedes + threat + + + incineration + nuclear + + + threat + incineration + + + see + recedes + + + see + we + + + see + can + + + see + more + + + posed + clearly + + + danger + the + + + clearly + danger + + + see + posed + + + posed + by + + + degradation + environmental + + + posed + degradation + + + posed + global + + + degradation + global + + + change + cli + + + change + mate + + + degradation + change + + + + + + + As + as + 16981 + 16983 + IN + O + + + nuclear + nuclear + 16984 + 16991 + JJ + O + + + winter + winter + 16992 + 16998 + NN + DATE + XXXX-WI + XXXX-WI + + + would + would + 16999 + 17004 + MD + O + + + sud + sud + 17005 + 17008 + SYM + O + + + - + - + 17008 + 17009 + : + O + + + denly + denly + 17011 + 17016 + NN + O + + + alter + alter + 17017 + 17022 + VBP + O + + + man + man + 17023 + 17026 + NN + O + + + 's + 's + 17026 + 17028 + POS + O + + + climate + climate + 17029 + 17036 + NN + O + + + and + and + 17037 + 17040 + CC + O + + + prospects + prospects + 17041 + 17050 + NNS + O + + + for + for + 17051 + 17054 + IN + O + + + survival + survival + 17056 + 17064 + NN + O + + + , + , + 17064 + 17065 + , + O + + + so + so + 17066 + 17068 + RB + O + + + might + might + 17069 + 17074 + MD + O + + + global + global + 17075 + 17081 + JJ + O + + + warming + warming + 17082 + 17089 + NN + O + + + albeit + albeit + 17090 + 17096 + IN + O + + + more + more + 17098 + 17102 + JJR + O + + + gradually + gradually + 17103 + 17112 + RB + O + + + . + . + 17112 + 17113 + . + O + + + (ROOT (PP (IN As) (NP (NP (JJ nuclear) (NN winter)) (SBAR (S (VP (MD would)))) (SBAR (FRAG (X (SYM sud)) (: -) (NP (NP (NN denly)) (SBAR (S (VP (VBP alter) (SBAR (S (NP (NP (NP (NN man) (POS 's)) (NN climate)) (CC and) (NP (NP (NNS prospects)) (PP (IN for) (NP (NN survival)))) (, ,)) (ADVP (RB so)) (VP (MD might) (VP (NP (JJ global) (NN warming)) (ADVP (IN albeit) (NP (NP (JJR more)) (ADVP (RB gradually)))))))))))) (. .)))))) + + + ROOT + As + + + winter + nuclear + + + As + winter + + + winter + would + + + denly + sud + + + winter + denly + + + denly + alter + + + climate + man + + + man + 's + + + warming + climate + + + climate + and + + + climate + prospects + + + prospects + for + + + for + survival + + + warming + so + + + warming + might + + + warming + global + + + alter + warming + + + warming + albeit + + + albeit + more + + + more + gradually + + + + + ROOT + As + + + winter + nuclear + + + As + winter + + + winter + would + + + denly + sud + + + winter + denly + + + denly + alter + + + climate + man + + + warming + climate + + + climate + prospects + + + prospects + survival + + + warming + so + + + warming + might + + + warming + global + + + alter + warming + + + warming + albeit + + + albeit + more + + + more + gradually + + + + + ROOT + As + + + winter + nuclear + + + As + winter + + + winter + would + + + denly + sud + + + winter + denly + + + denly + alter + + + climate + man + + + warming + climate + + + climate + prospects + + + warming + prospects + + + prospects + survival + + + warming + so + + + warming + might + + + warming + global + + + alter + warming + + + warming + albeit + + + albeit + more + + + more + gradually + + + + + + + If + if + 17114 + 17116 + IN + O + + + we + we + 17117 + 17119 + PRP + O + + + do + do + 17120 + 17122 + VBP + O + + + nothing + nothing + 17123 + 17130 + NN + O + + + we + we + 17131 + 17133 + PRP + O + + + may + may + 17134 + 17137 + MD + O + + + be + be + 17138 + 17140 + VB + O + + + trading + trade + 17142 + 17149 + VBG + O + + + the + the + 17150 + 17153 + DT + O + + + risk + risk + 17154 + 17158 + NN + O + + + of + of + 17159 + 17161 + IN + O + + + a + a + 17162 + 17163 + DT + O + + + flash + flash + 17164 + 17169 + NN + O + + + fry + fry + 17170 + 17173 + NN + O + + + for + for + 17174 + 17177 + IN + O + + + the + the + 17178 + 17181 + DT + O + + + cer + cer + 17182 + 17185 + NN + O + + + - + - + 17185 + 17186 + : + O + + + tainty + tainty + 17188 + 17194 + NN + O + + + of + of + 17195 + 17197 + IN + O + + + a + a + 17198 + 17199 + DT + O + + + slow + slow + 17200 + 17204 + JJ + O + + + roasting + roasting + 17205 + 17213 + NN + O + + + . + . + 17213 + 17214 + . + O + + + (ROOT (S (SBAR (IN If) (S (NP (PRP we)) (VP (VBP do) (NP (NN nothing))))) (NP (PRP we)) (VP (MD may) (VP (VB be) (VP (VBG trading) (NP (NP (NP (DT the) (NN risk)) (PP (IN of) (NP (NP (DT a) (NN flash) (NN fry)) (PP (IN for) (NP (DT the) (NN cer)))))) (: -) (NP (NP (NN tainty)) (PP (IN of) (NP (DT a) (JJ slow) (NN roasting)))))))) (. .))) + + + ROOT + trading + + + do + If + + + do + we + + + trading + do + + + do + nothing + + + trading + we + + + trading + may + + + trading + be + + + risk + the + + + trading + risk + + + risk + of + + + fry + a + + + fry + flash + + + of + fry + + + fry + for + + + cer + the + + + for + cer + + + risk + tainty + + + tainty + of + + + roasting + a + + + roasting + slow + + + of + roasting + + + + + ROOT + trading + + + do + If + + + do + we + + + trading + do + + + do + nothing + + + trading + we + + + trading + may + + + trading + be + + + risk + the + + + trading + risk + + + fry + a + + + fry + flash + + + risk + fry + + + cer + the + + + fry + cer + + + risk + tainty + + + roasting + a + + + roasting + slow + + + tainty + roasting + + + + + ROOT + trading + + + do + If + + + do + we + + + trading + do + + + do + nothing + + + trading + we + + + trading + may + + + trading + be + + + risk + the + + + trading + risk + + + fry + a + + + fry + flash + + + risk + fry + + + cer + the + + + fry + cer + + + risk + tainty + + + roasting + a + + + roasting + slow + + + tainty + roasting + + + + + + + In + in + 17215 + 17217 + IN + O + + + the + the + 17218 + 17221 + DT + O + + + end + end + 17222 + 17225 + NN + O + + + , + , + 17225 + 17226 + , + O + + + howev + howev + 17227 + 17232 + NN + O + + + - + - + 17232 + 17233 + : + O + + + er + er + 17235 + 17237 + NN + O + + + , + , + 17237 + 17238 + , + O + + + the + the + 17239 + 17242 + DT + O + + + results + result + 17243 + 17250 + NNS + O + + + can + can + 17251 + 17254 + MD + O + + + be + be + 17255 + 17257 + VB + O + + + comparably + comparably + 17258 + 17268 + RB + O + + + cata + cata + 17269 + 17273 + FW + O + + + - + - + 17273 + 17274 + : + O + + + strophic + strophic + 17276 + 17284 + JJ + O + + + . + . + 17284 + 17285 + . + O + + + (ROOT (UCP (S (PP (IN In) (NP (DT the) (NN end) (, ,) (NN howev) (: -) (NN er))) (, ,) (NP (DT the) (NNS results)) (VP (MD can) (VP (VB be) (ADJP (RB comparably) (FW cata))))) (: -) (NP (JJ strophic)) (. .))) + + + ROOT + comparably + + + comparably + In + + + er + the + + + er + end + + + er + howev + + + In + er + + + results + the + + + comparably + results + + + comparably + can + + + comparably + be + + + comparably + cata + + + comparably + strophic + + + + + ROOT + comparably + + + er + the + + + er + end + + + er + howev + + + comparably + er + + + results + the + + + comparably + results + + + comparably + can + + + comparably + be + + + comparably + cata + + + comparably + strophic + + + + + ROOT + comparably + + + er + the + + + er + end + + + er + howev + + + comparably + er + + + results + the + + + comparably + results + + + comparably + can + + + comparably + be + + + comparably + cata + + + comparably + strophic + + + + + + + If + if + 17289 + 17291 + IN + O + + + a + a + 17292 + 17293 + DT + O + + + deteriorating + deteriorate + 17294 + 17307 + VBG + O + + + environment + environment + 17308 + 17319 + NN + O + + + is + be + 17320 + 17322 + VBZ + O + + + compara + compara + 17323 + 17330 + NN + O + + + - + - + 17330 + 17331 + : + O + + + ble + ble + 17333 + 17336 + NN + O + + + in + in + 17337 + 17339 + IN + O + + + consequence + consequence + 17340 + 17351 + NN + O + + + , + , + 17351 + 17352 + , + O + + + if + if + 17353 + 17355 + IN + O + + + not + not + 17356 + 17359 + RB + O + + + immediacy + immediacy + 17360 + 17369 + NN + O + + + , + , + 17369 + 17370 + , + O + + + to + to + 17371 + 17373 + TO + O + + + global + global + 17375 + 17381 + JJ + O + + + war + war + 17382 + 17385 + NN + O + + + , + , + 17385 + 17386 + , + O + + + then + then + 17387 + 17391 + RB + O + + + logically + logically + 17392 + 17401 + RB + O + + + it + it + 17402 + 17404 + PRP + O + + + requires + require + 17405 + 17413 + VBZ + O + + + a + a + 17414 + 17415 + DT + O + + + com + com + 17416 + 17419 + NN + O + + + - + - + 17419 + 17420 + : + O + + + parable + parable + 17422 + 17429 + JJ + O + + + response + response + 17430 + 17438 + NN + O + + + . + . + 17438 + 17439 + . + O + + + (ROOT (S (SBAR (IN If) (S (NP (DT a) (VBG deteriorating) (NN environment)) (VP (VBZ is) (NP (NP (NN compara)) (: -) (NP (NP (NN ble)) (PP (IN in) (NP (NN consequence))))) (, ,) (SBAR (IN if) (RB not) (FRAG (NP (NN immediacy)))) (, ,) (PP (TO to) (NP (JJ global) (NN war)))))) (, ,) (ADVP (RB then) (RB logically)) (NP (PRP it)) (VP (VBZ requires) (NP (DT a) (NN com) (: -) (JJ parable) (NN response))) (. .))) + + + ROOT + requires + + + compara + If + + + environment + a + + + environment + deteriorating + + + compara + environment + + + compara + is + + + requires + compara + + + compara + ble + + + ble + in + + + in + consequence + + + immediacy + if + + + immediacy + not + + + compara + immediacy + + + compara + to + + + war + global + + + to + war + + + logically + then + + + requires + logically + + + requires + it + + + response + a + + + response + com + + + response + parable + + + requires + response + + + + + ROOT + requires + + + compara + If + + + environment + a + + + environment + deteriorating + + + compara + environment + + + compara + is + + + requires + compara + + + compara + ble + + + ble + consequence + + + immediacy + if + + + immediacy + not + + + compara + immediacy + + + war + global + + + compara + war + + + logically + then + + + requires + logically + + + requires + it + + + response + a + + + response + com + + + response + parable + + + requires + response + + + + + ROOT + requires + + + compara + If + + + environment + a + + + environment + deteriorating + + + compara + environment + + + compara + is + + + requires + compara + + + compara + ble + + + ble + consequence + + + immediacy + if + + + immediacy + not + + + compara + immediacy + + + war + global + + + compara + war + + + logically + then + + + requires + logically + + + requires + it + + + response + a + + + response + com + + + response + parable + + + requires + response + + + + + + + Put + put + 17440 + 17443 + VB + O + + + simply + simply + 17444 + 17450 + RB + O + + + , + , + 17450 + 17451 + , + O + + + we + we + 17452 + 17454 + PRP + O + + + must + must + 17455 + 17459 + MD + O + + + be + be + 17460 + 17462 + VB + O + + + prepared + prepare + 17464 + 17472 + VBN + O + + + to + to + 17473 + 17475 + TO + O + + + come + come + 17476 + 17480 + VB + O + + + forward + forward + 17481 + 17488 + RB + O + + + with + with + 17489 + 17493 + IN + O + + + the + the + 17494 + 17497 + DT + O + + + re + re + 17498 + 17500 + NN + O + + + - + - + 17500 + 17501 + : + O + + + sources + source + 17503 + 17510 + NNS + O + + + to + to + 17511 + 17513 + TO + O + + + protect + protect + 17514 + 17521 + VB + O + + + our + we + 17522 + 17525 + PRP$ + O + + + environment + environment + 17526 + 17537 + NN + O + + + . + . + 17537 + 17538 + . + O + + + (ROOT (S (S (VP (VB Put) (ADVP (RB simply)))) (, ,) (NP (PRP we)) (VP (MD must) (VP (VB be) (VP (VBN prepared) (S (VP (TO to) (VP (VB come) (ADVP (RB forward)) (PP (IN with) (NP (DT the) (NN re))) (: -) (S (NP (NNS sources)) (VP (TO to) (VP (VB protect) (NP (PRP$ our) (NN environment))))))))))) (. .))) + + + ROOT + prepared + + + prepared + Put + + + Put + simply + + + prepared + we + + + prepared + must + + + prepared + be + + + come + to + + + prepared + come + + + come + forward + + + come + with + + + re + the + + + with + re + + + protect + sources + + + protect + to + + + come + protect + + + environment + our + + + protect + environment + + + + + ROOT + prepared + + + prepared + Put + + + Put + simply + + + prepared + we + + + prepared + must + + + prepared + be + + + come + to + + + prepared + come + + + come + forward + + + re + the + + + come + re + + + protect + sources + + + protect + to + + + come + protect + + + environment + our + + + protect + environment + + + + + ROOT + prepared + + + prepared + Put + + + Put + simply + + + prepared + we + + + prepared + must + + + prepared + be + + + come + to + + + prepared + come + + + come + forward + + + re + the + + + come + re + + + protect + sources + + + protect + to + + + come + protect + + + environment + our + + + protect + environment + + + + + + + Today + today + 17539 + 17544 + NN + DATE + THIS P1D + + + + my + my + 17546 + 17548 + PRP$ + O + + + country + country + 17549 + 17556 + NN + O + + + spends + spend + 17557 + 17563 + VBZ + O + + + $ + $ + 17564 + 17565 + $ + MONEY + $2.956E11 + + + 295.6 + 295.6 + 17565 + 17570 + CD + MONEY + $2.956E11 + + + billion + billion + 17571 + 17578 + CD + MONEY + $2.956E11 + + + on + on + 17579 + 17581 + IN + O + + + defense + defense + 17582 + 17589 + NN + O + + + and + and + 17591 + 17594 + CC + O + + + $ + $ + 17595 + 17596 + $ + MONEY + $5.6E9 + + + 5.6 + 5.6 + 17596 + 17599 + CD + MONEY + $5.6E9 + + + billion + billion + 17600 + 17607 + CD + MONEY + $5.6E9 + + + at + at + 17608 + 17610 + IN + O + + + the + the + 17611 + 17614 + DT + O + + + federal + federal + 17615 + 17622 + JJ + O + + + level + level + 17623 + 17628 + NN + O + + + , + , + 17628 + 17629 + , + O + + + or + or + 17630 + 17632 + CC + O + + + about + about + 17633 + 17638 + IN + O + + + one-fifteenth + one-fifteenth + 17640 + 17653 + JJ + ORDINAL + + + as + as + 17654 + 17656 + RB + O + + + much + much + 17657 + 17661 + RB + O + + + , + , + 17661 + 17662 + , + O + + + on + on + 17663 + 17665 + IN + O + + + protecting + protect + 17666 + 17676 + VBG + O + + + our + we + 17677 + 17680 + PRP$ + O + + + en + en + 17681 + 17683 + IN + O + + + - + - + 17683 + 17684 + : + O + + + vironment + vironment + 17686 + 17695 + NN + O + + + . + . + 17695 + 17696 + . + O + + + (ROOT (S (NP-TMP (NN Today)) (NP (PRP$ my) (NN country)) (VP (VBZ spends) (NP (NP (NP (QP ($ $) (CD 295.6) (CD billion))) (PP (IN on) (NP (NN defense)))) (CC and) (NP (NP (QP ($ $) (CD 5.6) (CD billion))) (PP (PP (IN at) (NP (DT the) (JJ federal) (NN level))) (, ,) (CC or) (PP (IN about) (NP (NP (JJ one-fifteenth)) (ADVP (RB as) (RB much)) (, ,) (PP (IN on) (S (VP (VBG protecting) (NP (PRP$ our)) (ADVP (IN en))))) (: -) (NP (NN vironment)))))))) (. .))) + + + ROOT + spends + + + spends + Today + + + country + my + + + spends + country + + + spends + $ + + + $ + 295.6 + + + $ + billion + + + $ + on + + + on + defense + + + $ + and + + + $ + $ + + + $ + 5.6 + + + $ + billion + + + $ + at + + + level + the + + + level + federal + + + at + level + + + at + or + + + at + about + + + about + one-fifteenth + + + much + as + + + one-fifteenth + much + + + one-fifteenth + on + + + on + protecting + + + protecting + our + + + protecting + en + + + one-fifteenth + vironment + + + + + ROOT + spends + + + spends + Today + + + country + my + + + spends + country + + + spends + $ + + + $ + 295.6 + + + $ + billion + + + $ + defense + + + $ + $ + + + $ + $ + + + $ + 5.6 + + + $ + billion + + + level + the + + + level + federal + + + $ + level + + + $ + one-fifteenth + + + much + as + + + one-fifteenth + much + + + one-fifteenth + protecting + + + protecting + our + + + protecting + en + + + one-fifteenth + vironment + + + + + ROOT + spends + + + spends + Today + + + country + my + + + spends + country + + + spends + $ + + + $ + 295.6 + + + $ + billion + + + $ + defense + + + spends + $ + + + $ + $ + + + $ + $ + + + $ + $ + + + $ + 5.6 + + + $ + billion + + + level + the + + + level + federal + + + $ + level + + + $ + one-fifteenth + + + much + as + + + one-fifteenth + much + + + one-fifteenth + protecting + + + protecting + our + + + protecting + en + + + one-fifteenth + vironment + + + + + + + It + it + 17697 + 17699 + PRP + O + + + is + be + 17700 + 17702 + VBZ + O + + + not + not + 17703 + 17706 + RB + O + + + realistic + realistic + 17707 + 17716 + JJ + O + + + to + to + 17717 + 17719 + TO + O + + + expect + expect + 17720 + 17726 + VB + O + + + these + these + 17727 + 17732 + DT + O + + + proportions + proportion + 17734 + 17745 + NNS + O + + + to + to + 17746 + 17748 + TO + O + + + be + be + 17749 + 17751 + VB + O + + + reversed + reverse + 17752 + 17760 + VBN + O + + + , + , + 17760 + 17761 + , + O + + + but + but + 17762 + 17765 + CC + O + + + they + they + 17766 + 17770 + PRP + O + + + must + must + 17771 + 17775 + MD + O + + + be + be + 17776 + 17778 + VB + O + + + changed + change + 17780 + 17787 + VBN + O + + + . + . + 17787 + 17788 + . + O + + + (ROOT (S (S (NP (PRP It)) (VP (VBZ is) (RB not) (ADJP (JJ realistic) (S (VP (TO to) (VP (VB expect) (S (NP (DT these) (NNS proportions)) (VP (TO to) (VP (VB be) (VP (VBN reversed))))))))))) (, ,) (CC but) (S (NP (PRP they)) (VP (MD must) (VP (VB be) (VP (VBN changed))))) (. .))) + + + ROOT + realistic + + + realistic + It + + + realistic + is + + + realistic + not + + + expect + to + + + realistic + expect + + + proportions + these + + + reversed + proportions + + + reversed + to + + + reversed + be + + + expect + reversed + + + realistic + but + + + changed + they + + + changed + must + + + changed + be + + + realistic + changed + + + + + ROOT + realistic + + + realistic + It + + + realistic + is + + + realistic + not + + + expect + to + + + realistic + expect + + + proportions + these + + + reversed + proportions + + + reversed + to + + + reversed + be + + + expect + reversed + + + changed + they + + + changed + must + + + changed + be + + + realistic + changed + + + + + ROOT + realistic + + + realistic + It + + + realistic + is + + + realistic + not + + + expect + to + + + realistic + expect + + + proportions + these + + + reversed + proportions + + + reversed + to + + + reversed + be + + + expect + reversed + + + changed + they + + + changed + must + + + changed + be + + + realistic + changed + + + + + + + The + the + 17792 + 17795 + DT + O + + + end + end + 17796 + 17799 + NN + O + + + of + of + 17800 + 17802 + IN + O + + + the + the + 17803 + 17806 + DT + O + + + Cold + Cold + 17807 + 17811 + NNP + O + + + War + War + 17812 + 17815 + NNP + O + + + is + be + 17816 + 17818 + VBZ + O + + + already + already + 17819 + 17826 + RB + O + + + lead + lead + 17827 + 17831 + VB + O + + + - + - + 17831 + 17832 + : + O + + + ing + ing + 17834 + 17837 + NN + O + + + to + to + 17838 + 17840 + TO + O + + + cuts + cut + 17841 + 17845 + NNS + O + + + in + in + 17846 + 17848 + IN + O + + + military + military + 17849 + 17857 + JJ + O + + + spending + spending + 17858 + 17866 + NN + O + + + . + . + 17866 + 17867 + . + O + + + (ROOT (S (NP (NP (DT The) (NN end)) (PP (IN of) (NP (DT the) (NNP Cold) (NNP War)))) (VP (VBZ is) (ADVP (RB already)) (S (VP (VP (VB lead)) (: -) (VP (NP (NN ing)) (PP (TO to) (NP (NP (NNS cuts)) (PP (IN in) (NP (JJ military) (NN spending))))))))) (. .))) + + + ROOT + is + + + end + The + + + is + end + + + end + of + + + War + the + + + War + Cold + + + of + War + + + is + already + + + is + lead + + + lead + ing + + + ing + to + + + to + cuts + + + cuts + in + + + spending + military + + + in + spending + + + + + ROOT + is + + + end + The + + + is + end + + + War + the + + + War + Cold + + + end + War + + + is + already + + + is + lead + + + lead + ing + + + ing + cuts + + + spending + military + + + cuts + spending + + + + + ROOT + is + + + end + The + + + is + end + + + War + the + + + War + Cold + + + end + War + + + is + already + + + is + lead + + + lead + ing + + + ing + cuts + + + spending + military + + + cuts + spending + + + + + + + This + this + 17868 + 17872 + DT + O + + + trend + trend + 17873 + 17878 + NN + O + + + should + should + 17880 + 17886 + MD + O + + + be + be + 17887 + 17889 + VB + O + + + accelerated + accelerate + 17890 + 17901 + VBN + O + + + as + as + 17902 + 17904 + IN + O + + + we + we + 17905 + 17907 + PRP + O + + + conclude + conclude + 17908 + 17916 + VBP + O + + + agree + agree + 17917 + 17922 + VBP + O + + + - + - + 17922 + 17923 + : + O + + + ments + ment + 17925 + 17930 + NNS + O + + + to + to + 17931 + 17933 + TO + O + + + reduce + reduce + 17934 + 17940 + VB + O + + + strategic + strategic + 17941 + 17950 + JJ + O + + + and + and + 17951 + 17954 + CC + O + + + conventional + conventional + 17955 + 17967 + JJ + O + + + arms + arm + 17969 + 17973 + NNS + O + + + . + . + 17973 + 17974 + . + O + + + (ROOT (S (NP (DT This) (NN trend)) (VP (MD should) (VP (VB be) (VP (VBN accelerated) (SBAR (IN as) (S (NP (PRP we)) (VP (VBP conclude) (SBAR (S (VP (VBP agree) (: -) (NP (NNS ments) (S (VP (TO to) (VP (VB reduce) (NP (ADJP (JJ strategic) (CC and) (JJ conventional)) (NNS arms))))))))))))))) (. .))) + + + ROOT + accelerated + + + trend + This + + + accelerated + trend + + + accelerated + should + + + accelerated + be + + + conclude + as + + + conclude + we + + + accelerated + conclude + + + conclude + agree + + + agree + ments + + + reduce + to + + + ments + reduce + + + arms + strategic + + + strategic + and + + + strategic + conventional + + + reduce + arms + + + + + ROOT + accelerated + + + trend + This + + + accelerated + trend + + + accelerated + should + + + accelerated + be + + + conclude + as + + + conclude + we + + + accelerated + conclude + + + conclude + agree + + + agree + ments + + + reduce + to + + + ments + reduce + + + arms + strategic + + + strategic + conventional + + + reduce + arms + + + + + ROOT + accelerated + + + trend + This + + + accelerated + trend + + + accelerated + should + + + accelerated + be + + + conclude + as + + + conclude + we + + + accelerated + conclude + + + conclude + agree + + + agree + ments + + + reduce + to + + + ments + reduce + + + arms + strategic + + + strategic + conventional + + + arms + conventional + + + reduce + arms + + + + + + + These + these + 17975 + 17980 + DT + O + + + will + will + 17981 + 17985 + MD + O + + + jave + jave + 17986 + 17990 + VB + O + + + substantial + substantial + 17991 + 18002 + JJ + O + + + money + money + 18003 + 18008 + NN + O + + + for + for + 18009 + 18012 + IN + O + + + both + both + 18014 + 18018 + CC + O + + + the + the + 18019 + 18022 + DT + O + + + countries + country + 18023 + 18032 + NNS + O + + + of + of + 18033 + 18035 + IN + O + + + NATO + NATO + 18036 + 18040 + NNP + ORGANIZATION + + + and + and + 18041 + 18044 + CC + O + + + those + those + 18045 + 18050 + DT + O + + + linked + link + 18052 + 18058 + VBN + O + + + to + to + 18059 + 18061 + TO + O + + + the + the + 18062 + 18065 + DT + O + + + Warsaw + Warsaw + 18066 + 18072 + NNP + LOCATION + + + Pact + Pact + 18073 + 18077 + NNP + O + + + . + . + 18077 + 18078 + . + O + + + (ROOT (S (NP (DT These)) (VP (MD will) (VP (VB jave) (NP (NP (JJ substantial) (NN money)) (PP (IN for) (NP (CC both) (NP (NP (DT the) (NNS countries)) (PP (IN of) (NP (NNP NATO)))) (CC and) (NP (DT those)))) (VP (VBN linked) (PP (TO to) (NP (DT the) (NNP Warsaw) (NNP Pact))))))) (. .))) + + + ROOT + jave + + + jave + These + + + jave + will + + + money + substantial + + + jave + money + + + money + for + + + countries + both + + + countries + the + + + for + countries + + + countries + of + + + of + NATO + + + countries + and + + + countries + those + + + money + linked + + + linked + to + + + Pact + the + + + Pact + Warsaw + + + to + Pact + + + + + ROOT + jave + + + jave + These + + + jave + will + + + money + substantial + + + jave + money + + + countries + both + + + countries + the + + + money + countries + + + countries + NATO + + + countries + those + + + money + linked + + + Pact + the + + + Pact + Warsaw + + + linked + Pact + + + + + ROOT + jave + + + jave + These + + + jave + will + + + money + substantial + + + jave + money + + + countries + both + + + countries + the + + + money + countries + + + countries + NATO + + + money + those + + + countries + those + + + money + linked + + + Pact + the + + + Pact + Warsaw + + + linked + Pact + + + + + + + We + we + 18079 + 18081 + PRP + O + + + have + have + 18082 + 18086 + VBP + O + + + suc + suc + 18087 + 18090 + SYM + O + + + - + - + 18090 + 18091 + : + O + + + cessfully + cessfully + 18093 + 18102 + NN + O + + + met + meet + 18103 + 18106 + VBD + O + + + the + the + 18107 + 18110 + DT + O + + + challenge + challenge + 18111 + 18120 + NN + O + + + of + of + 18121 + 18123 + IN + O + + + the + the + 18124 + 18127 + DT + O + + + Cold + Cold + 18128 + 18132 + NNP + MISC + + + War + War + 18133 + 18136 + NNP + MISC + + + . + . + 18136 + 18137 + . + O + + + (ROOT (S (NP (PRP We)) (VP (VBP have) (FRAG (X (SYM suc)) (: -) (NP (NP (NN cessfully)) (SBAR (S (VP (VBD met) (NP (NP (DT the) (NN challenge)) (PP (IN of) (NP (DT the) (NNP Cold) (NNP War)))))))) (. .))))) + + + ROOT + have + + + have + We + + + cessfully + suc + + + have + cessfully + + + cessfully + met + + + challenge + the + + + met + challenge + + + challenge + of + + + War + the + + + War + Cold + + + of + War + + + + + ROOT + have + + + have + We + + + cessfully + suc + + + have + cessfully + + + cessfully + met + + + challenge + the + + + met + challenge + + + War + the + + + War + Cold + + + challenge + War + + + + + ROOT + have + + + have + We + + + cessfully + suc + + + have + cessfully + + + cessfully + met + + + challenge + the + + + met + challenge + + + War + the + + + War + Cold + + + challenge + War + + + + + + + The + the + 18139 + 18142 + DT + O + + + question + question + 18143 + 18151 + NN + O + + + now + now + 18152 + 18155 + RB + DATE + PRESENT_REF + PRESENT_REF + + + is + be + 18156 + 18158 + VBZ + O + + + how + how + 18159 + 18162 + WRB + O + + + will + will + 18163 + 18167 + MD + O + + + we + we + 18168 + 18170 + PRP + O + + + meet + meet + 18171 + 18175 + VB + O + + + the + the + 18176 + 18179 + DT + O + + + challenge + challenge + 18181 + 18190 + NN + O + + + of + of + 18191 + 18193 + IN + O + + + peace + peace + 18194 + 18199 + NN + O + + + ? + ? + 18199 + 18200 + . + O + + + (ROOT (S (NP (DT The) (NN question)) (ADVP (RB now)) (VP (VBZ is) (SBARQ (WHADVP (WRB how)) (SQ (MD will) (NP (PRP we)) (VP (VB meet) (NP (NP (DT the) (NN challenge)) (PP (IN of) (NP (NN peace)))))))) (. ?))) + + + ROOT + is + + + question + The + + + is + question + + + is + now + + + meet + how + + + meet + will + + + meet + we + + + is + meet + + + challenge + the + + + meet + challenge + + + challenge + of + + + of + peace + + + + + ROOT + is + + + question + The + + + is + question + + + is + now + + + meet + how + + + meet + will + + + meet + we + + + is + meet + + + challenge + the + + + meet + challenge + + + challenge + peace + + + + + ROOT + is + + + question + The + + + is + question + + + is + now + + + meet + how + + + meet + will + + + meet + we + + + is + meet + + + challenge + the + + + meet + challenge + + + challenge + peace + + + + + + + Here + here + 18204 + 18208 + RB + O + + + I + I + 18209 + 18210 + PRP + O + + + would + would + 18211 + 18216 + MD + O + + + suggest + suggest + 18217 + 18224 + VB + O + + + that + that + 18225 + 18229 + IN + O + + + a + a + 18230 + 18231 + DT + O + + + meaningful + meaningful + 18232 + 18242 + JJ + O + + + percentage + percentage + 18244 + 18254 + NN + O + + + , + , + 18254 + 18255 + , + O + + + perhaps + perhaps + 18256 + 18263 + RB + O + + + 15 + 15 + 18264 + 18266 + CD + PERCENT + %15.0 + + + percent + percent + 18267 + 18274 + NN + PERCENT + %15.0 + + + , + , + 18274 + 18275 + , + O + + + of + of + 18276 + 18278 + IN + O + + + our + we + 18279 + 18282 + PRP$ + O + + + pro- + pro- + 18283 + 18287 + JJ + O + + + spective + spective + 18289 + 18297 + JJ + O + + + peace + peace + 18298 + 18303 + NN + O + + + dividend + dividend + 18304 + 18312 + NN + O + + + be + be + 18313 + 18315 + VB + O + + + dedicated + dedicate + 18316 + 18325 + VBN + O + + + to + to + 18326 + 18328 + TO + O + + + the + the + 18329 + 18332 + DT + O + + + environment + environment + 18334 + 18345 + NN + O + + + . + . + 18345 + 18346 + . + O + + + (ROOT (S (ADVP (RB Here)) (NP (PRP I)) (VP (MD would) (VP (VB suggest) (SBAR (IN that) (S (NP (NP (NP (DT a) (JJ meaningful) (NN percentage)) (, ,) (NP (RB perhaps) (CD 15) (NN percent)) (, ,)) (PP (IN of) (NP (PRP$ our) (JJ pro-) (JJ spective) (NN peace) (NN dividend)))) (VP (VB be) (VP (VBN dedicated) (PP (TO to) (NP (DT the) (NN environment))))))))) (. .))) + + + ROOT + suggest + + + suggest + Here + + + suggest + I + + + suggest + would + + + dedicated + that + + + percentage + a + + + percentage + meaningful + + + dedicated + percentage + + + percent + perhaps + + + percent + 15 + + + percentage + percent + + + percentage + of + + + dividend + our + + + dividend + pro- + + + dividend + spective + + + dividend + peace + + + of + dividend + + + dedicated + be + + + suggest + dedicated + + + dedicated + to + + + environment + the + + + to + environment + + + + + ROOT + suggest + + + suggest + Here + + + suggest + I + + + suggest + would + + + dedicated + that + + + percentage + a + + + percentage + meaningful + + + dedicated + percentage + + + percent + perhaps + + + percent + 15 + + + percentage + percent + + + dividend + our + + + dividend + pro- + + + dividend + spective + + + dividend + peace + + + percentage + dividend + + + dedicated + be + + + suggest + dedicated + + + environment + the + + + dedicated + environment + + + + + ROOT + suggest + + + suggest + Here + + + suggest + I + + + suggest + would + + + dedicated + that + + + percentage + a + + + percentage + meaningful + + + dedicated + percentage + + + percent + perhaps + + + percent + 15 + + + percentage + percent + + + dividend + our + + + dividend + pro- + + + dividend + spective + + + dividend + peace + + + percentage + dividend + + + dedicated + be + + + suggest + dedicated + + + environment + the + + + dedicated + environment + + + + + + + And + and + 18347 + 18350 + CC + O + + + I + I + 18351 + 18352 + PRP + O + + + would + would + 18353 + 18358 + MD + O + + + propose + propose + 18359 + 18366 + VB + O + + + that + that + 18367 + 18371 + IN + O + + + the + the + 18372 + 18375 + DT + O + + + upcoming + upcoming + 18377 + 18385 + JJ + O + + + agreements + agreement + 18386 + 18396 + NNS + O + + + on + on + 18397 + 18399 + IN + O + + + strategic + strategic + 18400 + 18409 + JJ + O + + + and + and + 18410 + 18413 + CC + O + + + con + con + 18414 + 18417 + JJ + O + + + - + - + 18417 + 18418 + : + O + + + ventional + ventional + 18420 + 18429 + JJ + O + + + forces + force + 18430 + 18436 + NNS + O + + + explicitly + explicitly + 18437 + 18447 + RB + O + + + earmark + earmark + 18448 + 18455 + VBP + O + + + 15 + 15 + 18456 + 18458 + CD + NUMBER + 15.0 + + + per + per + 18459 + 18462 + IN + O + + + - + - + 18462 + 18463 + : + O + + + cent + cent + 18465 + 18469 + NN + O + + + of + of + 18470 + 18472 + IN + O + + + the + the + 18473 + 18476 + DT + O + + + resultant + resultant + 18477 + 18486 + JJ + O + + + savings + savings + 18487 + 18494 + NNS + O + + + for + for + 18495 + 18498 + IN + O + + + additional + additional + 18499 + 18509 + JJ + O + + + environmental + environmental + 18511 + 18524 + JJ + O + + + protection + protection + 18525 + 18535 + NN + O + + + to + to + 18536 + 18538 + TO + O + + + be + be + 18539 + 18541 + VB + O + + + expended + expend + 18542 + 18550 + VBN + O + + + either + either + 18552 + 18558 + CC + O + + + within + within + 18559 + 18565 + IN + O + + + the + the + 18566 + 18569 + DT + O + + + country + country + 18570 + 18577 + NN + O + + + where + where + 18578 + 18583 + WRB + O + + + the + the + 18584 + 18587 + DT + O + + + savings + savings + 18588 + 18595 + NNS + O + + + are + be + 18597 + 18600 + VBP + O + + + made + make + 18601 + 18605 + VBN + O + + + or + or + 18606 + 18608 + CC + O + + + internationally + internationally + 18609 + 18624 + RB + O + + + . + . + 18624 + 18625 + . + O + + + (ROOT (S (CC And) (NP (PRP I)) (VP (MD would) (VP (VB propose) (SBAR (IN that) (S (NP (NP (DT the) (JJ upcoming) (NNS agreements)) (PP (IN on) (NP (ADJP (ADJP (JJ strategic)) (CC and) (ADJP (JJ con)) (: -)) (JJ ventional) (NNS forces)))) (ADVP (RB explicitly)) (VP (VBP earmark) (NP (NP (CD 15)) (PP (IN per) (: -) (NP (NP (NN cent)) (PP (IN of) (NP (NP (DT the) (JJ resultant) (NNS savings)) (PP (IN for) (NP (JJ additional) (JJ environmental) (NN protection))))))) (UCP (S (VP (TO to) (VP (VB be) (VP (VBN expended) (PP (CC either) (PP (IN within) (NP (NP (DT the) (NN country)) (SBAR (WHADVP (WRB where)) (S (NP (DT the) (NNS savings)) (VP (VBP are) (VP (VBN made)))))))))))) (CC or) (ADVP (RB internationally))))))))) (. .))) + + + ROOT + propose + + + propose + And + + + propose + I + + + propose + would + + + earmark + that + + + agreements + the + + + agreements + upcoming + + + earmark + agreements + + + agreements + on + + + forces + strategic + + + strategic + and + + + strategic + con + + + forces + ventional + + + on + forces + + + earmark + explicitly + + + propose + earmark + + + earmark + 15 + + + 15 + per + + + per + cent + + + cent + of + + + savings + the + + + savings + resultant + + + of + savings + + + savings + for + + + protection + additional + + + protection + environmental + + + for + protection + + + expended + to + + + expended + be + + + 15 + expended + + + within + either + + + expended + within + + + country + the + + + within + country + + + made + where + + + savings + the + + + made + savings + + + made + are + + + country + made + + + expended + or + + + expended + internationally + + + + + ROOT + propose + + + propose + And + + + propose + I + + + propose + would + + + earmark + that + + + agreements + the + + + agreements + upcoming + + + earmark + agreements + + + forces + strategic + + + strategic + con + + + forces + ventional + + + agreements + forces + + + earmark + explicitly + + + propose + earmark + + + earmark + 15 + + + 15 + cent + + + savings + the + + + savings + resultant + + + cent + savings + + + protection + additional + + + protection + environmental + + + savings + protection + + + expended + to + + + expended + be + + + 15 + expended + + + expended + either + + + country + the + + + expended + country + + + made + where + + + savings + the + + + made + savings + + + made + are + + + country + made + + + expended + internationally + + + + + ROOT + propose + + + propose + And + + + propose + I + + + propose + would + + + earmark + that + + + agreements + the + + + agreements + upcoming + + + earmark + agreements + + + forces + strategic + + + strategic + con + + + forces + con + + + forces + ventional + + + agreements + forces + + + earmark + explicitly + + + propose + earmark + + + earmark + 15 + + + 15 + cent + + + savings + the + + + savings + resultant + + + cent + savings + + + protection + additional + + + protection + environmental + + + savings + protection + + + expended + to + + + expended + be + + + 15 + expended + + + expended + either + + + country + the + + + expended + country + + + made + where + + + savings + the + + + made + savings + + + made + are + + + country + made + + + 15 + internationally + + + expended + internationally + + + + + + + Under + under + 18629 + 18634 + IN + O + + + the + the + 18635 + 18638 + DT + O + + + domestic + domestic + 18639 + 18647 + JJ + O + + + law + law + 18648 + 18651 + NN + O + + + of + of + 18652 + 18654 + IN + O + + + the + the + 18655 + 18658 + DT + O + + + United + United + 18659 + 18665 + NNP + LOCATION + + + States + States + 18667 + 18673 + NNPS + LOCATION + + + such + such + 18674 + 18678 + JJ + O + + + funds + fund + 18679 + 18684 + NNS + O + + + would + would + 18685 + 18690 + MD + O + + + have + have + 18691 + 18695 + VB + O + + + to + to + 18696 + 18698 + TO + O + + + be + be + 18699 + 18701 + VB + O + + + appro + appro + 18702 + 18707 + NN + O + + + - + - + 18707 + 18708 + : + O + + + priated + priated + 18710 + 18717 + JJ + O + + + pursuant + pursuant + 18718 + 18726 + JJ + O + + + to + to + 18727 + 18729 + TO + O + + + our + we + 18730 + 18733 + PRP$ + O + + + constitutional + constitutional + 18734 + 18748 + JJ + O + + + proc + proc + 18749 + 18753 + NN + O + + + - + - + 18753 + 18754 + : + O + + + esses + ess + 18756 + 18761 + NNS + O + + + . + . + 18761 + 18762 + . + O + + + (ROOT (S (PP (IN Under) (NP (NP (DT the) (JJ domestic) (NN law)) (PP (IN of) (NP (DT the) (NNP United) (NNPS States))))) (NP (JJ such) (NNS funds)) (VP (MD would) (VP (VB have) (S (VP (TO to) (VP (VB be) (NP (NP (NN appro)) (: -) (ADJP (JJ priated) (JJ pursuant))) (PP (TO to) (NP (NP (PRP$ our) (JJ constitutional) (NN proc)) (: -) (NP (NNS esses))))))))) (. .))) + + + ROOT + have + + + have + Under + + + law + the + + + law + domestic + + + Under + law + + + law + of + + + States + the + + + States + United + + + of + States + + + funds + such + + + have + funds + + + have + would + + + appro + to + + + appro + be + + + have + appro + + + pursuant + priated + + + appro + pursuant + + + appro + to + + + proc + our + + + proc + constitutional + + + to + proc + + + proc + esses + + + + + ROOT + have + + + law + the + + + law + domestic + + + have + law + + + States + the + + + States + United + + + law + States + + + funds + such + + + have + funds + + + have + would + + + appro + to + + + appro + be + + + have + appro + + + pursuant + priated + + + appro + pursuant + + + proc + our + + + proc + constitutional + + + appro + proc + + + proc + esses + + + + + ROOT + have + + + law + the + + + law + domestic + + + have + law + + + States + the + + + States + United + + + law + States + + + funds + such + + + have + funds + + + have + would + + + appro + to + + + appro + be + + + have + appro + + + pursuant + priated + + + appro + pursuant + + + proc + our + + + proc + constitutional + + + appro + proc + + + proc + esses + + + + + + + I + I + 18763 + 18764 + PRP + O + + + am + be + 18765 + 18767 + VBP + O + + + sure + sure + 18768 + 18772 + JJ + O + + + the + the + 18773 + 18776 + DT + O + + + same + same + 18777 + 18781 + JJ + O + + + would + would + 18782 + 18787 + MD + O + + + be + be + 18788 + 18790 + VB + O + + + true + true + 18791 + 18795 + JJ + O + + + for + for + 18796 + 18799 + IN + O + + + other + other + 18801 + 18806 + JJ + O + + + countries + country + 18807 + 18816 + NNS + O + + + that + that + 18817 + 18821 + WDT + O + + + would + would + 18822 + 18827 + MD + O + + + participate + participate + 18828 + 18839 + VB + O + + + in + in + 18840 + 18842 + IN + O + + + such + such + 18844 + 18848 + PDT + O + + + an + a + 18849 + 18851 + DT + O + + + agreement + agreement + 18852 + 18861 + NN + O + + + . + . + 18861 + 18862 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ sure) (SBAR (S (NP (DT the) (JJ same)) (VP (MD would) (VP (VB be) (ADJP (JJ true) (PP (IN for) (NP (NP (JJ other) (NNS countries)) (SBAR (WHNP (WDT that)) (S (VP (MD would) (VP (VB participate) (PP (IN in) (NP (PDT such) (DT an) (NN agreement)))))))))))))))) (. .))) + + + ROOT + sure + + + sure + I + + + sure + am + + + same + the + + + true + same + + + true + would + + + true + be + + + sure + true + + + true + for + + + countries + other + + + for + countries + + + participate + that + + + participate + would + + + countries + participate + + + participate + in + + + agreement + such + + + agreement + an + + + in + agreement + + + + + ROOT + sure + + + sure + I + + + sure + am + + + same + the + + + true + same + + + true + would + + + true + be + + + sure + true + + + countries + other + + + true + countries + + + participate + that + + + participate + would + + + countries + participate + + + agreement + such + + + agreement + an + + + participate + agreement + + + + + ROOT + sure + + + sure + I + + + sure + am + + + same + the + + + true + same + + + true + would + + + true + be + + + sure + true + + + countries + other + + + true + countries + + + participate + that + + + participate + would + + + countries + participate + + + agreement + such + + + agreement + an + + + participate + agreement + + + + + + + However + however + 18863 + 18870 + RB + O + + + , + , + 18870 + 18871 + , + O + + + the + the + 18872 + 18875 + DT + O + + + inclusion + inclusion + 18876 + 18885 + NN + O + + + of + of + 18887 + 18889 + IN + O + + + an + a + 18890 + 18892 + DT + O + + + environmental + environmental + 18893 + 18906 + JJ + O + + + peace + peace + 18907 + 18912 + NN + O + + + dividend + dividend + 18913 + 18921 + NN + O + + + in + in + 18922 + 18924 + IN + O + + + an + a + 18925 + 18927 + DT + O + + + arms + arm + 18929 + 18933 + NNS + O + + + control + control + 18934 + 18941 + NN + O + + + treaty + treaty + 18942 + 18948 + NN + O + + + will + will + 18949 + 18953 + MD + O + + + create + create + 18954 + 18960 + VB + O + + + an + a + 18961 + 18963 + DT + O + + + obligation + obligation + 18964 + 18974 + NN + O + + + and + and + 18976 + 18979 + CC + O + + + a + a + 18980 + 18981 + DT + O + + + goal + goal + 18982 + 18986 + NN + O + + + for + for + 18987 + 18990 + IN + O + + + both + both + 18991 + 18995 + DT + O + + + West + West + 18996 + 19000 + NNP + O + + + and + and + 19001 + 19004 + CC + O + + + East + East + 19005 + 19009 + NNP + O + + + . + . + 19009 + 19010 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (NP (NP (DT the) (NN inclusion)) (PP (IN of) (NP (NP (DT an) (JJ environmental) (NN peace) (NN dividend)) (PP (IN in) (NP (DT an) (NNS arms) (NN control) (NN treaty)))))) (VP (MD will) (VP (VB create) (NP (NP (DT an) (NN obligation)) (CC and) (NP (DT a) (NN goal))) (PP (IN for) (NP (DT both) (NNP West) (CC and) (NNP East))))) (. .))) + + + ROOT + create + + + create + However + + + inclusion + the + + + create + inclusion + + + inclusion + of + + + dividend + an + + + dividend + environmental + + + dividend + peace + + + of + dividend + + + dividend + in + + + treaty + an + + + treaty + arms + + + treaty + control + + + in + treaty + + + create + will + + + obligation + an + + + create + obligation + + + obligation + and + + + goal + a + + + obligation + goal + + + create + for + + + West + both + + + for + West + + + West + and + + + West + East + + + + + ROOT + create + + + create + However + + + inclusion + the + + + create + inclusion + + + dividend + an + + + dividend + environmental + + + dividend + peace + + + inclusion + dividend + + + treaty + an + + + treaty + arms + + + treaty + control + + + dividend + treaty + + + create + will + + + obligation + an + + + create + obligation + + + goal + a + + + obligation + goal + + + West + both + + + create + West + + + West + East + + + + + ROOT + create + + + create + However + + + inclusion + the + + + create + inclusion + + + dividend + an + + + dividend + environmental + + + dividend + peace + + + inclusion + dividend + + + treaty + an + + + treaty + arms + + + treaty + control + + + dividend + treaty + + + create + will + + + obligation + an + + + create + obligation + + + goal + a + + + create + goal + + + obligation + goal + + + West + both + + + create + West + + + create + East + + + West + East + + + + + + + It + it + 19011 + 19013 + PRP + O + + + would + would + 19014 + 19019 + MD + O + + + also + also + 19021 + 19025 + RB + O + + + set + set + 19026 + 19029 + VB + O + + + an + a + 19030 + 19032 + DT + O + + + important + important + 19033 + 19042 + JJ + O + + + precedent + precedent + 19043 + 19052 + NN + O + + + for + for + 19053 + 19056 + IN + O + + + future + future + 19057 + 19063 + JJ + DATE + FUTURE_REF + FUTURE_REF + + + East-West + east-west + 19065 + 19074 + JJ + MISC + + + agreements + agreement + 19075 + 19085 + NNS + O + + + , + , + 19085 + 19086 + , + O + + + one + one + 19087 + 19090 + CD + NUMBER + 1.0 + + + where + where + 19091 + 19096 + WRB + O + + + we + we + 19097 + 19099 + PRP + O + + + agree + agree + 19100 + 19105 + VBP + O + + + not + not + 19107 + 19110 + RB + O + + + only + only + 19111 + 19115 + RB + O + + + on + on + 19116 + 19118 + IN + O + + + measures + measure + 19119 + 19127 + NNS + O + + + to + to + 19128 + 19130 + TO + O + + + reduce + reduce + 19131 + 19137 + VB + O + + + the + the + 19138 + 19141 + DT + O + + + risk + risk + 19142 + 19146 + NN + O + + + of + of + 19147 + 19149 + IN + O + + + mutual + mutual + 19151 + 19157 + JJ + O + + + destruction + destruction + 19158 + 19169 + NN + O + + + but + but + 19170 + 19173 + CC + O + + + also + also + 19174 + 19178 + RB + O + + + on + on + 19179 + 19181 + IN + O + + + major + major + 19182 + 19187 + JJ + O + + + meas + mea + 19188 + 19192 + NNS + O + + + - + - + 19192 + 19193 + : + O + + + ures + ure + 19195 + 19199 + NNS + O + + + of + of + 19200 + 19202 + IN + O + + + mutual + mutual + 19203 + 19209 + JJ + O + + + cooperation + cooperation + 19210 + 19221 + NN + O + + + . + . + 19221 + 19222 + . + O + + + (ROOT (S (NP (PRP It)) (VP (MD would) (ADVP (RB also)) (VP (VB set) (NP (DT an) (JJ important) (NN precedent)) (PP (IN for) (NP (NP (JJ future) (JJ East-West) (NNS agreements)) (, ,) (NP (NP (NP (CD one)) (SBAR (WHADVP (WRB where)) (S (NP (PRP we)) (VP (VBP agree) (NP (RB not) (RB only)) (PP (PP (IN on) (NP (NNS measures) (S (VP (TO to) (VP (VB reduce) (NP (NP (DT the) (NN risk)) (PP (IN of) (NP (JJ mutual) (NN destruction))))))))) (CONJP (CC but) (RB also)) (PP (IN on) (NP (JJ major) (NNS meas)))))))) (: -) (NP (NP (NNS ures)) (PP (IN of) (NP (JJ mutual) (NN cooperation))))))))) (. .))) + + + ROOT + set + + + set + It + + + set + would + + + set + also + + + precedent + an + + + precedent + important + + + set + precedent + + + set + for + + + agreements + future + + + agreements + East-West + + + for + agreements + + + agreements + one + + + agree + where + + + agree + we + + + one + agree + + + only + not + + + agree + only + + + agree + on + + + on + measures + + + reduce + to + + + measures + reduce + + + risk + the + + + reduce + risk + + + risk + of + + + destruction + mutual + + + of + destruction + + + also + but + + + on + also + + + on + on + + + meas + major + + + on + meas + + + one + ures + + + ures + of + + + cooperation + mutual + + + of + cooperation + + + + + ROOT + set + + + set + It + + + set + would + + + set + also + + + precedent + an + + + precedent + important + + + set + precedent + + + agreements + future + + + agreements + East-West + + + set + agreements + + + agreements + one + + + agree + where + + + agree + we + + + one + agree + + + only + not + + + agree + only + + + agree + measures + + + reduce + to + + + measures + reduce + + + risk + the + + + reduce + risk + + + destruction + mutual + + + risk + destruction + + + meas + major + + + measures + meas + + + one + ures + + + cooperation + mutual + + + ures + cooperation + + + + + ROOT + set + + + set + It + + + set + would + + + set + also + + + precedent + an + + + precedent + important + + + set + precedent + + + agreements + future + + + agreements + East-West + + + set + agreements + + + agreements + one + + + agree + where + + + agree + we + + + one + agree + + + only + not + + + agree + only + + + agree + measures + + + reduce + to + + + measures + reduce + + + risk + the + + + reduce + risk + + + destruction + mutual + + + risk + destruction + + + meas + major + + + agree + meas + + + measures + meas + + + one + ures + + + cooperation + mutual + + + ures + cooperation + + + + + + + CONGRESSIONAL + congressional + 19230 + 19243 + JJ + O + + + RECORD-SENATE + record-senate + 19244 + 19257 + NN + O + + + January + January + 19261 + 19268 + NNP + DATE + 1990-01-23 + 1990-01-23 + + + 23 + 23 + 19269 + 19271 + CD + DATE + 1990-01-23 + 1990-01-23 + + + , + , + 19271 + 19272 + , + DATE + 1990-01-23 + 1990-01-23 + + + 1990 + 1990 + 19273 + 19277 + CD + DATE + 1990-01-23 + 1990-01-23 + + + I + I + 19283 + 19284 + PRP + O + + + would + would + 19285 + 19290 + MD + O + + + further + further + 19291 + 19298 + RB + O + + + propose + propose + 19299 + 19306 + VB + O + + + that + that + 19307 + 19311 + IN + O + + + we + we + 19312 + 19314 + PRP + O + + + direct + direct + 19315 + 19321 + VBP + O + + + the + the + 19322 + 19325 + DT + O + + + earmarked + earmark + 19327 + 19336 + VBN + O + + + environmental + environmental + 19340 + 19353 + JJ + O + + + expenditure + expenditure + 19356 + 19367 + NN + O + + + largely + largely + 19369 + 19376 + RB + O + + + to + to + 19377 + 19379 + TO + O + + + those + those + 19380 + 19385 + DT + O + + + problems + problem + 19386 + 19394 + NNS + O + + + which + which + 19395 + 19400 + WDT + O + + + are + be + 19401 + 19404 + VBP + O + + + interna + interna + 19405 + 19412 + NN + O + + + - + - + 19412 + 19413 + : + O + + + tional + tional + 19415 + 19421 + JJ + O + + + or + or + 19422 + 19424 + CC + O + + + global + global + 19425 + 19431 + JJ + O + + + in + in + 19432 + 19434 + IN + O + + + nature + nature + 19435 + 19441 + NN + O + + + . + . + 19441 + 19442 + . + O + + + (ROOT (S (NP (NP (JJ CONGRESSIONAL) (NN RECORD-SENATE)) (NP-TMP (NNP January) (CD 23) (, ,) (CD 1990))) (NP (PRP I)) (VP (MD would) (ADVP (RB further)) (VP (VB propose) (SBAR (IN that) (S (NP (PRP we)) (VP (VBP direct) (NP (DT the) (VBN earmarked) (JJ environmental) (NN expenditure)) (PP (ADVP (RB largely)) (TO to) (NP (NP (DT those) (NNS problems)) (SBAR (WHNP (WDT which)) (S (VP (VBP are) (NP (NP (NP (NN interna)) (: -) (ADJP (JJ tional) (CC or) (JJ global))) (PP (IN in) (NP (NN nature)))))))))))))) (. .))) + + + ROOT + propose + + + RECORD-SENATE + CONGRESSIONAL + + + propose + RECORD-SENATE + + + RECORD-SENATE + January + + + January + 23 + + + January + 1990 + + + propose + I + + + propose + would + + + propose + further + + + direct + that + + + direct + we + + + propose + direct + + + expenditure + the + + + expenditure + earmarked + + + expenditure + environmental + + + direct + expenditure + + + to + largely + + + direct + to + + + problems + those + + + to + problems + + + interna + which + + + interna + are + + + problems + interna + + + interna + tional + + + tional + or + + + tional + global + + + interna + in + + + in + nature + + + + + ROOT + propose + + + RECORD-SENATE + CONGRESSIONAL + + + propose + RECORD-SENATE + + + RECORD-SENATE + January + + + January + 23 + + + January + 1990 + + + propose + I + + + propose + would + + + propose + further + + + direct + that + + + direct + we + + + propose + direct + + + expenditure + the + + + expenditure + earmarked + + + expenditure + environmental + + + direct + expenditure + + + direct + largely + + + problems + those + + + direct + problems + + + interna + which + + + interna + are + + + problems + interna + + + interna + tional + + + tional + global + + + interna + nature + + + + + ROOT + propose + + + RECORD-SENATE + CONGRESSIONAL + + + propose + RECORD-SENATE + + + RECORD-SENATE + January + + + January + 23 + + + January + 1990 + + + propose + I + + + propose + would + + + propose + further + + + direct + that + + + direct + we + + + propose + direct + + + expenditure + the + + + expenditure + earmarked + + + expenditure + environmental + + + direct + expenditure + + + direct + largely + + + problems + those + + + direct + problems + + + interna + which + + + interna + are + + + problems + interna + + + interna + tional + + + interna + global + + + tional + global + + + interna + nature + + + + + + + In + in + 19443 + 19445 + IN + O + + + Europe + Europe + 19446 + 19452 + NNP + LOCATION + + + this + this + 19453 + 19457 + DT + O + + + would + would + 19459 + 19464 + MD + O + + + mean + mean + 19465 + 19469 + VB + O + + + spending + spending + 19470 + 19478 + NN + O + + + to + to + 19479 + 19481 + TO + O + + + clean + clean + 19482 + 19487 + VB + O + + + up + up + 19488 + 19490 + RP + O + + + shared + shared + 19491 + 19497 + JJ + O + + + rivers + river + 19499 + 19505 + NNS + O + + + , + , + 19505 + 19506 + , + O + + + to + to + 19507 + 19509 + TO + O + + + prevent + prevent + 19510 + 19517 + VB + O + + + air + air + 19518 + 19521 + NN + O + + + pollution + pollution + 19522 + 19531 + NN + O + + + which + which + 19532 + 19537 + WDT + O + + + in + in + 19538 + 19540 + IN + O + + + Europe + Europe + 19542 + 19548 + NNP + LOCATION + + + has + have + 19549 + 19552 + VBZ + O + + + no + no + 19553 + 19555 + DT + O + + + boundaries + boundary + 19556 + 19566 + NNS + O + + + , + , + 19566 + 19567 + , + O + + + and + and + 19568 + 19571 + CC + O + + + to + to + 19572 + 19574 + TO + O + + + neutralize + neutralize + 19575 + 19585 + VB + O + + + acid + acid + 19587 + 19591 + JJ + O + + + rain + rain + 19592 + 19596 + NN + O + + + which + which + 19597 + 19602 + WDT + O + + + is + be + 19603 + 19605 + VBZ + O + + + destroying + destroy + 19606 + 19616 + VBG + O + + + the + the + 19617 + 19620 + DT + O + + + forests + forest + 19621 + 19628 + NNS + O + + + , + , + 19628 + 19629 + , + O + + + lakes + lake + 19631 + 19636 + NNS + O + + + , + , + 19636 + 19637 + , + O + + + and + and + 19638 + 19641 + CC + O + + + monuments + monument + 19642 + 19651 + NNS + O + + + of + of + 19652 + 19654 + IN + O + + + Europe + Europe + 19655 + 19661 + NNP + LOCATION + + + . + . + 19661 + 19662 + . + O + + + (ROOT (S (PP (IN In) (NP (NNP Europe))) (NP (DT this)) (VP (MD would) (VP (VB mean) (NP (NN spending)) (S (VP (VP (TO to) (VP (VB clean) (PRT (RP up)) (NP (JJ shared) (NNS rivers)))) (, ,) (VP (TO to) (VP (VB prevent) (NP (NP (NN air) (NN pollution)) (SBAR (WHNP (WDT which)) (S (PP (IN in) (NP (NNP Europe))) (VP (VBZ has) (NP (DT no) (NNS boundaries)))))))) (, ,) (CC and) (VP (TO to) (VP (VB neutralize) (NP (NP (JJ acid) (NN rain)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBG destroying) (NP (NP (DT the) (NNS forests)) (, ,) (NP (NNS lakes)) (, ,) (CC and) (NP (NP (NNS monuments)) (PP (IN of) (NP (NNP Europe)))))))))))))))) (. .))) + + + ROOT + mean + + + mean + In + + + In + Europe + + + mean + this + + + mean + would + + + mean + spending + + + clean + to + + + mean + clean + + + clean + up + + + rivers + shared + + + clean + rivers + + + prevent + to + + + clean + prevent + + + pollution + air + + + prevent + pollution + + + has + which + + + has + in + + + in + Europe + + + pollution + has + + + boundaries + no + + + has + boundaries + + + clean + and + + + neutralize + to + + + clean + neutralize + + + rain + acid + + + neutralize + rain + + + destroying + which + + + destroying + is + + + rain + destroying + + + forests + the + + + destroying + forests + + + forests + lakes + + + forests + and + + + forests + monuments + + + monuments + of + + + of + Europe + + + + + ROOT + mean + + + mean + Europe + + + mean + this + + + mean + would + + + mean + spending + + + clean + to + + + mean + clean + + + clean + up + + + rivers + shared + + + clean + rivers + + + prevent + to + + + clean + prevent + + + pollution + air + + + prevent + pollution + + + has + which + + + has + Europe + + + pollution + has + + + boundaries + no + + + has + boundaries + + + neutralize + to + + + clean + neutralize + + + rain + acid + + + neutralize + rain + + + destroying + which + + + destroying + is + + + rain + destroying + + + forests + the + + + destroying + forests + + + forests + lakes + + + forests + monuments + + + monuments + Europe + + + + + ROOT + mean + + + mean + Europe + + + mean + this + + + mean + would + + + mean + spending + + + clean + to + + + mean + clean + + + clean + up + + + rivers + shared + + + clean + rivers + + + prevent + to + + + mean + prevent + + + clean + prevent + + + pollution + air + + + prevent + pollution + + + has + which + + + has + Europe + + + pollution + has + + + boundaries + no + + + has + boundaries + + + neutralize + to + + + mean + neutralize + + + clean + neutralize + + + rain + acid + + + neutralize + rain + + + destroying + which + + + destroying + is + + + rain + destroying + + + forests + the + + + destroying + forests + + + destroying + lakes + + + forests + lakes + + + destroying + monuments + + + forests + monuments + + + monuments + Europe + + + + + + + It + it + 19666 + 19668 + PRP + O + + + is + be + 19669 + 19671 + VBZ + O + + + no + no + 19672 + 19674 + DT + O + + + secret + secret + 19675 + 19681 + NN + O + + + that + that + 19682 + 19686 + IN + O + + + the + the + 19687 + 19690 + DT + O + + + countries + country + 19691 + 19700 + NNS + O + + + of + of + 19701 + 19703 + IN + O + + + East + East + 19704 + 19708 + NNP + O + + + - + - + 19708 + 19709 + : + O + + + ern + ern + 19711 + 19714 + NN + O + + + Europe + Europe + 19715 + 19721 + NNP + LOCATION + + + have + have + 19722 + 19726 + VBP + O + + + lagged + lag + 19727 + 19733 + VBN + O + + + far + far + 19734 + 19737 + RB + O + + + behind + behind + 19738 + 19744 + IN + O + + + Western + Western + 19745 + 19752 + NNP + LOCATION + + + Europe + Europe + 19754 + 19760 + NNP + LOCATION + + + In + in + 19761 + 19763 + IN + O + + + utilizing + utilize + 19764 + 19773 + VBG + O + + + pollution + pollution + 19774 + 19783 + NN + O + + + control + control + 19784 + 19791 + NN + O + + + tech + tech + 19792 + 19796 + NN + O + + + . + . + 19796 + 19797 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (DT no) (NN secret)) (SBAR (IN that) (S (NP (NP (DT the) (NNS countries)) (PP (IN of) (NP (NP (NNP East)) (: -) (NP (NP (NN ern)) (NP (NNP Europe)))))) (VP (VBP have) (VP (VBN lagged) (ADVP (RB far) (PP (IN behind) (NP (NNP Western) (NNP Europe)))))) (PP (IN In) (S (VP (VBG utilizing) (NP (NN pollution) (NN control) (NN tech)))))))) (. .))) + + + ROOT + secret + + + secret + It + + + secret + is + + + secret + no + + + lagged + that + + + countries + the + + + lagged + countries + + + countries + of + + + of + East + + + East + ern + + + ern + Europe + + + lagged + have + + + secret + lagged + + + lagged + far + + + far + behind + + + Europe + Western + + + behind + Europe + + + lagged + In + + + In + utilizing + + + tech + pollution + + + tech + control + + + utilizing + tech + + + + + ROOT + secret + + + secret + It + + + secret + is + + + secret + no + + + lagged + that + + + countries + the + + + lagged + countries + + + countries + East + + + East + ern + + + ern + Europe + + + lagged + have + + + secret + lagged + + + lagged + far + + + Europe + Western + + + far + Europe + + + lagged + utilizing + + + tech + pollution + + + tech + control + + + utilizing + tech + + + + + ROOT + secret + + + secret + It + + + secret + is + + + secret + no + + + lagged + that + + + countries + the + + + lagged + countries + + + countries + East + + + East + ern + + + ern + Europe + + + lagged + have + + + secret + lagged + + + lagged + far + + + Europe + Western + + + far + Europe + + + lagged + utilizing + + + tech + pollution + + + tech + control + + + utilizing + tech + + + + + + + nologies + nology + 19799 + 19807 + NNS + O + + + in + in + 19808 + 19810 + IN + O + + + their + they + 19811 + 19816 + PRP$ + O + + + manufacturing + manufacturing + 19817 + 19830 + NN + O + + + and + and + 19831 + 19834 + CC + O + + + power + power + 19835 + 19840 + NN + O + + + generation + generation + 19842 + 19852 + NN + O + + + processes + process + 19853 + 19862 + NNS + O + + + . + . + 19862 + 19863 + . + O + + + (ROOT (NP (NP (NNS nologies)) (PP (IN in) (NP (NP (PRP$ their) (NN manufacturing)) (CC and) (NP (NN power) (NN generation) (NNS processes)))) (. .))) + + + ROOT + nologies + + + nologies + in + + + manufacturing + their + + + in + manufacturing + + + manufacturing + and + + + processes + power + + + processes + generation + + + manufacturing + processes + + + + + ROOT + nologies + + + manufacturing + their + + + nologies + manufacturing + + + processes + power + + + processes + generation + + + manufacturing + processes + + + + + ROOT + nologies + + + manufacturing + their + + + nologies + manufacturing + + + processes + power + + + processes + generation + + + nologies + processes + + + manufacturing + processes + + + + + + + Partly + partly + 19864 + 19870 + RB + O + + + this + this + 19871 + 19875 + DT + O + + + results + result + 19876 + 19883 + VBZ + O + + + from + from + 19885 + 19889 + IN + O + + + antiquated + antiquated + 19890 + 19900 + JJ + O + + + plants + plant + 19901 + 19907 + NNS + O + + + , + , + 19907 + 19908 + , + O + + + partly + partly + 19909 + 19915 + RB + O + + + from + from + 19916 + 19920 + IN + O + + + eco + eco + 19921 + 19924 + SYM + O + + + - + - + 19924 + 19925 + : + O + + + nomic + nomic + 19927 + 19932 + JJ + O + + + distress + distress + 19933 + 19941 + NN + O + + + that + that + 19942 + 19946 + WDT + O + + + necessitates + necessitate + 19947 + 19959 + VBZ + O + + + use + use + 19960 + 19963 + NN + O + + + of + of + 19964 + 19966 + IN + O + + + pol + pol + 19967 + 19970 + NN + O + + + - + - + 19970 + 19971 + : + O + + + luting + lute + 19973 + 19979 + VBG + O + + + technologies + technology + 19980 + 19992 + NNS + O + + + and + and + 19993 + 19996 + CC + O + + + fuels + fuel + 19997 + 20002 + NNS + O + + + , + , + 20002 + 20003 + , + O + + + such + such + 20004 + 20008 + JJ + O + + + as + as + 20009 + 20011 + IN + O + + + high + high + 20012 + 20016 + JJ + O + + + sulfur + sulfur + 20018 + 20024 + NN + O + + + coal + coal + 20025 + 20029 + NN + O + + + , + , + 20029 + 20030 + , + O + + + and + and + 20031 + 20034 + CC + O + + + partly + partly + 20035 + 20041 + RB + O + + + it + it + 20042 + 20044 + PRP + O + + + is + be + 20045 + 20047 + VBZ + O + + + the + the + 20048 + 20051 + DT + O + + + product + product + 20052 + 20059 + NN + O + + + of + of + 20060 + 20062 + IN + O + + + a + a + 20063 + 20064 + DT + O + + + political + political + 20066 + 20075 + JJ + O + + + system + system + 20076 + 20082 + NN + O + + + in + in + 20083 + 20085 + IN + O + + + which + which + 20086 + 20091 + WDT + O + + + the + the + 20092 + 20095 + DT + O + + + ruling + ruling + 20096 + 20102 + NN + O + + + elite + elite + 20103 + 20108 + NN + O + + + was + be + 20110 + 20113 + VBD + O + + + not + not + 20114 + 20117 + RB + O + + + responsive + responsive + 20118 + 20128 + JJ + O + + + to + to + 20129 + 20131 + TO + O + + + the + the + 20132 + 20135 + DT + O + + + concerns + concern + 20136 + 20144 + NNS + O + + + of + of + 20145 + 20147 + IN + O + + + the + the + 20148 + 20151 + DT + O + + + population + population + 20153 + 20163 + NN + O + + + . + . + 20163 + 20164 + . + O + + + (ROOT (S (S (ADVP (RB Partly)) (NP (DT this)) (VP (VBZ results) (PP (PP (IN from) (NP (JJ antiquated) (NNS plants))) (, ,) (RB partly) (PP (IN from) (FRAG (X (SYM eco)) (: -) (NP (JJ nomic) (NN distress))))) (SBAR (WHNP (WDT that)) (S (VP (VBZ necessitates) (NP (NP (NN use)) (PP (IN of) (NP (NP (NN pol)) (: -) (NP (NP (VBG luting) (NNS technologies)) (CC and) (NP (NNS fuels))))) (, ,) (PP (JJ such) (IN as) (NP (JJ high) (NN sulfur) (NN coal))))))))) (, ,) (CC and) (S (ADVP (RB partly)) (NP (PRP it)) (VP (VBZ is) (NP (NP (DT the) (NN product)) (PP (IN of) (NP (NP (DT a) (JJ political) (NN system)) (SBAR (WHPP (IN in) (WHNP (WDT which))) (S (NP (DT the) (NN ruling) (NN elite)) (VP (VBD was) (RB not) (ADJP (JJ responsive) (PP (TO to) (NP (NP (DT the) (NNS concerns)) (PP (IN of) (NP (DT the) (NN population)))))))))))))) (. .))) + + + ROOT + results + + + results + Partly + + + results + this + + + results + from + + + plants + antiquated + + + from + plants + + + from + partly + + + from + from + + + distress + eco + + + distress + nomic + + + from + distress + + + necessitates + that + + + results + necessitates + + + necessitates + use + + + use + of + + + of + pol + + + technologies + luting + + + pol + technologies + + + technologies + and + + + technologies + fuels + + + as + such + + + use + as + + + coal + high + + + coal + sulfur + + + as + coal + + + results + and + + + product + partly + + + product + it + + + product + is + + + product + the + + + results + product + + + product + of + + + system + a + + + system + political + + + of + system + + + responsive + in + + + in + which + + + elite + the + + + elite + ruling + + + responsive + elite + + + responsive + was + + + responsive + not + + + system + responsive + + + responsive + to + + + concerns + the + + + to + concerns + + + concerns + of + + + population + the + + + of + population + + + + + ROOT + results + + + results + Partly + + + results + this + + + plants + antiquated + + + results + plants + + + results + partly + + + results + from + + + distress + eco + + + distress + nomic + + + from + distress + + + necessitates + that + + + results + necessitates + + + necessitates + use + + + use + pol + + + technologies + luting + + + pol + technologies + + + technologies + fuels + + + coal + high + + + coal + sulfur + + + use + coal + + + product + partly + + + product + it + + + product + is + + + product + the + + + results + product + + + system + a + + + system + political + + + product + system + + + responsive + which + + + elite + the + + + elite + ruling + + + responsive + elite + + + responsive + was + + + responsive + not + + + system + responsive + + + concerns + the + + + responsive + concerns + + + population + the + + + concerns + population + + + + + ROOT + results + + + results + Partly + + + results + this + + + plants + antiquated + + + results + plants + + + results + partly + + + results + from + + + distress + eco + + + distress + nomic + + + from + distress + + + necessitates + that + + + results + necessitates + + + necessitates + use + + + use + pol + + + technologies + luting + + + pol + technologies + + + pol + fuels + + + technologies + fuels + + + coal + high + + + coal + sulfur + + + use + coal + + + product + partly + + + product + it + + + product + is + + + product + the + + + results + product + + + system + a + + + system + political + + + product + system + + + responsive + which + + + elite + the + + + elite + ruling + + + responsive + elite + + + responsive + was + + + responsive + not + + + system + responsive + + + concerns + the + + + responsive + concerns + + + population + the + + + concerns + population + + + + + + + Whatever + whatever + 20165 + 20173 + WDT + O + + + the + the + 20174 + 20177 + DT + O + + + reason + reason + 20178 + 20184 + NN + O + + + , + , + 20184 + 20185 + , + O + + + however + however + 20186 + 20193 + RB + O + + + , + , + 20193 + 20194 + , + O + + + the + the + 20196 + 20199 + DT + O + + + victims + victim + 20200 + 20207 + NNS + O + + + of + of + 20208 + 20210 + IN + O + + + East + east + 20211 + 20215 + JJ + MISC + + + European + european + 20216 + 20224 + JJ + MISC + + + pollution + pollution + 20225 + 20234 + NN + O + + + live + live + 20235 + 20239 + VB + O + + + in + in + 20241 + 20243 + IN + O + + + both + both + 20244 + 20248 + CC + O + + + the + the + 20249 + 20252 + DT + O + + + Eastern + Eastern + 20253 + 20260 + NNP + MISC + + + and + and + 20261 + 20264 + CC + O + + + Western + Western + 20265 + 20272 + NNP + MISC + + + wings + wing + 20273 + 20278 + NNS + O + + + of + of + 20279 + 20281 + IN + O + + + the + the + 20283 + 20286 + DT + O + + + common + common + 20287 + 20293 + JJ + O + + + European + european + 20294 + 20302 + JJ + MISC + + + home + home + 20303 + 20307 + NN + O + + + . + . + 20307 + 20308 + . + O + + + (ROOT (FRAG (WHNP (WDT Whatever)) (SBAR (WHNP (NP (DT the) (NN reason)) (PRN (, ,) (ADVP (RB however)) (, ,))) (S (NP (NP (DT the) (NNS victims)) (PP (IN of) (NP (JJ East) (JJ European) (NN pollution)))) (VP (VB live) (PP (IN in) (NP (NP (CC both) (NP (DT the) (NNP Eastern)) (CC and) (NP (NNP Western) (NNS wings))) (PP (IN of) (NP (DT the) (JJ common) (JJ European) (NN home)))))))) (. .))) + + + ROOT + live + + + live + Whatever + + + reason + the + + + live + reason + + + reason + however + + + victims + the + + + live + victims + + + victims + of + + + pollution + East + + + pollution + European + + + of + pollution + + + live + in + + + Eastern + both + + + Eastern + the + + + in + Eastern + + + Eastern + and + + + wings + Western + + + Eastern + wings + + + Eastern + of + + + home + the + + + home + common + + + home + European + + + of + home + + + + + ROOT + live + + + live + Whatever + + + reason + the + + + live + reason + + + reason + however + + + victims + the + + + live + victims + + + pollution + East + + + pollution + European + + + victims + pollution + + + Eastern + both + + + Eastern + the + + + live + Eastern + + + wings + Western + + + Eastern + wings + + + home + the + + + home + common + + + home + European + + + Eastern + home + + + + + ROOT + live + + + live + Whatever + + + reason + the + + + live + reason + + + reason + however + + + victims + the + + + live + victims + + + pollution + East + + + pollution + European + + + victims + pollution + + + Eastern + both + + + Eastern + the + + + live + Eastern + + + wings + Western + + + live + wings + + + Eastern + wings + + + home + the + + + home + common + + + home + European + + + Eastern + home + + + + + + + Both + both + 20309 + 20313 + DT + O + + + wings + wing + 20314 + 20319 + NNS + O + + + will + will + 20321 + 20325 + MD + O + + + benefit + benefit + 20326 + 20333 + VB + O + + + from + from + 20334 + 20338 + IN + O + + + cleaning + clean + 20339 + 20347 + VBG + O + + + up + up + 20348 + 20350 + RP + O + + + the + the + 20351 + 20354 + DT + O + + + environ + environ + 20355 + 20362 + NN + O + + + - + - + 20362 + 20363 + : + O + + + ment + ment + 20365 + 20369 + NN + O + + + . + . + 20369 + 20370 + . + O + + + (ROOT (S (NP (DT Both) (NNS wings)) (VP (MD will) (VP (VB benefit) (PP (IN from) (S (VP (VBG cleaning) (PRT (RP up)) (NP (DT the) (NN environ) (: -) (NN ment))))))) (. .))) + + + ROOT + benefit + + + wings + Both + + + benefit + wings + + + benefit + will + + + benefit + from + + + from + cleaning + + + cleaning + up + + + ment + the + + + ment + environ + + + cleaning + ment + + + + + ROOT + benefit + + + wings + Both + + + benefit + wings + + + benefit + will + + + benefit + cleaning + + + cleaning + up + + + ment + the + + + ment + environ + + + cleaning + ment + + + + + ROOT + benefit + + + wings + Both + + + benefit + wings + + + benefit + will + + + benefit + cleaning + + + cleaning + up + + + ment + the + + + ment + environ + + + cleaning + ment + + + + + + + However + however + 20371 + 20378 + RB + O + + + , + , + 20378 + 20379 + , + O + + + it + it + 20380 + 20382 + PRP + O + + + follows + follow + 20383 + 20390 + VBZ + O + + + that + that + 20391 + 20395 + IN + O + + + a + a + 20396 + 20397 + DT + O + + + very + very + 20398 + 20402 + RB + O + + + large + large + 20403 + 20408 + JJ + O + + + part + part + 20410 + 20414 + NN + O + + + of + of + 20415 + 20417 + IN + O + + + the + the + 20418 + 20421 + DT + O + + + European + european + 20422 + 20430 + JJ + MISC + + + generated + generate + 20431 + 20440 + VBN + O + + + component + component + 20441 + 20450 + NN + O + + + of + of + 20452 + 20454 + IN + O + + + my + my + 20455 + 20457 + PRP$ + O + + + proposed + propose + 20458 + 20466 + VBN + O + + + environmental + environmental + 20467 + 20480 + JJ + O + + + peace + peace + 20481 + 20486 + NN + O + + + divi + divi + 20487 + 20491 + SYM + O + + + - + - + 20491 + 20492 + : + O + + + dend + dend + 20494 + 20498 + NN + O + + + should + should + 20499 + 20505 + MD + O + + + be + be + 20506 + 20508 + VB + O + + + channeled + channel + 20509 + 20518 + VBN + O + + + to + to + 20519 + 20521 + TO + O + + + Eastern + Eastern + 20522 + 20529 + NNP + LOCATION + + + Europe + Europe + 20531 + 20537 + NNP + LOCATION + + + . + . + 20537 + 20538 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (NP (PRP it)) (VP (VBZ follows) (SBAR (IN that) (S (NP (NP (DT a) (ADJP (RB very) (JJ large)) (NN part)) (PP (IN of) (NP (DT the) (JJ European)))) (VP (VBN generated) (NP (NP (NN component)) (PP (IN of) (NP (NP (PRP$ my) (VBN proposed) (JJ environmental) (NN peace)) (SBAR (FRAG (X (SYM divi)) (: -) (NP (NP (NN dend)) (SBAR (S (VP (MD should) (VP (VB be) (VP (VBN channeled) (PP (TO to) (NP (NNP Eastern) (NNP Europe))))))))) (. .)))))))))))) + + + ROOT + follows + + + follows + However + + + follows + it + + + generated + that + + + part + a + + + large + very + + + part + large + + + generated + part + + + part + of + + + European + the + + + of + European + + + follows + generated + + + generated + component + + + component + of + + + peace + my + + + peace + proposed + + + peace + environmental + + + of + peace + + + dend + divi + + + peace + dend + + + channeled + should + + + channeled + be + + + dend + channeled + + + channeled + to + + + Europe + Eastern + + + to + Europe + + + + + ROOT + follows + + + follows + However + + + follows + it + + + generated + that + + + part + a + + + large + very + + + part + large + + + generated + part + + + European + the + + + part + European + + + follows + generated + + + generated + component + + + peace + my + + + peace + proposed + + + peace + environmental + + + component + peace + + + dend + divi + + + peace + dend + + + channeled + should + + + channeled + be + + + dend + channeled + + + Europe + Eastern + + + channeled + Europe + + + + + ROOT + follows + + + follows + However + + + follows + it + + + generated + that + + + part + a + + + large + very + + + part + large + + + generated + part + + + European + the + + + part + European + + + follows + generated + + + generated + component + + + peace + my + + + peace + proposed + + + peace + environmental + + + component + peace + + + dend + divi + + + peace + dend + + + channeled + should + + + channeled + be + + + dend + channeled + + + Europe + Eastern + + + channeled + Europe + + + + + + + In + in + 20542 + 20544 + IN + O + + + the + the + 20545 + 20548 + DT + O + + + case + case + 20549 + 20553 + NN + O + + + of + of + 20554 + 20556 + IN + O + + + the + the + 20557 + 20560 + DT + O + + + United + United + 20561 + 20567 + NNP + LOCATION + + + States + States + 20568 + 20574 + NNPS + LOCATION + + + and + and + 20575 + 20578 + CC + O + + + Canada + Canada + 20580 + 20586 + NNP + LOCATION + + + , + , + 20586 + 20587 + , + O + + + our + we + 20588 + 20591 + PRP$ + O + + + people + people + 20592 + 20598 + NNS + O + + + will + will + 20599 + 20603 + MD + O + + + undoubtedly + undoubtedly + 20604 + 20615 + RB + O + + + expect + expect + 20616 + 20622 + VB + O + + + that + that + 20624 + 20628 + IN + O + + + the + the + 20629 + 20632 + DT + O + + + greater + greater + 20633 + 20640 + JJR + O + + + part + part + 20641 + 20645 + NN + O + + + of + of + 20646 + 20648 + IN + O + + + our + we + 20649 + 20652 + PRP$ + O + + + peace + peace + 20653 + 20658 + NN + O + + + dividend + dividend + 20659 + 20667 + NN + O + + + be + be + 20669 + 20671 + VB + O + + + spent + spend + 20672 + 20677 + VBN + O + + + in + in + 20678 + 20680 + IN + O + + + a + a + 20681 + 20682 + DT + O + + + manner + manner + 20683 + 20689 + NN + O + + + that + that + 20690 + 20694 + IN + O + + + visibly + visibly + 20695 + 20702 + RB + O + + + benefits + benefit + 20703 + 20711 + VBZ + O + + + our + we + 20713 + 20716 + PRP$ + O + + + own + own + 20717 + 20720 + JJ + O + + + people + people + 20721 + 20727 + NNS + O + + + . + . + 20727 + 20728 + . + O + + + (ROOT (S (PP (IN In) (NP (NP (DT the) (NN case)) (PP (IN of) (NP (DT the) (NNP United) (NNPS States) (CC and) (NNP Canada))))) (, ,) (NP (PRP$ our) (NNS people)) (VP (MD will) (ADVP (RB undoubtedly)) (VP (VB expect) (SBAR (IN that) (S (NP (NP (DT the) (JJR greater) (NN part)) (PP (IN of) (NP (PRP$ our) (NN peace) (NN dividend)))) (VP (VB be) (VP (VBN spent) (PP (IN in) (NP (NP (DT a) (NN manner)) (SBAR (WHNP (IN that)) (S (ADVP (RB visibly)) (VP (VBZ benefits) (NP (PRP$ our) (JJ own) (NNS people))))))))))))) (. .))) + + + ROOT + expect + + + expect + In + + + case + the + + + In + case + + + case + of + + + States + the + + + States + United + + + of + States + + + States + and + + + States + Canada + + + people + our + + + expect + people + + + expect + will + + + expect + undoubtedly + + + spent + that + + + part + the + + + part + greater + + + spent + part + + + part + of + + + dividend + our + + + dividend + peace + + + of + dividend + + + spent + be + + + expect + spent + + + spent + in + + + manner + a + + + in + manner + + + benefits + that + + + benefits + visibly + + + manner + benefits + + + people + our + + + people + own + + + benefits + people + + + + + ROOT + expect + + + case + the + + + expect + case + + + States + the + + + States + United + + + case + States + + + States + Canada + + + people + our + + + expect + people + + + expect + will + + + expect + undoubtedly + + + spent + that + + + part + the + + + part + greater + + + spent + part + + + dividend + our + + + dividend + peace + + + part + dividend + + + spent + be + + + expect + spent + + + manner + a + + + spent + manner + + + benefits + that + + + benefits + visibly + + + manner + benefits + + + people + our + + + people + own + + + benefits + people + + + + + ROOT + expect + + + case + the + + + expect + case + + + States + the + + + States + United + + + case + States + + + case + Canada + + + States + Canada + + + people + our + + + expect + people + + + expect + will + + + expect + undoubtedly + + + spent + that + + + part + the + + + part + greater + + + spent + part + + + dividend + our + + + dividend + peace + + + part + dividend + + + spent + be + + + expect + spent + + + manner + a + + + spent + manner + + + benefits + that + + + benefits + visibly + + + manner + benefits + + + people + our + + + people + own + + + benefits + people + + + + + + + Thus + thus + 20729 + 20733 + RB + O + + + , + , + 20733 + 20734 + , + O + + + most + most + 20735 + 20739 + JJS + O + + + of + of + 20740 + 20742 + IN + O + + + our + we + 20743 + 20746 + PRP$ + O + + + two + two + 20747 + 20750 + CD + NUMBER + 2.0 + + + coun + coun + 20751 + 20755 + NN + O + + + - + - + 20755 + 20756 + : + O + + + tries + try + 20758 + 20763 + VBZ + O + + + ' + ' + 20763 + 20764 + '' + O + + + new + new + 20765 + 20768 + JJ + O + + + environmental + environmental + 20772 + 20785 + JJ + O + + + expenditure + expenditure + 20788 + 20799 + NN + O + + + should + should + 20801 + 20807 + MD + O + + + occur + occur + 20808 + 20813 + VB + O + + + on + on + 20814 + 20816 + IN + O + + + our + we + 20817 + 20820 + PRP$ + O + + + North + north + 20821 + 20826 + JJ + LOCATION + + + American + american + 20827 + 20835 + JJ + LOCATION + + + conti + contus + 20836 + 20841 + NN + O + + + - + - + 20841 + 20842 + : + O + + + nent + nent + 20844 + 20848 + NN + O + + + . + . + 20848 + 20849 + . + O + + + (ROOT (S (ADVP (RB Thus)) (, ,) (NP (NP (JJS most)) (PP (IN of) (NP (NP (PRP$ our) (CD two) (NN coun)) (: -) (S (VP (VBZ tries) (VP ('' ') (NP-TMP (JJ new) (JJ environmental) (NN expenditure)))))))) (VP (MD should) (VP (VB occur) (PP (IN on) (NP (PRP$ our) (JJ North) (JJ American) (NN conti) (: -) (NN nent))))) (. .))) + + + ROOT + occur + + + occur + Thus + + + occur + most + + + most + of + + + coun + our + + + coun + two + + + of + coun + + + coun + tries + + + expenditure + new + + + expenditure + environmental + + + tries + expenditure + + + occur + should + + + occur + on + + + nent + our + + + nent + North + + + nent + American + + + nent + conti + + + on + nent + + + + + ROOT + occur + + + occur + Thus + + + occur + most + + + coun + our + + + coun + two + + + most + coun + + + coun + tries + + + expenditure + new + + + expenditure + environmental + + + tries + expenditure + + + occur + should + + + nent + our + + + nent + North + + + nent + American + + + nent + conti + + + occur + nent + + + + + ROOT + occur + + + occur + Thus + + + occur + most + + + coun + our + + + coun + two + + + most + coun + + + coun + tries + + + expenditure + new + + + expenditure + environmental + + + tries + expenditure + + + occur + should + + + nent + our + + + nent + North + + + nent + American + + + nent + conti + + + occur + nent + + + + + + + This + this + 20850 + 20854 + DT + O + + + expenditure + expenditure + 20855 + 20866 + NN + O + + + should + should + 20867 + 20873 + MD + O + + + nonetheless + nonetheless + 20874 + 20885 + RB + O + + + be + be + 20887 + 20889 + VB + O + + + made + make + 20890 + 20894 + VBN + O + + + in + in + 20895 + 20897 + IN + O + + + a + a + 20898 + 20899 + DT + O + + + way + way + 20900 + 20903 + NN + O + + + that + that + 20904 + 20908 + IN + O + + + benefits + benefit + 20909 + 20917 + NNS + O + + + the + the + 20918 + 20921 + DT + O + + + global + global + 20922 + 20928 + JJ + O + + + environment + environment + 20930 + 20941 + NN + O + + + . + . + 20941 + 20942 + . + O + + + (ROOT (S (NP (DT This) (NN expenditure)) (VP (MD should) (ADVP (RB nonetheless)) (VP (VB be) (VP (VBN made) (PP (IN in) (NP (NP (DT a) (NN way)) (SBAR (IN that) (FRAG (NP-TMP (NP (NNS benefits)) (NP (DT the) (JJ global) (NN environment)))))))))) (. .))) + + + ROOT + made + + + expenditure + This + + + made + expenditure + + + made + should + + + made + nonetheless + + + made + be + + + made + in + + + way + a + + + in + way + + + benefits + that + + + way + benefits + + + environment + the + + + environment + global + + + benefits + environment + + + + + ROOT + made + + + expenditure + This + + + made + expenditure + + + made + should + + + made + nonetheless + + + made + be + + + way + a + + + made + way + + + benefits + that + + + way + benefits + + + environment + the + + + environment + global + + + benefits + environment + + + + + ROOT + made + + + expenditure + This + + + made + expenditure + + + made + should + + + made + nonetheless + + + made + be + + + way + a + + + made + way + + + benefits + that + + + way + benefits + + + environment + the + + + environment + global + + + benefits + environment + + + + + + + North + North + 20943 + 20948 + NNP + MISC + + + Americans + Americans + 20949 + 20958 + NNPS + MISC + + + are + be + 20959 + 20962 + VBP + O + + + both + both + 20963 + 20967 + DT + O + + + in + in + 20968 + 20970 + IN + O + + + the + the + 20972 + 20975 + DT + O + + + aggregate + aggregate + 20976 + 20985 + NN + O + + + and + and + 20986 + 20989 + CC + O + + + on + on + 20990 + 20992 + IN + O + + + a + a + 20993 + 20994 + DT + O + + + per + per + 20995 + 20998 + FW + O + + + capita + capita + 20999 + 21005 + NN + O + + + basis + basis + 21006 + 21011 + NN + O + + + the + the + 21012 + 21015 + DT + O + + + biggest + biggest + 21017 + 21024 + JJS + O + + + producers + producer + 21025 + 21034 + NNS + O + + + of + of + 21035 + 21037 + IN + O + + + the + the + 21038 + 21041 + DT + O + + + greenhouse + greenhouse + 21042 + 21052 + NN + O + + + gases + gas + 21053 + 21058 + NNS + O + + + , + , + 21058 + 21059 + , + O + + + and + and + 21061 + 21064 + CC + O + + + In + in + 21065 + 21067 + IN + O + + + particular + particular + 21068 + 21078 + JJ + O + + + of + of + 21079 + 21081 + IN + O + + + carbon + carbon + 21082 + 21088 + NN + O + + + dioxide + dioxide + 21089 + 21096 + NN + O + + + . + . + 21096 + 21097 + . + O + + + (ROOT (S (NP (NNP North) (NNPS Americans)) (VP (VBP are) (PP (DT both) (PP (IN in) (NP (DT the) (NN aggregate))) (CC and) (PP (IN on) (NP (NP (DT a) (FW per) (NN capita) (NN basis)) (NP (NP (NP (DT the) (JJS biggest) (NNS producers)) (PP (IN of) (NP (DT the) (NN greenhouse) (NNS gases)))) (, ,) (CC and) (PP (IN In) (NP (NP (JJ particular)) (PP (IN of) (NP (NN carbon) (NN dioxide)))))))))) (. .))) + + + ROOT + are + + + Americans + North + + + are + Americans + + + in + both + + + are + in + + + aggregate + the + + + in + aggregate + + + in + and + + + in + on + + + basis + a + + + basis + per + + + basis + capita + + + on + basis + + + producers + the + + + producers + biggest + + + basis + producers + + + producers + of + + + gases + the + + + gases + greenhouse + + + of + gases + + + producers + and + + + producers + In + + + In + particular + + + particular + of + + + dioxide + carbon + + + of + dioxide + + + + + ROOT + are + + + Americans + North + + + are + Americans + + + are + are + + + are + both + + + aggregate + the + + + are + aggregate + + + basis + a + + + basis + per + + + basis + capita + + + are + basis + + + producers + the + + + producers + biggest + + + basis + producers + + + gases + the + + + gases + greenhouse + + + producers + gases + + + producers + In + + + In + particular + + + dioxide + carbon + + + particular + dioxide + + + + + ROOT + are + + + Americans + North + + + are + Americans + + + are + Americans + + + are + are + + + are + both + + + aggregate + the + + + are + aggregate + + + basis + a + + + basis + per + + + basis + capita + + + are + basis + + + producers + the + + + producers + biggest + + + basis + producers + + + gases + the + + + gases + greenhouse + + + producers + gases + + + basis + In + + + producers + In + + + In + particular + + + dioxide + carbon + + + particular + dioxide + + + + + + + Logical + logical + 21098 + 21105 + JJ + O + + + - + - + 21105 + 21106 + : + O + + + ly + ly + 21108 + 21110 + RB + O + + + , + , + 21110 + 21111 + , + O + + + the + the + 21112 + 21115 + DT + O + + + effort + effort + 21116 + 21122 + NN + O + + + to + to + 21123 + 21125 + TO + O + + + begin + begin + 21126 + 21131 + VB + O + + + to + to + 21132 + 21134 + TO + O + + + control + control + 21135 + 21142 + VB + O + + + global + global + 21143 + 21149 + JJ + O + + + warming + warming + 21151 + 21158 + NN + O + + + must + must + 21159 + 21163 + MD + O + + + start + start + 21164 + 21169 + VB + O + + + In + in + 21170 + 21172 + IN + O + + + North + North + 21173 + 21178 + NNP + LOCATION + + + America + America + 21179 + 21186 + NNP + LOCATION + + + . + . + 21186 + 21187 + . + O + + + (ROOT (NP (NP (JJ Logical)) (: -) (ADVP (RB ly)) (, ,) (S (NP (DT the) (NN effort) (S (VP (TO to) (VP (VB begin) (S (VP (TO to) (VP (VB control) (NP (JJ global) (NN warming))))))))) (VP (MD must) (VP (VB start) (PP (IN In) (NP (NNP North) (NNP America)))))) (. .))) + + + ROOT + Logical + + + Logical + ly + + + effort + the + + + start + effort + + + begin + to + + + effort + begin + + + control + to + + + begin + control + + + warming + global + + + control + warming + + + start + must + + + Logical + start + + + start + In + + + America + North + + + In + America + + + + + ROOT + Logical + + + Logical + ly + + + effort + the + + + start + effort + + + begin + to + + + effort + begin + + + control + to + + + begin + control + + + warming + global + + + control + warming + + + start + must + + + Logical + start + + + America + North + + + start + America + + + + + ROOT + Logical + + + Logical + ly + + + effort + the + + + start + effort + + + begin + to + + + effort + begin + + + control + to + + + begin + control + + + warming + global + + + control + warming + + + start + must + + + Logical + start + + + America + North + + + start + America + + + + + + + Under + under + 21189 + 21194 + IN + O + + + my + my + 21195 + 21197 + PRP$ + O + + + proposed + propose + 21198 + 21206 + VBN + O + + + scheme + scheme + 21207 + 21213 + NN + O + + + I + I + 21214 + 21215 + PRP + O + + + would + would + 21216 + 21221 + MD + O + + + recom + recom + 21222 + 21227 + VB + O + + + - + - + 21227 + 21228 + : + O + + + mend + mend + 21230 + 21234 + VB + O + + + that + that + 21235 + 21239 + IN + O + + + a + a + 21240 + 21241 + DT + O + + + great + great + 21242 + 21247 + JJ + O + + + part + part + 21248 + 21252 + NN + O + + + of + of + 21253 + 21255 + IN + O + + + our + we + 21256 + 21259 + PRP$ + O + + + environmen + environman + 21260 + 21270 + NN + O + + + - + - + 21270 + 21271 + : + O + + + tal + tal + 21273 + 21276 + JJ + O + + + dividend + dividend + 21277 + 21285 + NN + O + + + be + be + 21286 + 21288 + VB + O + + + used + use + 21289 + 21293 + VBN + O + + + to + to + 21294 + 21296 + TO + O + + + develop + develop + 21297 + 21304 + VB + O + + + energy + energy + 21305 + 21311 + NN + O + + + con + con + 21312 + 21315 + NN + O + + + - + - + 21315 + 21316 + : + O + + + servation + servation + 21318 + 21327 + NN + O + + + technologies + technology + 21328 + 21340 + NNS + O + + + as + as + 21341 + 21343 + RB + O + + + well + well + 21344 + 21348 + RB + O + + + as + as + 21349 + 21351 + IN + O + + + alterna + alterna + 21352 + 21359 + NN + O + + + - + - + 21359 + 21360 + : + O + + + tives + tive + 21362 + 21367 + NNS + O + + + to + to + 21368 + 21370 + TO + O + + + fossil + fossil + 21371 + 21377 + JJ + O + + + fuels + fuel + 21378 + 21383 + NNS + O + + + . + . + 21383 + 21384 + . + O + + + (ROOT (S (PP (IN Under) (NP (PRP$ my) (VBN proposed) (NN scheme))) (NP (PRP I)) (VP (MD would) (VP (VB recom) (PRN (: -) (S (VP (VB mend) (SBAR (IN that) (S (NP (NP (DT a) (JJ great) (NN part)) (PP (IN of) (NP (NP (PRP$ our) (NN environmen)) (: -) (NP (JJ tal) (NN dividend))))) (VP (VB be) (VP (VBN used) (S (VP (TO to) (VP (VB develop) (NP (NP (NP (NN energy) (NN con)) (: -) (NP (NN servation) (NNS technologies))) (CONJP (RB as) (RB well) (IN as)) (NP (NN alterna)))))))))))) (: -)) (NP (NNS tives)) (PP (TO to) (NP (JJ fossil) (NNS fuels))))) (. .))) + + + ROOT + recom + + + recom + Under + + + scheme + my + + + scheme + proposed + + + Under + scheme + + + recom + I + + + recom + would + + + recom + mend + + + used + that + + + part + a + + + part + great + + + used + part + + + part + of + + + environmen + our + + + of + environmen + + + dividend + tal + + + environmen + dividend + + + used + be + + + mend + used + + + develop + to + + + used + develop + + + con + energy + + + develop + con + + + technologies + servation + + + con + technologies + + + well + as + + + con + well + + + well + as + + + con + alterna + + + recom + tives + + + recom + to + + + fuels + fossil + + + to + fuels + + + + + ROOT + recom + + + scheme + my + + + scheme + proposed + + + recom + scheme + + + recom + I + + + recom + would + + + recom + mend + + + used + that + + + part + a + + + part + great + + + used + part + + + environmen + our + + + part + environmen + + + dividend + tal + + + environmen + dividend + + + used + be + + + mend + used + + + develop + to + + + used + develop + + + con + energy + + + develop + con + + + technologies + servation + + + con + technologies + + + con + alterna + + + recom + tives + + + fuels + fossil + + + recom + fuels + + + + + ROOT + recom + + + scheme + my + + + scheme + proposed + + + recom + scheme + + + recom + I + + + recom + would + + + recom + mend + + + used + that + + + part + a + + + part + great + + + used + part + + + environmen + our + + + part + environmen + + + dividend + tal + + + environmen + dividend + + + used + be + + + mend + used + + + develop + to + + + used + develop + + + con + energy + + + develop + con + + + technologies + servation + + + con + technologies + + + develop + alterna + + + con + alterna + + + recom + tives + + + fuels + fossil + + + recom + fuels + + + + + + + As + as + 21385 + 21387 + IN + O + + + a + a + 21388 + 21389 + DT + O + + + bonus + bonus + 21390 + 21395 + NN + O + + + , + , + 21395 + 21396 + , + O + + + this + this + 21397 + 21401 + DT + O + + + effort + effort + 21402 + 21408 + NN + O + + + will + will + 21410 + 21414 + MD + O + + + help + help + 21415 + 21419 + VB + O + + + ameliorate + ameliorate + 21420 + 21430 + VB + O + + + the + the + 21431 + 21434 + DT + O + + + problem + problem + 21435 + 21442 + NN + O + + + of + of + 21443 + 21445 + IN + O + + + acid + acid + 21446 + 21450 + JJ + O + + + rain + rain + 21452 + 21456 + NN + O + + + , + , + 21456 + 21457 + , + O + + + which + which + 21458 + 21463 + WDT + O + + + has + have + 21464 + 21467 + VBZ + O + + + become + become + 21468 + 21474 + VBN + O + + + a + a + 21475 + 21476 + DT + O + + + major + major + 21477 + 21482 + JJ + O + + + bilateral + bilateral + 21483 + 21492 + JJ + O + + + issue + issue + 21494 + 21499 + NN + O + + + in + in + 21500 + 21502 + IN + O + + + U.S.-Canada + u.s.-canada + 21503 + 21514 + JJ + MISC + + + relations + relation + 21515 + 21524 + NNS + O + + + and + and + 21525 + 21528 + CC + O + + + has + have + 21529 + 21532 + VBZ + O + + + in + in + 21533 + 21535 + IN + O + + + - + - + 21535 + 21536 + : + O + + + flicted + flicted + 21538 + 21545 + JJ + O + + + damage + damage + 21546 + 21552 + NN + O + + + on + on + 21553 + 21555 + IN + O + + + my + my + 21556 + 21558 + PRP$ + O + + + home + home + 21559 + 21563 + NN + O + + + region + region + 21564 + 21570 + NN + O + + + of + of + 21571 + 21573 + IN + O + + + New + New + 21574 + 21577 + NNP + LOCATION + + + England + England + 21579 + 21586 + NNP + LOCATION + + + . + . + 21586 + 21587 + . + O + + + (ROOT (S (PP (IN As) (NP (DT a) (NN bonus))) (, ,) (NP (DT this) (NN effort)) (VP (MD will) (VP (VB help) (VP (VB ameliorate) (NP (NP (DT the) (NN problem)) (PP (IN of) (NP (NP (JJ acid) (NN rain)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VP (VBZ has) (VP (VBN become) (NP (NP (DT a) (JJ major) (JJ bilateral) (NN issue)) (PP (IN in) (NP (JJ U.S.-Canada) (NNS relations)))))) (CC and) (VP (VBZ has))))))) (PP (IN in) (: -) (NP (NP (JJ flicted) (NN damage)) (PP (IN on) (NP (NP (PRP$ my) (NN home) (NN region)) (PP (IN of) (NP (NNP New) (NNP England))))))))))) (. .))) + + + ROOT + help + + + help + As + + + bonus + a + + + As + bonus + + + effort + this + + + help + effort + + + help + will + + + help + ameliorate + + + problem + the + + + ameliorate + problem + + + problem + of + + + rain + acid + + + of + rain + + + issue + which + + + issue + has + + + issue + become + + + issue + a + + + issue + major + + + issue + bilateral + + + rain + issue + + + issue + in + + + relations + U.S.-Canada + + + in + relations + + + issue + and + + + issue + has + + + problem + in + + + damage + flicted + + + in + damage + + + damage + on + + + region + my + + + region + home + + + on + region + + + region + of + + + England + New + + + of + England + + + + + ROOT + help + + + bonus + a + + + help + bonus + + + effort + this + + + help + effort + + + help + will + + + help + ameliorate + + + problem + the + + + ameliorate + problem + + + rain + acid + + + problem + rain + + + issue + which + + + issue + has + + + issue + become + + + issue + a + + + issue + major + + + issue + bilateral + + + rain + issue + + + relations + U.S.-Canada + + + issue + relations + + + issue + has + + + damage + flicted + + + problem + damage + + + region + my + + + region + home + + + damage + region + + + England + New + + + region + England + + + + + ROOT + help + + + bonus + a + + + help + bonus + + + effort + this + + + help + effort + + + help + will + + + help + ameliorate + + + problem + the + + + ameliorate + problem + + + rain + acid + + + problem + rain + + + issue + which + + + has + which + + + issue + has + + + issue + become + + + issue + a + + + issue + major + + + issue + bilateral + + + rain + issue + + + relations + U.S.-Canada + + + issue + relations + + + rain + has + + + issue + has + + + damage + flicted + + + problem + damage + + + region + my + + + region + home + + + damage + region + + + England + New + + + region + England + + + + + + + At + at + 21591 + 21593 + IN + O + + + this + this + 21594 + 21598 + DT + O + + + time + time + 21599 + 21603 + NN + O + + + I + I + 21604 + 21605 + PRP + O + + + can + can + 21606 + 21609 + MD + O + + + not + not + 21609 + 21612 + RB + O + + + state + state + 21613 + 21618 + VB + O + + + the + the + 21619 + 21622 + DT + O + + + amount + amount + 21623 + 21629 + NN + O + + + of + of + 21630 + 21632 + IN + O + + + new + new + 21634 + 21637 + JJ + O + + + environmental + environmental + 21638 + 21651 + JJ + O + + + expenditure + expenditure + 21652 + 21663 + NN + O + + + to + to + 21664 + 21666 + TO + O + + + be + be + 21667 + 21669 + VB + O + + + gener + gener + 21670 + 21675 + SYM + O + + + - + - + 21675 + 21676 + : + O + + + ated + ate + 21678 + 21682 + VBN + O + + + by + by + 21683 + 21685 + IN + O + + + my + my + 21686 + 21688 + PRP$ + O + + + proposal + proposal + 21689 + 21697 + NN + O + + + . + . + 21697 + 21698 + . + O + + + (ROOT (S (PP (IN At) (NP (DT this) (NN time))) (NP (PRP I)) (VP (MD can) (RB not) (VP (VB state) (NP (NP (DT the) (NN amount)) (PP (IN of) (NP (JJ new) (JJ environmental) (NN expenditure)))) (S (VP (TO to) (VP (VB be) (S (VP (X (SYM gener)) (: -) (VP (VBN ated) (PP (IN by) (NP (PRP$ my) (NN proposal))))))))))) (. .))) + + + ROOT + state + + + state + At + + + time + this + + + At + time + + + state + I + + + state + can + + + state + not + + + amount + the + + + state + amount + + + amount + of + + + expenditure + new + + + expenditure + environmental + + + of + expenditure + + + be + to + + + state + be + + + ated + gener + + + be + ated + + + ated + by + + + proposal + my + + + by + proposal + + + + + ROOT + state + + + time + this + + + state + time + + + state + I + + + state + can + + + state + not + + + amount + the + + + state + amount + + + expenditure + new + + + expenditure + environmental + + + amount + expenditure + + + be + to + + + state + be + + + ated + gener + + + be + ated + + + proposal + my + + + ated + proposal + + + + + ROOT + state + + + time + this + + + state + time + + + state + I + + + state + can + + + state + not + + + amount + the + + + state + amount + + + expenditure + new + + + expenditure + environmental + + + amount + expenditure + + + be + to + + + state + be + + + ated + gener + + + be + ated + + + proposal + my + + + ated + proposal + + + + + + + However + however + 21699 + 21706 + RB + O + + + , + , + 21706 + 21707 + , + O + + + some + some + 21708 + 21712 + DT + O + + + project + project + 21713 + 21720 + NN + O + + + that + that + 21722 + 21726 + IN + O + + + the + the + 21727 + 21730 + DT + O + + + end + end + 21731 + 21734 + NN + O + + + of + of + 21735 + 21737 + IN + O + + + the + the + 21738 + 21741 + DT + O + + + Cold + Cold + 21742 + 21746 + NNP + MISC + + + War + War + 21747 + 21750 + NNP + MISC + + + might + might + 21751 + 21756 + MD + O + + + lead + lead + 21757 + 21761 + VB + O + + + to + to + 21762 + 21764 + TO + O + + + a + a + 21766 + 21767 + DT + O + + + 50 + 50 + 21768 + 21770 + CD + PERCENT + %50.0 + + + percent + percent + 21771 + 21778 + NN + PERCENT + %50.0 + + + reduction + reduction + 21779 + 21788 + NN + O + + + in + in + 21789 + 21791 + IN + O + + + U.S. + U.S. + 21792 + 21796 + NNP + LOCATION + + + defense + defense + 21797 + 21804 + NN + O + + + spending + spending + 21806 + 21814 + NN + O + + + by + by + 21815 + 21817 + IN + O + + + the + the + 21818 + 21821 + DT + DATE + THIS P100Y + + + + end + end + 21822 + 21825 + NN + DATE + THIS P100Y + + + + of + of + 21826 + 21828 + IN + DATE + THIS P100Y + + + + this + this + 21829 + 21833 + DT + DATE + THIS P100Y + + + + century + century + 21834 + 21841 + NN + DATE + THIS P100Y + + + + . + . + 21841 + 21842 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (NP (NP (DT some) (NN project)) (PP (IN that) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (NNP Cold) (NNP War)))))) (VP (MD might) (VP (VB lead) (PP (TO to) (NP (NP (DT a) (CD 50) (NN percent) (NN reduction)) (PP (IN in) (NP (NNP U.S.) (NN defense) (NN spending))))) (PP (IN by) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT this) (NN century))))))) (. .))) + + + ROOT + lead + + + lead + However + + + project + some + + + lead + project + + + project + that + + + end + the + + + that + end + + + end + of + + + War + the + + + War + Cold + + + of + War + + + lead + might + + + lead + to + + + reduction + a + + + reduction + 50 + + + reduction + percent + + + to + reduction + + + reduction + in + + + spending + U.S. + + + spending + defense + + + in + spending + + + lead + by + + + end + the + + + by + end + + + end + of + + + century + this + + + of + century + + + + + ROOT + lead + + + lead + However + + + project + some + + + lead + project + + + end + the + + + project + end + + + War + the + + + War + Cold + + + end + War + + + lead + might + + + reduction + a + + + reduction + 50 + + + reduction + percent + + + lead + reduction + + + spending + U.S. + + + spending + defense + + + reduction + spending + + + end + the + + + lead + end + + + century + this + + + end + century + + + + + ROOT + lead + + + lead + However + + + project + some + + + lead + project + + + end + the + + + project + end + + + War + the + + + War + Cold + + + end + War + + + lead + might + + + reduction + a + + + reduction + 50 + + + reduction + percent + + + lead + reduction + + + spending + U.S. + + + spending + defense + + + reduction + spending + + + end + the + + + lead + end + + + century + this + + + end + century + + + + + + + If + if + 21843 + 21845 + IN + O + + + 15 + 15 + 21846 + 21848 + CD + PERCENT + %15.0 + + + percent + percent + 21850 + 21857 + NN + PERCENT + %15.0 + + + of + of + 21858 + 21860 + IN + O + + + this + this + 21861 + 21865 + DT + O + + + saving + saving + 21866 + 21872 + NN + O + + + went + go + 21873 + 21877 + VBD + O + + + to + to + 21878 + 21880 + TO + O + + + the + the + 21881 + 21884 + DT + O + + + environ + environ + 21885 + 21892 + NN + O + + + - + - + 21892 + 21893 + : + O + + + mental + mental + 21895 + 21901 + JJ + O + + + peace + peace + 21902 + 21907 + NN + O + + + dividend + dividend + 21908 + 21916 + NN + O + + + , + , + 21916 + 21917 + , + O + + + the + the + 21918 + 21921 + DT + O + + + annual + annual + 21922 + 21928 + JJ + SET + P1Y + + + new + new + 21929 + 21932 + JJ + O + + + envi + envus + 21933 + 21937 + NNS + O + + + - + - + 21937 + 21938 + : + O + + + ronmental + ronmental + 21940 + 21949 + JJ + O + + + expenditure + expenditure + 21950 + 21961 + NN + O + + + in + in + 21962 + 21964 + IN + O + + + the + the + 21965 + 21968 + DT + O + + + United + United + 21969 + 21975 + NNP + LOCATION + + + States + States + 21976 + 21982 + NNPS + LOCATION + + + would + would + 21984 + 21989 + MD + O + + + equal + equal + 21990 + 21995 + VB + O + + + $ + $ + 21996 + 21997 + $ + MONEY + $2.2E10 + + + 22 + 22 + 21997 + 21999 + CD + MONEY + $2.2E10 + + + billion + billion + 22000 + 22007 + CD + MONEY + $2.2E10 + + + , + , + 22007 + 22008 + , + O + + + or + or + 22009 + 22011 + CC + O + + + four + four + 22012 + 22016 + CD + NUMBER + 4.0 + + + times + times + 22017 + 22022 + CC + O + + + our + we + 22023 + 22026 + PRP$ + O + + + present + present + 22028 + 22035 + JJ + DATE + PRESENT_REF + PRESENT_REF + + + federal + federal + 22036 + 22043 + JJ + O + + + effort + effort + 22044 + 22050 + NN + O + + + . + . + 22050 + 22051 + . + O + + + (ROOT (S (SBAR (IN If) (S (NP (NP (CD 15) (NN percent)) (PP (IN of) (NP (DT this) (NN saving)))) (VP (VBD went) (PP (TO to) (NP (NP (DT the) (NN environ)) (PRN (: -) (NP (NP (JJ mental) (NN peace) (NN dividend)) (, ,) (NP (DT the) (JJ annual) (JJ new) (NNS envi))) (: -))))))) (NP (NP (JJ ronmental) (NN expenditure)) (PP (IN in) (NP (DT the) (NNP United) (NNPS States)))) (VP (MD would) (VP (VB equal) (NP (NP (QP ($ $) (CD 22) (CD billion))) (, ,) (CC or) (NP (NP (CD four)) (PP (CC times) (NP (PRP$ our) (JJ present) (JJ federal) (NN effort))))))) (. .))) + + + ROOT + equal + + + went + If + + + percent + 15 + + + went + percent + + + percent + of + + + saving + this + + + of + saving + + + equal + went + + + went + to + + + environ + the + + + to + environ + + + dividend + mental + + + dividend + peace + + + environ + dividend + + + envi + the + + + envi + annual + + + envi + new + + + dividend + envi + + + expenditure + ronmental + + + equal + expenditure + + + expenditure + in + + + States + the + + + States + United + + + in + States + + + equal + would + + + equal + $ + + + $ + 22 + + + $ + billion + + + $ + or + + + $ + four + + + four + times + + + effort + our + + + effort + present + + + effort + federal + + + times + effort + + + + + ROOT + equal + + + went + If + + + percent + 15 + + + went + percent + + + saving + this + + + percent + saving + + + equal + went + + + environ + the + + + went + environ + + + dividend + mental + + + dividend + peace + + + environ + dividend + + + envi + the + + + envi + annual + + + envi + new + + + dividend + envi + + + expenditure + ronmental + + + equal + expenditure + + + States + the + + + States + United + + + expenditure + States + + + equal + would + + + equal + $ + + + $ + 22 + + + $ + billion + + + $ + four + + + four + times + + + effort + our + + + effort + present + + + effort + federal + + + times + effort + + + + + ROOT + equal + + + went + If + + + percent + 15 + + + went + percent + + + saving + this + + + percent + saving + + + equal + went + + + environ + the + + + went + environ + + + dividend + mental + + + dividend + peace + + + environ + dividend + + + envi + the + + + envi + annual + + + envi + new + + + dividend + envi + + + expenditure + ronmental + + + equal + expenditure + + + States + the + + + States + United + + + expenditure + States + + + equal + would + + + equal + $ + + + $ + 22 + + + $ + billion + + + equal + four + + + $ + four + + + four + times + + + effort + our + + + effort + present + + + effort + federal + + + times + effort + + + + + + + Comparable + comparable + 22052 + 22062 + JJ + O + + + sums + sum + 22063 + 22067 + NNS + O + + + should + should + 22069 + 22075 + MD + O + + + be + be + 22076 + 22078 + VB + O + + + generated + generate + 22079 + 22088 + VBN + O + + + by + by + 22089 + 22091 + IN + O + + + reductions + reduction + 22092 + 22102 + NNS + O + + + in + in + 22103 + 22105 + IN + O + + + Euro + Euro + 22106 + 22110 + NNP + MISC + + + - + - + 22110 + 22111 + : + O + + + pean + pean + 22113 + 22117 + NN + O + + + and + and + 22118 + 22121 + CC + O + + + Soviet + soviet + 22122 + 22128 + JJ + MISC + + + defense + defense + 22129 + 22136 + NN + O + + + expenditure + expenditure + 22137 + 22148 + NN + O + + + . + . + 22148 + 22149 + . + O + + + (ROOT (S (NP (JJ Comparable) (NNS sums)) (VP (MD should) (VP (VB be) (VP (VBN generated) (PP (IN by) (NP (NP (NP (NNS reductions)) (PP (IN in) (NP (NNP Euro)))) (: -) (NP (NP (NN pean)) (CC and) (NP (JJ Soviet) (NN defense) (NN expenditure)))))))) (. .))) + + + ROOT + generated + + + sums + Comparable + + + generated + sums + + + generated + should + + + generated + be + + + generated + by + + + by + reductions + + + reductions + in + + + in + Euro + + + reductions + pean + + + pean + and + + + expenditure + Soviet + + + expenditure + defense + + + pean + expenditure + + + + + ROOT + generated + + + sums + Comparable + + + generated + sums + + + generated + should + + + generated + be + + + generated + reductions + + + reductions + Euro + + + reductions + pean + + + expenditure + Soviet + + + expenditure + defense + + + pean + expenditure + + + + + ROOT + generated + + + sums + Comparable + + + generated + sums + + + generated + should + + + generated + be + + + generated + reductions + + + reductions + Euro + + + reductions + pean + + + expenditure + Soviet + + + expenditure + defense + + + reductions + expenditure + + + pean + expenditure + + + + + + + With + with + 22150 + 22154 + IN + O + + + this + this + 22156 + 22160 + DT + O + + + level + level + 22161 + 22166 + NN + O + + + of + of + 22167 + 22169 + IN + O + + + resource + resource + 22170 + 22178 + NN + O + + + commitment + commitment + 22179 + 22189 + NN + O + + + we + we + 22190 + 22192 + PRP + O + + + might + might + 22193 + 22198 + MD + O + + + truly + truly + 22200 + 22205 + RB + O + + + begin + begin + 22206 + 22211 + VB + O + + + to + to + 22212 + 22214 + TO + O + + + have + have + 22215 + 22219 + VB + O + + + an + a + 22220 + 22222 + DT + O + + + impact + impact + 22223 + 22229 + NN + O + + + on + on + 22230 + 22232 + IN + O + + + the + the + 22233 + 22236 + DT + O + + + mam + mam + 22237 + 22240 + NN + O + + + - + - + 22240 + 22241 + : + O + + + moth + moth + 22243 + 22247 + NN + O + + + environmental + environmental + 22248 + 22261 + JJ + O + + + problems + problem + 22262 + 22270 + NNS + O + + + facing + face + 22271 + 22277 + VBG + O + + + us + we + 22278 + 22280 + PRP + O + + + . + . + 22280 + 22281 + . + O + + + (ROOT (S (PP (IN With) (NP (NP (DT this) (NN level)) (PP (IN of) (NP (NN resource) (NN commitment))))) (NP (PRP we)) (VP (MD might) (ADVP (RB truly)) (VP (VB begin) (S (VP (TO to) (VP (VB have) (NP (NP (NP (DT an) (NN impact)) (PP (IN on) (NP (DT the) (NN mam)))) (: -) (NP (NP (NN moth) (JJ environmental) (NNS problems)) (VP (VBG facing) (NP (PRP us)))))))))) (. .))) + + + ROOT + begin + + + begin + With + + + level + this + + + With + level + + + level + of + + + commitment + resource + + + of + commitment + + + begin + we + + + begin + might + + + begin + truly + + + have + to + + + begin + have + + + impact + an + + + have + impact + + + impact + on + + + mam + the + + + on + mam + + + problems + moth + + + problems + environmental + + + impact + problems + + + problems + facing + + + facing + us + + + + + ROOT + begin + + + level + this + + + begin + level + + + commitment + resource + + + level + commitment + + + begin + we + + + begin + might + + + begin + truly + + + have + to + + + begin + have + + + impact + an + + + have + impact + + + mam + the + + + impact + mam + + + problems + moth + + + problems + environmental + + + impact + problems + + + problems + facing + + + facing + us + + + + + ROOT + begin + + + level + this + + + begin + level + + + commitment + resource + + + level + commitment + + + begin + we + + + begin + might + + + begin + truly + + + have + to + + + begin + have + + + impact + an + + + have + impact + + + mam + the + + + impact + mam + + + problems + moth + + + problems + environmental + + + impact + problems + + + problems + facing + + + facing + us + + + + + + + So + so + 22285 + 22287 + RB + O + + + far + far + 22288 + 22291 + RB + O + + + I + I + 22292 + 22293 + PRP + O + + + have + have + 22294 + 22298 + VBP + O + + + discussed + discuss + 22299 + 22308 + VBN + O + + + how + how + 22309 + 22312 + WRB + O + + + the + the + 22313 + 22316 + DT + O + + + peace + peace + 22317 + 22322 + NN + O + + + divi + divi + 22323 + 22327 + SYM + O + + + - + - + 22327 + 22328 + : + O + + + dend + dend + 22332 + 22336 + NN + O + + + generated + generate + 22337 + 22346 + VBN + O + + + by + by + 22347 + 22349 + IN + O + + + the + the + 22350 + 22353 + DT + O + + + end + end + 22354 + 22357 + NN + O + + + of + of + 22358 + 22360 + IN + O + + + the + the + 22361 + 22364 + DT + O + + + Cold + Cold + 22365 + 22369 + NNP + MISC + + + War + War + 22370 + 22373 + NNP + MISC + + + might + might + 22377 + 22382 + MD + O + + + be + be + 22383 + 22385 + VB + O + + + used + use + 22386 + 22390 + VBN + O + + + to + to + 22391 + 22393 + TO + O + + + enhance + enhance + 22394 + 22401 + VB + O + + + the + the + 22402 + 22405 + DT + O + + + environment + environment + 22406 + 22417 + NN + O + + + of + of + 22421 + 22423 + IN + O + + + the + the + 22424 + 22427 + DT + O + + + Cold + Cold + 22428 + 22432 + NNP + O + + + Warring + war + 22433 + 22440 + VBG + O + + + nations + nation + 22441 + 22448 + NNS + O + + + , + , + 22448 + 22449 + , + O + + + that + that + 22450 + 22454 + IN + O + + + Is + be + 22455 + 22457 + VBZ + O + + + , + , + 22457 + 22458 + , + O + + + of + of + 22459 + 22461 + IN + O + + + Europe + Europe + 22465 + 22471 + NNP + LOCATION + + + and + and + 22472 + 22475 + CC + O + + + North + North + 22476 + 22481 + NNP + LOCATION + + + America + America + 22482 + 22489 + NNP + LOCATION + + + . + . + 22489 + 22490 + . + O + + + (ROOT (S (ADVP (RB So) (RB far)) (NP (PRP I)) (VP (VBP have) (VP (VBN discussed) (SBAR (WHADVP (WRB how)) (S (NP (NP (DT the) (NN peace)) (SBAR (S (VP (X (SYM divi)) (: -) (VP (NP (NN dend)) (VBN generated) (PP (IN by) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (NNP Cold) (NNP War)))))))))) (VP (MD might) (VP (VB be) (VP (VBN used) (S (VP (TO to) (VP (VB enhance) (NP (NP (DT the) (NN environment)) (PP (IN of) (NP (DT the) (NNP Cold)))) (S (VP (VBG Warring) (NP (NP (NP (NNS nations)) (, ,) (SBAR (WHNP (IN that)) (S (VP (VBZ Is)))) (, ,)) (PP (IN of) (NP (NP (NNP Europe) (CC and) (NNP North)) (NNP America)))))))))))))))) (. .))) + + + ROOT + discussed + + + far + So + + + discussed + far + + + discussed + I + + + discussed + have + + + used + how + + + peace + the + + + used + peace + + + generated + divi + + + generated + dend + + + peace + generated + + + generated + by + + + end + the + + + by + end + + + end + of + + + War + the + + + War + Cold + + + of + War + + + used + might + + + used + be + + + discussed + used + + + enhance + to + + + used + enhance + + + environment + the + + + enhance + environment + + + environment + of + + + Cold + the + + + of + Cold + + + enhance + Warring + + + Warring + nations + + + Is + that + + + nations + Is + + + nations + of + + + America + Europe + + + Europe + and + + + Europe + North + + + of + America + + + + + ROOT + discussed + + + far + So + + + discussed + far + + + discussed + I + + + discussed + have + + + used + how + + + peace + the + + + used + peace + + + generated + divi + + + generated + dend + + + peace + generated + + + end + the + + + generated + end + + + War + the + + + War + Cold + + + end + War + + + used + might + + + used + be + + + discussed + used + + + enhance + to + + + used + enhance + + + environment + the + + + enhance + environment + + + Cold + the + + + environment + Cold + + + enhance + Warring + + + Warring + nations + + + Is + that + + + nations + Is + + + America + Europe + + + Europe + North + + + nations + America + + + + + ROOT + discussed + + + far + So + + + discussed + far + + + discussed + I + + + discussed + have + + + used + how + + + peace + the + + + used + peace + + + generated + divi + + + generated + dend + + + peace + generated + + + end + the + + + generated + end + + + War + the + + + War + Cold + + + end + War + + + used + might + + + used + be + + + discussed + used + + + enhance + to + + + used + enhance + + + environment + the + + + enhance + environment + + + Cold + the + + + environment + Cold + + + enhance + Warring + + + Warring + nations + + + Is + that + + + nations + Is + + + America + Europe + + + Europe + North + + + America + North + + + nations + America + + + + + + + We + we + 22491 + 22493 + PRP + O + + + live + live + 22494 + 22498 + VBP + O + + + in + in + 22499 + 22501 + IN + O + + + a + a + 22502 + 22503 + DT + O + + + single + single + 22507 + 22513 + JJ + O + + + global + global + 22514 + 22520 + JJ + O + + + community + community + 22521 + 22530 + NN + O + + + . + . + 22530 + 22531 + . + O + + + (ROOT (S (NP (PRP We)) (VP (VBP live) (PP (IN in) (NP (DT a) (JJ single) (JJ global) (NN community)))) (. .))) + + + ROOT + live + + + live + We + + + live + in + + + community + a + + + community + single + + + community + global + + + in + community + + + + + ROOT + live + + + live + We + + + community + a + + + community + single + + + community + global + + + live + community + + + + + ROOT + live + + + live + We + + + community + a + + + community + single + + + community + global + + + live + community + + + + + + + The + the + 22532 + 22535 + DT + O + + + Spring + spring + 22536 + 22542 + NN + DATE + XXXX-SP + XXXX-SP + + + clean + clean + 22543 + 22548 + JJ + O + + + - + - + 22548 + 22549 + : + O + + + ing + ing + 22553 + 22556 + NN + O + + + made + make + 22557 + 22561 + VBD + O + + + possible + possible + 22562 + 22570 + JJ + O + + + by + by + 22571 + 22573 + IN + O + + + the + the + 22574 + 22577 + DT + O + + + thaw + thaw + 22578 + 22582 + NN + O + + + In + in + 22583 + 22585 + IN + O + + + the + the + 22586 + 22589 + DT + O + + + Cold + Cold + 22590 + 22594 + NNP + MISC + + + War + War + 22598 + 22601 + NNP + MISC + + + will + will + 22602 + 22606 + MD + O + + + benefit + benefit + 22607 + 22614 + VB + O + + + not + not + 22615 + 22618 + RB + O + + + only + only + 22619 + 22623 + RB + O + + + our + we + 22624 + 22627 + PRP$ + O + + + house + house + 22628 + 22633 + NN + O + + + but + but + 22634 + 22637 + CC + O + + + also + also + 22638 + 22642 + RB + O + + + our + we + 22646 + 22649 + PRP$ + O + + + global + global + 22650 + 22656 + JJ + O + + + community + community + 22657 + 22666 + NN + O + + + . + . + 22666 + 22667 + . + O + + + (ROOT (S (NP (NP (DT The) (NN Spring)) (ADJP (ADJP (JJ clean)) (: -) (NP (NP (NN ing)) (SBAR (S (VP (VBD made) (ADJP (JJ possible) (PP (IN by) (NP (DT the) (NN thaw)))) (PP (IN In) (NP (DT the) (NNP Cold) (NNP War))))))))) (VP (MD will) (VP (VB benefit) (NP (CONJP (RB not) (RB only)) (NP (PRP$ our) (NN house)) (CONJP (CC but) (RB also)) (NP (PRP$ our) (JJ global) (NN community))))) (. .))) + + + ROOT + benefit + + + Spring + The + + + benefit + Spring + + + Spring + clean + + + clean + ing + + + ing + made + + + made + possible + + + possible + by + + + thaw + the + + + by + thaw + + + made + In + + + War + the + + + War + Cold + + + In + War + + + benefit + will + + + only + not + + + house + only + + + house + our + + + benefit + house + + + also + but + + + house + also + + + community + our + + + community + global + + + house + community + + + + + ROOT + benefit + + + Spring + The + + + benefit + Spring + + + Spring + clean + + + clean + ing + + + ing + made + + + made + possible + + + thaw + the + + + possible + thaw + + + War + the + + + War + Cold + + + made + War + + + benefit + will + + + only + not + + + house + only + + + house + our + + + benefit + house + + + community + our + + + community + global + + + house + community + + + + + ROOT + benefit + + + Spring + The + + + benefit + Spring + + + Spring + clean + + + clean + ing + + + ing + made + + + made + possible + + + thaw + the + + + possible + thaw + + + War + the + + + War + Cold + + + made + War + + + benefit + will + + + only + not + + + house + only + + + house + our + + + benefit + house + + + community + our + + + community + global + + + benefit + community + + + house + community + + + + + + + However + however + 22668 + 22675 + RB + O + + + , + , + 22675 + 22676 + , + O + + + we + we + 22677 + 22679 + PRP + O + + + can + can + 22680 + 22683 + MD + O + + + not + not + 22683 + 22686 + RB + O + + + be + be + 22690 + 22692 + VB + O + + + indifferent + indifferent + 22693 + 22704 + JJ + O + + + to + to + 22705 + 22707 + TO + O + + + an + a + 22708 + 22710 + DT + O + + + environmental + environmental + 22711 + 22724 + JJ + O + + + deterio + deterio + 22725 + 22732 + NN + O + + + - + - + 22732 + 22733 + : + O + + + ration + ration + 22737 + 22743 + NN + O + + + in + in + 22744 + 22746 + IN + O + + + that + that + 22747 + 22751 + DT + O + + + part + part + 22752 + 22756 + NN + O + + + of + of + 22757 + 22759 + IN + O + + + the + the + 22760 + 22763 + DT + O + + + world + world + 22764 + 22769 + NN + O + + + which + which + 22770 + 22775 + WDT + O + + + is + be + 22776 + 22778 + VBZ + O + + + nei + nei + 22779 + 22782 + SYM + O + + + - + - + 22782 + 22783 + : + O + + + ther + ther + 22787 + 22791 + IN + O + + + East + East + 22792 + 22796 + NNP + O + + + nor + nor + 22797 + 22800 + CC + O + + + West + West + 22801 + 22805 + NNP + O + + + , + , + 22805 + 22806 + , + O + + + that + that + 22807 + 22811 + DT + O + + + is + be + 22812 + 22814 + VBZ + O + + + , + , + 22814 + 22815 + , + O + + + the + the + 22816 + 22819 + DT + O + + + Third + Third + 22820 + 22825 + NNP + ORDINAL + 3.0 + + + World + World + 22829 + 22834 + NNP + MISC + + + , + , + 22834 + 22835 + , + O + + + the + the + 22836 + 22839 + DT + O + + + developing + develop + 22840 + 22850 + VBG + O + + + world + world + 22851 + 22856 + NN + O + + + which + which + 22857 + 22862 + WDT + O + + + is + be + 22863 + 22865 + VBZ + O + + + home + home + 22866 + 22870 + NN + O + + + to + to + 22874 + 22876 + TO + O + + + 70 + 70 + 22877 + 22879 + CD + PERCENT + %70.0 + + + percent + percent + 22880 + 22887 + NN + PERCENT + %70.0 + + + of + of + 22888 + 22890 + IN + O + + + the + the + 22891 + 22894 + DT + O + + + world + world + 22895 + 22900 + NN + O + + + 's + 's + 22900 + 22902 + POS + O + + + population + population + 22903 + 22913 + NN + O + + + . + . + 22913 + 22914 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (NP (PRP we)) (VP (MD can) (RB not) (VP (VB be) (ADJP (ADJP (JJ indifferent) (PP (TO to) (NP (NP (DT an) (JJ environmental) (NN deterio)) (: -) (NP (NP (NN ration)) (PP (IN in) (NP (NP (DT that) (NN part)) (PP (IN of) (NP (NP (DT the) (NN world)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is)))))))))))) (SBAR (X (SYM nei)) (S (: -) (PP (IN ther) (NP (NNP East) (CC nor) (NNP West)) (, ,)) (NP (DT that)) (VP (VBZ is) (, ,) (FRAG (NP (NP (DT the) (NNP Third) (NNP World)) (, ,) (NP (NP (DT the) (VBG developing) (NN world)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (ADVP (NN home)) (PP (TO to) (NP (NP (CD 70) (NN percent)) (PP (IN of) (NP (NP (DT the) (NN world) (POS 's)) (NN population))))))))))))))))) (. .))) + + + ROOT + indifferent + + + indifferent + However + + + indifferent + we + + + indifferent + can + + + indifferent + not + + + indifferent + be + + + indifferent + to + + + deterio + an + + + deterio + environmental + + + to + deterio + + + deterio + ration + + + ration + in + + + part + that + + + in + part + + + part + of + + + world + the + + + of + world + + + is + which + + + world + is + + + is + nei + + + is + ther + + + ther + East + + + East + nor + + + East + West + + + is + that + + + indifferent + is + + + World + the + + + World + Third + + + is + World + + + world + the + + + world + developing + + + World + world + + + is + which + + + world + is + + + is + home + + + is + to + + + percent + 70 + + + to + percent + + + percent + of + + + world + the + + + population + world + + + world + 's + + + of + population + + + + + ROOT + indifferent + + + indifferent + However + + + indifferent + we + + + indifferent + can + + + indifferent + not + + + indifferent + be + + + deterio + an + + + deterio + environmental + + + indifferent + deterio + + + deterio + ration + + + part + that + + + ration + part + + + world + the + + + part + world + + + is + which + + + world + is + + + is + nei + + + is + East + + + East + West + + + is + that + + + indifferent + is + + + World + the + + + World + Third + + + is + World + + + world + the + + + world + developing + + + World + world + + + is + which + + + world + is + + + is + home + + + percent + 70 + + + is + percent + + + world + the + + + population + world + + + percent + population + + + + + ROOT + indifferent + + + indifferent + However + + + indifferent + we + + + indifferent + can + + + indifferent + not + + + indifferent + be + + + deterio + an + + + deterio + environmental + + + indifferent + deterio + + + deterio + ration + + + part + that + + + ration + part + + + world + the + + + part + world + + + is + which + + + world + is + + + is + nei + + + is + East + + + East + West + + + is + West + + + is + that + + + indifferent + is + + + World + the + + + World + Third + + + is + World + + + world + the + + + world + developing + + + World + world + + + is + which + + + world + is + + + is + home + + + percent + 70 + + + is + percent + + + world + the + + + population + world + + + percent + population + + + + + + + On + on + 22920 + 22922 + IN + O + + + an + a + 22923 + 22925 + DT + O + + + environmental + environmental + 22926 + 22939 + JJ + O + + + level + level + 22940 + 22945 + NN + O + + + , + , + 22945 + 22946 + , + O + + + we + we + 22947 + 22949 + PRP + O + + + will + will + 22950 + 22954 + MD + O + + + accom + accom + 22955 + 22960 + VB + O + + + - + - + 22960 + 22961 + : + O + + + plish + plish + 22963 + 22968 + NN + O + + + little + little + 22969 + 22975 + RB + O + + + if + if + 22976 + 22978 + IN + O + + + the + the + 22979 + 22982 + DT + O + + + savings + savings + 22983 + 22990 + NNS + O + + + in + in + 22991 + 22993 + IN + O + + + greenhouse + greenhouse + 22994 + 23004 + NN + O + + + gases + gas + 23006 + 23011 + NNS + O + + + made + make + 23012 + 23016 + VBN + O + + + by + by + 23017 + 23019 + IN + O + + + conservation + conservation + 23020 + 23032 + NN + O + + + and + and + 23033 + 23036 + CC + O + + + new + new + 23037 + 23040 + JJ + O + + + tech + tech + 23041 + 23045 + NN + O + + + - + - + 23045 + 23046 + : + O + + + nology + nology + 23048 + 23054 + NN + O + + + use + use + 23055 + 23058 + NN + O + + + In + in + 23059 + 23061 + IN + O + + + the + the + 23062 + 23065 + DT + O + + + developed + developed + 23066 + 23075 + JJ + O + + + countries + country + 23076 + 23085 + NNS + O + + + are + be + 23086 + 23089 + VBP + O + + + offset + offset + 23091 + 23097 + VBN + O + + + by + by + 23098 + 23100 + IN + O + + + the + the + 23101 + 23104 + DT + O + + + ecologically + ecologically + 23105 + 23117 + RB + O + + + unsound + unsound + 23118 + 23125 + JJ + O + + + Industri + Industri + 23126 + 23134 + NNP + ORGANIZATION + + + - + - + 23134 + 23135 + : + O + + + alization + alization + 23137 + 23146 + NN + O + + + of + of + 23147 + 23149 + IN + O + + + the + the + 23150 + 23153 + DT + O + + + developing + develop + 23154 + 23164 + VBG + O + + + world + world + 23165 + 23170 + NN + O + + + and + and + 23171 + 23174 + CC + O + + + by + by + 23175 + 23177 + IN + O + + + the + the + 23178 + 23181 + DT + O + + + destruction + destruction + 23183 + 23194 + NN + O + + + of + of + 23195 + 23197 + IN + O + + + the + the + 23198 + 23201 + DT + O + + + tropical + tropical + 23202 + 23210 + JJ + O + + + forests + forest + 23211 + 23218 + NNS + O + + + which + which + 23219 + 23224 + WDT + O + + + are + be + 23225 + 23228 + VBP + O + + + quite + quite + 23230 + 23235 + RB + O + + + literally + literally + 23236 + 23245 + RB + O + + + the + the + 23246 + 23249 + DT + O + + + lungs + lung + 23250 + 23255 + NNS + O + + + of + of + 23256 + 23258 + IN + O + + + our + we + 23259 + 23262 + PRP$ + O + + + planet + planet + 23263 + 23269 + NN + O + + + . + . + 23269 + 23270 + . + O + + + (ROOT (S (PP (IN On) (NP (DT an) (JJ environmental) (NN level))) (, ,) (NP (PRP we)) (VP (MD will) (VP (VB accom) (: -) (S (NP (NP (NN plish) (RB little)) (SBAR (IN if) (S (NP (NP (DT the) (NNS savings)) (PP (IN in) (NP (NN greenhouse) (NNS gases)))) (VP (VBN made) (PP (IN by) (NP (NP (NN conservation)) (CC and) (NP (JJ new) (NN tech)))))))) (: -) (NP (NN nology) (NN use)) (PP (IN In) (NP (DT the) (JJ developed) (NNS countries))) (VP (VBP are) (VP (VBN offset) (PP (PP (IN by) (NP (NP (DT the) (ADJP (RB ecologically) (JJ unsound)) (NNP Industri)) (: -) (NP (NP (NN alization)) (PP (IN of) (NP (DT the) (VBG developing) (NN world)))))) (CC and) (PP (IN by) (NP (NP (DT the) (NN destruction)) (PP (IN of) (NP (NP (DT the) (JJ tropical) (NNS forests)) (SBAR (WHNP (WDT which)) (S (VP (VBP are) (ADVP (RB quite) (RB literally)) (NP (NP (DT the) (NNS lungs)) (PP (IN of) (NP (PRP$ our) (NN planet))))))))))))))))) (. .))) + + + ROOT + accom + + + accom + On + + + level + an + + + level + environmental + + + On + level + + + accom + we + + + accom + will + + + offset + plish + + + plish + little + + + made + if + + + savings + the + + + made + savings + + + savings + in + + + gases + greenhouse + + + in + gases + + + plish + made + + + made + by + + + by + conservation + + + conservation + and + + + tech + new + + + conservation + tech + + + use + nology + + + offset + use + + + offset + In + + + countries + the + + + countries + developed + + + In + countries + + + offset + are + + + accom + offset + + + offset + by + + + Industri + the + + + unsound + ecologically + + + Industri + unsound + + + by + Industri + + + Industri + alization + + + alization + of + + + world + the + + + world + developing + + + of + world + + + by + and + + + by + by + + + destruction + the + + + by + destruction + + + destruction + of + + + forests + the + + + forests + tropical + + + of + forests + + + lungs + which + + + lungs + are + + + literally + quite + + + lungs + literally + + + lungs + the + + + forests + lungs + + + lungs + of + + + planet + our + + + of + planet + + + + + ROOT + accom + + + level + an + + + level + environmental + + + accom + level + + + accom + we + + + accom + will + + + offset + plish + + + plish + little + + + made + if + + + savings + the + + + made + savings + + + gases + greenhouse + + + savings + gases + + + plish + made + + + made + conservation + + + tech + new + + + conservation + tech + + + use + nology + + + offset + use + + + countries + the + + + countries + developed + + + offset + countries + + + offset + are + + + accom + offset + + + Industri + the + + + unsound + ecologically + + + Industri + unsound + + + offset + Industri + + + Industri + alization + + + world + the + + + world + developing + + + alization + world + + + destruction + the + + + Industri + destruction + + + forests + the + + + forests + tropical + + + destruction + forests + + + lungs + which + + + lungs + are + + + literally + quite + + + lungs + literally + + + lungs + the + + + forests + lungs + + + planet + our + + + lungs + planet + + + + + ROOT + accom + + + level + an + + + level + environmental + + + accom + level + + + accom + we + + + accom + will + + + offset + plish + + + plish + little + + + made + if + + + savings + the + + + made + savings + + + gases + greenhouse + + + savings + gases + + + plish + made + + + made + conservation + + + tech + new + + + made + tech + + + conservation + tech + + + use + nology + + + offset + use + + + countries + the + + + countries + developed + + + offset + countries + + + offset + are + + + accom + offset + + + Industri + the + + + unsound + ecologically + + + Industri + unsound + + + offset + Industri + + + Industri + alization + + + world + the + + + world + developing + + + alization + world + + + destruction + the + + + offset + destruction + + + Industri + destruction + + + forests + the + + + forests + tropical + + + destruction + forests + + + lungs + which + + + lungs + are + + + literally + quite + + + lungs + literally + + + lungs + the + + + forests + lungs + + + planet + our + + + lungs + planet + + + + + + + Worse + worse + 23272 + 23277 + RBR + O + + + , + , + 23277 + 23278 + , + O + + + environmental + environmental + 23279 + 23292 + JJ + O + + + degradation + degradation + 23293 + 23304 + NN + O + + + in + in + 23305 + 23307 + IN + O + + + the + the + 23308 + 23311 + DT + O + + + third + third + 23313 + 23318 + JJ + ORDINAL + 3.0 + + + world + world + 23319 + 23324 + NN + O + + + is + be + 23325 + 23327 + VBZ + O + + + the + the + 23328 + 23331 + DT + O + + + product + product + 23332 + 23339 + NN + O + + + of + of + 23340 + 23342 + IN + O + + + , + , + 23342 + 23343 + , + O + + + the + the + 23344 + 23347 + DT + O + + + compan + compan + 23348 + 23354 + NN + O + + + - + - + 23354 + 23355 + : + O + + + ion + ion + 23357 + 23360 + NN + O + + + of + of + 23361 + 23363 + IN + O + + + , + , + 23363 + 23364 + , + O + + + and + and + 23365 + 23368 + CC + O + + + the + the + 23369 + 23372 + DT + O + + + cause + cause + 23373 + 23378 + NN + O + + + of + of + 23379 + 23381 + IN + O + + + increased + increase + 23382 + 23391 + VBN + O + + + poverty + poverty + 23392 + 23399 + NN + O + + + and + and + 23401 + 23404 + CC + O + + + human + human + 23405 + 23410 + JJ + O + + + misery + misery + 23411 + 23417 + NN + O + + + . + . + 23417 + 23418 + . + O + + + (ROOT (S (ADVP (RBR Worse)) (, ,) (NP (NP (JJ environmental) (NN degradation)) (PP (IN in) (NP (DT the) (JJ third) (NN world)))) (VP (VBZ is) (NP (NP (NP (DT the) (NN product)) (PP (IN of))) (, ,) (NP (NP (DT the) (NN compan)) (: -) (NP (NP (NN ion)) (PP (IN of)))) (, ,) (CC and) (NP (NP (DT the) (NN cause)) (PP (IN of) (NP (NP (VBN increased) (NN poverty)) (CC and) (NP (JJ human) (NN misery))))))) (. .))) + + + ROOT + product + + + product + Worse + + + degradation + environmental + + + product + degradation + + + degradation + in + + + world + the + + + world + third + + + in + world + + + product + is + + + product + the + + + product + of + + + compan + the + + + product + compan + + + compan + ion + + + ion + of + + + product + and + + + cause + the + + + product + cause + + + cause + of + + + poverty + increased + + + of + poverty + + + poverty + and + + + misery + human + + + poverty + misery + + + + + ROOT + product + + + product + Worse + + + degradation + environmental + + + product + degradation + + + world + the + + + world + third + + + degradation + world + + + product + is + + + product + the + + + product + of + + + compan + the + + + product + compan + + + compan + ion + + + ion + of + + + cause + the + + + product + cause + + + poverty + increased + + + cause + poverty + + + misery + human + + + poverty + misery + + + + + ROOT + product + + + product + Worse + + + degradation + environmental + + + product + degradation + + + world + the + + + world + third + + + degradation + world + + + product + is + + + product + the + + + product + of + + + compan + the + + + product + compan + + + compan + ion + + + ion + of + + + cause + the + + + product + cause + + + poverty + increased + + + cause + poverty + + + misery + human + + + cause + misery + + + poverty + misery + + + + + + + This + this + 23419 + 23423 + DT + O + + + misery + misery + 23424 + 23430 + NN + O + + + can + can + 23431 + 23434 + MD + O + + + only + only + 23435 + 23439 + RB + O + + + breed + breed + 23441 + 23446 + VB + O + + + popular + popular + 23447 + 23454 + JJ + O + + + anger + anger + 23455 + 23460 + NN + O + + + and + and + 23461 + 23464 + CC + O + + + governmental + governmental + 23465 + 23477 + JJ + O + + + in + in + 23478 + 23480 + IN + O + + + - + - + 23480 + 23481 + : + O + + + stability + stability + 23483 + 23492 + NN + O + + + . + . + 23492 + 23493 + . + O + + + (ROOT (S (NP (DT This) (NN misery)) (VP (MD can) (ADVP (RB only)) (VP (VB breed) (NP (ADJP (ADJP (JJ popular) (NN anger)) (CC and) (ADJP (JJ governmental) (PP (IN in))) (: -)) (NN stability)))) (. .))) + + + ROOT + breed + + + misery + This + + + breed + misery + + + breed + can + + + breed + only + + + anger + popular + + + stability + anger + + + anger + and + + + anger + governmental + + + governmental + in + + + breed + stability + + + + + ROOT + breed + + + misery + This + + + breed + misery + + + breed + can + + + breed + only + + + anger + popular + + + stability + anger + + + anger + governmental + + + governmental + in + + + breed + stability + + + + + ROOT + breed + + + misery + This + + + breed + misery + + + breed + can + + + breed + only + + + anger + popular + + + stability + anger + + + anger + governmental + + + stability + governmental + + + governmental + in + + + breed + stability + + + + + + + It + it + 23494 + 23496 + PRP + O + + + could + could + 23497 + 23502 + MD + O + + + harm + harm + 23503 + 23507 + VB + O + + + the + the + 23508 + 23511 + DT + O + + + process + process + 23512 + 23519 + NN + O + + + of + of + 23520 + 23522 + IN + O + + + de + de + 23523 + 23525 + FW + O + + + - + - + 23525 + 23526 + : + O + + + mocratization + mocratization + 23528 + 23541 + NN + O + + + in + in + 23542 + 23544 + IN + O + + + the + the + 23545 + 23548 + DT + O + + + third + third + 23549 + 23554 + JJ + ORDINAL + 3.0 + + + world + world + 23555 + 23560 + NN + O + + + and + and + 23561 + 23564 + CC + O + + + lead + lead + 23565 + 23569 + VB + O + + + to + to + 23570 + 23572 + TO + O + + + the + the + 23574 + 23577 + DT + O + + + emergence + emergence + 23578 + 23587 + NN + O + + + of + of + 23588 + 23590 + IN + O + + + aggressive + aggressive + 23591 + 23601 + JJ + O + + + regimes + regime + 23602 + 23609 + NNS + O + + + . + . + 23609 + 23610 + . + O + + + (ROOT (S (NP (PRP It)) (VP (MD could) (VP (VP (VB harm) (NP (NP (NP (DT the) (NN process)) (PP (IN of) (NP (FW de)))) (: -) (NP (NP (NN mocratization)) (PP (IN in) (NP (DT the) (JJ third) (NN world)))))) (CC and) (VP (VB lead) (PP (TO to) (NP (NP (DT the) (NN emergence)) (PP (IN of) (NP (JJ aggressive) (NNS regimes)))))))) (. .))) + + + ROOT + harm + + + harm + It + + + harm + could + + + process + the + + + harm + process + + + process + of + + + of + de + + + process + mocratization + + + mocratization + in + + + world + the + + + world + third + + + in + world + + + harm + and + + + harm + lead + + + lead + to + + + emergence + the + + + to + emergence + + + emergence + of + + + regimes + aggressive + + + of + regimes + + + + + ROOT + harm + + + harm + It + + + harm + could + + + process + the + + + harm + process + + + process + de + + + process + mocratization + + + world + the + + + world + third + + + mocratization + world + + + harm + lead + + + emergence + the + + + lead + emergence + + + regimes + aggressive + + + emergence + regimes + + + + + ROOT + harm + + + harm + It + + + lead + It + + + harm + could + + + process + the + + + harm + process + + + process + de + + + process + mocratization + + + world + the + + + world + third + + + mocratization + world + + + harm + lead + + + emergence + the + + + lead + emergence + + + regimes + aggressive + + + emergence + regimes + + + + + + + It + it + 23611 + 23613 + PRP + O + + + would + would + 23615 + 23620 + MD + O + + + be + be + 23621 + 23623 + VB + O + + + truly + truly + 23624 + 23629 + RB + O + + + tragic + tragic + 23630 + 23636 + JJ + O + + + if + if + 23637 + 23639 + IN + O + + + the + the + 23640 + 23643 + DT + O + + + end + end + 23644 + 23647 + NN + O + + + of + of + 23648 + 23650 + IN + O + + + the + the + 23651 + 23654 + DT + O + + + Cold + Cold + 23655 + 23659 + NNP + MISC + + + War + War + 23661 + 23664 + NNP + MISC + + + were + be + 23665 + 23669 + VBD + O + + + followed + follow + 23670 + 23678 + VBN + O + + + by + by + 23679 + 23681 + IN + O + + + new + new + 23682 + 23685 + JJ + O + + + wars + war + 23686 + 23690 + NNS + O + + + in + in + 23691 + 23693 + IN + O + + + the + the + 23694 + 23697 + DT + O + + + devel + devel + 23698 + 23703 + NN + O + + + - + - + 23703 + 23704 + : + O + + + oping + oping + 23706 + 23711 + NN + O + + + world + world + 23712 + 23717 + NN + O + + + or + or + 23718 + 23720 + CC + O + + + growing + grow + 23721 + 23728 + VBG + O + + + conflict + conflict + 23729 + 23737 + NN + O + + + along + along + 23738 + 23743 + IN + O + + + north + north + 23744 + 23749 + NN + O + + + - + - + 23749 + 23750 + : + O + + + south + south + 23752 + 23757 + RB + O + + + , + , + 23757 + 23758 + , + O + + + rich-poor + rich-poor + 23759 + 23768 + JJ + O + + + lines + line + 23769 + 23774 + NNS + O + + + . + . + 23774 + 23775 + . + O + + + (ROOT (S (NP (PRP It)) (VP (MD would) (VP (VB be) (ADJP (RB truly) (JJ tragic)) (SBAR (IN if) (S (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (NNP Cold) (NNP War)))) (VP (VBD were) (VP (VBN followed) (PP (IN by) (NP (JJ new) (NNS wars))) (PP (IN in) (NP (NP (DT the) (NN devel)) (: -) (NP (NP (NN oping) (NN world)) (CC or) (NP (NP (VBG growing) (NN conflict)) (PP (IN along) (NP (NP (NN north)) (: -) (ADVP (RB south)) (, ,) (NP (JJ rich-poor) (NNS lines)))))))))))))) (. .))) + + + ROOT + tragic + + + tragic + It + + + tragic + would + + + tragic + be + + + tragic + truly + + + followed + if + + + end + the + + + followed + end + + + end + of + + + War + the + + + War + Cold + + + of + War + + + followed + were + + + tragic + followed + + + followed + by + + + wars + new + + + by + wars + + + followed + in + + + devel + the + + + in + devel + + + world + oping + + + devel + world + + + world + or + + + conflict + growing + + + world + conflict + + + conflict + along + + + along + north + + + north + south + + + lines + rich-poor + + + north + lines + + + + + ROOT + tragic + + + tragic + It + + + tragic + would + + + tragic + be + + + tragic + truly + + + followed + if + + + end + the + + + followed + end + + + War + the + + + War + Cold + + + end + War + + + followed + were + + + tragic + followed + + + wars + new + + + followed + wars + + + devel + the + + + followed + devel + + + world + oping + + + devel + world + + + conflict + growing + + + world + conflict + + + conflict + north + + + north + south + + + lines + rich-poor + + + north + lines + + + + + ROOT + tragic + + + tragic + It + + + tragic + would + + + tragic + be + + + tragic + truly + + + followed + if + + + end + the + + + followed + end + + + War + the + + + War + Cold + + + end + War + + + followed + were + + + tragic + followed + + + wars + new + + + followed + wars + + + devel + the + + + followed + devel + + + world + oping + + + devel + world + + + conflict + growing + + + devel + conflict + + + world + conflict + + + conflict + north + + + north + south + + + lines + rich-poor + + + north + lines + + + + + + + Given + give + 23779 + 23784 + VBN + O + + + the + the + 23785 + 23788 + DT + O + + + consequences + consequence + 23789 + 23801 + NNS + O + + + , + , + 23801 + 23802 + , + O + + + our + we + 23803 + 23806 + PRP$ + O + + + response + response + 23807 + 23815 + NN + O + + + to + to + 23816 + 23818 + TO + O + + + environmental + environmental + 23820 + 23833 + JJ + O + + + deterioration + deterioration + 23834 + 23847 + NN + O + + + in + in + 23848 + 23850 + IN + O + + + the + the + 23851 + 23854 + DT + O + + + third + third + 23855 + 23860 + JJ + ORDINAL + 3.0 + + + world + world + 23862 + 23867 + NN + O + + + is + be + 23868 + 23870 + VBZ + O + + + woefully + woefully + 23871 + 23879 + RB + O + + + inadequate + inadequate + 23880 + 23890 + JJ + O + + + . + . + 23890 + 23891 + . + O + + + (ROOT (S (PP (VBN Given) (NP (DT the) (NNS consequences))) (, ,) (NP (NP (PRP$ our) (NN response)) (PP (TO to) (NP (NP (JJ environmental) (NN deterioration)) (PP (IN in) (NP (DT the) (JJ third) (NN world)))))) (VP (VBZ is) (ADJP (RB woefully) (JJ inadequate))) (. .))) + + + ROOT + inadequate + + + inadequate + Given + + + consequences + the + + + Given + consequences + + + response + our + + + inadequate + response + + + response + to + + + deterioration + environmental + + + to + deterioration + + + deterioration + in + + + world + the + + + world + third + + + in + world + + + inadequate + is + + + inadequate + woefully + + + + + ROOT + inadequate + + + inadequate + Given + + + consequences + the + + + Given + consequences + + + response + our + + + inadequate + response + + + deterioration + environmental + + + response + deterioration + + + world + the + + + world + third + + + deterioration + world + + + inadequate + is + + + inadequate + woefully + + + + + ROOT + inadequate + + + inadequate + Given + + + consequences + the + + + Given + consequences + + + response + our + + + inadequate + response + + + deterioration + environmental + + + response + deterioration + + + world + the + + + world + third + + + deterioration + world + + + inadequate + is + + + inadequate + woefully + + + + + + + Until + until + 23892 + 23897 + IN + O + + + recently + recently + 23898 + 23906 + RB + DATE + PAST_REF + PAST_REF + + + , + , + 23906 + 23907 + , + O + + + the + the + 23909 + 23912 + DT + O + + + principal + principal + 23913 + 23922 + JJ + O + + + development + development + 23923 + 23934 + NN + O + + + banks + bank + 23935 + 23940 + NNS + O + + + and + and + 23941 + 23944 + CC + O + + + major + major + 23945 + 23950 + JJ + O + + + donors + donor + 23952 + 23958 + NNS + O + + + did + do + 23959 + 23962 + VBD + O + + + not + not + 23963 + 23966 + RB + O + + + include + include + 23967 + 23974 + VB + O + + + the + the + 23975 + 23978 + DT + O + + + environment + environment + 23979 + 23990 + NN + O + + + as + as + 23991 + 23993 + IN + O + + + priority + priority + 23995 + 24003 + NN + O + + + in + in + 24004 + 24006 + IN + O + + + the + the + 24007 + 24010 + DT + O + + + development + development + 24011 + 24022 + NN + O + + + process + process + 24023 + 24030 + NN + O + + + . + . + 24030 + 24031 + . + O + + + (ROOT (S (PP (IN Until) (ADVP (RB recently))) (, ,) (NP (NP (DT the) (JJ principal) (NN development) (NNS banks)) (CC and) (NP (JJ major) (NNS donors))) (VP (VBD did) (RB not) (VP (VB include) (NP (DT the) (NN environment)) (PP (IN as) (NP (NP (NN priority)) (PP (IN in) (NP (DT the) (NN development) (NN process))))))) (. .))) + + + ROOT + include + + + include + Until + + + Until + recently + + + banks + the + + + banks + principal + + + banks + development + + + include + banks + + + banks + and + + + donors + major + + + banks + donors + + + include + did + + + include + not + + + environment + the + + + include + environment + + + include + as + + + as + priority + + + priority + in + + + process + the + + + process + development + + + in + process + + + + + ROOT + include + + + include + Until + + + Until + recently + + + banks + the + + + banks + principal + + + banks + development + + + include + banks + + + donors + major + + + banks + donors + + + include + did + + + include + not + + + environment + the + + + include + environment + + + include + priority + + + process + the + + + process + development + + + priority + process + + + + + ROOT + include + + + include + Until + + + Until + recently + + + banks + the + + + banks + principal + + + banks + development + + + include + banks + + + donors + major + + + banks + donors + + + include + donors + + + include + did + + + include + not + + + environment + the + + + include + environment + + + include + priority + + + process + the + + + process + development + + + priority + process + + + + + + + Indeed + indeed + 24032 + 24038 + RB + O + + + , + , + 24038 + 24039 + , + O + + + many + many + 24041 + 24045 + JJ + O + + + donor-financed + donor-financed + 24046 + 24060 + JJ + O + + + projects + project + 24061 + 24069 + NNS + O + + + went + go + 24070 + 24074 + VBD + O + + + forward + forward + 24075 + 24082 + RB + O + + + without + without + 24084 + 24091 + IN + O + + + regard + regard + 24092 + 24098 + NN + O + + + to + to + 24099 + 24101 + TO + O + + + the + the + 24102 + 24105 + DT + O + + + environmental + environmental + 24106 + 24119 + JJ + O + + + conse + conse + 24120 + 24125 + NN + O + + + - + - + 24125 + 24126 + : + O + + + quences + quence + 24128 + 24135 + NNS + O + + + with + with + 24136 + 24140 + IN + O + + + sometimes + sometimes + 24141 + 24150 + RB + O + + + disastrous + disastrous + 24151 + 24161 + JJ + O + + + conse + conse + 24162 + 24167 + NN + O + + + - + - + 24167 + 24168 + : + O + + + quences + quence + 24170 + 24177 + NNS + O + + + . + . + 24177 + 24178 + . + O + + + (ROOT (S (ADVP (RB Indeed)) (, ,) (NP (JJ many) (JJ donor-financed) (NNS projects)) (VP (VBD went) (ADVP (RB forward)) (PP (IN without) (NP (NN regard))) (PP (TO to) (NP (NP (DT the) (JJ environmental) (NN conse)) (: -) (NP (NP (NNS quences)) (PP (IN with) (NP (ADJP (RB sometimes) (JJ disastrous)) (NN conse)))) (: -) (NP (NNS quences))))) (. .))) + + + ROOT + went + + + went + Indeed + + + projects + many + + + projects + donor-financed + + + went + projects + + + went + forward + + + went + without + + + without + regard + + + went + to + + + conse + the + + + conse + environmental + + + to + conse + + + conse + quences + + + quences + with + + + disastrous + sometimes + + + conse + disastrous + + + with + conse + + + conse + quences + + + + + ROOT + went + + + went + Indeed + + + projects + many + + + projects + donor-financed + + + went + projects + + + went + forward + + + went + regard + + + conse + the + + + conse + environmental + + + went + conse + + + conse + quences + + + disastrous + sometimes + + + conse + disastrous + + + quences + conse + + + conse + quences + + + + + ROOT + went + + + went + Indeed + + + projects + many + + + projects + donor-financed + + + went + projects + + + went + forward + + + went + regard + + + conse + the + + + conse + environmental + + + went + conse + + + conse + quences + + + disastrous + sometimes + + + conse + disastrous + + + quences + conse + + + conse + quences + + + + + + + It + it + 24182 + 24184 + PRP + O + + + was + be + 24185 + 24188 + VBD + O + + + only + only + 24189 + 24193 + RB + O + + + in + in + 24194 + 24196 + IN + O + + + 1972 + 1972 + 24197 + 24201 + CD + DATE + 1972 + 1972 + + + that + that + 24202 + 24206 + IN + O + + + the + the + 24207 + 24210 + DT + O + + + international + international + 24211 + 24224 + JJ + O + + + community + community + 24226 + 24235 + NN + O + + + established + establish + 24236 + 24247 + VBD + O + + + an + a + 24248 + 24250 + DT + O + + + organization + organization + 24251 + 24263 + NN + O + + + spe + spe + 24264 + 24267 + NN + O + + + - + - + 24267 + 24268 + : + O + + + cifically + cifically + 24270 + 24279 + RB + O + + + concerned + concern + 24280 + 24289 + VBN + O + + + with + with + 24290 + 24294 + IN + O + + + the + the + 24295 + 24298 + DT + O + + + global + global + 24299 + 24305 + JJ + O + + + environ + environ + 24306 + 24313 + NN + O + + + - + - + 24313 + 24314 + : + O + + + ment + ment + 24316 + 24320 + NN + O + + + . + . + 24320 + 24321 + . + O + + + (ROOT (S (NP (PRP It)) (VP (VBD was) (ADVP (RB only)) (PP (IN in) (NP (CD 1972))) (SBAR (IN that) (S (NP (DT the) (JJ international) (NN community)) (VP (VBD established) (NP (DT an) (NN organization)) (NP (NP (NN spe)) (: -) (ADJP (RB cifically) (VBN concerned) (PP (IN with) (NP (DT the) (JJ global) (NN environ) (: -) (NN ment))))))))) (. .))) + + + ROOT + was + + + was + It + + + was + only + + + was + in + + + in + 1972 + + + established + that + + + community + the + + + community + international + + + established + community + + + was + established + + + organization + an + + + established + organization + + + established + spe + + + concerned + cifically + + + spe + concerned + + + concerned + with + + + ment + the + + + ment + global + + + ment + environ + + + with + ment + + + + + ROOT + was + + + was + It + + + was + only + + + was + 1972 + + + established + that + + + community + the + + + community + international + + + established + community + + + was + established + + + organization + an + + + established + organization + + + established + spe + + + concerned + cifically + + + spe + concerned + + + ment + the + + + ment + global + + + ment + environ + + + concerned + ment + + + + + ROOT + was + + + was + It + + + was + only + + + was + 1972 + + + established + that + + + community + the + + + community + international + + + established + community + + + was + established + + + organization + an + + + established + organization + + + established + spe + + + concerned + cifically + + + spe + concerned + + + ment + the + + + ment + global + + + ment + environ + + + concerned + ment + + + + + + + That + that + 24322 + 24326 + DT + O + + + organization + organization + 24327 + 24339 + NN + O + + + , + , + 24339 + 24340 + , + O + + + the + the + 24341 + 24344 + DT + O + + + United + United + 24345 + 24351 + NNP + ORGANIZATION + + + Na + Na + 24352 + 24354 + NNP + ORGANIZATION + + + - + - + 24354 + 24355 + : + O + + + tions + tion + 24357 + 24362 + NNS + O + + + Environment + Environment + 24363 + 24374 + NNP + O + + + Program + Program + 24375 + 24382 + NNP + O + + + , + , + 24382 + 24383 + , + O + + + remains + remain + 24384 + 24391 + VBZ + O + + + stun + stun + 24392 + 24396 + SYM + O + + + - + - + 24396 + 24397 + : + O + + + ningly + ningly + 24399 + 24405 + RB + O + + + underfunded + underfund + 24406 + 24417 + VBN + O + + + . + . + 24417 + 24418 + . + O + + + (ROOT (S (NP (NP (DT That) (NN organization)) (, ,) (NP (NP (DT the) (NNP United) (NNP Na)) (: -) (NP (NP (NNS tions)) (NP (NNP Environment) (NNP Program)))) (, ,)) (VP (VBZ remains) (S (VP (X (SYM stun)) (: -) (VP (ADVP (RB ningly)) (VBN underfunded))))) (. .))) + + + ROOT + remains + + + organization + That + + + remains + organization + + + Na + the + + + Na + United + + + organization + Na + + + Na + tions + + + Program + Environment + + + tions + Program + + + underfunded + stun + + + underfunded + ningly + + + remains + underfunded + + + + + ROOT + remains + + + organization + That + + + remains + organization + + + Na + the + + + Na + United + + + organization + Na + + + Na + tions + + + Program + Environment + + + tions + Program + + + underfunded + stun + + + underfunded + ningly + + + remains + underfunded + + + + + ROOT + remains + + + organization + That + + + remains + organization + + + Na + the + + + Na + United + + + organization + Na + + + Na + tions + + + Program + Environment + + + tions + Program + + + underfunded + stun + + + underfunded + ningly + + + remains + underfunded + + + + + + + In + in + 24419 + 24421 + IN + O + + + 1989 + 1989 + 24422 + 24426 + CD + DATE + 1989 + 1989 + + + the + the + 24427 + 24430 + DT + O + + + UNEP + unep + 24431 + 24435 + NN + ORGANIZATION + + + budget + budget + 24437 + 24443 + NN + O + + + was + be + 24444 + 24447 + VBD + O + + + a + a + 24448 + 24449 + DT + O + + + mere + mere + 24450 + 24454 + JJ + O + + + $ + $ + 24455 + 24456 + $ + MONEY + $3.0E7 + + + 30 + 30 + 24456 + 24458 + CD + MONEY + $3.0E7 + + + million + million + 24459 + 24466 + CD + MONEY + $3.0E7 + + + , + , + 24466 + 24467 + , + O + + + not + not + 24468 + 24471 + RB + O + + + even + even + 24472 + 24476 + RB + O + + + one + one + 24477 + 24480 + CD + PERCENT + %1.0 + + + percent + percent + 24482 + 24489 + NN + PERCENT + %1.0 + + + of + of + 24490 + 24492 + IN + O + + + U.S. + U.S. + 24493 + 24497 + NNP + LOCATION + + + environmental + environmental + 24498 + 24511 + JJ + O + + + expenditure + expenditure + 24512 + 24523 + NN + O + + + at + at + 24525 + 24527 + IN + O + + + the + the + 24528 + 24531 + DT + O + + + federal + federal + 24532 + 24539 + JJ + O + + + level + level + 24540 + 24545 + NN + O + + + . + . + 24545 + 24546 + . + O + + + (ROOT (S (PP (IN In) (NP (CD 1989))) (NP (DT the) (NN UNEP) (NN budget)) (VP (VBD was) (NP (NP (DT a) (JJ mere) (ADJP (QP ($ $) (CD 30) (CD million))) (PRN (, ,) (RB not) (NP (NP (RB even) (CD one) (NN percent)) (PP (IN of) (NP (NNP U.S.))))) (JJ environmental) (NN expenditure)) (PP (IN at) (NP (DT the) (JJ federal) (NN level))))) (. .))) + + + ROOT + expenditure + + + expenditure + In + + + In + 1989 + + + budget + the + + + budget + UNEP + + + expenditure + budget + + + expenditure + was + + + expenditure + a + + + expenditure + mere + + + expenditure + $ + + + $ + 30 + + + $ + million + + + percent + not + + + percent + even + + + percent + one + + + expenditure + percent + + + percent + of + + + of + U.S. + + + expenditure + environmental + + + expenditure + at + + + level + the + + + level + federal + + + at + level + + + + + ROOT + expenditure + + + expenditure + 1989 + + + budget + the + + + budget + UNEP + + + expenditure + budget + + + expenditure + was + + + expenditure + a + + + expenditure + mere + + + expenditure + $ + + + $ + 30 + + + $ + million + + + percent + not + + + percent + even + + + percent + one + + + expenditure + percent + + + percent + U.S. + + + expenditure + environmental + + + level + the + + + level + federal + + + expenditure + level + + + + + ROOT + expenditure + + + expenditure + 1989 + + + budget + the + + + budget + UNEP + + + expenditure + budget + + + expenditure + was + + + expenditure + a + + + expenditure + mere + + + expenditure + $ + + + $ + 30 + + + $ + million + + + percent + not + + + percent + even + + + percent + one + + + expenditure + percent + + + percent + U.S. + + + expenditure + environmental + + + level + the + + + level + federal + + + expenditure + level + + + + + + + In + in + 24547 + 24549 + IN + O + + + its + its + 24550 + 24553 + PRP$ + O + + + 17 + 17 + 24554 + 24556 + CD + DURATION + 17.0 + P17Y + + + years + year + 24557 + 24562 + NNS + NUMBER + 0.0 + P17Y + + + UNEP + UNEP + 24563 + 24567 + NNP + ORGANIZATION + + + has + have + 24569 + 24572 + VBZ + O + + + had + have + 24573 + 24576 + VBD + O + + + an + a + 24577 + 24579 + DT + O + + + extraordinary + extraordinary + 24580 + 24593 + JJ + O + + + catalytic + catalytic + 24594 + 24603 + JJ + O + + + role + role + 24604 + 24608 + NN + O + + + in + in + 24609 + 24611 + IN + O + + + developing + develop + 24613 + 24623 + VBG + O + + + international + international + 24624 + 24637 + JJ + O + + + environmental + environmental + 24638 + 24651 + JJ + O + + + law + law + 24652 + 24655 + NN + O + + + , + , + 24655 + 24656 + , + O + + + in + in + 24658 + 24660 + IN + O + + + assisting + assist + 24661 + 24670 + VBG + O + + + developing + develop + 24671 + 24681 + VBG + O + + + countries + country + 24682 + 24691 + NNS + O + + + build + build + 24692 + 24697 + VB + O + + + envi + envi + 24698 + 24702 + SYM + O + + + - + - + 24702 + 24703 + : + O + + + ronmental + ronmental + 24705 + 24714 + JJ + O + + + institutions + institution + 24715 + 24727 + NNS + O + + + , + , + 24727 + 24728 + , + O + + + and + and + 24729 + 24732 + CC + O + + + in + in + 24733 + 24735 + IN + O + + + enhancing + enhance + 24736 + 24745 + VBG + O + + + an + a + 24746 + 24748 + DT + O + + + awareness + awareness + 24750 + 24759 + NN + O + + + of + of + 24760 + 24762 + IN + O + + + the + the + 24763 + 24766 + DT + O + + + close + close + 24767 + 24772 + JJ + O + + + link + link + 24773 + 24777 + NN + O + + + between + between + 24778 + 24785 + IN + O + + + the + the + 24786 + 24789 + DT + O + + + en + en + 24790 + 24792 + FW + O + + + - + - + 24792 + 24793 + : + O + + + vironment + vironment + 24795 + 24804 + NN + O + + + and + and + 24808 + 24811 + CC + O + + + development + development + 24814 + 24825 + NN + O + + + . + . + 24825 + 24826 + . + O + + + (ROOT (S (PP (IN In) (NP (PRP$ its) (CD 17) (NNS years))) (NP (NNP UNEP)) (VP (VBZ has) (VP (VBD had) (NP (NP (DT an) (JJ extraordinary) (JJ catalytic) (NN role)) (PP (IN in) (S (VP (VBG developing) (NP (JJ international) (JJ environmental) (NN law)))))))) (, ,) (PP (PP (IN in) (S (VP (VBG assisting) (S (VP (VBG developing) (NP (NNS countries)) (VP (VB build) (FRAG (X (SYM envi)) (: -) (NP (JJ ronmental) (NNS institutions))))))))) (, ,) (CC and) (PP (IN in) (S (VP (VBG enhancing) (NP (NP (NP (DT an) (NN awareness)) (PP (IN of) (NP (NP (DT the) (JJ close) (NN link)) (PP (IN between) (NP (DT the) (FW en)))))) (: -) (NP (NN vironment) (CC and) (NN development))))))) (. .))) + + + ROOT + had + + + had + In + + + years + its + + + years + 17 + + + In + years + + + had + UNEP + + + had + has + + + role + an + + + role + extraordinary + + + role + catalytic + + + had + role + + + role + in + + + in + developing + + + law + international + + + law + environmental + + + developing + law + + + had + in + + + in + assisting + + + assisting + developing + + + developing + countries + + + developing + build + + + institutions + envi + + + institutions + ronmental + + + build + institutions + + + in + and + + + in + in + + + in + enhancing + + + awareness + an + + + enhancing + awareness + + + awareness + of + + + link + the + + + link + close + + + of + link + + + link + between + + + en + the + + + between + en + + + awareness + vironment + + + vironment + and + + + vironment + development + + + + + ROOT + had + + + years + its + + + years + 17 + + + had + years + + + had + UNEP + + + had + has + + + role + an + + + role + extraordinary + + + role + catalytic + + + had + role + + + role + developing + + + law + international + + + law + environmental + + + developing + law + + + had + assisting + + + assisting + developing + + + developing + countries + + + developing + build + + + institutions + envi + + + institutions + ronmental + + + build + institutions + + + assisting + enhancing + + + awareness + an + + + enhancing + awareness + + + link + the + + + link + close + + + awareness + link + + + en + the + + + link + en + + + awareness + vironment + + + vironment + development + + + + + ROOT + had + + + years + its + + + years + 17 + + + had + years + + + had + UNEP + + + had + has + + + role + an + + + role + extraordinary + + + role + catalytic + + + had + role + + + role + developing + + + law + international + + + law + environmental + + + developing + law + + + had + assisting + + + assisting + developing + + + developing + countries + + + developing + build + + + institutions + envi + + + institutions + ronmental + + + build + institutions + + + had + enhancing + + + assisting + enhancing + + + awareness + an + + + enhancing + awareness + + + link + the + + + link + close + + + awareness + link + + + en + the + + + link + en + + + awareness + vironment + + + awareness + development + + + vironment + development + + + + + + + Among + among + 24829 + 24834 + IN + O + + + UNEP + UNEP + 24836 + 24840 + NNP + ORGANIZATION + + + 's + 's + 24840 + 24842 + POS + O + + + recent + recent + 24843 + 24849 + JJ + O + + + achievements + achievement + 24850 + 24862 + NNS + O + + + is + be + 24863 + 24865 + VBZ + O + + + the + the + 24866 + 24869 + DT + O + + + Montre + Montre + 24870 + 24876 + NNP + LOCATION + + + - + - + 24876 + 24877 + : + O + + + al + al + 24879 + 24881 + NNP + O + + + Protocol + Protocol + 24882 + 24890 + NNP + O + + + on + on + 24891 + 24893 + IN + O + + + the + the + 24894 + 24897 + DT + O + + + ozone + ozone + 24898 + 24903 + NN + O + + + layer + layer + 24904 + 24909 + NN + O + + + , + , + 24909 + 24910 + , + O + + + the + the + 24911 + 24914 + DT + O + + + major + major + 24915 + 24920 + JJ + O + + + international + international + 24922 + 24935 + JJ + O + + + environmental + environmental + 24936 + 24949 + JJ + O + + + agreement + agreement + 24950 + 24959 + NN + O + + + of + of + 24960 + 24962 + IN + O + + + the + the + 24964 + 24967 + DT + DATE + THIS P10Y + + + + decade + decade + 24968 + 24974 + NN + DATE + THIS P10Y + + + + and + and + 24975 + 24978 + CC + O + + + the + the + 24979 + 24982 + DT + O + + + first + first + 24983 + 24988 + JJ + ORDINAL + 1.0 + + + serious + serious + 24989 + 24996 + JJ + O + + + effort + effort + 24997 + 25003 + NN + O + + + to + to + 25004 + 25006 + TO + O + + + ad + ad + 25007 + 25009 + NN + O + + + - + - + 25009 + 25010 + : + O + + + dress + dress + 25012 + 25017 + NN + O + + + the + the + 25018 + 25021 + DT + O + + + global + global + 25022 + 25028 + JJ + O + + + warming + warming + 25029 + 25036 + NN + O + + + problem + problem + 25037 + 25044 + NN + O + + + . + . + 25044 + 25045 + . + O + + + (ROOT (FRAG (S (PP (IN Among) (NP (NP (NNP UNEP) (POS 's)) (JJ recent) (NNS achievements))) (VP (VBZ is) (NP (DT the) (NNP Montre)))) (: -) (NP (NP (NP (NNP al) (NNP Protocol)) (PP (IN on) (NP (DT the) (NN ozone) (NN layer)))) (, ,) (NP (NP (DT the) (JJ major) (JJ international) (JJ environmental) (NN agreement)) (PP (IN of) (NP (NP (DT the) (NN decade)) (CC and) (NP (DT the) (JJ first) (JJ serious) (NN effort)))) (PP (TO to) (NP (NN ad))) (: -) (NP (NP (NN dress)) (NP (DT the) (JJ global) (NN warming) (NN problem))))) (. .))) + + + ROOT + Protocol + + + Montre + Among + + + achievements + UNEP + + + UNEP + 's + + + achievements + recent + + + Among + achievements + + + Montre + is + + + Montre + the + + + Protocol + Montre + + + Protocol + al + + + Protocol + on + + + layer + the + + + layer + ozone + + + on + layer + + + agreement + the + + + agreement + major + + + agreement + international + + + agreement + environmental + + + Protocol + agreement + + + agreement + of + + + decade + the + + + of + decade + + + decade + and + + + effort + the + + + effort + first + + + effort + serious + + + decade + effort + + + agreement + to + + + to + ad + + + agreement + dress + + + problem + the + + + problem + global + + + problem + warming + + + dress + problem + + + + + ROOT + Protocol + + + achievements + UNEP + + + achievements + recent + + + Montre + achievements + + + Montre + is + + + Montre + the + + + Protocol + Montre + + + Protocol + al + + + layer + the + + + layer + ozone + + + Protocol + layer + + + agreement + the + + + agreement + major + + + agreement + international + + + agreement + environmental + + + Protocol + agreement + + + decade + the + + + agreement + decade + + + effort + the + + + effort + first + + + effort + serious + + + decade + effort + + + agreement + ad + + + agreement + dress + + + problem + the + + + problem + global + + + problem + warming + + + dress + problem + + + + + ROOT + Protocol + + + achievements + UNEP + + + achievements + recent + + + Montre + achievements + + + Montre + is + + + Montre + the + + + Protocol + Montre + + + Protocol + al + + + layer + the + + + layer + ozone + + + Protocol + layer + + + agreement + the + + + agreement + major + + + agreement + international + + + agreement + environmental + + + Protocol + agreement + + + decade + the + + + agreement + decade + + + effort + the + + + effort + first + + + effort + serious + + + agreement + effort + + + decade + effort + + + agreement + ad + + + agreement + dress + + + problem + the + + + problem + global + + + problem + warming + + + dress + problem + + + + + + + This + this + 25046 + 25050 + DT + O + + + alone + alone + 25052 + 25057 + RB + O + + + would + would + 25058 + 25063 + MD + O + + + , + , + 25063 + 25064 + , + O + + + In + in + 25065 + 25067 + IN + O + + + my + my + 25068 + 25070 + PRP$ + O + + + view + view + 25071 + 25075 + NN + O + + + , + , + 25075 + 25076 + , + O + + + justify + justify + 25077 + 25084 + VB + O + + + the + the + 25085 + 25088 + DT + O + + + paltry + paltry + 25089 + 25095 + JJ + O + + + sums + sum + 25097 + 25101 + NNS + O + + + our + we + 25102 + 25105 + PRP$ + O + + + world + world + 25106 + 25111 + NN + O + + + community + community + 25112 + 25121 + NN + O + + + has + have + 25122 + 25125 + VBZ + O + + + expended + expend + 25126 + 25134 + VBN + O + + + on + on + 25135 + 25137 + IN + O + + + UNEP + UNEP + 25139 + 25143 + NNP + ORGANIZATION + + + . + . + 25143 + 25144 + . + O + + + (ROOT (S (NP (DT This)) (ADVP (RB alone)) (VP (MD would) (PRN (, ,) (PP (IN In) (NP (PRP$ my) (NN view))) (, ,)) (VP (VB justify) (NP (NP (DT the) (JJ paltry) (NNS sums)) (SBAR (S (NP (PRP$ our) (NN world) (NN community)) (VP (VBZ has) (VP (VBN expended) (PP (IN on) (NP (NNP UNEP)))))))))) (. .))) + + + ROOT + justify + + + justify + This + + + justify + alone + + + justify + would + + + justify + In + + + view + my + + + In + view + + + sums + the + + + sums + paltry + + + justify + sums + + + community + our + + + community + world + + + expended + community + + + expended + has + + + sums + expended + + + expended + on + + + on + UNEP + + + + + ROOT + justify + + + justify + This + + + justify + alone + + + justify + would + + + justify + In + + + view + my + + + In + view + + + sums + the + + + sums + paltry + + + justify + sums + + + community + our + + + community + world + + + expended + community + + + expended + has + + + sums + expended + + + expended + UNEP + + + + + ROOT + justify + + + justify + This + + + justify + alone + + + justify + would + + + justify + In + + + view + my + + + In + view + + + sums + the + + + sums + paltry + + + justify + sums + + + community + our + + + community + world + + + expended + community + + + expended + has + + + sums + expended + + + expended + UNEP + + + + + + + I + I + 25148 + 25149 + PRP + O + + + believe + believe + 25150 + 25157 + VBP + O + + + we + we + 25158 + 25160 + PRP + O + + + should + should + 25161 + 25167 + MD + O + + + in + in + 25168 + 25170 + IN + O + + + this + this + 25171 + 25175 + DT + DATE + THIS P10Y + + + + decade + decade + 25176 + 25182 + NN + DATE + THIS P10Y + + + + resolve + resolve + 25183 + 25190 + NN + O + + + to + to + 25192 + 25194 + TO + O + + + support + support + 25195 + 25202 + VB + O + + + a + a + 25203 + 25204 + DT + O + + + rapid + rapid + 25205 + 25210 + JJ + O + + + increase + increase + 25211 + 25219 + NN + O + + + in + in + 25220 + 25222 + IN + O + + + the + the + 25223 + 25226 + DT + O + + + size + size + 25227 + 25231 + NN + O + + + and + and + 25232 + 25235 + CC + O + + + scope + scope + 25237 + 25242 + NN + O + + + of + of + 25243 + 25245 + IN + O + + + UNEP + unep + 25246 + 25250 + NN + ORGANIZATION + + + activities + activity + 25251 + 25261 + NNS + O + + + . + . + 25261 + 25262 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP believe) (SBAR (S (NP (PRP we)) (VP (MD should) (S (PP (IN in) (NP (DT this) (NN decade) (NN resolve))) (VP (TO to) (VP (VB support) (NP (DT a) (JJ rapid) (NN increase)) (PP (IN in) (NP (NP (DT the) (NN size) (CC and) (NN scope)) (PP (IN of) (NP (NN UNEP) (NNS activities)))))))))))) (. .))) + + + ROOT + believe + + + believe + I + + + should + we + + + believe + should + + + support + in + + + resolve + this + + + resolve + decade + + + in + resolve + + + support + to + + + should + support + + + increase + a + + + increase + rapid + + + support + increase + + + support + in + + + size + the + + + in + size + + + size + and + + + size + scope + + + size + of + + + activities + UNEP + + + of + activities + + + + + ROOT + believe + + + believe + I + + + should + we + + + believe + should + + + resolve + this + + + resolve + decade + + + support + resolve + + + support + to + + + should + support + + + increase + a + + + increase + rapid + + + support + increase + + + size + the + + + support + size + + + size + scope + + + activities + UNEP + + + size + activities + + + + + ROOT + believe + + + believe + I + + + should + we + + + believe + should + + + resolve + this + + + resolve + decade + + + support + resolve + + + support + to + + + should + support + + + increase + a + + + increase + rapid + + + support + increase + + + size + the + + + support + size + + + support + scope + + + size + scope + + + activities + UNEP + + + size + activities + + + + + + + I + I + 25263 + 25264 + PRP + O + + + would + would + 25265 + 25270 + MD + O + + + urge + urge + 25271 + 25275 + VB + O + + + a + a + 25276 + 25277 + DT + O + + + tenfold + tenfold + 25279 + 25286 + JJ + O + + + expansion + expansion + 25287 + 25296 + NN + O + + + in + in + 25297 + 25299 + IN + O + + + the + the + 25300 + 25303 + DT + O + + + UNEP + UNEP + 25304 + 25308 + NNP + ORGANIZATION + + + budget + budget + 25309 + 25315 + NN + O + + + over + over + 25316 + 25320 + IN + O + + + the + the + 25322 + 25325 + DT + DURATION + P3Y + + + next + next + 25326 + 25330 + JJ + DURATION + P3Y + + + three + three + 25331 + 25336 + CD + DURATION + P3Y + + + years + year + 25337 + 25342 + NNS + NUMBER + 0.0 + P3Y + + + . + . + 25342 + 25343 + . + O + + + (ROOT (S (NP (PRP I)) (VP (MD would) (VP (VB urge) (NP (NP (DT a) (JJ tenfold) (NN expansion)) (PP (IN in) (NP (DT the) (NNP UNEP) (NN budget)))) (PP (IN over) (NP (DT the) (JJ next) (CD three) (NNS years))))) (. .))) + + + ROOT + urge + + + urge + I + + + urge + would + + + expansion + a + + + expansion + tenfold + + + urge + expansion + + + expansion + in + + + budget + the + + + budget + UNEP + + + in + budget + + + urge + over + + + years + the + + + years + next + + + years + three + + + over + years + + + + + ROOT + urge + + + urge + I + + + urge + would + + + expansion + a + + + expansion + tenfold + + + urge + expansion + + + budget + the + + + budget + UNEP + + + expansion + budget + + + years + the + + + years + next + + + years + three + + + urge + years + + + + + ROOT + urge + + + urge + I + + + urge + would + + + expansion + a + + + expansion + tenfold + + + urge + expansion + + + budget + the + + + budget + UNEP + + + expansion + budget + + + years + the + + + years + next + + + years + three + + + urge + years + + + + + + + This + this + 25344 + 25348 + DT + O + + + , + , + 25348 + 25349 + , + O + + + of + of + 25350 + 25352 + IN + O + + + course + course + 25353 + 25359 + NN + O + + + , + , + 25359 + 25360 + , + O + + + will + will + 25361 + 25365 + MD + O + + + re + re + 25366 + 25368 + VB + O + + + - + - + 25368 + 25369 + : + O + + + quire + quire + 25371 + 25376 + NN + O + + + leadership + leadership + 25377 + 25387 + NN + O + + + from + from + 25388 + 25392 + IN + O + + + the + the + 25393 + 25396 + DT + O + + + parliamentarians + parliamentarian + 25397 + 25413 + NNS + O + + + amongst + amongst + 25415 + 25422 + IN + O + + + us + we + 25423 + 25425 + PRP + O + + + to + to + 25426 + 25428 + TO + O + + + Increase + increase + 25429 + 25437 + VB + O + + + our + we + 25438 + 25441 + PRP$ + O + + + own + own + 25442 + 25445 + JJ + O + + + countries + country + 25446 + 25455 + NNS + O + + + ' + ' + 25455 + 25456 + POS + O + + + contributions + contribution + 25458 + 25471 + NNS + O + + + . + . + 25471 + 25472 + . + O + + + (ROOT (S (NP (NP (DT This)) (, ,) (PP (IN of) (NP (NN course))) (, ,)) (VP (MD will) (VP (VB re) (: -) (S (NP (NP (NN quire) (NN leadership)) (PP (PP (IN from) (NP (DT the) (NNS parliamentarians))) (PP (IN amongst) (NP (PRP us))))) (VP (TO to) (VP (VB Increase) (NP (NP (PRP$ our) (JJ own) (NNS countries) (POS ')) (NNS contributions))))))) (. .))) + + + ROOT + re + + + re + This + + + This + of + + + of + course + + + re + will + + + leadership + quire + + + Increase + leadership + + + leadership + from + + + parliamentarians + the + + + from + parliamentarians + + + from + amongst + + + amongst + us + + + Increase + to + + + re + Increase + + + countries + our + + + countries + own + + + contributions + countries + + + Increase + contributions + + + + + ROOT + re + + + re + This + + + This + course + + + re + will + + + leadership + quire + + + Increase + leadership + + + parliamentarians + the + + + leadership + parliamentarians + + + leadership + us + + + Increase + to + + + re + Increase + + + countries + our + + + countries + own + + + contributions + countries + + + Increase + contributions + + + + + ROOT + re + + + re + This + + + This + course + + + re + will + + + leadership + quire + + + Increase + leadership + + + parliamentarians + the + + + leadership + parliamentarians + + + leadership + us + + + Increase + to + + + re + Increase + + + countries + our + + + countries + own + + + contributions + countries + + + Increase + contributions + + + + + + + However + however + 25473 + 25480 + RB + O + + + , + , + 25480 + 25481 + , + O + + + even + even + 25482 + 25486 + RB + O + + + at + at + 25487 + 25489 + IN + O + + + the + the + 25490 + 25493 + DT + O + + + $ + $ + 25494 + 25495 + $ + MONEY + $3.0E8 + + + 300 + 300 + 25495 + 25498 + CD + MONEY + $3.0E8 + + + million + million + 25500 + 25507 + CD + MONEY + $3.0E8 + + + level + level + 25508 + 25513 + NN + O + + + , + , + 25513 + 25514 + , + O + + + UNEP + unep + 25515 + 25519 + NN + ORGANIZATION + + + would + would + 25520 + 25525 + MD + O + + + still + still + 25526 + 25531 + RB + O + + + be + be + 25532 + 25534 + VB + O + + + a + a + 25535 + 25536 + DT + O + + + modest + modest + 25537 + 25543 + JJ + O + + + sized + size + 25545 + 25550 + VBN + O + + + U.N. + U.N. + 25551 + 25555 + NNP + ORGANIZATION + + + agency + agency + 25556 + 25562 + NN + O + + + , + , + 25562 + 25563 + , + O + + + and + and + 25564 + 25567 + CC + O + + + the + the + 25568 + 25571 + DT + O + + + overall + overall + 25572 + 25579 + JJ + O + + + effort + effort + 25580 + 25586 + NN + O + + + would + would + 25588 + 25593 + MD + O + + + be + be + 25594 + 25596 + VB + O + + + still + still + 25597 + 25602 + RB + O + + + small + small + 25603 + 25608 + JJ + O + + + as + as + 25609 + 25611 + IN + O + + + compared + compare + 25612 + 25620 + VBN + O + + + to + to + 25621 + 25623 + TO + O + + + the + the + 25624 + 25627 + DT + O + + + envi + envi + 25628 + 25632 + SYM + O + + + - + - + 25632 + 25633 + : + O + + + ronmental + ronmental + 25635 + 25644 + JJ + O + + + needs + need + 25645 + 25650 + NNS + O + + + of + of + 25651 + 25653 + IN + O + + + the + the + 25654 + 25657 + DT + O + + + developing + develop + 25658 + 25668 + VBG + O + + + world + world + 25669 + 25674 + NN + O + + + or + or + 25675 + 25677 + CC + O + + + the + the + 25679 + 25682 + DT + O + + + scale + scale + 25683 + 25688 + NN + O + + + of + of + 25689 + 25691 + IN + O + + + the + the + 25692 + 25695 + DT + O + + + global + global + 25696 + 25702 + JJ + O + + + environmental + environmental + 25703 + 25716 + JJ + O + + + prob + prob + 25717 + 25721 + NN + O + + + - + - + 25721 + 25722 + : + O + + + lem + lem + 25724 + 25727 + NN + O + + + . + . + 25727 + 25728 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (PP (RB even) (PP (IN at) (NP (DT the) (ADJP (QP ($ $) (CD 300) (CD million))) (NN level)))) (, ,) (NP (NN UNEP)) (VP (MD would) (ADVP (RB still)) (VP (VB be) (NP (NP (NP (DT a) (JJ modest) (VBN sized) (NNP U.N.) (NN agency)) (, ,) (CC and) (NP (NP (DT the) (JJ overall) (NN effort)) (SBAR (S (VP (MD would) (VP (VB be) (ADJP (RB still) (JJ small) (PP (IN as))) (PP (VBN compared) (PP (TO to) (NP (NP (DT the)) (X (SYM envi))))))))))) (PRN (: -) (NP (NP (NP (JJ ronmental) (NNS needs)) (PP (IN of) (NP (DT the) (VBG developing) (NN world)))) (CC or) (NP (NP (DT the) (NN scale)) (PP (IN of) (NP (DT the) (JJ global) (JJ environmental) (NN prob))))) (: -)) (NP (NN lem))))) (. .))) + + + ROOT + agency + + + agency + However + + + at + even + + + agency + at + + + level + the + + + level + $ + + + $ + 300 + + + $ + million + + + at + level + + + agency + UNEP + + + agency + would + + + agency + still + + + agency + be + + + agency + a + + + agency + modest + + + agency + sized + + + agency + U.N. + + + agency + and + + + effort + the + + + effort + overall + + + agency + effort + + + small + would + + + small + be + + + small + still + + + effort + small + + + small + as + + + small + compared + + + compared + to + + + to + the + + + the + envi + + + needs + ronmental + + + agency + needs + + + needs + of + + + world + the + + + world + developing + + + of + world + + + needs + or + + + scale + the + + + needs + scale + + + scale + of + + + prob + the + + + prob + global + + + prob + environmental + + + of + prob + + + agency + lem + + + + + ROOT + agency + + + agency + However + + + agency + even + + + level + the + + + level + $ + + + $ + 300 + + + $ + million + + + agency + level + + + agency + UNEP + + + agency + would + + + agency + still + + + agency + be + + + agency + a + + + agency + modest + + + agency + sized + + + agency + U.N. + + + effort + the + + + effort + overall + + + agency + effort + + + small + would + + + small + be + + + small + still + + + effort + small + + + small + as + + + small + to + + + small + the + + + the + envi + + + needs + ronmental + + + agency + needs + + + world + the + + + world + developing + + + needs + world + + + scale + the + + + needs + scale + + + prob + the + + + prob + global + + + prob + environmental + + + scale + prob + + + agency + lem + + + + + ROOT + agency + + + agency + However + + + agency + even + + + level + the + + + level + $ + + + $ + 300 + + + $ + million + + + agency + level + + + agency + UNEP + + + agency + would + + + agency + still + + + agency + be + + + agency + a + + + agency + modest + + + agency + sized + + + agency + U.N. + + + effort + the + + + effort + overall + + + agency + effort + + + small + would + + + small + be + + + small + still + + + effort + small + + + small + as + + + small + to + + + small + the + + + the + envi + + + needs + ronmental + + + agency + needs + + + world + the + + + world + developing + + + needs + world + + + scale + the + + + agency + scale + + + needs + scale + + + prob + the + + + prob + global + + + prob + environmental + + + scale + prob + + + agency + lem + + + + + + + As + as + 25732 + 25734 + IN + O + + + you + you + 25735 + 25738 + PRP + O + + + will + will + 25739 + 25743 + MD + O + + + have + have + 25744 + 25748 + VB + O + + + noticed + notice + 25749 + 25756 + VBN + O + + + my + my + 25757 + 25759 + PRP$ + O + + + remarks + remark + 25760 + 25767 + NNS + O + + + have + have + 25768 + 25772 + VBP + O + + + focused + focus + 25776 + 25783 + VBN + O + + + heavily + heavily + 25784 + 25791 + RB + O + + + on + on + 25792 + 25794 + IN + O + + + the + the + 25795 + 25798 + DT + O + + + Issue + issue + 25799 + 25804 + NN + O + + + of + of + 25805 + 25807 + IN + O + + + resources + resource + 25808 + 25817 + NNS + O + + + . + . + 25817 + 25818 + . + O + + + (ROOT (S (SBAR (IN As) (S (NP (PRP you)) (VP (MD will) (VP (VB have) (VP (VBN noticed) (NP (PRP$ my) (NNS remarks))))))) (VP (VBP have) (VP (VBN focused) (ADVP (RB heavily)) (PP (IN on) (NP (NP (DT the) (NN Issue)) (PP (IN of) (NP (NNS resources))))))) (. .))) + + + ROOT + focused + + + noticed + As + + + noticed + you + + + noticed + will + + + noticed + have + + + focused + noticed + + + remarks + my + + + noticed + remarks + + + focused + have + + + focused + heavily + + + focused + on + + + Issue + the + + + on + Issue + + + Issue + of + + + of + resources + + + + + ROOT + focused + + + noticed + As + + + noticed + you + + + noticed + will + + + noticed + have + + + focused + noticed + + + remarks + my + + + noticed + remarks + + + focused + have + + + focused + heavily + + + Issue + the + + + focused + Issue + + + Issue + resources + + + + + ROOT + focused + + + noticed + As + + + noticed + you + + + noticed + will + + + noticed + have + + + focused + noticed + + + remarks + my + + + noticed + remarks + + + focused + have + + + focused + heavily + + + Issue + the + + + focused + Issue + + + Issue + resources + + + + + + + After + after + 25822 + 25827 + IN + O + + + a + a + 25828 + 25829 + DT + DURATION + P10Y + + + decade + decade + 25830 + 25836 + NN + DURATION + P10Y + + + of + of + 25837 + 25839 + IN + O + + + borrowing + borrowing + 25840 + 25849 + NN + O + + + and + and + 25850 + 25853 + CC + O + + + spending + spending + 25854 + 25862 + NN + O + + + , + , + 25862 + 25863 + , + O + + + it + it + 25864 + 25866 + PRP + O + + + has + have + 25870 + 25873 + VBZ + O + + + become + become + 25874 + 25880 + VBN + O + + + fashionable + fashionable + 25881 + 25892 + JJ + O + + + in + in + 25893 + 25895 + IN + O + + + the + the + 25896 + 25899 + DT + O + + + United + United + 25900 + 25906 + NNP + LOCATION + + + States + States + 25908 + 25914 + NNPS + LOCATION + + + to + to + 25915 + 25917 + TO + O + + + talk + talk + 25918 + 25922 + VB + O + + + of + of + 25923 + 25925 + IN + O + + + actions + action + 25926 + 25933 + NNS + O + + + that + that + 25934 + 25938 + WDT + O + + + do + do + 25939 + 25941 + VBP + O + + + not + not + 25942 + 25945 + RB + O + + + cost + cost + 25946 + 25950 + VB + O + + + money + money + 25952 + 25957 + NN + O + + + . + . + 25957 + 25958 + . + O + + + (ROOT (S (PP (IN After) (NP (NP (DT a) (NN decade)) (PP (IN of) (NP (NN borrowing) (CC and) (NN spending))))) (, ,) (NP (PRP it)) (VP (VBZ has) (VP (VBN become) (S (ADJP (JJ fashionable))) (PP (IN in) (NP (DT the) (NNP United) (NNPS States))) (VP (TO to) (VP (VB talk) (PP (IN of) (NP (NP (NNS actions)) (SBAR (WHNP (WDT that)) (S (VP (VBP do) (RB not) (VP (VB cost) (NP (NN money)))))))))))) (. .))) + + + ROOT + talk + + + talk + After + + + decade + a + + + After + decade + + + decade + of + + + of + borrowing + + + borrowing + and + + + borrowing + spending + + + talk + it + + + talk + has + + + talk + become + + + talk + fashionable + + + talk + in + + + States + the + + + States + United + + + in + States + + + talk + to + + + talk + of + + + of + actions + + + cost + that + + + cost + do + + + cost + not + + + actions + cost + + + cost + money + + + + + ROOT + talk + + + decade + a + + + talk + decade + + + decade + borrowing + + + borrowing + spending + + + talk + it + + + talk + has + + + talk + become + + + talk + fashionable + + + States + the + + + States + United + + + talk + States + + + talk + to + + + talk + actions + + + cost + that + + + cost + do + + + cost + not + + + actions + cost + + + cost + money + + + + + ROOT + talk + + + decade + a + + + talk + decade + + + decade + borrowing + + + decade + spending + + + borrowing + spending + + + talk + it + + + talk + has + + + talk + become + + + talk + fashionable + + + States + the + + + States + United + + + talk + States + + + talk + to + + + talk + actions + + + cost + that + + + cost + do + + + cost + not + + + actions + cost + + + cost + money + + + + + + + Given + give + 25959 + 25964 + VBN + O + + + the + the + 25965 + 25968 + DT + O + + + economic + economic + 25969 + 25977 + JJ + O + + + crisis + crisis + 25978 + 25984 + NN + O + + + of + of + 25985 + 25987 + IN + O + + + the + the + 25988 + 25991 + DT + O + + + East + East + 25993 + 25997 + NNP + LOCATION + + + , + , + 25997 + 25998 + , + O + + + they + they + 25999 + 26003 + PRP + O + + + too + too + 26004 + 26007 + RB + O + + + may + may + 26008 + 26011 + MD + O + + + be + be + 26012 + 26014 + VB + O + + + subject + subject + 26015 + 26022 + JJ + O + + + to + to + 26023 + 26025 + TO + O + + + the + the + 26026 + 26029 + DT + O + + + same + same + 26030 + 26034 + JJ + O + + + tendency + tendency + 26036 + 26044 + NN + O + + + . + . + 26044 + 26045 + . + O + + + (ROOT (S (PP (VBN Given) (NP (NP (DT the) (JJ economic) (NN crisis)) (PP (IN of) (NP (DT the) (NNP East))))) (, ,) (NP (PRP they)) (ADVP (RB too)) (VP (MD may) (VP (VB be) (ADJP (JJ subject) (PP (TO to) (NP (DT the) (JJ same) (NN tendency)))))) (. .))) + + + ROOT + subject + + + subject + Given + + + crisis + the + + + crisis + economic + + + Given + crisis + + + crisis + of + + + East + the + + + of + East + + + subject + they + + + subject + too + + + subject + may + + + subject + be + + + subject + to + + + tendency + the + + + tendency + same + + + to + tendency + + + + + ROOT + subject + + + subject + Given + + + crisis + the + + + crisis + economic + + + Given + crisis + + + East + the + + + crisis + East + + + subject + they + + + subject + too + + + subject + may + + + subject + be + + + tendency + the + + + tendency + same + + + subject + tendency + + + + + ROOT + subject + + + subject + Given + + + crisis + the + + + crisis + economic + + + Given + crisis + + + East + the + + + crisis + East + + + subject + they + + + subject + too + + + subject + may + + + subject + be + + + tendency + the + + + tendency + same + + + subject + tendency + + + + + + + And + and + 26046 + 26049 + CC + O + + + there + there + 26050 + 26055 + EX + O + + + is + be + 26056 + 26058 + VBZ + O + + + , + , + 26058 + 26059 + , + O + + + of + of + 26060 + 26062 + IN + O + + + course + course + 26063 + 26069 + NN + O + + + , + , + 26069 + 26070 + , + O + + + much + much + 26071 + 26075 + RB + O + + + that + that + 26076 + 26080 + IN + O + + + can + can + 26082 + 26085 + MD + O + + + be + be + 26086 + 26088 + VB + O + + + done + do + 26089 + 26093 + VBN + O + + + to + to + 26094 + 26096 + TO + O + + + protect + protect + 26097 + 26104 + VB + O + + + the + the + 26105 + 26108 + DT + O + + + environment + environment + 26109 + 26120 + NN + O + + + without + without + 26122 + 26129 + IN + O + + + costing + cost + 26130 + 26137 + VBG + O + + + a + a + 26138 + 26139 + DT + O + + + lot + lot + 26140 + 26143 + NN + O + + + of + of + 26144 + 26146 + IN + O + + + money + money + 26147 + 26152 + NN + O + + + . + . + 26152 + 26153 + . + O + + + (ROOT (S (CC And) (NP (EX there)) (VP (VBZ is) (, ,) (PP (IN of) (NP (NN course))) (, ,) (SBAR (RB much) (IN that) (S (VP (MD can) (VP (VB be) (VP (VBN done) (S (VP (TO to) (VP (VB protect) (NP (DT the) (NN environment)) (PP (IN without) (S (VP (VBG costing) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN money)))))))))))))))) (. .))) + + + ROOT + is + + + is + And + + + is + there + + + is + of + + + of + course + + + done + much + + + done + that + + + done + can + + + done + be + + + is + done + + + protect + to + + + done + protect + + + environment + the + + + protect + environment + + + protect + without + + + without + costing + + + lot + a + + + costing + lot + + + lot + of + + + of + money + + + + + ROOT + is + + + is + And + + + is + there + + + is + course + + + done + much + + + done + that + + + done + can + + + done + be + + + is + done + + + protect + to + + + done + protect + + + environment + the + + + protect + environment + + + protect + costing + + + lot + a + + + costing + lot + + + lot + money + + + + + ROOT + is + + + is + And + + + is + there + + + is + course + + + done + much + + + done + that + + + done + can + + + done + be + + + is + done + + + protect + to + + + done + protect + + + environment + the + + + protect + environment + + + protect + costing + + + lot + a + + + costing + lot + + + lot + money + + + + + + + However + however + 26154 + 26161 + RB + O + + + , + , + 26161 + 26162 + , + O + + + we + we + 26163 + 26165 + PRP + O + + + can + can + 26167 + 26170 + MD + O + + + not + not + 26170 + 26173 + RB + O + + + seriously + seriously + 26174 + 26183 + RB + O + + + address + address + 26184 + 26191 + VB + O + + + our + we + 26192 + 26195 + PRP$ + O + + + environmental + environmental + 26196 + 26209 + JJ + O + + + crisis + crisis + 26211 + 26217 + NN + O + + + unless + unless + 26218 + 26224 + IN + O + + + we + we + 26225 + 26227 + PRP + O + + + are + be + 26228 + 26231 + VBP + O + + + also + also + 26232 + 26236 + RB + O + + + prepared + prepare + 26237 + 26245 + VBN + O + + + to + to + 26246 + 26248 + TO + O + + + address + address + 26249 + 26256 + VB + O + + + the + the + 26258 + 26261 + DT + O + + + need + need + 26262 + 26266 + NN + O + + + for + for + 26267 + 26270 + IN + O + + + major + major + 26271 + 26276 + JJ + O + + + new + new + 26277 + 26280 + JJ + O + + + resources + resource + 26281 + 26290 + NNS + O + + + . + . + 26290 + 26291 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (NP (PRP we)) (VP (MD can) (RB not) (ADVP (RB seriously)) (VP (VB address) (NP (PRP$ our) (JJ environmental) (NN crisis)) (SBAR (IN unless) (S (NP (PRP we)) (VP (VBP are) (ADVP (RB also)) (VP (VBN prepared) (S (VP (TO to) (VP (VB address) (NP (NP (DT the) (NN need)) (PP (IN for) (NP (JJ major) (JJ new) (NNS resources))))))))))))) (. .))) + + + ROOT + address + + + address + However + + + address + we + + + address + can + + + address + not + + + address + seriously + + + crisis + our + + + crisis + environmental + + + address + crisis + + + prepared + unless + + + prepared + we + + + prepared + are + + + prepared + also + + + address + prepared + + + address + to + + + prepared + address + + + need + the + + + address + need + + + need + for + + + resources + major + + + resources + new + + + for + resources + + + + + ROOT + address + + + address + However + + + address + we + + + address + can + + + address + not + + + address + seriously + + + crisis + our + + + crisis + environmental + + + address + crisis + + + prepared + unless + + + prepared + we + + + prepared + are + + + prepared + also + + + address + prepared + + + address + to + + + prepared + address + + + need + the + + + address + need + + + resources + major + + + resources + new + + + need + resources + + + + + ROOT + address + + + address + However + + + address + we + + + address + can + + + address + not + + + address + seriously + + + crisis + our + + + crisis + environmental + + + address + crisis + + + prepared + unless + + + prepared + we + + + prepared + are + + + prepared + also + + + address + prepared + + + address + to + + + prepared + address + + + need + the + + + address + need + + + resources + major + + + resources + new + + + need + resources + + + + + + + Hence + hence + 26292 + 26297 + RB + O + + + the + the + 26299 + 26302 + DT + O + + + Importance + importance + 26303 + 26313 + NN + O + + + I + I + 26314 + 26315 + PRP + O + + + have + have + 26316 + 26320 + VBP + O + + + given + give + 26321 + 26326 + VBN + O + + + to + to + 26327 + 26329 + TO + O + + + means + means + 26330 + 26335 + NNS + O + + + for + for + 26336 + 26339 + IN + O + + + finding + find + 26341 + 26348 + VBG + O + + + such + such + 26349 + 26353 + JJ + O + + + resources + resource + 26354 + 26363 + NNS + O + + + . + . + 26363 + 26364 + . + O + + + (ROOT (FRAG (ADVP (RB Hence)) (NP (DT the) (NN Importance) (SBAR (S (NP (PRP I)) (VP (VBP have) (VP (VBN given) (PP (TO to) (NP (NNS means))) (PP (IN for) (S (VP (VBG finding) (NP (JJ such) (NNS resources)))))))))) (. .))) + + + ROOT + Importance + + + Importance + Hence + + + Importance + the + + + given + I + + + given + have + + + Importance + given + + + given + to + + + to + means + + + given + for + + + for + finding + + + resources + such + + + finding + resources + + + + + ROOT + Importance + + + Importance + Hence + + + Importance + the + + + given + I + + + given + have + + + Importance + given + + + given + means + + + given + finding + + + resources + such + + + finding + resources + + + + + ROOT + Importance + + + Importance + Hence + + + Importance + the + + + given + I + + + given + have + + + Importance + given + + + given + means + + + given + finding + + + resources + such + + + finding + resources + + + + + + + As + as + 26368 + 26370 + IN + O + + + a + a + 26371 + 26372 + DT + O + + + planet + planet + 26373 + 26379 + NN + O + + + we + we + 26380 + 26382 + PRP + O + + + face + face + 26383 + 26387 + VBP + O + + + a + a + 26388 + 26389 + DT + O + + + threat + threat + 26390 + 26396 + NN + O + + + to + to + 26397 + 26399 + TO + O + + + our + we + 26400 + 26403 + PRP$ + O + + + surviv + surviv + 26404 + 26410 + NN + O + + + - + - + 26410 + 26411 + : + O + + + al + al + 26415 + 26417 + NNP + O + + + comparable + comparable + 26418 + 26428 + JJ + O + + + to + to + 26429 + 26431 + TO + O + + + the + the + 26432 + 26435 + DT + O + + + threat + threat + 26436 + 26442 + NN + O + + + a + a + 26443 + 26444 + DT + O + + + foreign + foreign + 26445 + 26452 + JJ + O + + + enemy + enemy + 26453 + 26458 + NN + O + + + can + can + 26462 + 26465 + MD + O + + + pose + pose + 26466 + 26470 + VB + O + + + to + to + 26471 + 26473 + TO + O + + + national + national + 26474 + 26482 + JJ + O + + + survival + survival + 26483 + 26491 + NN + O + + + . + . + 26491 + 26492 + . + O + + + (ROOT (S (PP (IN As) (NP (DT a) (NN planet))) (NP (PRP we)) (VP (VBP face) (NP (DT a) (NN threat)) (PP (TO to) (NP (PRP$ our) (NN surviv))) (: -) (S (NP (NNP al)) (ADJP (JJ comparable) (PP (TO to) (NP (NP (DT the) (NN threat)) (SBAR (S (NP (DT a) (JJ foreign) (NN enemy)) (VP (MD can) (VP (VB pose) (PP (TO to) (NP (JJ national) (NN survival)))))))))))) (. .))) + + + ROOT + face + + + face + As + + + planet + a + + + As + planet + + + face + we + + + threat + a + + + face + threat + + + face + to + + + surviv + our + + + to + surviv + + + comparable + al + + + face + comparable + + + comparable + to + + + threat + the + + + to + threat + + + enemy + a + + + enemy + foreign + + + pose + enemy + + + pose + can + + + threat + pose + + + pose + to + + + survival + national + + + to + survival + + + + + ROOT + face + + + planet + a + + + face + planet + + + face + we + + + threat + a + + + face + threat + + + surviv + our + + + face + surviv + + + comparable + al + + + face + comparable + + + threat + the + + + comparable + threat + + + enemy + a + + + enemy + foreign + + + pose + enemy + + + pose + can + + + threat + pose + + + survival + national + + + pose + survival + + + + + ROOT + face + + + planet + a + + + face + planet + + + face + we + + + threat + a + + + face + threat + + + surviv + our + + + face + surviv + + + comparable + al + + + face + comparable + + + threat + the + + + comparable + threat + + + enemy + a + + + enemy + foreign + + + pose + enemy + + + pose + can + + + threat + pose + + + survival + national + + + pose + survival + + + + + + + New + New + 26493 + 26496 + NNP + O + + + ideas + idea + 26497 + 26502 + NNS + O + + + and + and + 26503 + 26506 + CC + O + + + cost-free + cost-free + 26508 + 26517 + JJ + O + + + measures + measure + 26518 + 26526 + NNS + O + + + have + have + 26527 + 26531 + VBP + O + + + their + they + 26532 + 26537 + PRP$ + O + + + place + place + 26538 + 26543 + NN + O + + + . + . + 26543 + 26544 + . + O + + + (ROOT (S (NP (NP (NNP New) (NNS ideas)) (CC and) (NP (JJ cost-free) (NNS measures))) (VP (VBP have) (NP (PRP$ their) (NN place))) (. .))) + + + ROOT + have + + + ideas + New + + + have + ideas + + + ideas + and + + + measures + cost-free + + + ideas + measures + + + place + their + + + have + place + + + + + ROOT + have + + + ideas + New + + + have + ideas + + + measures + cost-free + + + ideas + measures + + + place + their + + + have + place + + + + + ROOT + have + + + ideas + New + + + have + ideas + + + measures + cost-free + + + ideas + measures + + + have + measures + + + place + their + + + have + place + + + + + + + There + there + 26545 + 26550 + EX + O + + + is + be + 26552 + 26554 + VBZ + O + + + , + , + 26554 + 26555 + , + O + + + however + however + 26556 + 26563 + RB + O + + + , + , + 26563 + 26564 + , + O + + + no + no + 26565 + 26567 + DT + O + + + substitute + substitute + 26568 + 26578 + NN + O + + + for + for + 26579 + 26582 + IN + O + + + cold + cold + 26583 + 26587 + JJ + O + + + , + , + 26587 + 26588 + , + O + + + hard + hard + 26589 + 26593 + JJ + O + + + cash + cash + 26595 + 26599 + NN + O + + + . + . + 26599 + 26600 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBZ is) (, ,) (ADVP (RB however)) (, ,) (NP (NP (DT no) (NN substitute)) (PP (IN for) (NP (JJ cold) (, ,) (JJ hard) (NN cash))))) (. .))) + + + ROOT + is + + + is + There + + + is + however + + + substitute + no + + + is + substitute + + + substitute + for + + + cash + cold + + + cash + hard + + + for + cash + + + + + ROOT + is + + + is + There + + + is + however + + + substitute + no + + + is + substitute + + + cash + cold + + + cash + hard + + + substitute + cash + + + + + ROOT + is + + + is + There + + + is + however + + + substitute + no + + + is + substitute + + + cash + cold + + + cash + hard + + + substitute + cash + + + + + + + Fortunately + fortunately + 26601 + 26612 + RB + O + + + , + , + 26612 + 26613 + , + O + + + the + the + 26614 + 26617 + DT + O + + + prospective + prospective + 26618 + 26629 + JJ + O + + + peace + peace + 26630 + 26635 + NN + O + + + dividend + dividend + 26637 + 26645 + NN + O + + + provides + provide + 26646 + 26654 + VBZ + O + + + a + a + 26655 + 26656 + DT + O + + + source + source + 26657 + 26663 + NN + O + + + for + for + 26664 + 26667 + IN + O + + + such + such + 26668 + 26672 + JJ + O + + + cash + cash + 26673 + 26677 + NN + O + + + . + . + 26677 + 26678 + . + O + + + (ROOT (S (ADVP (RB Fortunately)) (, ,) (NP (DT the) (JJ prospective) (NN peace) (NN dividend)) (VP (VBZ provides) (NP (NP (DT a) (NN source)) (PP (IN for) (NP (JJ such) (NN cash))))) (. .))) + + + ROOT + provides + + + provides + Fortunately + + + dividend + the + + + dividend + prospective + + + dividend + peace + + + provides + dividend + + + source + a + + + provides + source + + + source + for + + + cash + such + + + for + cash + + + + + ROOT + provides + + + provides + Fortunately + + + dividend + the + + + dividend + prospective + + + dividend + peace + + + provides + dividend + + + source + a + + + provides + source + + + cash + such + + + source + cash + + + + + ROOT + provides + + + provides + Fortunately + + + dividend + the + + + dividend + prospective + + + dividend + peace + + + provides + dividend + + + source + a + + + provides + source + + + cash + such + + + source + cash + + + + + + + This + this + 26682 + 26686 + DT + O + + + said + say + 26687 + 26691 + VBD + O + + + , + , + 26691 + 26692 + , + O + + + I + I + 26693 + 26694 + PRP + O + + + would + would + 26695 + 26700 + MD + O + + + like + like + 26701 + 26705 + VB + O + + + to + to + 26706 + 26708 + TO + O + + + put + put + 26709 + 26712 + VB + O + + + in + in + 26713 + 26715 + RP + O + + + a + a + 26716 + 26717 + DT + O + + + word + word + 26718 + 26722 + NN + O + + + on + on + 26723 + 26725 + IN + O + + + behalf + behalf + 26727 + 26733 + NN + O + + + of + of + 26734 + 26736 + IN + O + + + several + several + 26737 + 26744 + JJ + O + + + relatively + relatively + 26745 + 26755 + RB + O + + + low + low + 26756 + 26759 + JJ + O + + + cost + cost + 26760 + 26764 + NN + O + + + environ + environ + 26765 + 26772 + NN + O + + + - + - + 26772 + 26773 + : + O + + + mental + mental + 26775 + 26781 + JJ + O + + + Initiatives + initiative + 26782 + 26793 + NNS + O + + + with + with + 26794 + 26798 + IN + O + + + which + which + 26799 + 26804 + WDT + O + + + I + I + 26805 + 26806 + PRP + O + + + personally + personally + 26807 + 26817 + RB + O + + + have + have + 26819 + 26823 + VBP + O + + + long + long + 26824 + 26828 + RB + O + + + been + be + 26829 + 26833 + VBN + O + + + associated + associate + 26834 + 26844 + VBN + O + + + . + . + 26844 + 26845 + . + O + + + (ROOT (S (S (NP (DT This)) (VP (VBD said))) (, ,) (NP (PRP I)) (VP (MD would) (VP (VB like) (S (VP (TO to) (VP (VB put) (PRT (RP in)) (NP (NP (DT a) (NN word)) (PP (IN on) (NP (NP (NN behalf)) (PP (IN of) (NP (NP (JJ several) (ADJP (RB relatively) (JJ low) (NN cost)) (NN environ)) (: -) (NP (NP (JJ mental) (NNS Initiatives)) (SBAR (WHPP (IN with) (WHNP (WDT which))) (S (NP (PRP I)) (ADVP (RB personally)) (VP (VBP have) (ADVP (RB long)) (VP (VBN been) (VP (VBN associated))))))))))))))))) (. .))) + + + ROOT + like + + + said + This + + + like + said + + + like + I + + + like + would + + + put + to + + + like + put + + + put + in + + + word + a + + + put + word + + + word + on + + + on + behalf + + + behalf + of + + + environ + several + + + cost + relatively + + + cost + low + + + environ + cost + + + of + environ + + + Initiatives + mental + + + environ + Initiatives + + + associated + with + + + with + which + + + associated + I + + + associated + personally + + + associated + have + + + associated + long + + + associated + been + + + Initiatives + associated + + + + + ROOT + like + + + said + This + + + like + said + + + like + I + + + like + would + + + put + to + + + like + put + + + put + in + + + word + a + + + put + word + + + environ + several + + + cost + relatively + + + cost + low + + + environ + cost + + + word + environ + + + Initiatives + mental + + + environ + Initiatives + + + associated + which + + + associated + I + + + associated + personally + + + associated + have + + + associated + long + + + associated + been + + + Initiatives + associated + + + + + ROOT + like + + + said + This + + + like + said + + + like + I + + + like + would + + + put + to + + + like + put + + + put + in + + + word + a + + + put + word + + + environ + several + + + cost + relatively + + + cost + low + + + environ + cost + + + word + environ + + + Initiatives + mental + + + environ + Initiatives + + + associated + which + + + associated + I + + + associated + personally + + + associated + have + + + associated + long + + + associated + been + + + Initiatives + associated + + + + + + + On + on + 26846 + 26848 + IN + O + + + several + several + 26849 + 26856 + JJ + O + + + occa + occa + 26857 + 26861 + NN + O + + + - + - + 26861 + 26862 + : + O + + + sions + sion + 26864 + 26869 + NNS + O + + + I + I + 26870 + 26871 + PRP + O + + + have + have + 26872 + 26876 + VBP + O + + + persuaded + persuade + 26877 + 26886 + VBN + O + + + my + my + 26887 + 26889 + PRP$ + O + + + Senate + Senate + 26890 + 26896 + NNP + ORGANIZATION + + + col + col + 26897 + 26900 + NN + O + + + - + - + 26900 + 26901 + : + O + + + leagues + league + 26903 + 26910 + NNS + O + + + to + to + 26911 + 26913 + TO + O + + + endorse + endorse + 26914 + 26921 + VB + O + + + resolutions + resolution + 26922 + 26933 + NNS + O + + + containing + contain + 26934 + 26944 + VBG + O + + + draft + draft + 26946 + 26951 + NN + O + + + treaty + treaty + 26952 + 26958 + NN + O + + + language + language + 26959 + 26967 + NN + O + + + . + . + 26967 + 26968 + . + O + + + (ROOT (FRAG (PP (IN On) (NP (JJ several) (NN occa))) (: -) (NP (NP (NNS sions)) (SBAR (S (NP (PRP I)) (VP (VBP have) (VP (VBN persuaded) (NP (PRP$ my) (NNP Senate) (NN col)) (: -) (S (NP (NNS leagues)) (VP (TO to) (VP (VB endorse) (NP (NP (NNS resolutions)) (VP (VBG containing) (NP (NN draft) (NN treaty) (NN language)))))))))))) (. .))) + + + ROOT + sions + + + sions + On + + + occa + several + + + On + occa + + + persuaded + I + + + persuaded + have + + + sions + persuaded + + + col + my + + + col + Senate + + + persuaded + col + + + endorse + leagues + + + endorse + to + + + persuaded + endorse + + + endorse + resolutions + + + resolutions + containing + + + language + draft + + + language + treaty + + + containing + language + + + + + ROOT + sions + + + occa + several + + + sions + occa + + + persuaded + I + + + persuaded + have + + + sions + persuaded + + + col + my + + + col + Senate + + + persuaded + col + + + endorse + leagues + + + endorse + to + + + persuaded + endorse + + + endorse + resolutions + + + resolutions + containing + + + language + draft + + + language + treaty + + + containing + language + + + + + ROOT + sions + + + occa + several + + + sions + occa + + + persuaded + I + + + persuaded + have + + + sions + persuaded + + + col + my + + + col + Senate + + + persuaded + col + + + endorse + leagues + + + endorse + to + + + persuaded + endorse + + + endorse + resolutions + + + resolutions + containing + + + language + draft + + + language + treaty + + + containing + language + + + + + + + I + I + 26969 + 26970 + PRP + O + + + am + be + 26971 + 26973 + VBP + O + + + pleased + pleased + 26974 + 26981 + JJ + O + + + to + to + 26982 + 26984 + TO + O + + + say + say + 26985 + 26988 + VB + O + + + that + that + 26990 + 26994 + IN + O + + + two + two + 26995 + 26998 + CD + NUMBER + 2.0 + + + of + of + 26999 + 27001 + IN + O + + + these + these + 27002 + 27007 + DT + O + + + efforts + effort + 27008 + 27015 + NNS + O + + + were + be + 27016 + 27020 + VBD + O + + + , + , + 27020 + 27021 + , + O + + + In + in + 27022 + 27024 + IN + O + + + fact + fact + 27025 + 27029 + NN + O + + + , + , + 27029 + 27030 + , + O + + + con + con + 27031 + 27034 + NN + O + + + - + - + 27034 + 27035 + : + O + + + verted + vert + 27037 + 27043 + VBN + O + + + from + from + 27044 + 27048 + IN + O + + + Senate + Senate + 27049 + 27055 + NNP + ORGANIZATION + + + resolution + resolution + 27056 + 27066 + NN + O + + + into + into + 27067 + 27071 + IN + O + + + an + a + 27072 + 27074 + DT + O + + + actual + actual + 27075 + 27081 + JJ + O + + + treaty + treaty + 27083 + 27089 + NN + O + + + now + now + 27090 + 27093 + RB + DATE + PRESENT_REF + PRESENT_REF + + + In + in + 27094 + 27096 + IN + O + + + force + force + 27097 + 27102 + NN + O + + + . + . + 27102 + 27103 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ pleased) (S (VP (TO to) (VP (VB say) (SBAR (IN that) (S (NP (NP (CD two)) (PP (IN of) (NP (DT these) (NNS efforts)))) (VP (VP (VBD were)) (, ,) (VP (PP (IN In) (NP (NN fact))) (, ,) (NP (NN con))) (: -) (VP (VBN verted) (PP (IN from) (NP (NNP Senate) (NN resolution)))) (PP (IN into) (NP (DT an) (JJ actual) (NN treaty)))))))))) (PRN (ADVP (RB now)) (PP (IN In) (NP (NN force))))) (. .))) + + + ROOT + pleased + + + pleased + I + + + pleased + am + + + say + to + + + pleased + say + + + were + that + + + were + two + + + two + of + + + efforts + these + + + of + efforts + + + say + were + + + con + In + + + In + fact + + + were + con + + + were + verted + + + verted + from + + + resolution + Senate + + + from + resolution + + + were + into + + + treaty + an + + + treaty + actual + + + into + treaty + + + In + now + + + pleased + In + + + In + force + + + + + ROOT + pleased + + + pleased + I + + + pleased + am + + + say + to + + + pleased + say + + + were + that + + + were + two + + + efforts + these + + + two + efforts + + + say + were + + + con + fact + + + were + con + + + were + verted + + + resolution + Senate + + + verted + resolution + + + treaty + an + + + treaty + actual + + + were + treaty + + + In + now + + + pleased + In + + + In + force + + + + + ROOT + pleased + + + pleased + I + + + pleased + am + + + say + to + + + pleased + say + + + were + that + + + were + two + + + efforts + these + + + two + efforts + + + say + were + + + con + fact + + + were + con + + + were + verted + + + resolution + Senate + + + verted + resolution + + + treaty + an + + + treaty + actual + + + were + treaty + + + In + now + + + pleased + In + + + In + force + + + + + + + These + these + 27104 + 27109 + DT + O + + + are + be + 27110 + 27113 + VBP + O + + + a + a + 27114 + 27115 + DT + O + + + treaty + treaty + 27116 + 27122 + NN + O + + + ban + ban + 27123 + 27126 + NN + O + + + - + - + 27126 + 27127 + : + O + + + ning + ning + 27129 + 27133 + JJ + O + + + tile + tile + 27134 + 27138 + NN + O + + + emplacement + emplacement + 27139 + 27150 + NN + O + + + of + of + 27151 + 27153 + IN + O + + + weapons + weapon + 27154 + 27161 + NNS + O + + + of + of + 27162 + 27164 + IN + O + + + mass + mass + 27165 + 27169 + NN + O + + + destruction + destruction + 27171 + 27182 + NN + O + + + on + on + 27183 + 27185 + IN + O + + + the + the + 27186 + 27189 + DT + O + + + seabed + seab + 27190 + 27196 + VBN + O + + + floor + floor + 27197 + 27202 + NN + O + + + and + and + 27203 + 27206 + CC + O + + + a + a + 27207 + 27208 + DT + O + + + treaty + treaty + 27209 + 27215 + NN + O + + + banning + ban + 27217 + 27224 + VBG + O + + + the + the + 27225 + 27228 + DT + O + + + use + use + 27229 + 27232 + NN + O + + + of + of + 27233 + 27235 + IN + O + + + environmental + environmental + 27236 + 27249 + JJ + O + + + modifica + modifica + 27250 + 27258 + NN + O + + + - + - + 27258 + 27259 + : + O + + + tion + tion + 27261 + 27265 + NN + O + + + techinques + techinque + 27266 + 27276 + NNS + O + + + In + in + 27277 + 27279 + IN + O + + + warfare + warfare + 27280 + 27287 + NN + O + + + . + . + 27287 + 27288 + . + O + + + (ROOT (S (S (NP (DT These)) (VP (VBP are) (NP (NP (NP (DT a) (NN treaty) (NN ban)) (: -) (NP (NP (JJ ning) (NN tile) (NN emplacement)) (PP (IN of) (NP (NP (NNS weapons)) (PP (IN of) (NP (NP (NN mass) (NN destruction)) (PP (IN on) (NP (DT the) (S (VP (VBN seabed))) (NN floor))))))))) (CC and) (NP (NP (DT a) (NN treaty)) (VP (VBG banning) (NP (NP (DT the) (NN use)) (PP (IN of) (NP (JJ environmental) (NN modifica))))))))) (: -) (S (NP (NN tion) (NNS techinques)) (PP (IN In) (NP (NN warfare)))) (. .))) + + + ROOT + ban + + + ban + These + + + ban + are + + + ban + a + + + ban + treaty + + + emplacement + ning + + + emplacement + tile + + + ban + emplacement + + + emplacement + of + + + of + weapons + + + weapons + of + + + destruction + mass + + + of + destruction + + + destruction + on + + + floor + the + + + floor + seabed + + + on + floor + + + ban + and + + + treaty + a + + + ban + treaty + + + treaty + banning + + + use + the + + + banning + use + + + use + of + + + modifica + environmental + + + of + modifica + + + techinques + tion + + + ban + techinques + + + techinques + In + + + In + warfare + + + + + ROOT + ban + + + ban + These + + + ban + are + + + ban + a + + + ban + treaty + + + emplacement + ning + + + emplacement + tile + + + ban + emplacement + + + emplacement + weapons + + + destruction + mass + + + weapons + destruction + + + floor + the + + + floor + seabed + + + destruction + floor + + + treaty + a + + + ban + treaty + + + treaty + banning + + + use + the + + + banning + use + + + modifica + environmental + + + use + modifica + + + techinques + tion + + + ban + techinques + + + techinques + In + + + In + warfare + + + + + ROOT + ban + + + ban + These + + + ban + are + + + ban + a + + + ban + treaty + + + emplacement + ning + + + emplacement + tile + + + ban + emplacement + + + emplacement + weapons + + + destruction + mass + + + weapons + destruction + + + floor + the + + + floor + seabed + + + destruction + floor + + + treaty + a + + + ban + treaty + + + treaty + banning + + + use + the + + + banning + use + + + modifica + environmental + + + use + modifica + + + techinques + tion + + + ban + techinques + + + techinques + In + + + In + warfare + + + + + + + In + in + 27292 + 27294 + IN + O + + + 1977 + 1977 + 27295 + 27299 + CD + DATE + 1977 + 1977 + + + I + I + 27300 + 27301 + PRP + O + + + put + put + 27302 + 27305 + VBD + O + + + forward + forward + 27306 + 27313 + RB + O + + + draft + draft + 27314 + 27319 + NN + O + + + language + language + 27320 + 27328 + NN + O + + + for + for + 27329 + 27332 + IN + O + + + a + a + 27333 + 27334 + DT + O + + + third + third + 27336 + 27341 + JJ + ORDINAL + 3.0 + + + treaty + treaty + 27342 + 27348 + NN + O + + + , + , + 27348 + 27349 + , + O + + + an + a + 27350 + 27352 + DT + O + + + international + international + 27353 + 27366 + JJ + O + + + agreement + agreement + 27367 + 27376 + NN + O + + + mandating + mandate + 27378 + 27387 + VBG + O + + + the + the + 27388 + 27391 + DT + O + + + preparation + preparation + 27392 + 27403 + NN + O + + + of + of + 27404 + 27406 + IN + O + + + an + a + 27407 + 27409 + DT + O + + + environ + environ + 27410 + 27417 + NN + O + + + - + - + 27417 + 27418 + : + O + + + mental + mental + 27420 + 27426 + JJ + O + + + impact + impact + 27427 + 27433 + NN + O + + + assessment + assessment + 27434 + 27444 + NN + O + + + for + for + 27445 + 27448 + IN + O + + + all + all + 27449 + 27452 + DT + O + + + projects + project + 27453 + 27461 + NNS + O + + + , + , + 27461 + 27462 + , + O + + + public + public + 27464 + 27470 + JJ + O + + + and + and + 27471 + 27474 + CC + O + + + private + private + 27475 + 27482 + JJ + O + + + , + , + 27482 + 27483 + , + O + + + that + that + 27484 + 27488 + WDT + O + + + would + would + 27489 + 27494 + MD + O + + + impact + impact + 27495 + 27501 + VB + O + + + on + on + 27502 + 27504 + IN + O + + + the + the + 27506 + 27509 + DT + O + + + territory + territory + 27510 + 27519 + NN + O + + + of + of + 27520 + 27522 + IN + O + + + another + another + 27523 + 27530 + DT + O + + + state + state + 27531 + 27536 + NN + O + + + or + or + 27537 + 27539 + CC + O + + + on + on + 27540 + 27542 + IN + O + + + the + the + 27543 + 27546 + DT + O + + + global + global + 27548 + 27554 + JJ + O + + + commons + common + 27555 + 27562 + NNS + O + + + . + . + 27562 + 27563 + . + O + + + (ROOT (S (PP (IN In) (NP (CD 1977))) (NP (PRP I)) (VP (VBD put) (ADVP (RB forward)) (NP (NP (NN draft) (NN language)) (PP (IN for) (NP (NP (DT a) (JJ third) (NN treaty)) (, ,) (NP (NP (NP (DT an) (JJ international) (NN agreement)) (VP (VBG mandating) (NP (NP (DT the) (NN preparation)) (PP (IN of) (NP (DT an) (NN environ)))))) (: -) (NP (NP (JJ mental) (NN impact) (NN assessment)) (PP (IN for) (NP (DT all) (NNS projects))))) (, ,) (ADJP (JJ public) (CC and) (JJ private)) (, ,) (SBAR (WHNP (WDT that)) (S (VP (MD would) (VP (VB impact) (PP (PP (IN on) (NP (NP (DT the) (NN territory)) (PP (IN of) (NP (DT another) (NN state))))) (CC or) (PP (IN on) (NP (DT the) (JJ global) (NNS commons)))))))))))) (. .))) + + + ROOT + put + + + put + In + + + In + 1977 + + + put + I + + + put + forward + + + language + draft + + + put + language + + + language + for + + + treaty + a + + + treaty + third + + + for + treaty + + + agreement + an + + + agreement + international + + + treaty + agreement + + + agreement + mandating + + + preparation + the + + + mandating + preparation + + + preparation + of + + + environ + an + + + of + environ + + + assessment + mental + + + assessment + impact + + + agreement + assessment + + + assessment + for + + + projects + all + + + for + projects + + + treaty + public + + + public + and + + + public + private + + + impact + that + + + impact + would + + + treaty + impact + + + impact + on + + + territory + the + + + on + territory + + + territory + of + + + state + another + + + of + state + + + on + or + + + on + on + + + commons + the + + + commons + global + + + on + commons + + + + + ROOT + put + + + put + 1977 + + + put + I + + + put + forward + + + language + draft + + + put + language + + + treaty + a + + + treaty + third + + + language + treaty + + + agreement + an + + + agreement + international + + + treaty + agreement + + + agreement + mandating + + + preparation + the + + + mandating + preparation + + + environ + an + + + preparation + environ + + + assessment + mental + + + assessment + impact + + + agreement + assessment + + + projects + all + + + assessment + projects + + + treaty + public + + + public + private + + + impact + that + + + impact + would + + + treaty + impact + + + territory + the + + + impact + territory + + + state + another + + + territory + state + + + commons + the + + + commons + global + + + territory + commons + + + + + ROOT + put + + + put + 1977 + + + put + I + + + put + forward + + + language + draft + + + put + language + + + treaty + a + + + treaty + third + + + language + treaty + + + agreement + an + + + agreement + international + + + treaty + agreement + + + agreement + mandating + + + preparation + the + + + mandating + preparation + + + environ + an + + + preparation + environ + + + assessment + mental + + + assessment + impact + + + agreement + assessment + + + projects + all + + + assessment + projects + + + treaty + public + + + treaty + private + + + public + private + + + impact + that + + + impact + would + + + treaty + impact + + + territory + the + + + impact + territory + + + state + another + + + territory + state + + + commons + the + + + commons + global + + + impact + commons + + + territory + commons + + + + + + + My + my + 27564 + 27566 + PRP$ + O + + + proposed + propose + 27567 + 27575 + VBN + O + + + Environmen + Environmen + 27576 + 27586 + NNP + O + + + - + - + 27586 + 27587 + : + O + + + tal + tal + 27589 + 27592 + JJ + O + + + Impact + Impact + 27593 + 27599 + NNP + O + + + Assessment + Assessment + 27600 + 27610 + NNP + O + + + Treaty + Treaty + 27611 + 27617 + NNP + O + + + would + would + 27618 + 27623 + MD + O + + + not + not + 27624 + 27627 + RB + O + + + prohibit + prohibit + 27629 + 27637 + VB + O + + + a + a + 27638 + 27639 + DT + O + + + state + state + 27640 + 27645 + NN + O + + + from + from + 27646 + 27650 + IN + O + + + carrying + carry + 27651 + 27659 + VBG + O + + + out + out + 27660 + 27663 + RP + O + + + the + the + 27664 + 27667 + DT + O + + + activi + activus + 27668 + 27674 + NN + O + + + - + - + 27674 + 27675 + : + O + + + ty + ty + 27677 + 27679 + NN + O + + + . + . + 27679 + 27680 + . + O + + + (ROOT (S (S (NP (PRP$ My)) (VP (VBN proposed) (NP (NNP Environmen)))) (: -) (S (NP (JJ tal) (NNP Impact) (NNP Assessment) (NNP Treaty)) (VP (MD would) (RB not) (VP (VB prohibit) (NP (DT a) (NN state)) (PP (IN from) (S (VP (VBG carrying) (PRT (RP out)) (NP (DT the) (NN activi) (: -) (NN ty)))))))) (. .))) + + + ROOT + proposed + + + proposed + My + + + proposed + Environmen + + + Treaty + tal + + + Treaty + Impact + + + Treaty + Assessment + + + prohibit + Treaty + + + prohibit + would + + + prohibit + not + + + proposed + prohibit + + + state + a + + + prohibit + state + + + prohibit + from + + + from + carrying + + + carrying + out + + + ty + the + + + ty + activi + + + carrying + ty + + + + + ROOT + proposed + + + proposed + My + + + proposed + Environmen + + + Treaty + tal + + + Treaty + Impact + + + Treaty + Assessment + + + prohibit + Treaty + + + prohibit + would + + + prohibit + not + + + proposed + prohibit + + + state + a + + + prohibit + state + + + prohibit + carrying + + + carrying + out + + + ty + the + + + ty + activi + + + carrying + ty + + + + + ROOT + proposed + + + proposed + My + + + proposed + Environmen + + + Treaty + tal + + + Treaty + Impact + + + Treaty + Assessment + + + prohibit + Treaty + + + prohibit + would + + + prohibit + not + + + proposed + prohibit + + + state + a + + + prohibit + state + + + prohibit + carrying + + + carrying + out + + + ty + the + + + ty + activi + + + carrying + ty + + + + + + + It + it + 27681 + 27683 + PRP + O + + + would + would + 27684 + 27689 + MD + O + + + , + , + 27689 + 27690 + , + O + + + however + however + 27691 + 27698 + RB + O + + + , + , + 27698 + 27699 + , + O + + + be + be + 27700 + 27702 + VB + O + + + required + require + 27703 + 27711 + VBN + O + + + to + to + 27712 + 27714 + TO + O + + + make + make + 27715 + 27719 + VB + O + + + a + a + 27720 + 27721 + DT + O + + + detailed + detailed + 27723 + 27731 + JJ + O + + + assessment + assessment + 27732 + 27742 + NN + O + + + of + of + 27743 + 27745 + IN + O + + + the + the + 27746 + 27749 + DT + O + + + impact + impact + 27750 + 27756 + NN + O + + + of + of + 27757 + 27759 + IN + O + + + the + the + 27760 + 27763 + DT + O + + + ac + ac + 27764 + 27766 + NN + O + + + - + - + 27766 + 27767 + : + O + + + tivity + tivity + 27769 + 27775 + NN + O + + + and + and + 27776 + 27779 + CC + O + + + to + to + 27780 + 27782 + TO + O + + + communicate + communicate + 27783 + 27794 + VB + O + + + this + this + 27795 + 27799 + DT + O + + + information + information + 27800 + 27811 + NN + O + + + to + to + 27813 + 27815 + TO + O + + + the + the + 27816 + 27819 + DT + O + + + affected + affected + 27820 + 27828 + JJ + O + + + countries + country + 27829 + 27838 + NNS + O + + + or + or + 27839 + 27841 + CC + O + + + , + , + 27841 + 27842 + , + O + + + in + in + 27843 + 27845 + IN + O + + + the + the + 27846 + 27849 + DT + O + + + case + case + 27850 + 27854 + NN + O + + + of + of + 27855 + 27857 + IN + O + + + the + the + 27859 + 27862 + DT + O + + + global + global + 27863 + 27869 + JJ + O + + + commons + common + 27870 + 27877 + NNS + O + + + , + , + 27877 + 27878 + , + O + + + to + to + 27879 + 27881 + TO + O + + + the + the + 27882 + 27885 + DT + O + + + United + United + 27886 + 27892 + NNP + ORGANIZATION + + + Nations + Nations + 27893 + 27900 + NNP + ORGANIZATION + + + Environment + Environment + 27902 + 27913 + NNP + ORGANIZATION + + + Program + Program + 27914 + 27921 + NNP + ORGANIZATION + + + . + . + 27921 + 27922 + . + O + + + (ROOT (S (NP (PRP It)) (VP (MD would) (, ,) (ADVP (RB however)) (, ,) (VP (VB be) (VP (VBN required) (S (VP (VP (TO to) (VP (VB make) (NP (NP (DT a) (JJ detailed) (NN assessment)) (PP (IN of) (NP (NP (DT the) (NN impact)) (PP (IN of) (NP (DT the) (NN ac) (: -) (NN tivity)))))))) (CC and) (VP (TO to) (VP (VB communicate) (NP (DT this) (NN information)) (PP (PP (TO to) (NP (DT the) (JJ affected) (NNS countries))) (CC or) (PRN (, ,) (PP (IN in) (NP (NP (DT the) (NN case)) (PP (IN of) (NP (DT the) (JJ global) (NNS commons))))) (, ,)) (PP (TO to) (NP (DT the) (NNP United) (NNP Nations) (NNP Environment) (NNP Program))))))))))) (. .))) + + + ROOT + required + + + required + It + + + required + would + + + required + however + + + required + be + + + make + to + + + required + make + + + assessment + a + + + assessment + detailed + + + make + assessment + + + assessment + of + + + impact + the + + + of + impact + + + impact + of + + + tivity + the + + + tivity + ac + + + of + tivity + + + make + and + + + communicate + to + + + make + communicate + + + information + this + + + communicate + information + + + communicate + to + + + countries + the + + + countries + affected + + + to + countries + + + to + or + + + to + in + + + case + the + + + in + case + + + case + of + + + commons + the + + + commons + global + + + of + commons + + + to + to + + + Program + the + + + Program + United + + + Program + Nations + + + Program + Environment + + + to + Program + + + + + ROOT + required + + + required + It + + + required + would + + + required + however + + + required + be + + + make + to + + + required + make + + + assessment + a + + + assessment + detailed + + + make + assessment + + + impact + the + + + assessment + impact + + + tivity + the + + + tivity + ac + + + impact + tivity + + + communicate + to + + + make + communicate + + + information + this + + + communicate + information + + + countries + the + + + countries + affected + + + communicate + countries + + + communicate + in + + + case + the + + + in + case + + + commons + the + + + commons + global + + + case + commons + + + Program + the + + + Program + United + + + Program + Nations + + + Program + Environment + + + countries + Program + + + + + ROOT + required + + + required + It + + + required + would + + + required + however + + + required + be + + + make + to + + + required + make + + + assessment + a + + + assessment + detailed + + + make + assessment + + + impact + the + + + assessment + impact + + + tivity + the + + + tivity + ac + + + impact + tivity + + + communicate + to + + + required + communicate + + + make + communicate + + + information + this + + + communicate + information + + + countries + the + + + countries + affected + + + communicate + countries + + + communicate + in + + + case + the + + + in + case + + + commons + the + + + commons + global + + + case + commons + + + Program + the + + + Program + United + + + Program + Nations + + + Program + Environment + + + communicate + Program + + + countries + Program + + + + + + + This + this + 27926 + 27930 + DT + O + + + idea + idea + 27931 + 27935 + NN + O + + + was + be + 27936 + 27939 + VBD + O + + + endorsed + endorse + 27940 + 27948 + VBN + O + + + unanimously + unanimously + 27949 + 27960 + RB + O + + + by + by + 27961 + 27963 + IN + O + + + the + the + 27965 + 27968 + DT + O + + + U.S. + U.S. + 27969 + 27973 + NNP + ORGANIZATION + + + Senate + Senate + 27974 + 27980 + NNP + ORGANIZATION + + + in + in + 27981 + 27983 + IN + O + + + 1078 + 1078 + 27984 + 27988 + CD + DATE + 1078 + 1078 + + + . + . + 27988 + 27989 + . + O + + + (ROOT (S (NP (DT This) (NN idea)) (VP (VBD was) (VP (VBN endorsed) (ADVP (RB unanimously)) (PP (IN by) (NP (NP (DT the) (NNP U.S.) (NNP Senate)) (PP (IN in) (NP (CD 1078))))))) (. .))) + + + ROOT + endorsed + + + idea + This + + + endorsed + idea + + + endorsed + was + + + endorsed + unanimously + + + endorsed + by + + + Senate + the + + + Senate + U.S. + + + by + Senate + + + Senate + in + + + in + 1078 + + + + + ROOT + endorsed + + + idea + This + + + endorsed + idea + + + endorsed + was + + + endorsed + unanimously + + + Senate + the + + + Senate + U.S. + + + endorsed + Senate + + + Senate + 1078 + + + + + ROOT + endorsed + + + idea + This + + + endorsed + idea + + + endorsed + was + + + endorsed + unanimously + + + Senate + the + + + Senate + U.S. + + + endorsed + Senate + + + Senate + 1078 + + + + + + + Since + since + 27990 + 27995 + IN + O + + + then + then + 27996 + 28000 + RB + O + + + it + it + 28001 + 28003 + PRP + O + + + has + have + 28004 + 28007 + VBZ + O + + + been + be + 28009 + 28013 + VBN + O + + + on + on + 28014 + 28016 + IN + O + + + the + the + 28017 + 28020 + DT + O + + + agenda + agenda + 28021 + 28027 + NN + O + + + of + of + 28028 + 28030 + IN + O + + + the + the + 28031 + 28034 + DT + O + + + UNEP + UNEP + 28035 + 28039 + NNP + ORGANIZATION + + + Governing + Governing + 28040 + 28049 + NNP + O + + + Council + Council + 28051 + 28058 + NNP + O + + + and + and + 28059 + 28062 + CC + O + + + , + , + 28062 + 28063 + , + O + + + as + as + 28064 + 28066 + IN + O + + + principles + principle + 28067 + 28077 + NNS + O + + + to + to + 28078 + 28080 + TO + O + + + be + be + 28081 + 28083 + VB + O + + + followed + follow + 28084 + 28092 + VBN + O + + + by + by + 28093 + 28095 + IN + O + + + member + member + 28097 + 28103 + NN + O + + + states + state + 28104 + 28110 + NNS + O + + + , + , + 28110 + 28111 + , + O + + + has + have + 28112 + 28115 + VBZ + O + + + received + receive + 28116 + 28124 + VBN + O + + + the + the + 28125 + 28128 + DT + O + + + endorse + endorse + 28129 + 28136 + VB + O + + + - + - + 28136 + 28137 + : + O + + + ment + ment + 28139 + 28143 + NN + O + + + of + of + 28144 + 28146 + IN + O + + + that + that + 28147 + 28151 + DT + O + + + Governing + Governing + 28152 + 28161 + NNP + O + + + Council + Council + 28162 + 28169 + NNP + O + + + . + . + 28169 + 28170 + . + O + + + (ROOT (S (PP (IN Since) (NP (RB then))) (NP (PRP it)) (VP (VP (VBZ has) (VP (VBN been) (PP (IN on) (NP (NP (DT the) (NN agenda)) (PP (IN of) (NP (DT the) (NNP UNEP) (NNP Governing) (NNP Council))))))) (CC and) (PRN (, ,) (SBAR (IN as) (S (NP (NNS principles)) (VP (TO to) (VP (VB be) (VP (VBN followed) (PP (IN by) (NP (NN member) (NNS states)))))))) (, ,)) (VP (VP (VBZ has) (VP (VBN received) (S (NP (DT the)) (VP (VB endorse))))) (: -) (NP (NP (NN ment)) (PP (IN of) (NP (DT that) (NNP Governing) (NNP Council)))))) (. .))) + + + ROOT + been + + + been + Since + + + Since + then + + + been + it + + + been + has + + + been + on + + + agenda + the + + + on + agenda + + + agenda + of + + + Council + the + + + Council + UNEP + + + Council + Governing + + + of + Council + + + been + and + + + followed + as + + + followed + principles + + + followed + to + + + followed + be + + + been + followed + + + followed + by + + + states + member + + + by + states + + + received + has + + + been + received + + + endorse + the + + + received + endorse + + + received + ment + + + ment + of + + + Council + that + + + Council + Governing + + + of + Council + + + + + ROOT + been + + + been + Since + + + Since + then + + + been + it + + + been + has + + + agenda + the + + + been + agenda + + + Council + the + + + Council + UNEP + + + Council + Governing + + + agenda + Council + + + followed + as + + + followed + principles + + + followed + to + + + followed + be + + + been + followed + + + states + member + + + followed + states + + + received + has + + + been + received + + + endorse + the + + + received + endorse + + + received + ment + + + Council + that + + + Council + Governing + + + ment + Council + + + + + ROOT + been + + + been + Since + + + Since + then + + + been + it + + + received + it + + + been + has + + + agenda + the + + + been + agenda + + + Council + the + + + Council + UNEP + + + Council + Governing + + + agenda + Council + + + followed + as + + + followed + principles + + + followed + to + + + followed + be + + + been + followed + + + states + member + + + followed + states + + + received + has + + + been + received + + + endorse + the + + + received + endorse + + + received + ment + + + Council + that + + + Council + Governing + + + ment + Council + + + + + + + Further + further + 28171 + 28178 + RB + O + + + , + , + 28178 + 28179 + , + O + + + UNEP + UNEP + 28181 + 28185 + NNP + ORGANIZATION + + + 's + 's + 28185 + 28187 + POS + O + + + international + international + 28188 + 28201 + JJ + O + + + law + law + 28202 + 28205 + NN + O + + + unit + unit + 28206 + 28210 + NN + O + + + has + have + 28211 + 28214 + VBZ + O + + + made + make + 28215 + 28219 + VBN + O + + + substantial + substantial + 28221 + 28232 + JJ + O + + + progress + progress + 28233 + 28241 + NN + O + + + toward + toward + 28242 + 28248 + IN + O + + + drafting + draft + 28249 + 28257 + VBG + O + + + a + a + 28258 + 28259 + DT + O + + + treaty + treaty + 28261 + 28267 + NN + O + + + . + . + 28267 + 28268 + . + O + + + (ROOT (S (ADVP (RB Further)) (, ,) (NP (NP (NNP UNEP) (POS 's)) (JJ international) (NN law) (NN unit)) (VP (VBZ has) (VP (VBN made) (NP (JJ substantial) (NN progress)) (PP (IN toward) (S (VP (VBG drafting) (NP (DT a) (NN treaty))))))) (. .))) + + + ROOT + made + + + made + Further + + + unit + UNEP + + + UNEP + 's + + + unit + international + + + unit + law + + + made + unit + + + made + has + + + progress + substantial + + + made + progress + + + made + toward + + + toward + drafting + + + treaty + a + + + drafting + treaty + + + + + ROOT + made + + + made + Further + + + unit + UNEP + + + unit + international + + + unit + law + + + made + unit + + + made + has + + + progress + substantial + + + made + progress + + + made + drafting + + + treaty + a + + + drafting + treaty + + + + + ROOT + made + + + made + Further + + + unit + UNEP + + + unit + international + + + unit + law + + + made + unit + + + made + has + + + progress + substantial + + + made + progress + + + made + drafting + + + treaty + a + + + drafting + treaty + + + + + + + I + I + 28269 + 28270 + PRP + O + + + realize + realize + 28271 + 28278 + VBP + O + + + many + many + 28279 + 28283 + JJ + O + + + European + european + 28284 + 28292 + JJ + MISC + + + agreements + agreement + 28293 + 28303 + NNS + O + + + go + go + 28305 + 28307 + VBP + O + + + far + far + 28308 + 28311 + RB + O + + + beyond + beyond + 28312 + 28318 + IN + O + + + this + this + 28319 + 28323 + DT + O + + + treaty + treaty + 28324 + 28330 + NN + O + + + . + . + 28330 + 28331 + . + O + + + (ROOT (S (NP (PRP I)) (VP (VBP realize) (SBAR (S (NP (JJ many) (JJ European) (NNS agreements)) (VP (VBP go) (ADVP (RB far) (PP (IN beyond) (NP (DT this) (NN treaty)))))))) (. .))) + + + ROOT + realize + + + realize + I + + + agreements + many + + + agreements + European + + + go + agreements + + + realize + go + + + go + far + + + far + beyond + + + treaty + this + + + beyond + treaty + + + + + ROOT + realize + + + realize + I + + + agreements + many + + + agreements + European + + + go + agreements + + + realize + go + + + go + far + + + treaty + this + + + far + treaty + + + + + ROOT + realize + + + realize + I + + + agreements + many + + + agreements + European + + + go + agreements + + + realize + go + + + go + far + + + treaty + this + + + far + treaty + + + + + + + However + however + 28332 + 28339 + RB + O + + + , + , + 28339 + 28340 + , + O + + + where + where + 28341 + 28346 + WRB + O + + + no + no + 28348 + 28350 + DT + O + + + such + such + 28351 + 28355 + JJ + O + + + agreements + agreement + 28356 + 28366 + NNS + O + + + are + be + 28367 + 28370 + VBP + O + + + in + in + 28371 + 28373 + IN + O + + + place + place + 28374 + 28379 + NN + O + + + , + , + 28379 + 28380 + , + O + + + I + I + 28381 + 28382 + PRP + O + + + believe + believe + 28383 + 28390 + VBP + O + + + this + this + 28392 + 28396 + DT + O + + + Environmental + environmental + 28398 + 28411 + JJ + O + + + Impact + impact + 28412 + 28418 + NN + O + + + Assessment + Assessment + 28419 + 28429 + NNP + O + + + Treaty + Treaty + 28431 + 28437 + NNP + O + + + represents + represent + 28438 + 28448 + VBZ + O + + + an + a + 28449 + 28451 + DT + O + + + important + important + 28452 + 28461 + JJ + O + + + step + step + 28462 + 28466 + NN + O + + + toward + toward + 28467 + 28473 + IN + O + + + greater + greater + 28475 + 28482 + JJR + O + + + environmental + environmental + 28483 + 28496 + JJ + O + + + responsibility + responsibility + 28497 + 28511 + NN + O + + + . + . + 28511 + 28512 + . + O + + + (ROOT (S (ADVP (RB However)) (, ,) (SBAR (WHADVP (WRB where)) (S (NP (DT no) (JJ such) (NNS agreements)) (VP (VBP are) (PP (IN in) (NP (NN place)))))) (, ,) (NP (PRP I)) (VP (VBP believe) (NP (NP (DT this) (JJ Environmental) (NN Impact)) (SBAR (S (NP (NNP Assessment) (NNP Treaty)) (VP (VBZ represents) (NP (NP (DT an) (JJ important) (NN step)) (PP (IN toward) (NP (JJR greater) (JJ environmental) (NN responsibility))))))))) (. .))) + + + ROOT + believe + + + believe + However + + + are + where + + + agreements + no + + + agreements + such + + + are + agreements + + + believe + are + + + are + in + + + in + place + + + believe + I + + + Impact + this + + + Impact + Environmental + + + believe + Impact + + + Treaty + Assessment + + + represents + Treaty + + + Impact + represents + + + step + an + + + step + important + + + represents + step + + + step + toward + + + responsibility + greater + + + responsibility + environmental + + + toward + responsibility + + + + + ROOT + believe + + + believe + However + + + are + where + + + agreements + no + + + agreements + such + + + are + agreements + + + believe + are + + + are + place + + + believe + I + + + Impact + this + + + Impact + Environmental + + + believe + Impact + + + Treaty + Assessment + + + represents + Treaty + + + Impact + represents + + + step + an + + + step + important + + + represents + step + + + responsibility + greater + + + responsibility + environmental + + + step + responsibility + + + + + ROOT + believe + + + believe + However + + + are + where + + + agreements + no + + + agreements + such + + + are + agreements + + + believe + are + + + are + place + + + believe + I + + + Impact + this + + + Impact + Environmental + + + believe + Impact + + + Treaty + Assessment + + + represents + Treaty + + + Impact + represents + + + step + an + + + step + important + + + represents + step + + + responsibility + greater + + + responsibility + environmental + + + step + responsibility + + + + + + + Second + second + 28516 + 28522 + RB + ORDINAL + 2.0 + + + , + , + 28522 + 28523 + , + O + + + I + I + 28524 + 28525 + PRP + O + + + would + would + 28526 + 28531 + MD + O + + + urge + urge + 28532 + 28536 + VB + O + + + we + we + 28537 + 28539 + PRP + O + + + move + move + 28540 + 28544 + VB + O + + + forward + forward + 28545 + 28552 + RB + O + + + quickly + quickly + 28554 + 28561 + RB + O + + + with + with + 28562 + 28566 + IN + O + + + proposals + proposal + 28567 + 28576 + NNS + O + + + to + to + 28577 + 28579 + TO + O + + + draft + draft + 28580 + 28585 + VB + O + + + and + and + 28586 + 28589 + CC + O + + + enact + enact + 28590 + 28595 + VB + O + + + an + a + 28596 + 28598 + DT + O + + + International + International + 28600 + 28613 + NNP + MISC + + + convention + convention + 28614 + 28624 + NN + O + + + to + to + 28625 + 28627 + TO + O + + + protect + protect + 28628 + 28635 + VB + O + + + biologi + biologus + 28636 + 28643 + NN + O + + + - + - + 28643 + 28644 + : + O + + + cal + cal + 28646 + 28649 + JJ + O + + + diversity + diversity + 28650 + 28659 + NN + O + + + . + . + 28659 + 28660 + . + O + + + (ROOT (S (ADVP (RB Second)) (, ,) (NP (PRP I)) (VP (MD would) (VP (VB urge) (S (NP (PRP we)) (VP (VB move) (ADVP (RB forward) (RB quickly)) (PP (IN with) (NP (NNS proposals) (S (VP (TO to) (VP (VB draft) (CC and) (VB enact) (NP (DT an) (NNP International) (NN convention)) (S (VP (TO to) (VP (VB protect) (NP (NP (NN biologi)) (: -) (NP (JJ cal) (NN diversity))))))))))))))) (. .))) + + + ROOT + urge + + + urge + Second + + + urge + I + + + urge + would + + + move + we + + + urge + move + + + quickly + forward + + + move + quickly + + + move + with + + + with + proposals + + + draft + to + + + proposals + draft + + + draft + and + + + draft + enact + + + convention + an + + + convention + International + + + draft + convention + + + protect + to + + + draft + protect + + + protect + biologi + + + diversity + cal + + + biologi + diversity + + + + + ROOT + urge + + + urge + Second + + + urge + I + + + urge + would + + + move + we + + + urge + move + + + quickly + forward + + + move + quickly + + + move + proposals + + + draft + to + + + proposals + draft + + + draft + enact + + + convention + an + + + convention + International + + + draft + convention + + + protect + to + + + draft + protect + + + protect + biologi + + + diversity + cal + + + biologi + diversity + + + + + ROOT + urge + + + urge + Second + + + urge + I + + + urge + would + + + move + we + + + urge + move + + + quickly + forward + + + move + quickly + + + move + proposals + + + draft + to + + + proposals + draft + + + proposals + enact + + + draft + enact + + + convention + an + + + convention + International + + + draft + convention + + + protect + to + + + draft + protect + + + protect + biologi + + + diversity + cal + + + biologi + diversity + + + + + + + This + this + 28661 + 28665 + DT + O + + + , + , + 28665 + 28666 + , + O + + + too + too + 28667 + 28670 + RB + O + + + , + , + 28670 + 28671 + , + O + + + is + be + 28672 + 28674 + VBZ + O + + + an + a + 28675 + 28677 + DT + O + + + issue + issue + 28678 + 28683 + NN + O + + + of + of + 28684 + 28686 + IN + O + + + person + person + 28687 + 28693 + NN + O + + + - + - + 28693 + 28694 + : + O + + + al + al + 28696 + 28698 + NNP + O + + + concern + concern + 28699 + 28706 + NN + O + + + and + and + 28707 + 28710 + CC + O + + + I + I + 28711 + 28712 + PRP + O + + + am + be + 28713 + 28715 + VBP + O + + + proud + proud + 28716 + 28721 + JJ + O + + + to + to + 28722 + 28724 + TO + O + + + be + be + 28725 + 28727 + VB + O + + + the + the + 28728 + 28731 + DT + O + + + author + author + 28732 + 28738 + NN + O + + + of + of + 28740 + 28742 + IN + O + + + a + a + 28743 + 28744 + DT + O + + + provision + provision + 28745 + 28754 + NN + O + + + of + of + 28755 + 28757 + IN + O + + + U.S. + U.S. + 28758 + 28762 + NNP + LOCATION + + + law + law + 28763 + 28766 + NN + O + + + establishing + establish + 28767 + 28779 + VBG + O + + + a + a + 28780 + 28781 + DT + O + + + pro- + pro- + 28782 + 28786 + JJ + O + + + gram + gram + 28788 + 28792 + NN + O + + + , + , + 28792 + 28793 + , + O + + + under + under + 28794 + 28799 + IN + O + + + the + the + 28800 + 28803 + DT + O + + + auspices + auspices + 28804 + 28812 + NNS + O + + + of + of + 28813 + 28815 + IN + O + + + our + we + 28816 + 28819 + PRP$ + O + + + Agency + Agency + 28820 + 28826 + NNP + ORGANIZATION + + + for + for + 28827 + 28830 + IN + ORGANIZATION + + + International + International + 28832 + 28845 + NNP + ORGANIZATION + + + Development + Development + 28846 + 28857 + NNP + ORGANIZATION + + + , + , + 28857 + 28858 + , + O + + + to + to + 28859 + 28861 + TO + O + + + assist + assist + 28862 + 28868 + VB + O + + + coun + coun + 28869 + 28873 + SYM + O + + + - + - + 28873 + 28874 + : + O + + + tries + try + 28876 + 28881 + VBZ + O + + + in + in + 28882 + 28884 + IN + O + + + the + the + 28885 + 28888 + DT + O + + + protection + protection + 28889 + 28899 + NN + O + + + of + of + 28900 + 28902 + IN + O + + + biological + biological + 28903 + 28913 + JJ + O + + + diversi + diversus + 28914 + 28921 + NN + O + + + - + - + 28921 + 28922 + : + O + + + ty + ty + 28924 + 28926 + NN + O + + + . + . + 28926 + 28927 + . + O + + + (ROOT (S (S (NP (DT This)) (, ,) (ADVP (RB too)) (, ,) (VP (VBZ is) (NP (NP (NP (DT an) (NN issue)) (PP (IN of) (NP (NN person)))) (: -) (NP (NNP al) (NN concern))))) (CC and) (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ proud) (S (VP (TO to) (VP (VB be) (NP (NP (DT the) (NN author)) (PP (IN of) (NP (NP (DT a) (NN provision)) (PP (IN of) (NP (NNP U.S.) (NN law))))) (VP (VBG establishing) (NP (DT a) (JJ pro-) (NN gram)))))))))) (, ,) (S (PP (IN under) (NP (NP (DT the) (NNS auspices)) (PP (IN of) (NP (NP (PRP$ our) (NNP Agency)) (PP (IN for) (NP (NNP International) (NNP Development))))))) (, ,) (S (VP (TO to) (VP (VB assist) (X (SYM coun))))) (: -) (VP (VBZ tries) (PP (IN in) (NP (NP (DT the) (NN protection)) (PP (IN of) (NP (JJ biological) (NN diversi) (: -) (NN ty))))))) (. .))) + + + ROOT + issue + + + issue + This + + + issue + too + + + issue + is + + + issue + an + + + issue + of + + + of + person + + + concern + al + + + issue + concern + + + issue + and + + + proud + I + + + proud + am + + + issue + proud + + + author + to + + + author + be + + + author + the + + + proud + author + + + author + of + + + provision + a + + + of + provision + + + provision + of + + + law + U.S. + + + of + law + + + author + establishing + + + gram + a + + + gram + pro- + + + establishing + gram + + + tries + under + + + auspices + the + + + under + auspices + + + auspices + of + + + Agency + our + + + of + Agency + + + Agency + for + + + Development + International + + + for + Development + + + assist + to + + + tries + assist + + + assist + coun + + + issue + tries + + + tries + in + + + protection + the + + + in + protection + + + protection + of + + + ty + biological + + + ty + diversi + + + of + ty + + + + + ROOT + issue + + + issue + This + + + issue + too + + + issue + is + + + issue + an + + + issue + person + + + concern + al + + + issue + concern + + + proud + I + + + proud + am + + + issue + proud + + + author + to + + + author + be + + + author + the + + + proud + author + + + provision + a + + + author + provision + + + law + U.S. + + + provision + law + + + author + establishing + + + gram + a + + + gram + pro- + + + establishing + gram + + + auspices + the + + + tries + auspices + + + Agency + our + + + auspices + Agency + + + Development + International + + + Agency + Development + + + assist + to + + + tries + assist + + + assist + coun + + + issue + tries + + + protection + the + + + tries + protection + + + ty + biological + + + ty + diversi + + + protection + ty + + + + + ROOT + issue + + + issue + This + + + issue + too + + + issue + is + + + issue + an + + + issue + person + + + concern + al + + + issue + concern + + + proud + I + + + proud + am + + + issue + proud + + + author + to + + + author + be + + + author + the + + + proud + author + + + provision + a + + + author + provision + + + law + U.S. + + + provision + law + + + author + establishing + + + gram + a + + + gram + pro- + + + establishing + gram + + + auspices + the + + + tries + auspices + + + Agency + our + + + auspices + Agency + + + Development + International + + + Agency + Development + + + assist + to + + + tries + assist + + + assist + coun + + + issue + tries + + + protection + the + + + tries + protection + + + ty + biological + + + ty + diversi + + + protection + ty + + + + + + + With + with + 28928 + 28932 + IN + O + + + the + the + 28933 + 28936 + DT + O + + + rate + rate + 28937 + 28941 + NN + O + + + of + of + 28942 + 28944 + IN + O + + + extinctions + extinction + 28945 + 28956 + NNS + O + + + rapidly + rapidly + 28957 + 28964 + RB + O + + + ac + ac + 28965 + 28967 + SYM + O + + + - + - + 28967 + 28968 + : + O + + + celerating + celerate + 28970 + 28980 + VBG + O + + + there + there + 28981 + 28986 + RB + O + + + can + can + 28987 + 28990 + MD + O + + + be + be + 28991 + 28993 + VB + O + + + no + no + 28994 + 28996 + DT + O + + + doubt + doubt + 28997 + 29002 + NN + O + + + of + of + 29003 + 29005 + IN + O + + + the + the + 29006 + 29009 + DT + O + + + seri + serus + 29010 + 29014 + NN + O + + + - + - + 29014 + 29015 + : + O + + + ousness + ousness + 29017 + 29024 + NN + O + + + of + of + 29025 + 29027 + IN + O + + + the + the + 29028 + 29031 + DT + O + + + problem + problem + 29032 + 29039 + NN + O + + + . + . + 29039 + 29040 + . + O + + + (ROOT (FRAG (PP (IN With) (NP (NP (NP (DT the) (NN rate)) (PP (IN of) (NP (NP (NNS extinctions)) (ADVP (RB rapidly)))) (X (SYM ac))) (: -) (S (S (VP (VBG celerating) (ADVP (RB there)))) (VP (MD can) (VP (VB be) (NP (NP (NP (DT no) (NN doubt)) (PP (IN of) (NP (DT the) (NN seri)))) (: -) (NP (NP (NN ousness)) (PP (IN of) (NP (DT the) (NN problem)))))))))) (. .))) + + + ROOT + With + + + rate + the + + + With + rate + + + rate + of + + + of + extinctions + + + extinctions + rapidly + + + rate + ac + + + doubt + celerating + + + celerating + there + + + doubt + can + + + doubt + be + + + doubt + no + + + rate + doubt + + + doubt + of + + + seri + the + + + of + seri + + + doubt + ousness + + + ousness + of + + + problem + the + + + of + problem + + + + + ROOT + With + + + rate + the + + + With + rate + + + rate + extinctions + + + extinctions + rapidly + + + rate + ac + + + doubt + celerating + + + celerating + there + + + doubt + can + + + doubt + be + + + doubt + no + + + rate + doubt + + + seri + the + + + doubt + seri + + + doubt + ousness + + + problem + the + + + ousness + problem + + + + + ROOT + With + + + rate + the + + + With + rate + + + rate + extinctions + + + extinctions + rapidly + + + rate + ac + + + doubt + celerating + + + celerating + there + + + doubt + can + + + doubt + be + + + doubt + no + + + rate + doubt + + + seri + the + + + doubt + seri + + + doubt + ousness + + + problem + the + + + ousness + problem + + + + + + + Here + here + 29041 + 29045 + RB + O + + + in + in + 29046 + 29048 + IN + O + + + the + the + 29049 + 29052 + DT + O + + + pres + pre + 29053 + 29057 + NNS + O + + + - + - + 29057 + 29058 + : + O + + + ence + ence + 29060 + 29064 + NN + O + + + of + of + 29065 + 29067 + IN + O + + + so + so + 29068 + 29070 + RB + O + + + many + many + 29071 + 29075 + JJ + O + + + spiritual + spiritual + 29076 + 29085 + JJ + O + + + leaders + leader + 29086 + 29093 + NNS + O + + + I + I + 29094 + 29095 + PRP + O + + + can + can + 29096 + 29099 + MD + O + + + only + only + 29100 + 29104 + RB + O + + + wonder + wonder + 29106 + 29112 + VB + O + + + how + how + 29113 + 29116 + WRB + O + + + the + the + 29117 + 29120 + DT + O + + + divine + divine + 29121 + 29127 + NN + O + + + must + must + 29128 + 29132 + MD + O + + + view + view + 29133 + 29137 + VB + O + + + the + the + 29138 + 29141 + DT + O + + + de + de + 29142 + 29144 + FW + O + + + - + - + 29144 + 29145 + : + O + + + struction + struction + 29147 + 29156 + NN + O + + + of + of + 29157 + 29159 + IN + O + + + so + so + 29160 + 29162 + RB + O + + + many + many + 29163 + 29167 + JJ + O + + + of + of + 29168 + 29170 + IN + O + + + His + he + 29171 + 29174 + PRP$ + O + + + creations + creation + 29175 + 29184 + NNS + O + + + . + . + 29184 + 29185 + . + O + + + (ROOT (S (ADVP (RB Here)) (PP (IN in) (NP (NP (DT the) (NNS pres)) (: -) (NP (NP (NN ence)) (PP (IN of) (NP (ADJP (RB so) (JJ many)) (JJ spiritual) (NNS leaders)))))) (NP (PRP I)) (VP (MD can) (ADVP (RB only)) (VP (VB wonder) (SBAR (WHADVP (WRB how)) (S (NP (DT the) (NN divine)) (VP (MD must) (VP (VB view) (NP (NP (DT the) (FW de)) (: -) (NP (NP (NN struction)) (PP (IN of) (NP (NP (RB so) (JJ many)) (PP (IN of) (NP (PRP$ His) (NNS creations))))))))))))) (. .))) + + + ROOT + wonder + + + wonder + Here + + + wonder + in + + + pres + the + + + in + pres + + + pres + ence + + + ence + of + + + many + so + + + leaders + many + + + leaders + spiritual + + + of + leaders + + + wonder + I + + + wonder + can + + + wonder + only + + + view + how + + + divine + the + + + view + divine + + + view + must + + + wonder + view + + + de + the + + + view + de + + + de + struction + + + struction + of + + + many + so + + + of + many + + + many + of + + + creations + His + + + of + creations + + + + + ROOT + wonder + + + wonder + Here + + + pres + the + + + wonder + pres + + + pres + ence + + + many + so + + + leaders + many + + + leaders + spiritual + + + ence + leaders + + + wonder + I + + + wonder + can + + + wonder + only + + + view + how + + + divine + the + + + view + divine + + + view + must + + + wonder + view + + + de + the + + + view + de + + + de + struction + + + many + so + + + struction + many + + + creations + His + + + many + creations + + + + + ROOT + wonder + + + wonder + Here + + + pres + the + + + wonder + pres + + + pres + ence + + + many + so + + + leaders + many + + + leaders + spiritual + + + ence + leaders + + + wonder + I + + + wonder + can + + + wonder + only + + + view + how + + + divine + the + + + view + divine + + + view + must + + + wonder + view + + + de + the + + + view + de + + + de + struction + + + many + so + + + struction + many + + + creations + His + + + many + creations + + + + + + + And + and + 29186 + 29189 + CC + O + + + I + I + 29190 + 29191 + PRP + O + + + wonder + wonder + 29193 + 29199 + VBP + O + + + what + what + 29200 + 29204 + WP + O + + + He + he + 29205 + 29207 + PRP + O + + + must + must + 29208 + 29212 + MD + O + + + think + think + 29213 + 29218 + VB + O + + + of + of + 29219 + 29221 + IN + O + + + the + the + 29222 + 29225 + DT + O + + + cavalier + cavalier + 29226 + 29234 + JJ + O + + + manner + manner + 29236 + 29242 + NN + O + + + In + in + 29243 + 29245 + IN + O + + + which + which + 29246 + 29251 + WDT + O + + + these + these + 29252 + 29257 + DT + O + + + extinctions + extinction + 29258 + 29269 + NNS + O + + + are + be + 29270 + 29273 + VBP + O + + + being + be + 29274 + 29279 + VBG + O + + + carried + carry + 29281 + 29288 + VBD + O + + + out-elephants + out-elephants + 29289 + 29302 + JJ + O + + + and + and + 29303 + 29306 + CC + O + + + rhinos + rhino + 29307 + 29313 + NNS + O + + + destroyed + destroy + 29314 + 29323 + VBN + O + + + for + for + 29325 + 29328 + IN + O + + + ivory + ivory + 29329 + 29334 + NN + O + + + trinkets + trinket + 29335 + 29343 + NNS + O + + + and + and + 29344 + 29347 + CC + O + + + aphrodisiac + aphrodisiac + 29348 + 29359 + NN + O + + + powder + powder + 29360 + 29366 + NN + O + + + , + , + 29366 + 29367 + , + O + + + or + or + 29369 + 29371 + CC + O + + + perhaps + perhaps + 29372 + 29379 + RB + O + + + worse + worse + 29380 + 29385 + JJR + O + + + , + , + 29385 + 29386 + , + O + + + entire + entire + 29387 + 29393 + JJ + O + + + species + species + 29394 + 29401 + NNS + O + + + obliterated + obliterate + 29402 + 29413 + VBD + O + + + without + without + 29415 + 29422 + IN + O + + + man + man + 29423 + 29426 + NN + O + + + even + even + 29427 + 29431 + RB + O + + + knowing + know + 29432 + 29439 + VBG + O + + + what + what + 29440 + 29444 + WP + O + + + was + be + 29445 + 29448 + VBD + O + + + once + once + 29449 + 29453 + RB + DATE + PAST_REF + PAST_REF + + + there + there + 29455 + 29460 + RB + O + + + . + . + 29460 + 29461 + . + O + + + (ROOT (S (CC And) (S (NP (PRP I)) (VP (VBP wonder) (SBAR (WHNP (WP what)) (S (NP (PRP He)) (VP (MD must) (VP (VB think) (PP (IN of) (NP (NP (NP (DT the) (JJ cavalier) (NN manner)) (SBAR (WHPP (IN In) (WHNP (WDT which))) (S (NP (DT these) (NNS extinctions)) (VP (VBP are) (VP (VBG being) (VP (VBD carried) (S (ADJP (JJ out-elephants))))))))) (CC and) (NP (NP (NNS rhinos)) (VP (VBN destroyed) (PP (IN for) (NP (NN ivory) (NNS trinkets))))))))))))) (CC and) (S (NP (NP (NP (NN aphrodisiac) (NN powder))) (, ,) (CC or) (NP (ADVP (RB perhaps)) (ADJP (JJR worse)))) (, ,) (NP (JJ entire) (NNS species)) (VP (VBD obliterated) (PP (IN without) (NP (NN man))) (S (VP (ADVP (RB even)) (VBG knowing) (SBAR (WHNP (WP what)) (S (VP (VBD was) (ADVP (RB once) (RB there))))))))) (. .))) + + + ROOT + wonder + + + wonder + And + + + wonder + I + + + think + what + + + think + He + + + think + must + + + wonder + think + + + think + of + + + manner + the + + + manner + cavalier + + + of + manner + + + carried + In + + + In + which + + + extinctions + these + + + carried + extinctions + + + carried + are + + + carried + being + + + manner + carried + + + carried + out-elephants + + + manner + and + + + manner + rhinos + + + rhinos + destroyed + + + destroyed + for + + + trinkets + ivory + + + for + trinkets + + + wonder + and + + + powder + aphrodisiac + + + obliterated + powder + + + powder + or + + + worse + perhaps + + + powder + worse + + + species + entire + + + obliterated + species + + + wonder + obliterated + + + obliterated + without + + + without + man + + + knowing + even + + + obliterated + knowing + + + was + what + + + knowing + was + + + there + once + + + was + there + + + + + ROOT + wonder + + + wonder + I + + + think + what + + + think + He + + + think + must + + + wonder + think + + + manner + the + + + manner + cavalier + + + think + manner + + + carried + which + + + extinctions + these + + + carried + extinctions + + + carried + are + + + carried + being + + + manner + carried + + + carried + out-elephants + + + manner + rhinos + + + rhinos + destroyed + + + trinkets + ivory + + + destroyed + trinkets + + + powder + aphrodisiac + + + obliterated + powder + + + worse + perhaps + + + powder + worse + + + species + entire + + + obliterated + species + + + wonder + obliterated + + + obliterated + man + + + knowing + even + + + obliterated + knowing + + + was + what + + + knowing + was + + + there + once + + + was + there + + + + + ROOT + wonder + + + wonder + I + + + think + what + + + think + He + + + think + must + + + wonder + think + + + manner + the + + + manner + cavalier + + + think + manner + + + carried + which + + + extinctions + these + + + carried + extinctions + + + carried + are + + + carried + being + + + manner + carried + + + carried + out-elephants + + + think + rhinos + + + manner + rhinos + + + rhinos + destroyed + + + trinkets + ivory + + + destroyed + trinkets + + + powder + aphrodisiac + + + obliterated + powder + + + worse + perhaps + + + powder + worse + + + obliterated + worse + + + species + entire + + + obliterated + species + + + wonder + obliterated + + + obliterated + man + + + knowing + even + + + obliterated + knowing + + + was + what + + + knowing + was + + + there + once + + + was + there + + + + + + + A + a + 29465 + 29466 + DT + O + + + treaty + treaty + 29467 + 29473 + NN + O + + + to + to + 29474 + 29476 + TO + O + + + conserve + conserve + 29477 + 29485 + VB + O + + + biological + biological + 29486 + 29496 + JJ + O + + + diversity + diversity + 29497 + 29506 + NN + O + + + should + should + 29508 + 29514 + MD + O + + + include + include + 29515 + 29522 + VB + O + + + provisions + provision + 29523 + 29533 + NNS + O + + + under + under + 29534 + 29539 + IN + O + + + which + which + 29540 + 29545 + WDT + O + + + coun + coun + 29546 + 29550 + SYM + O + + + - + - + 29550 + 29551 + : + O + + + tries + try + 29553 + 29558 + VBZ + O + + + would + would + 29559 + 29564 + MD + O + + + register + register + 29565 + 29573 + VB + O + + + species-rich + species-rich + 29574 + 29586 + JJ + O + + + habitats + habitat + 29587 + 29595 + NNS + O + + + , + , + 29595 + 29596 + , + O + + + and + and + 29598 + 29601 + CC + O + + + in + in + 29602 + 29604 + IN + O + + + particular + particular + 29605 + 29615 + JJ + O + + + , + , + 29615 + 29616 + , + O + + + the + the + 29617 + 29620 + DT + O + + + habitats + habitat + 29621 + 29629 + NNS + O + + + of + of + 29630 + 29632 + IN + O + + + endan + endan + 29633 + 29638 + NN + O + + + - + - + 29638 + 29639 + : + O + + + gered + gered + 29641 + 29646 + JJ + O + + + species + species + 29647 + 29654 + NNS + O + + + . + . + 29654 + 29655 + . + O + + + (ROOT (S (NP (DT A) (NN treaty) (S (VP (TO to) (VP (VB conserve) (NP (JJ biological) (NN diversity)))))) (VP (MD should) (VP (VB include) (NP (NP (NNS provisions)) (SBAR (WHPP (IN under) (WHNP (WDT which))) (FRAG (X (SYM coun)) (: -) (VP (VBZ tries) (SBAR (S (VP (VP (MD would) (VP (VB register) (NP (JJ species-rich) (NNS habitats)))) (, ,) (CC and) (VP (PP (IN in) (ADJP (JJ particular))) (, ,) (NP (NP (NP (DT the) (NNS habitats)) (PP (IN of) (NP (NN endan)))) (: -) (NP (JJ gered) (NNS species)))))))) (. .)))))))) + + + ROOT + include + + + treaty + A + + + include + treaty + + + conserve + to + + + treaty + conserve + + + diversity + biological + + + conserve + diversity + + + include + should + + + include + provisions + + + tries + under + + + under + which + + + tries + coun + + + provisions + tries + + + register + would + + + tries + register + + + habitats + species-rich + + + register + habitats + + + register + and + + + habitats + in + + + in + particular + + + habitats + the + + + register + habitats + + + habitats + of + + + of + endan + + + species + gered + + + habitats + species + + + + + ROOT + include + + + treaty + A + + + include + treaty + + + conserve + to + + + treaty + conserve + + + diversity + biological + + + conserve + diversity + + + include + should + + + include + provisions + + + tries + which + + + tries + coun + + + provisions + tries + + + register + would + + + tries + register + + + habitats + species-rich + + + register + habitats + + + habitats + particular + + + habitats + the + + + register + habitats + + + habitats + endan + + + species + gered + + + habitats + species + + + + + ROOT + include + + + treaty + A + + + include + treaty + + + conserve + to + + + treaty + conserve + + + diversity + biological + + + conserve + diversity + + + include + should + + + include + provisions + + + tries + which + + + tries + coun + + + provisions + tries + + + register + would + + + tries + register + + + habitats + species-rich + + + register + habitats + + + habitats + particular + + + habitats + the + + + tries + habitats + + + register + habitats + + + habitats + endan + + + species + gered + + + habitats + species + + + + + + + Registration + registration + 29656 + 29668 + NN + O + + + of + of + 29669 + 29671 + IN + O + + + the + the + 29672 + 29675 + DT + O + + + habitat + habitat + 29676 + 29683 + NN + O + + + would + would + 29685 + 29690 + MD + O + + + include + include + 29691 + 29698 + VB + O + + + an + a + 29699 + 29701 + DT + O + + + obligation + obligation + 29702 + 29712 + NN + O + + + to + to + 29713 + 29715 + TO + O + + + protect + protect + 29716 + 29723 + VB + O + + + the + the + 29724 + 29727 + DT + O + + + habitat + habitat + 29729 + 29736 + NN + O + + + , + , + 29736 + 29737 + , + O + + + and + and + 29738 + 29741 + CC + O + + + the + the + 29742 + 29745 + DT + O + + + species + species + 29746 + 29753 + NNS + O + + + contained + contain + 29754 + 29763 + VBD + O + + + therein + therein + 29764 + 29771 + RB + O + + + . + . + 29771 + 29772 + . + O + + + (ROOT (S (S (NP (NP (NN Registration)) (PP (IN of) (NP (DT the) (NN habitat)))) (VP (MD would) (VP (VB include) (NP (DT an) (NN obligation) (S (VP (TO to) (VP (VB protect) (NP (DT the) (NN habitat))))))))) (, ,) (CC and) (S (NP (DT the) (NNS species)) (VP (VBD contained) (ADVP (RB therein)))) (. .))) + + + ROOT + include + + + include + Registration + + + Registration + of + + + habitat + the + + + of + habitat + + + include + would + + + obligation + an + + + include + obligation + + + protect + to + + + obligation + protect + + + habitat + the + + + protect + habitat + + + include + and + + + species + the + + + contained + species + + + include + contained + + + contained + therein + + + + + ROOT + include + + + include + Registration + + + habitat + the + + + Registration + habitat + + + include + would + + + obligation + an + + + include + obligation + + + protect + to + + + obligation + protect + + + habitat + the + + + protect + habitat + + + species + the + + + contained + species + + + include + contained + + + contained + therein + + + + + ROOT + include + + + include + Registration + + + habitat + the + + + Registration + habitat + + + include + would + + + obligation + an + + + include + obligation + + + protect + to + + + obligation + protect + + + habitat + the + + + protect + habitat + + + species + the + + + contained + species + + + include + contained + + + contained + therein + + + + + + + In + in + 29774 + 29776 + IN + O + + + my + my + 29777 + 29779 + PRP$ + O + + + view + view + 29780 + 29784 + NN + O + + + , + , + 29784 + 29785 + , + O + + + a + a + 29786 + 29787 + DT + O + + + treaty + treaty + 29788 + 29794 + NN + O + + + should + should + 29795 + 29801 + MD + O + + + spell + spell + 29802 + 29807 + VB + O + + + out + out + 29808 + 29811 + RP + O + + + mini + mini + 29812 + 29816 + SYM + O + + + - + - + 29816 + 29817 + : + O + + + mum + mum + 29819 + 29822 + NN + O + + + standards + standard + 29823 + 29832 + NNS + O + + + for + for + 29833 + 29836 + IN + O + + + habitat + habitat + 29837 + 29844 + NN + O + + + and + and + 29845 + 29848 + CC + O + + + species + species + 29849 + 29856 + NNS + O + + + pro- + pro- + 29857 + 29861 + JJ + O + + + tection + tection + 29863 + 29870 + NN + O + + + . + . + 29870 + 29871 + . + O + + + (ROOT (S (PP (IN In) (NP (PRP$ my) (NN view))) (, ,) (NP (DT a) (NN treaty)) (VP (MD should) (VP (VB spell) (PRT (RP out)) (FRAG (PP (X (SYM mini)) (: -) (PP (NP (NN mum) (NNS standards)) (IN for) (NP (NP (NN habitat)) (CC and) (NP (NP (NNS species)) (NP (JJ pro-) (NN tection))))) (. .))))))) + + + ROOT + spell + + + spell + In + + + view + my + + + In + view + + + treaty + a + + + spell + treaty + + + spell + should + + + spell + out + + + for + mini + + + standards + mum + + + for + standards + + + spell + for + + + for + habitat + + + habitat + and + + + habitat + species + + + tection + pro- + + + species + tection + + + + + ROOT + spell + + + view + my + + + spell + view + + + treaty + a + + + spell + treaty + + + spell + should + + + spell + out + + + for + mini + + + standards + mum + + + for + standards + + + spell + for + + + for + habitat + + + habitat + species + + + tection + pro- + + + species + tection + + + + + ROOT + spell + + + view + my + + + spell + view + + + treaty + a + + + spell + treaty + + + spell + should + + + spell + out + + + for + mini + + + standards + mum + + + for + standards + + + spell + for + + + for + habitat + + + for + species + + + habitat + species + + + tection + pro- + + + species + tection + + + + + + + In + in + 29872 + 29874 + IN + O + + + return + return + 29875 + 29881 + NN + O + + + for + for + 29882 + 29885 + IN + O + + + protecting + protect + 29886 + 29896 + VBG + O + + + these + these + 29897 + 29902 + DT + O + + + habi + habus + 29903 + 29907 + NNS + O + + + - + - + 29907 + 29908 + : + O + + + tats + tat + 29910 + 29914 + NNS + O + + + , + , + 29914 + 29915 + , + O + + + the + the + 29916 + 29919 + DT + O + + + registering + register + 29920 + 29931 + VBG + O + + + countries + country + 29932 + 29941 + NNS + O + + + should + should + 29942 + 29948 + MD + O + + + receive + receive + 29949 + 29956 + VB + O + + + technical + technical + 29958 + 29967 + JJ + O + + + assistance + assistance + 29968 + 29978 + NN + O + + + for + for + 29979 + 29982 + IN + O + + + their + they + 29983 + 29988 + PRP$ + O + + + protective + protective + 29989 + 29999 + JJ + O + + + ac + ac + 30000 + 30002 + NN + O + + + - + - + 30002 + 30003 + : + O + + + tivities + tivity + 30005 + 30013 + NNS + O + + + and + and + 30014 + 30017 + CC + O + + + perhaps + perhaps + 30018 + 30025 + RB + O + + + a + a + 30026 + 30027 + DT + O + + + priority + priority + 30028 + 30036 + NN + O + + + for + for + 30037 + 30040 + IN + O + + + other + other + 30041 + 30046 + JJ + O + + + kinds + kind + 30048 + 30053 + NNS + O + + + of + of + 30054 + 30056 + IN + O + + + assistance + assistance + 30057 + 30067 + NN + O + + + Intended + intend + 30068 + 30076 + VBN + O + + + to + to + 30077 + 30079 + TO + O + + + encourage + encourage + 30080 + 30089 + VB + O + + + January + January + 30097 + 30104 + NNP + DATE + 1990-01-23 + 1990-01-23 + + + 23 + 23 + 30105 + 30107 + CD + DATE + 1990-01-23 + 1990-01-23 + + + , + , + 30107 + 30108 + , + DATE + 1990-01-23 + 1990-01-23 + + + 1990 + 1990 + 30109 + 30113 + CD + DATE + 1990-01-23 + 1990-01-23 + + + CO + CO + 30131 + 30133 + NNP + O + + + local + local + 30135 + 30140 + JJ + O + + + peoples + people + 30141 + 30148 + NNS + O + + + to + to + 30149 + 30151 + TO + O + + + value + value + 30152 + 30157 + VB + O + + + the + the + 30158 + 30161 + DT + O + + + preserved + preserved + 30162 + 30171 + JJ + O + + + life + life + 30172 + 30176 + NN + O + + + re + re + 30177 + 30179 + SYM + O + + + - + - + 30179 + 30180 + : + O + + + sources + source + 30182 + 30189 + NNS + O + + + . + . + 30189 + 30190 + . + O + + + (ROOT (S (PP (IN In) (NP (NP (NN return)) (PP (IN for) (S (VP (VBG protecting) (NP (NP (DT these) (NNS habi)) (: -) (NP (NNS tats)))))))) (, ,) (NP (DT the) (VBG registering) (NNS countries)) (VP (MD should) (VP (VB receive) (NP (NP (NP (JJ technical) (NN assistance)) (PP (IN for) (NP (PRP$ their) (JJ protective) (NN ac)))) (: -) (NP (NP (NNS tivities)) (CC and) (NP (RB perhaps) (DT a) (NN priority)))) (PP (IN for) (NP (NP (JJ other) (NNS kinds)) (PP (IN of) (NP (NP (NN assistance)) (VP (VBN Intended) (S (VP (TO to) (VP (VB encourage) (NP-TMP (NNP January) (CD 23) (, ,) (CD 1990)) (NP (NP (NNP CO) (JJ local) (NNS peoples)) (SBAR (S (VP (TO to) (VP (VB value) (NP (DT the) (JJ preserved) (NN life)) (FRAG (X (SYM re)) (: -) (NP (NNS sources)))))))))))))))))) (. .))) + + + ROOT + receive + + + receive + In + + + In + return + + + return + for + + + for + protecting + + + habi + these + + + protecting + habi + + + habi + tats + + + countries + the + + + countries + registering + + + receive + countries + + + receive + should + + + assistance + technical + + + receive + assistance + + + assistance + for + + + ac + their + + + ac + protective + + + for + ac + + + assistance + tivities + + + tivities + and + + + priority + perhaps + + + priority + a + + + tivities + priority + + + receive + for + + + kinds + other + + + for + kinds + + + kinds + of + + + of + assistance + + + assistance + Intended + + + encourage + to + + + Intended + encourage + + + encourage + January + + + January + 23 + + + January + 1990 + + + peoples + CO + + + peoples + local + + + encourage + peoples + + + value + to + + + peoples + value + + + life + the + + + life + preserved + + + value + life + + + sources + re + + + value + sources + + + + + ROOT + receive + + + receive + return + + + return + protecting + + + habi + these + + + protecting + habi + + + habi + tats + + + countries + the + + + countries + registering + + + receive + countries + + + receive + should + + + assistance + technical + + + receive + assistance + + + ac + their + + + ac + protective + + + assistance + ac + + + assistance + tivities + + + priority + perhaps + + + priority + a + + + tivities + priority + + + kinds + other + + + receive + kinds + + + kinds + assistance + + + assistance + Intended + + + encourage + to + + + Intended + encourage + + + encourage + January + + + January + 23 + + + January + 1990 + + + peoples + CO + + + peoples + local + + + encourage + peoples + + + value + to + + + peoples + value + + + life + the + + + life + preserved + + + value + life + + + sources + re + + + value + sources + + + + + ROOT + receive + + + receive + return + + + return + protecting + + + habi + these + + + protecting + habi + + + habi + tats + + + countries + the + + + countries + registering + + + receive + countries + + + receive + should + + + assistance + technical + + + receive + assistance + + + ac + their + + + ac + protective + + + assistance + ac + + + assistance + tivities + + + priority + perhaps + + + priority + a + + + assistance + priority + + + tivities + priority + + + kinds + other + + + receive + kinds + + + kinds + assistance + + + assistance + Intended + + + encourage + to + + + Intended + encourage + + + encourage + January + + + January + 23 + + + January + 1990 + + + peoples + CO + + + peoples + local + + + encourage + peoples + + + value + to + + + peoples + value + + + life + the + + + life + preserved + + + value + life + + + sources + re + + + value + sources + + + + + + + Finally + finally + 30194 + 30201 + RB + O + + + , + , + 30201 + 30202 + , + O + + + I + I + 30203 + 30204 + PRP + O + + + would + would + 30205 + 30210 + MD + O + + + note + note + 30211 + 30215 + VB + O + + + that + that + 30216 + 30220 + IN + O + + + the + the + 30221 + 30224 + DT + DURATION + P15Y + + + last + last + 30225 + 30229 + JJ + DATE + P15Y + P15Y + + + 15 + 15 + 30230 + 30232 + CD + DATE + P15Y + P15Y + + + years + year + 30236 + 30241 + NNS + DATE + P15Y + P15Y + + + has + have + 30242 + 30245 + VBZ + O + + + seen + see + 30246 + 30250 + VBN + O + + + an + a + 30251 + 30253 + DT + O + + + enormous + enormous + 30254 + 30262 + JJ + O + + + explosion + explosion + 30263 + 30272 + NN + O + + + in + in + 30273 + 30275 + IN + O + + + the + the + 30276 + 30279 + DT + O + + + number + number + 30281 + 30287 + NN + O + + + and + and + 30288 + 30291 + CC + O + + + scope + scope + 30292 + 30297 + NN + O + + + of + of + 30298 + 30300 + IN + O + + + international + international + 30301 + 30314 + JJ + O + + + legal + legal + 30315 + 30320 + JJ + O + + + agreements + agreement + 30322 + 30332 + NNS + O + + + relating + relate + 30333 + 30341 + VBG + O + + + to + to + 30342 + 30344 + TO + O + + + the + the + 30345 + 30348 + DT + O + + + environment + environment + 30349 + 30360 + NN + O + + + . + . + 30360 + 30361 + . + O + + + (ROOT (S (ADVP (RB Finally)) (, ,) (NP (PRP I)) (VP (MD would) (VP (VB note) (SBAR (IN that) (S (NP (DT the) (JJ last) (CD 15) (NNS years)) (VP (VBZ has) (VP (VBN seen) (NP (DT an) (JJ enormous) (NN explosion)) (PP (IN in) (NP (NP (DT the) (NN number) (CC and) (NN scope)) (PP (IN of) (NP (JJ international) (JJ legal) (NNS agreements))))) (VP (VBG relating) (PP (TO to) (NP (DT the) (NN environment)))))))))) (. .))) + + + ROOT + note + + + note + Finally + + + note + I + + + note + would + + + seen + that + + + years + the + + + years + last + + + years + 15 + + + seen + years + + + seen + has + + + note + seen + + + explosion + an + + + explosion + enormous + + + seen + explosion + + + seen + in + + + number + the + + + in + number + + + number + and + + + number + scope + + + number + of + + + agreements + international + + + agreements + legal + + + of + agreements + + + seen + relating + + + relating + to + + + environment + the + + + to + environment + + + + + ROOT + note + + + note + Finally + + + note + I + + + note + would + + + seen + that + + + years + the + + + years + last + + + years + 15 + + + seen + years + + + seen + has + + + note + seen + + + explosion + an + + + explosion + enormous + + + seen + explosion + + + number + the + + + seen + number + + + number + scope + + + agreements + international + + + agreements + legal + + + number + agreements + + + seen + relating + + + environment + the + + + relating + environment + + + + + ROOT + note + + + note + Finally + + + note + I + + + note + would + + + seen + that + + + years + the + + + years + last + + + years + 15 + + + seen + years + + + seen + has + + + note + seen + + + explosion + an + + + explosion + enormous + + + seen + explosion + + + number + the + + + seen + number + + + seen + scope + + + number + scope + + + agreements + international + + + agreements + legal + + + number + agreements + + + seen + relating + + + environment + the + + + relating + environment + + + + + + + The + the + 30363 + 30366 + DT + O + + + development + development + 30367 + 30378 + NN + O + + + of + of + 30379 + 30381 + IN + O + + + international + international + 30382 + 30395 + JJ + O + + + environ + environ + 30396 + 30403 + NN + O + + + - + - + 30403 + 30404 + : + O + + + mental + mental + 30406 + 30412 + JJ + O + + + law + law + 30413 + 30416 + NN + O + + + is + be + 30417 + 30419 + VBZ + O + + + a + a + 30420 + 30421 + DT + O + + + low + low + 30422 + 30425 + JJ + O + + + cost + cost + 30426 + 30430 + NN + O + + + and + and + 30431 + 30434 + CC + O + + + highly + highly + 30435 + 30441 + RB + O + + + benefi + benefi + 30442 + 30448 + FW + O + + + - + - + 30448 + 30449 + : + O + + + cial + cial + 30451 + 30455 + JJ + O + + + way + way + 30456 + 30459 + NN + O + + + of + of + 30460 + 30462 + IN + O + + + protecting + protect + 30463 + 30473 + VBG + O + + + global + global + 30474 + 30480 + JJ + O + + + environment + environment + 30481 + 30492 + NN + O + + + and + and + 30494 + 30497 + CC + O + + + of + of + 30498 + 30500 + IN + O + + + enhancing + enhance + 30501 + 30510 + VBG + O + + + global + global + 30511 + 30517 + JJ + O + + + environmental + environmental + 30518 + 30531 + JJ + O + + + co + co + 30532 + 30534 + NN + O + + + - + - + 30534 + 30535 + : + O + + + operation + operation + 30537 + 30546 + NN + O + + + . + . + 30546 + 30547 + . + O + + + (ROOT (FRAG (NP (NP (DT The) (NN development)) (PP (IN of) (NP (JJ international) (NN environ)))) (: -) (S (NP (JJ mental) (NN law)) (VP (VBZ is) (NP (NP (DT a) (JJ low) (NN cost)) (CC and) (RB highly) (NP (FW benefi)) (: -) (NP (NP (JJ cial) (NN way)) (PP (PP (IN of) (S (VP (VBG protecting) (NP (JJ global) (NN environment))))) (CC and) (PP (IN of) (S (VP (VBG enhancing) (NP (NP (JJ global) (JJ environmental) (NN co)) (: -) (NP (NN operation))))))))))) (. .))) + + + ROOT + development + + + development + The + + + development + of + + + environ + international + + + of + environ + + + law + mental + + + cost + law + + + cost + is + + + cost + a + + + cost + low + + + development + cost + + + cost + and + + + cost + highly + + + cost + benefi + + + way + cial + + + cost + way + + + way + of + + + of + protecting + + + environment + global + + + protecting + environment + + + of + and + + + of + of + + + of + enhancing + + + co + global + + + co + environmental + + + enhancing + co + + + co + operation + + + + + ROOT + development + + + development + The + + + environ + international + + + development + environ + + + law + mental + + + cost + law + + + cost + is + + + cost + a + + + cost + low + + + development + cost + + + cost + highly + + + cost + benefi + + + way + cial + + + cost + way + + + way + protecting + + + environment + global + + + protecting + environment + + + protecting + enhancing + + + co + global + + + co + environmental + + + enhancing + co + + + co + operation + + + + + ROOT + development + + + development + The + + + environ + international + + + development + environ + + + law + mental + + + cost + law + + + cost + is + + + cost + a + + + cost + low + + + development + cost + + + cost + highly + + + cost + benefi + + + way + cial + + + development + way + + + cost + way + + + way + protecting + + + environment + global + + + protecting + environment + + + way + enhancing + + + protecting + enhancing + + + co + global + + + co + environmental + + + enhancing + co + + + co + operation + + + + + + + This + this + 30548 + 30552 + DT + O + + + Is + be + 30553 + 30555 + VBZ + O + + + a + a + 30556 + 30557 + DT + O + + + trend + trend + 30558 + 30563 + NN + O + + + we + we + 30564 + 30566 + PRP + O + + + must + must + 30567 + 30571 + MD + O + + + encor + encor + 30572 + 30578 + VB + O + + + - + - + 30578 + 30579 + : + O + + + age + age + 30581 + 30584 + NN + O + + + . + . + 30584 + 30585 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ Is) (NP (NP (DT a) (NN trend)) (SBAR (S (NP (PRP we)) (VP (MD must) (VP (VB encor) (: -) (NP (NN age)))))))) (. .))) + + + ROOT + trend + + + trend + This + + + trend + Is + + + trend + a + + + encor + we + + + encor + must + + + trend + encor + + + encor + age + + + + + ROOT + trend + + + trend + This + + + trend + Is + + + trend + a + + + encor + we + + + encor + must + + + trend + encor + + + encor + age + + + + + ROOT + trend + + + trend + This + + + trend + Is + + + trend + a + + + encor + we + + + encor + must + + + trend + encor + + + encor + age + + + + + + + I + I + 30586 + 30587 + PRP + O + + + would + would + 30588 + 30593 + MD + O + + + hope + hope + 30594 + 30598 + VB + O + + + that + that + 30599 + 30603 + IN + O + + + UNEP + UNEP + 30604 + 30608 + NNP + ORGANIZATION + + + 's + 's + 30608 + 30610 + POS + O + + + environmen + environman + 30611 + 30621 + NN + O + + + - + - + 30621 + 30622 + : + O + + + tal + tal + 30624 + 30627 + JJ + O + + + law + law + 30628 + 30631 + NN + O + + + unit + unit + 30632 + 30636 + NN + O + + + might + might + 30637 + 30642 + MD + O + + + become + become + 30643 + 30649 + VB + O + + + the + the + 30650 + 30653 + DT + O + + + nucleus + nucleus + 30654 + 30661 + NN + O + + + of + of + 30662 + 30664 + IN + O + + + a + a + 30665 + 30666 + DT + O + + + new + new + 30668 + 30671 + JJ + O + + + International + International + 30672 + 30685 + NNP + O + + + environmental + environmental + 30686 + 30699 + JJ + O + + + law + law + 30700 + 30703 + NN + O + + + insti + instus + 30704 + 30709 + NN + O + + + - + - + 30709 + 30710 + : + O + + + tute + tute + 30712 + 30716 + NN + O + + + . + . + 30716 + 30717 + . + O + + + (ROOT (S (NP (PRP I)) (VP (MD would) (VP (VB hope) (SBAR (IN that) (S (NP (NP (NNP UNEP) (POS 's)) (NN environmen)) (: -) (NP (JJ tal) (NN law) (NN unit)) (VP (MD might) (VP (VB become) (NP (NP (NP (DT the) (NN nucleus)) (PP (IN of) (NP (DT a) (JJ new) (NNP International) (JJ environmental) (NN law) (NN insti)))) (: -) (NP (NN tute))))))))) (. .))) + + + ROOT + hope + + + hope + I + + + hope + would + + + nucleus + that + + + environmen + UNEP + + + UNEP + 's + + + nucleus + environmen + + + unit + tal + + + unit + law + + + nucleus + unit + + + nucleus + might + + + nucleus + become + + + nucleus + the + + + hope + nucleus + + + nucleus + of + + + insti + a + + + insti + new + + + insti + International + + + insti + environmental + + + insti + law + + + of + insti + + + nucleus + tute + + + + + ROOT + hope + + + hope + I + + + hope + would + + + nucleus + that + + + environmen + UNEP + + + nucleus + environmen + + + unit + tal + + + unit + law + + + nucleus + unit + + + nucleus + might + + + nucleus + become + + + nucleus + the + + + hope + nucleus + + + insti + a + + + insti + new + + + insti + International + + + insti + environmental + + + insti + law + + + nucleus + insti + + + nucleus + tute + + + + + ROOT + hope + + + hope + I + + + hope + would + + + nucleus + that + + + environmen + UNEP + + + nucleus + environmen + + + unit + tal + + + unit + law + + + nucleus + unit + + + nucleus + might + + + nucleus + become + + + nucleus + the + + + hope + nucleus + + + insti + a + + + insti + new + + + insti + International + + + insti + environmental + + + insti + law + + + nucleus + insti + + + nucleus + tute + + + + + + + Such + such + 30718 + 30722 + PDT + O + + + an + a + 30723 + 30725 + DT + O + + + institute + institute + 30726 + 30735 + NN + O + + + should + should + 30736 + 30742 + MD + O + + + draw + draw + 30743 + 30747 + VB + O + + + on + on + 30748 + 30750 + IN + O + + + the + the + 30751 + 30754 + DT + O + + + resources + resource + 30756 + 30765 + NNS + O + + + of + of + 30766 + 30768 + IN + O + + + UNEP + UNEP + 30769 + 30773 + NNP + ORGANIZATION + + + members + member + 30774 + 30781 + NNS + O + + + , + , + 30781 + 30782 + , + O + + + and + and + 30783 + 30786 + CC + O + + + In + in + 30787 + 30789 + IN + O + + + particu + particu + 30790 + 30797 + NN + O + + + - + - + 30797 + 30798 + : + O + + + lar + lar + 30800 + 30803 + NN + O + + + those + those + 30804 + 30809 + DT + O + + + with + with + 30810 + 30814 + IN + O + + + more + more + 30815 + 30819 + JJR + O + + + developed + developed + 30820 + 30829 + JJ + O + + + domestic + domestic + 30830 + 30838 + JJ + O + + + en + en + 30839 + 30841 + IN + O + + + - + - + 30841 + 30842 + : + O + + + vironmental + vironmental + 30844 + 30855 + JJ + O + + + law + law + 30856 + 30859 + NN + O + + + . + . + 30859 + 30860 + . + O + + + (ROOT (S (NP (PDT Such) (DT an) (NN institute)) (VP (MD should) (VP (VB draw) (PP (IN on) (NP (DT the) (NP (NP (NP (NNS resources)) (PP (IN of) (NP (NNP UNEP) (NNS members)))) (, ,) (CC and) (PP (IN In) (NP (NN particu)) (: -))) (NN lar))) (NP-TMP (NP (DT those)) (PP (IN with) (NP (NP (JJR more)) (RRC (ADJP (JJ developed) (JJ domestic)) (ADVP (IN en))))) (: -) (NP (JJ vironmental) (NN law))))) (. .))) + + + ROOT + draw + + + institute + Such + + + institute + an + + + draw + institute + + + draw + should + + + draw + on + + + lar + the + + + lar + resources + + + resources + of + + + members + UNEP + + + of + members + + + resources + and + + + resources + In + + + In + particu + + + on + lar + + + draw + those + + + those + with + + + with + more + + + domestic + developed + + + en + domestic + + + more + en + + + law + vironmental + + + those + law + + + + + ROOT + draw + + + institute + Such + + + institute + an + + + draw + institute + + + draw + should + + + lar + the + + + lar + resources + + + members + UNEP + + + resources + members + + + resources + In + + + In + particu + + + draw + lar + + + draw + those + + + those + more + + + domestic + developed + + + en + domestic + + + more + en + + + law + vironmental + + + those + law + + + + + ROOT + draw + + + institute + Such + + + institute + an + + + draw + institute + + + draw + should + + + lar + the + + + lar + resources + + + members + UNEP + + + resources + members + + + resources + In + + + lar + In + + + In + particu + + + draw + lar + + + draw + those + + + those + more + + + domestic + developed + + + en + domestic + + + more + en + + + law + vironmental + + + those + law + + + + + + + I + I + 30861 + 30862 + PRP + O + + + would + would + 30863 + 30868 + MD + O + + + hope + hope + 30869 + 30873 + VB + O + + + these + these + 30874 + 30879 + DT + O + + + states + state + 30880 + 30886 + NNS + O + + + might + might + 30888 + 30893 + MD + O + + + secund + secund + 30894 + 30900 + VB + O + + + lawyers + lawyer + 30901 + 30908 + NNS + O + + + to + to + 30909 + 30911 + TO + O + + + the + the + 30912 + 30915 + DT + O + + + international + international + 30916 + 30929 + JJ + O + + + environmental + environmental + 30931 + 30944 + JJ + O + + + law + law + 30945 + 30948 + NN + O + + + institute + institute + 30949 + 30958 + NN + O + + + both + both + 30959 + 30963 + CC + O + + + for + for + 30964 + 30967 + IN + O + + + the + the + 30968 + 30971 + DT + O + + + purpose + purpose + 30973 + 30980 + NN + O + + + of + of + 30981 + 30983 + IN + O + + + developing + develop + 30984 + 30994 + VBG + O + + + further + further + 30995 + 31002 + JJ + O + + + international + international + 31003 + 31016 + JJ + O + + + environmental + environmental + 31018 + 31031 + JJ + O + + + law + law + 31032 + 31035 + NN + O + + + and + and + 31036 + 31039 + CC + O + + + to + to + 31040 + 31042 + TO + O + + + assist + assist + 31043 + 31049 + VB + O + + + countries + country + 31050 + 31059 + NNS + O + + + in + in + 31060 + 31062 + IN + O + + + the + the + 31064 + 31067 + DT + O + + + development + development + 31068 + 31079 + NN + O + + + of + of + 31080 + 31082 + IN + O + + + domestic + domestic + 31083 + 31091 + JJ + O + + + environmental + environmental + 31092 + 31105 + JJ + O + + + law + law + 31107 + 31110 + NN + O + + + . + . + 31110 + 31111 + . + O + + + (ROOT (S (NP (PRP I)) (VP (MD would) (VP (VB hope) (SBAR (S (NP (DT these) (NNS states)) (VP (MD might) (VP (VB secund) (NP (NP (NNS lawyers)) (PP (PP (TO to) (NP (DT the) (JJ international) (JJ environmental) (NN law) (NN institute))) (CC both) (PP (IN for) (NP (DT the) (NN purpose)))) (PP (IN of) (S (VP (VBG developing) (NP (NP (JJ further) (JJ international) (JJ environmental) (NN law)) (CC and) (S (VP (TO to) (VP (VB assist) (NP (NNS countries)) (PP (IN in) (NP (NP (DT the) (NN development)) (PP (IN of) (NP (JJ domestic) (JJ environmental) (NN law))))))))))))))))))) (. .))) + + + ROOT + hope + + + hope + I + + + hope + would + + + states + these + + + secund + states + + + secund + might + + + hope + secund + + + secund + lawyers + + + lawyers + to + + + institute + the + + + institute + international + + + institute + environmental + + + institute + law + + + to + institute + + + to + both + + + to + for + + + purpose + the + + + for + purpose + + + lawyers + of + + + of + developing + + + law + further + + + law + international + + + law + environmental + + + developing + law + + + law + and + + + assist + to + + + law + assist + + + assist + countries + + + assist + in + + + development + the + + + in + development + + + development + of + + + law + domestic + + + law + environmental + + + of + law + + + + + ROOT + hope + + + hope + I + + + hope + would + + + states + these + + + secund + states + + + secund + might + + + hope + secund + + + secund + lawyers + + + institute + the + + + institute + international + + + institute + environmental + + + institute + law + + + lawyers + institute + + + lawyers + both + + + lawyers + for + + + purpose + the + + + for + purpose + + + lawyers + developing + + + law + further + + + law + international + + + law + environmental + + + developing + law + + + assist + to + + + law + assist + + + assist + countries + + + development + the + + + assist + development + + + law + domestic + + + law + environmental + + + development + law + + + + + ROOT + hope + + + hope + I + + + hope + would + + + states + these + + + secund + states + + + secund + might + + + hope + secund + + + secund + lawyers + + + institute + the + + + institute + international + + + institute + environmental + + + institute + law + + + lawyers + institute + + + lawyers + both + + + lawyers + for + + + purpose + the + + + for + purpose + + + lawyers + developing + + + law + further + + + law + international + + + law + environmental + + + developing + law + + + assist + to + + + developing + assist + + + law + assist + + + assist + countries + + + development + the + + + assist + development + + + law + domestic + + + law + environmental + + + development + law + + + + + + + Twenty-five + twenty-five + 31115 + 31126 + CD + DATE + OFFSET P-25Y + + + + years + year + 31127 + 31132 + NNS + DATE + OFFSET P-25Y + + + + ago + ago + 31133 + 31136 + RB + DATE + OFFSET P-25Y + + + + , + , + 31136 + 31137 + , + O + + + in + in + 31138 + 31140 + IN + O + + + his + he + 31141 + 31144 + PRP$ + O + + + last + last + 31145 + 31149 + JJ + O + + + speech + speech + 31150 + 31156 + NN + O + + + to + to + 31158 + 31160 + TO + O + + + the + the + 31161 + 31164 + DT + O + + + United + United + 31165 + 31171 + NNP + ORGANIZATION + + + Nations + Nations + 31172 + 31179 + NNPS + ORGANIZATION + + + , + , + 31179 + 31180 + , + O + + + U.S. + U.S. + 31181 + 31185 + NNP + LOCATION + + + Ambassador + Ambassador + 31186 + 31196 + NNP + O + + + Adlai + Adlai + 31198 + 31203 + NNP + PERSON + + + Stevenson + Stevenson + 31204 + 31213 + NNP + PERSON + + + made + make + 31214 + 31218 + VBD + O + + + reference + reference + 31219 + 31228 + NN + O + + + to + to + 31229 + 31231 + TO + O + + + the + the + 31232 + 31235 + DT + O + + + pho + pho + 31236 + 31239 + NN + O + + + - + - + 31239 + 31240 + : + O + + + tographs + tograph + 31242 + 31250 + NNS + O + + + of + of + 31251 + 31253 + IN + O + + + Earth + Earth + 31254 + 31259 + NNP + MISC + + + taken + take + 31260 + 31265 + VBN + O + + + from + from + 31266 + 31270 + IN + O + + + an + a + 31271 + 31273 + DT + O + + + early + early + 31274 + 31279 + JJ + O + + + space + space + 31280 + 31285 + NN + O + + + mission + mission + 31287 + 31294 + NN + O + + + . + . + 31294 + 31295 + . + O + + + (ROOT (S (ADVP (NP (CD Twenty-five) (NNS years)) (RB ago)) (, ,) (PP (IN in) (NP (NP (PRP$ his) (JJ last) (NN speech)) (PP (TO to) (NP (DT the) (NNP United) (NNPS Nations))))) (, ,) (NP (NNP U.S.) (NNP Ambassador) (NNP Adlai) (NNP Stevenson)) (VP (VBD made) (NP (NN reference)) (PP (TO to) (NP (DT the) (NN pho))) (: -) (NP (NP (NNS tographs)) (PP (IN of) (NP (NP (NNP Earth)) (VP (VBN taken) (PP (IN from) (NP (DT an) (JJ early) (NN space) (NN mission)))))))) (. .))) + + + ROOT + made + + + years + Twenty-five + + + ago + years + + + made + ago + + + made + in + + + speech + his + + + speech + last + + + in + speech + + + speech + to + + + Nations + the + + + Nations + United + + + to + Nations + + + Stevenson + U.S. + + + Stevenson + Ambassador + + + Stevenson + Adlai + + + made + Stevenson + + + made + reference + + + made + to + + + pho + the + + + to + pho + + + made + tographs + + + tographs + of + + + of + Earth + + + Earth + taken + + + taken + from + + + mission + an + + + mission + early + + + mission + space + + + from + mission + + + + + ROOT + made + + + years + Twenty-five + + + ago + years + + + made + ago + + + speech + his + + + speech + last + + + made + speech + + + Nations + the + + + Nations + United + + + speech + Nations + + + Stevenson + U.S. + + + Stevenson + Ambassador + + + Stevenson + Adlai + + + made + Stevenson + + + made + reference + + + pho + the + + + made + pho + + + made + tographs + + + tographs + Earth + + + Earth + taken + + + mission + an + + + mission + early + + + mission + space + + + taken + mission + + + + + ROOT + made + + + years + Twenty-five + + + ago + years + + + made + ago + + + speech + his + + + speech + last + + + made + speech + + + Nations + the + + + Nations + United + + + speech + Nations + + + Stevenson + U.S. + + + Stevenson + Ambassador + + + Stevenson + Adlai + + + made + Stevenson + + + made + reference + + + pho + the + + + made + pho + + + made + tographs + + + tographs + Earth + + + Earth + taken + + + mission + an + + + mission + early + + + mission + space + + + taken + mission + + + + + + + Today + today + 31296 + 31301 + NN + DATE + THIS P1D + + + + these + these + 31302 + 31307 + DT + O + + + images + image + 31308 + 31314 + NNS + O + + + have + have + 31315 + 31319 + VBP + O + + + become + become + 31320 + 31326 + VBN + O + + + commonplace + commonplace + 31328 + 31339 + JJ + O + + + but + but + 31340 + 31343 + CC + O + + + at + at + 31344 + 31346 + IN + O + + + that + that + 31347 + 31351 + DT + O + + + time + time + 31352 + 31356 + NN + O + + + they + they + 31357 + 31361 + PRP + O + + + were + be + 31362 + 31366 + VBD + O + + + strikingly + strikingly + 31368 + 31378 + RB + O + + + new + new + 31379 + 31382 + JJ + O + + + and + and + 31383 + 31386 + CC + O + + + they + they + 31387 + 31391 + PRP + O + + + led + lead + 31392 + 31395 + VBD + O + + + Ambassador + Ambassador + 31396 + 31406 + NNP + O + + + Stevenson + Stevenson + 31408 + 31417 + NNP + PERSON + + + to + to + 31418 + 31420 + TO + O + + + reflect + reflect + 31421 + 31428 + VB + O + + + on + on + 31429 + 31431 + IN + O + + + the + the + 31432 + 31435 + DT + O + + + fragility + fragility + 31436 + 31445 + NN + O + + + of + of + 31446 + 31448 + IN + O + + + our + we + 31449 + 31452 + PRP$ + O + + + human + human + 31454 + 31459 + JJ + O + + + environment + environment + 31460 + 31471 + NN + O + + + . + . + 31471 + 31472 + . + O + + + (ROOT (S (S (NP-TMP (NN Today)) (NP (DT these) (NNS images)) (VP (VBP have) (VP (VBN become) (S (ADJP (JJ commonplace)))))) (CC but) (S (PP (IN at) (NP (DT that) (NN time))) (NP (PRP they)) (VP (VBD were) (ADJP (RB strikingly) (JJ new)))) (CC and) (S (NP (PRP they)) (VP (VBD led) (S (NP (NNP Ambassador) (NNP Stevenson)) (VP (TO to) (VP (VB reflect) (PP (IN on) (NP (NP (DT the) (NN fragility)) (PP (IN of) (NP (PRP$ our) (JJ human) (NN environment)))))))))) (. .))) + + + ROOT + become + + + become + Today + + + images + these + + + become + images + + + become + have + + + become + commonplace + + + become + but + + + new + at + + + time + that + + + at + time + + + new + they + + + new + were + + + new + strikingly + + + become + new + + + become + and + + + led + they + + + become + led + + + Stevenson + Ambassador + + + reflect + Stevenson + + + reflect + to + + + led + reflect + + + reflect + on + + + fragility + the + + + on + fragility + + + fragility + of + + + environment + our + + + environment + human + + + of + environment + + + + + ROOT + become + + + become + Today + + + images + these + + + become + images + + + become + have + + + become + commonplace + + + time + that + + + new + time + + + new + they + + + new + were + + + new + strikingly + + + become + new + + + led + they + + + become + led + + + Stevenson + Ambassador + + + reflect + Stevenson + + + reflect + to + + + led + reflect + + + fragility + the + + + reflect + fragility + + + environment + our + + + environment + human + + + fragility + environment + + + + + ROOT + become + + + become + Today + + + images + these + + + become + images + + + become + have + + + become + commonplace + + + time + that + + + new + time + + + new + they + + + new + were + + + new + strikingly + + + become + new + + + led + they + + + become + led + + + Stevenson + Ambassador + + + reflect + Stevenson + + + reflect + to + + + led + reflect + + + fragility + the + + + reflect + fragility + + + environment + our + + + environment + human + + + fragility + environment + + + + + + + `` + `` + 31476 + 31477 + `` + O + + + We + we + 31477 + 31479 + PRP + O + + + travel + travel + 31480 + 31486 + VBP + O + + + together + together + 31487 + 31495 + RB + O + + + , + , + 31495 + 31496 + , + O + + + '' + '' + 31496 + 31497 + '' + O + + + he + he + 31498 + 31500 + PRP + O + + + said + say + 31501 + 31505 + VBD + O + + + , + , + 31505 + 31506 + , + O + + + `` + `` + 31507 + 31508 + `` + O + + + passengers + passenger + 31508 + 31518 + NNS + O + + + on + on + 31520 + 31522 + IN + O + + + a + a + 31523 + 31524 + DT + O + + + little + little + 31525 + 31531 + JJ + O + + + space + space + 31532 + 31537 + NN + O + + + ship + ship + 31538 + 31542 + NN + O + + + , + , + 31542 + 31543 + , + O + + + dependent + dependent + 31544 + 31553 + JJ + O + + + on + on + 31554 + 31556 + IN + O + + + its + its + 31557 + 31560 + PRP$ + O + + + vul + vul + 31561 + 31564 + NN + O + + + - + - + 31564 + 31565 + : + O + + + nerable + nerable + 31567 + 31574 + JJ + O + + + reserves + reserve + 31575 + 31583 + NNS + O + + + of + of + 31584 + 31586 + IN + O + + + air + air + 31587 + 31590 + NN + O + + + and + and + 31591 + 31594 + CC + O + + + soil + soil + 31595 + 31599 + NN + O + + + , + , + 31599 + 31600 + , + O + + + all + all + 31601 + 31604 + DT + O + + + commit + commit + 31605 + 31611 + VB + O + + + - + - + 31611 + 31612 + : + O + + + ted + ted + 31614 + 31617 + VBN + O + + + for + for + 31618 + 31621 + IN + O + + + our + we + 31622 + 31625 + PRP$ + O + + + safety + safety + 31626 + 31632 + NN + O + + + to + to + 31633 + 31635 + TO + O + + + its + its + 31636 + 31639 + PRP$ + O + + + security + security + 31640 + 31648 + NN + O + + + and + and + 31649 + 31652 + CC + O + + + peace + peace + 31653 + 31658 + NN + O + + + ; + ; + 31658 + 31659 + : + O + + + preserved + preserve + 31661 + 31670 + VBN + O + + + from + from + 31671 + 31675 + IN + O + + + annihilation + annihilation + 31676 + 31688 + NN + O + + + only + only + 31689 + 31693 + RB + O + + + by + by + 31694 + 31696 + IN + O + + + the + the + 31697 + 31700 + DT + O + + + care + care + 31702 + 31706 + NN + O + + + , + , + 31706 + 31707 + , + O + + + the + the + 31708 + 31711 + DT + O + + + work + work + 31712 + 31716 + NN + O + + + , + , + 31716 + 31717 + , + O + + + and + and + 31718 + 31721 + CC + O + + + I + I + 31722 + 31723 + PRP + O + + + will + will + 31724 + 31728 + MD + O + + + say + say + 31729 + 31732 + VB + O + + + , + , + 31732 + 31733 + , + O + + + the + the + 31734 + 31737 + DT + O + + + love + love + 31738 + 31742 + NN + O + + + we + we + 31743 + 31745 + PRP + O + + + give + give + 31747 + 31751 + VBP + O + + + our + we + 31752 + 31755 + PRP$ + O + + + fragile + fragile + 31756 + 31763 + JJ + O + + + craft + craft + 31764 + 31769 + NN + O + + + . + . + 31769 + 31770 + . + O + + + '' + '' + 31770 + 31771 + '' + O + + + (ROOT (S (`` ``) (S (NP (PRP We)) (VP (VBP travel) (ADVP (RB together)))) (, ,) ('' '') (NP (PRP he)) (VP (VP (VBD said) (, ,) (`` ``) (S (S (NP (NP (NNS passengers)) (PP (IN on) (NP (NP (DT a) (JJ little) (NN space) (NN ship)) (, ,) (ADJP (ADJP (JJ dependent) (PP (IN on) (NP (PRP$ its) (NN vul)))) (: -) (NP (NP (JJ nerable) (NNS reserves)) (PP (IN of) (NP (NP (NN air) (CC and) (NN soil)) (, ,) (SBAR (WHNP (DT all)) (S (VP (VB commit)))))))))) (: -) (VP (VBN ted) (PP (IN for) (NP (PRP$ our) (NN safety))) (PP (TO to) (NP (PRP$ its) (NN security) (CC and) (NN peace)))) (: ;)) (VP (VBN preserved) (PP (IN from) (NP (NN annihilation))) (PP (RB only) (IN by) (NP (NP (DT the) (NN care)) (, ,) (NP (DT the) (NN work)))))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD will) (VP (VB say)))))) (, ,) (NP (NP (DT the) (NN love)) (SBAR (S (NP (PRP we)) (VP (VBP give) (NP (PRP$ our) (JJ fragile) (NN craft))))))) (. .) ('' ''))) + + + ROOT + said + + + travel + We + + + said + travel + + + travel + together + + + said + he + + + preserved + passengers + + + passengers + on + + + ship + a + + + ship + little + + + ship + space + + + on + ship + + + ship + dependent + + + dependent + on + + + vul + its + + + on + vul + + + reserves + nerable + + + dependent + reserves + + + reserves + of + + + of + air + + + air + and + + + air + soil + + + commit + all + + + air + commit + + + passengers + ted + + + ted + for + + + safety + our + + + for + safety + + + ted + to + + + security + its + + + to + security + + + security + and + + + security + peace + + + said + preserved + + + preserved + from + + + from + annihilation + + + by + only + + + preserved + by + + + care + the + + + by + care + + + work + the + + + care + work + + + preserved + and + + + say + I + + + say + will + + + preserved + say + + + love + the + + + said + love + + + give + we + + + love + give + + + craft + our + + + craft + fragile + + + give + craft + + + + + ROOT + said + + + travel + We + + + said + travel + + + travel + together + + + said + he + + + preserved + passengers + + + ship + a + + + ship + little + + + ship + space + + + passengers + ship + + + ship + dependent + + + vul + its + + + dependent + vul + + + reserves + nerable + + + dependent + reserves + + + reserves + air + + + air + soil + + + commit + all + + + air + commit + + + passengers + ted + + + safety + our + + + ted + safety + + + security + its + + + ted + security + + + security + peace + + + said + preserved + + + preserved + annihilation + + + preserved + only + + + care + the + + + preserved + care + + + work + the + + + care + work + + + say + I + + + say + will + + + preserved + say + + + love + the + + + said + love + + + give + we + + + love + give + + + craft + our + + + craft + fragile + + + give + craft + + + + + ROOT + said + + + travel + We + + + said + travel + + + travel + together + + + said + he + + + preserved + passengers + + + ship + a + + + ship + little + + + ship + space + + + passengers + ship + + + ship + dependent + + + vul + its + + + dependent + vul + + + reserves + nerable + + + dependent + reserves + + + reserves + air + + + reserves + soil + + + air + soil + + + commit + all + + + air + commit + + + passengers + ted + + + safety + our + + + ted + safety + + + security + its + + + ted + security + + + ted + peace + + + security + peace + + + said + preserved + + + preserved + annihilation + + + preserved + only + + + care + the + + + preserved + care + + + work + the + + + care + work + + + say + I + + + say + will + + + said + say + + + preserved + say + + + love + the + + + said + love + + + give + we + + + love + give + + + craft + our + + + craft + fragile + + + give + craft + + + + + + + The + the + 31775 + 31778 + DT + O + + + rapid + rapid + 31779 + 31784 + JJ + O + + + political + political + 31785 + 31794 + JJ + O + + + changes + change + 31795 + 31802 + NNS + O + + + of + of + 31803 + 31805 + IN + O + + + the + the + 31806 + 31809 + DT + DATE + PREV_IMMEDIATE P1Y + + + + last + last + 31810 + 31814 + JJ + DATE + PREV_IMMEDIATE P1Y + + + + year + year + 31816 + 31820 + NN + DATE + PREV_IMMEDIATE P1Y + + + + now + now + 31821 + 31824 + RB + DATE + PREV_IMMEDIATE P1Y + PRESENT_REF + + + provide + provide + 31825 + 31832 + VBP + O + + + an + a + 31833 + 31835 + DT + O + + + extraordinary + extraordinary + 31836 + 31849 + JJ + O + + + opportu + opportu + 31850 + 31857 + NN + O + + + - + - + 31857 + 31858 + : + O + + + nity-an + nity-an + 31860 + 31867 + JJ + O + + + opportunity + opportunity + 31869 + 31880 + NN + O + + + for + for + 31881 + 31884 + IN + O + + + unprecedented + unprecedented + 31885 + 31898 + JJ + O + + + global + global + 31900 + 31906 + JJ + O + + + cooperation + cooperation + 31907 + 31918 + NN + O + + + and + and + 31919 + 31922 + CC + O + + + an + a + 31923 + 31925 + DT + O + + + opportunity + opportunity + 31926 + 31937 + NN + O + + + to + to + 31938 + 31940 + TO + O + + + mobilize + mobilize + 31942 + 31950 + VB + O + + + significant + significant + 31951 + 31962 + JJ + O + + + new + new + 31963 + 31966 + JJ + O + + + resources-to + resources-to + 31967 + 31979 + NN + O + + + the + the + 31980 + 31983 + DT + O + + + task + task + 31985 + 31989 + NN + O + + + of + of + 31990 + 31992 + IN + O + + + protecting + protect + 31993 + 32003 + VBG + O + + + our + we + 32004 + 32007 + PRP$ + O + + + fragile + fragile + 32008 + 32015 + JJ + O + + + craft + craft + 32016 + 32021 + NN + O + + + . + . + 32021 + 32022 + . + O + + + (ROOT (S (NP (NP (DT The) (JJ rapid) (JJ political) (NNS changes)) (PP (IN of) (NP (DT the) (JJ last) (NN year)))) (ADVP (RB now)) (VP (VBP provide) (NP (DT an) (JJ extraordinary) (NN opportu)) (: -) (S (NP (NP (JJ nity-an) (NN opportunity)) (PP (IN for) (NP (NP (JJ unprecedented) (JJ global) (NN cooperation)) (CC and) (NP (DT an) (NN opportunity))))) (VP (TO to) (VP (VB mobilize) (S (NP (JJ significant) (JJ new) (NN resources-to)) (NP (NP (DT the) (NN task)) (PP (IN of) (S (VP (VBG protecting) (NP (PRP$ our) (JJ fragile) (NN craft))))))))))) (. .))) + + + ROOT + provide + + + changes + The + + + changes + rapid + + + changes + political + + + provide + changes + + + changes + of + + + year + the + + + year + last + + + of + year + + + provide + now + + + opportu + an + + + opportu + extraordinary + + + provide + opportu + + + opportunity + nity-an + + + mobilize + opportunity + + + opportunity + for + + + cooperation + unprecedented + + + cooperation + global + + + for + cooperation + + + cooperation + and + + + opportunity + an + + + cooperation + opportunity + + + mobilize + to + + + provide + mobilize + + + resources-to + significant + + + resources-to + new + + + task + resources-to + + + task + the + + + mobilize + task + + + task + of + + + of + protecting + + + craft + our + + + craft + fragile + + + protecting + craft + + + + + ROOT + provide + + + changes + The + + + changes + rapid + + + changes + political + + + provide + changes + + + year + the + + + year + last + + + changes + year + + + provide + now + + + opportu + an + + + opportu + extraordinary + + + provide + opportu + + + opportunity + nity-an + + + mobilize + opportunity + + + cooperation + unprecedented + + + cooperation + global + + + opportunity + cooperation + + + opportunity + an + + + cooperation + opportunity + + + mobilize + to + + + provide + mobilize + + + resources-to + significant + + + resources-to + new + + + task + resources-to + + + task + the + + + mobilize + task + + + task + protecting + + + craft + our + + + craft + fragile + + + protecting + craft + + + + + ROOT + provide + + + changes + The + + + changes + rapid + + + changes + political + + + provide + changes + + + year + the + + + year + last + + + changes + year + + + provide + now + + + opportu + an + + + opportu + extraordinary + + + provide + opportu + + + opportunity + nity-an + + + mobilize + opportunity + + + cooperation + unprecedented + + + cooperation + global + + + opportunity + cooperation + + + opportunity + an + + + opportunity + opportunity + + + cooperation + opportunity + + + mobilize + to + + + provide + mobilize + + + resources-to + significant + + + resources-to + new + + + task + resources-to + + + task + the + + + mobilize + task + + + task + protecting + + + craft + our + + + craft + fragile + + + protecting + craft + + + + + + + We + we + 32023 + 32025 + PRP + O + + + must + must + 32026 + 32030 + MD + O + + + go + go + 32032 + 32034 + VB + O + + + forward + forward + 32035 + 32042 + RB + O + + + from + from + 32043 + 32047 + IN + O + + + here + here + 32048 + 32052 + RB + O + + + reaffirming + reaffirm + 32053 + 32064 + VBG + O + + + our + we + 32065 + 32068 + PRP$ + O + + + love + love + 32069 + 32073 + NN + O + + + for + for + 32075 + 32078 + IN + O + + + this + this + 32079 + 32083 + DT + O + + + planet + planet + 32084 + 32090 + NN + O + + + and + and + 32091 + 32094 + CC + O + + + rededicating + rededicate + 32095 + 32107 + VBG + O + + + ourselves + ourselves + 32108 + 32117 + PRP + O + + + to + to + 32118 + 32120 + TO + O + + + its + its + 32122 + 32125 + PRP$ + O + + + protection + protection + 32126 + 32136 + NN + O + + + . + . + 32136 + 32137 + . + O + + + (ROOT (S (NP (PRP We)) (VP (MD must) (VP (VB go) (ADVP (RB forward)) (PP (IN from) (S (VP (VP (ADVP (RB here)) (VBG reaffirming) (NP (PRP$ our) (NN love)) (PP (IN for) (NP (DT this) (NN planet)))) (CC and) (VP (VBG rededicating) (NP (PRP ourselves)) (PP (TO to) (NP (PRP$ its) (NN protection))))))))) (. .))) + + + ROOT + go + + + go + We + + + go + must + + + go + forward + + + go + from + + + reaffirming + here + + + from + reaffirming + + + love + our + + + reaffirming + love + + + reaffirming + for + + + planet + this + + + for + planet + + + reaffirming + and + + + reaffirming + rededicating + + + rededicating + ourselves + + + rededicating + to + + + protection + its + + + to + protection + + + + + ROOT + go + + + go + We + + + go + must + + + go + forward + + + reaffirming + here + + + go + reaffirming + + + love + our + + + reaffirming + love + + + planet + this + + + reaffirming + planet + + + reaffirming + rededicating + + + rededicating + ourselves + + + protection + its + + + rededicating + protection + + + + + ROOT + go + + + go + We + + + go + must + + + go + forward + + + reaffirming + here + + + go + reaffirming + + + love + our + + + reaffirming + love + + + planet + this + + + reaffirming + planet + + + go + rededicating + + + reaffirming + rededicating + + + rededicating + ourselves + + + protection + its + + + rededicating + protection + + + + + + + Mr. + Mr. + 32141 + 32144 + NNP + O + + + MITCHELL + MITCHELL + 32145 + 32153 + NNP + PERSON + + + . + . + 32153 + 32154 + . + O + + + (ROOT (NP (NNP Mr.) (NNP MITCHELL) (. .))) + + + ROOT + MITCHELL + + + MITCHELL + Mr. + + + + + ROOT + MITCHELL + + + MITCHELL + Mr. + + + + + ROOT + MITCHELL + + + MITCHELL + Mr. + + + + + + + Mr. + Mr. + 32155 + 32158 + NNP + O + + + President + President + 32159 + 32168 + NNP + O + + + , + , + 32168 + 32169 + , + O + + + I + I + 32170 + 32171 + PRP + O + + + would + would + 32173 + 32178 + MD + O + + + like + like + 32179 + 32183 + VB + O + + + now + now + 32184 + 32187 + RB + DATE + PRESENT_REF + PRESENT_REF + + + to + to + 32188 + 32190 + TO + O + + + yield + yield + 32191 + 32196 + VB + O + + + to + to + 32197 + 32199 + TO + O + + + my + my + 32200 + 32202 + PRP$ + O + + + distin + distin + 32203 + 32209 + NN + O + + + - + - + 32209 + 32210 + : + O + + + guished + guish + 32212 + 32219 + VBN + O + + + friend + friend + 32220 + 32226 + NN + O + + + and + and + 32227 + 32230 + CC + O + + + colleague + colleague + 32231 + 32240 + NN + O + + + , + , + 32240 + 32241 + , + O + + + the + the + 32242 + 32245 + DT + O + + + Re + Re + 32246 + 32248 + NNP + O + + + - + - + 32248 + 32249 + : + O + + + publican + publican + 32251 + 32259 + JJ + O + + + leader + leader + 32260 + 32266 + NN + O + + + and + and + 32267 + 32270 + CC + O + + + to + to + 32271 + 32273 + TO + O + + + say + say + 32274 + 32277 + VB + O + + + that + that + 32278 + 32282 + IN + O + + + it + it + 32283 + 32285 + PRP + O + + + is + be + 32286 + 32288 + VBZ + O + + + as + as + 32289 + 32291 + IN + O + + + always + always + 32293 + 32299 + RB + O + + + a + a + 32300 + 32301 + DT + O + + + pleasure + pleasure + 32302 + 32310 + NN + O + + + to + to + 32311 + 32313 + TO + O + + + be + be + 32314 + 32316 + VB + O + + + here + here + 32317 + 32321 + RB + O + + + on + on + 32322 + 32324 + IN + O + + + the + the + 32325 + 32328 + DT + O + + + Senate + Senate + 32330 + 32336 + NNP + ORGANIZATION + + + floor + floor + 32337 + 32342 + NN + O + + + with + with + 32343 + 32347 + IN + O + + + the + the + 32348 + 32351 + DT + O + + + distinguished + distinguished + 32352 + 32365 + JJ + O + + + Republican + republican + 32367 + 32377 + JJ + MISC + + + leader + leader + 32378 + 32384 + NN + O + + + . + . + 32384 + 32385 + . + O + + + (ROOT (S (NP (NNP Mr.) (NNP President)) (, ,) (NP (PRP I)) (VP (MD would) (VP (VB like) (ADVP (RB now)) (S (VP (VP (TO to) (VP (VB yield) (PP (TO to) (NP (NP (PRP$ my) (NN distin)) (: -) (VP (VBN guished) (NP (NP (NP (NN friend) (CC and) (NN colleague)) (, ,) (NP (DT the) (NNP Re))) (: -) (NP (JJ publican) (NN leader)))))))) (CC and) (VP (TO to) (VP (VB say) (SBAR (IN that) (S (NP (PRP it)) (VP (VBZ is) (ADVP (IN as) (RB always)) (NP (DT a) (NN pleasure) (S (VP (TO to) (VP (VB be) (ADVP (RB here)) (PP (IN on) (NP (NP (DT the) (NNP Senate) (NN floor)) (PP (IN with) (NP (DT the) (JJ distinguished) (JJ Republican) (NN leader)))))))))))))))))) (. .))) + + + ROOT + like + + + President + Mr. + + + like + President + + + like + I + + + like + would + + + like + now + + + yield + to + + + like + yield + + + yield + to + + + distin + my + + + to + distin + + + distin + guished + + + guished + friend + + + friend + and + + + friend + colleague + + + Re + the + + + friend + Re + + + leader + publican + + + friend + leader + + + yield + and + + + say + to + + + yield + say + + + pleasure + that + + + pleasure + it + + + pleasure + is + + + pleasure + as + + + as + always + + + pleasure + a + + + say + pleasure + + + be + to + + + pleasure + be + + + be + here + + + be + on + + + floor + the + + + floor + Senate + + + on + floor + + + floor + with + + + leader + the + + + leader + distinguished + + + leader + Republican + + + with + leader + + + + + ROOT + like + + + President + Mr. + + + like + President + + + like + I + + + like + would + + + like + now + + + yield + to + + + like + yield + + + distin + my + + + yield + distin + + + distin + guished + + + guished + friend + + + friend + colleague + + + Re + the + + + friend + Re + + + leader + publican + + + friend + leader + + + say + to + + + yield + say + + + pleasure + that + + + pleasure + it + + + pleasure + is + + + pleasure + as + + + as + always + + + pleasure + a + + + say + pleasure + + + be + to + + + pleasure + be + + + be + here + + + floor + the + + + floor + Senate + + + be + floor + + + leader + the + + + leader + distinguished + + + leader + Republican + + + floor + leader + + + + + ROOT + like + + + President + Mr. + + + like + President + + + like + I + + + like + would + + + like + now + + + yield + to + + + like + yield + + + distin + my + + + yield + distin + + + distin + guished + + + guished + friend + + + guished + colleague + + + friend + colleague + + + Re + the + + + friend + Re + + + leader + publican + + + friend + leader + + + say + to + + + like + say + + + yield + say + + + pleasure + that + + + pleasure + it + + + pleasure + is + + + pleasure + as + + + as + always + + + pleasure + a + + + say + pleasure + + + be + to + + + pleasure + be + + + be + here + + + floor + the + + + floor + Senate + + + be + floor + + + leader + the + + + leader + distinguished + + + leader + Republican + + + floor + leader + + + + + + + I + I + 32386 + 32387 + PRP + O + + + look + look + 32388 + 32392 + VBP + O + + + forward + forward + 32393 + 32400 + RB + O + + + to + to + 32401 + 32403 + TO + O + + + what + what + 32405 + 32409 + WP + O + + + I + I + 32410 + 32411 + PRP + O + + + know + know + 32412 + 32416 + VBP + O + + + will + will + 32417 + 32421 + MD + O + + + be + be + 32422 + 32424 + VB + O + + + a + a + 32425 + 32426 + DT + O + + + busy + busy + 32427 + 32431 + JJ + O + + + and + and + 32432 + 32435 + CC + O + + + I + I + 32436 + 32437 + PRP + O + + + hope + hope + 32438 + 32442 + VBP + O + + + will + will + 32444 + 32448 + MD + O + + + also + also + 32449 + 32453 + RB + O + + + be + be + 32454 + 32456 + VB + O + + + a + a + 32457 + 32458 + DT + O + + + very + very + 32459 + 32463 + RB + O + + + productive + productive + 32464 + 32474 + JJ + O + + + session + session + 32475 + 32482 + NN + O + + + for + for + 32484 + 32487 + IN + O + + + the + the + 32488 + 32491 + DT + O + + + Senate + Senate + 32492 + 32498 + NNP + ORGANIZATION + + + and + and + 32499 + 32502 + CC + O + + + the + the + 32503 + 32506 + DT + O + + + Nation + nation + 32507 + 32513 + NN + O + + + . + . + 32513 + 32514 + . + O + + + (ROOT (S (S (NP (PRP I)) (VP (VBP look) (ADVP (RB forward)) (PP (TO to) (SBAR (WHNP (WP what)) (S (NP (PRP I)) (VP (VBP know) (SBAR (S (VP (MD will) (VP (VB be) (ADJP (DT a) (JJ busy)))))))))))) (CC and) (S (NP (PRP I)) (VP (VBP hope) (SBAR (S (VP (MD will) (ADVP (RB also)) (VP (VB be) (NP (NP (DT a) (ADJP (RB very) (JJ productive)) (NN session)) (PP (IN for) (NP (NP (DT the) (NNP Senate)) (CC and) (NP (DT the) (NN Nation))))))))))) (. .))) + + + ROOT + look + + + look + I + + + look + forward + + + look + to + + + know + what + + + know + I + + + to + know + + + busy + will + + + busy + be + + + busy + a + + + know + busy + + + look + and + + + hope + I + + + look + hope + + + session + will + + + session + also + + + session + be + + + session + a + + + productive + very + + + session + productive + + + hope + session + + + session + for + + + Senate + the + + + for + Senate + + + Senate + and + + + Nation + the + + + Senate + Nation + + + + + ROOT + look + + + look + I + + + look + forward + + + know + what + + + know + I + + + look + know + + + busy + will + + + busy + be + + + busy + a + + + know + busy + + + hope + I + + + look + hope + + + session + will + + + session + also + + + session + be + + + session + a + + + productive + very + + + session + productive + + + hope + session + + + Senate + the + + + session + Senate + + + Nation + the + + + Senate + Nation + + + + + ROOT + look + + + look + I + + + look + forward + + + know + what + + + know + I + + + look + know + + + busy + will + + + busy + be + + + busy + a + + + know + busy + + + hope + I + + + look + hope + + + session + will + + + session + also + + + session + be + + + session + a + + + productive + very + + + session + productive + + + hope + session + + + Senate + the + + + session + Senate + + + Nation + the + + + session + Nation + + + Senate + Nation + + + + + + + + 107 + 4 + 8 + 7 + + + 1 + 1 + 4 + 2 + + + 3 + 4 + 6 + 5 + + + 119 + 16 + 18 + 17 + + + 247 + 1 + 4 + 2 + + + + + 2 + 1 + 48 + 2 + + + 2 + 1 + 3 + 2 + + + 4 + 1 + 3 + 2 + + + 13 + 3 + 6 + 4 + + + 14 + 5 + 6 + 5 + + + 15 + 9 + 10 + 9 + + + 41 + 7 + 10 + 8 + + + 43 + 3 + 6 + 4 + + + 44 + 1 + 2 + 1 + + + 44 + 4 + 5 + 4 + + + 45 + 1 + 4 + 2 + + + 47 + 6 + 12 + 7 + + + 47 + 6 + 8 + 7 + + + 53 + 1 + 3 + 2 + + + 53 + 4 + 5 + 4 + + + 64 + 3 + 6 + 4 + + + 94 + 28 + 30 + 29 + + + 108 + 1 + 3 + 2 + + + 120 + 1 + 3 + 2 + + + 248 + 1 + 3 + 2 + + + + + 223 + 22 + 24 + 23 + + + 241 + 4 + 6 + 5 + + + + + 2 + 18 + 19 + 18 + + + 13 + 1 + 2 + 1 + + + 14 + 1 + 2 + 1 + + + 15 + 3 + 4 + 3 + + + 20 + 5 + 6 + 5 + + + 29 + 1 + 2 + 1 + + + 39 + 1 + 2 + 1 + + + 42 + 3 + 4 + 3 + + + 43 + 1 + 2 + 1 + + + 57 + 1 + 2 + 1 + + + 57 + 3 + 4 + 3 + + + 59 + 3 + 4 + 3 + + + 61 + 1 + 2 + 1 + + + 64 + 1 + 2 + 1 + + + 73 + 3 + 4 + 3 + + + 78 + 1 + 2 + 1 + + + 92 + 13 + 14 + 13 + + + 94 + 1 + 2 + 1 + + + 105 + 7 + 8 + 7 + + + 106 + 1 + 2 + 1 + + + 106 + 12 + 13 + 12 + + + 108 + 4 + 5 + 4 + + + 112 + 1 + 2 + 1 + + + 112 + 11 + 12 + 11 + + + 114 + 1 + 2 + 1 + + + 115 + 5 + 6 + 5 + + + 115 + 10 + 11 + 10 + + + 117 + 1 + 2 + 1 + + + 118 + 1 + 2 + 1 + + + 120 + 4 + 5 + 4 + + + 123 + 1 + 2 + 1 + + + 123 + 13 + 14 + 13 + + + 125 + 1 + 2 + 1 + + + 150 + 2 + 3 + 2 + + + 157 + 2 + 3 + 2 + + + 158 + 2 + 3 + 2 + + + 160 + 1 + 2 + 1 + + + 163 + 7 + 8 + 7 + + + 170 + 16 + 17 + 16 + + + 176 + 2 + 3 + 2 + + + 176 + 5 + 6 + 5 + + + 177 + 33 + 34 + 33 + + + 178 + 4 + 5 + 4 + + + 178 + 20 + 21 + 20 + + + 183 + 3 + 4 + 3 + + + 200 + 6 + 7 + 6 + + + 201 + 1 + 2 + 1 + + + 202 + 1 + 2 + 1 + + + 205 + 6 + 7 + 6 + + + 210 + 4 + 5 + 4 + + + 215 + 4 + 5 + 4 + + + 215 + 25 + 26 + 25 + + + 216 + 6 + 7 + 6 + + + 216 + 9 + 10 + 9 + + + 217 + 1 + 2 + 1 + + + 219 + 3 + 4 + 3 + + + 220 + 1 + 2 + 1 + + + 225 + 1 + 2 + 1 + + + 226 + 11 + 12 + 11 + + + 227 + 3 + 4 + 3 + + + 228 + 14 + 15 + 14 + + + 230 + 12 + 13 + 12 + + + 231 + 2 + 3 + 2 + + + 234 + 2 + 3 + 2 + + + 236 + 3 + 4 + 3 + + + 239 + 1 + 2 + 1 + + + 241 + 1 + 2 + 1 + + + 248 + 4 + 5 + 4 + + + 248 + 11 + 12 + 11 + + + 249 + 1 + 2 + 1 + + + 249 + 6 + 7 + 6 + + + 249 + 13 + 14 + 13 + + + + + 2 + 36 + 47 + 36 + + + 2 + 36 + 37 + 36 + + + + + 2 + 40 + 47 + 40 + + + 2 + 40 + 42 + 40 + + + + + 3 + 1 + 7 + 3 + + + 98 + 6 + 8 + 7 + + + + + 223 + 10 + 14 + 13 + + + 223 + 32 + 35 + 34 + + + + + 4 + 4 + 5 + 4 + + + 7 + 4 + 5 + 4 + + + 8 + 5 + 6 + 5 + + + + + 5 + 1 + 2 + 1 + + + 6 + 1 + 2 + 1 + + + + + 228 + 48 + 55 + 49 + + + 246 + 17 + 19 + 18 + + + + + 4 + 9 + 13 + 12 + + + 6 + 8 + 12 + 11 + + + 37 + 2 + 6 + 5 + + + + + 6 + 16 + 18 + 17 + + + 15 + 6 + 8 + 7 + + + 23 + 20 + 22 + 21 + + + 42 + 7 + 9 + 8 + + + 108 + 28 + 29 + 28 + + + 110 + 31 + 32 + 31 + + + 216 + 10 + 11 + 10 + + + 217 + 20 + 21 + 20 + + + 248 + 39 + 40 + 39 + + + 249 + 23 + 25 + 24 + + + 249 + 24 + 25 + 24 + + + + + 7 + 9 + 41 + 10 + + + 7 + 9 + 12 + 10 + + + 7 + 14 + 41 + 15 + + + 7 + 30 + 32 + 31 + + + + + 228 + 6 + 13 + 7 + + + 228 + 1 + 2 + 1 + + + + + 7 + 4 + 6 + 5 + + + 167 + 53 + 55 + 54 + + + + + 7 + 2 + 3 + 2 + + + 223 + 2 + 3 + 2 + + + 223 + 3 + 4 + 3 + + + + + 227 + 6 + 7 + 6 + + + 228 + 36 + 37 + 36 + + + + + 7 + 28 + 37 + 28 + + + 7 + 28 + 29 + 28 + + + + + 231 + 9 + 26 + 11 + + + 231 + 9 + 20 + 11 + + + + + 56 + 27 + 30 + 29 + + + 8 + 18 + 21 + 18 + + + 33 + 25 + 26 + 25 + + + 56 + 25 + 26 + 25 + + + + + 8 + 9 + 11 + 10 + + + 12 + 39 + 41 + 40 + + + + + 9 + 3 + 9 + 8 + + + 100 + 27 + 30 + 29 + + + + + 229 + 16 + 18 + 17 + + + 230 + 29 + 30 + 29 + + + 231 + 5 + 6 + 5 + + + + + 10 + 4 + 16 + 6 + + + 10 + 1 + 2 + 1 + + + 10 + 4 + 7 + 6 + + + 33 + 30 + 32 + 31 + + + 34 + 12 + 14 + 13 + + + + + 231 + 14 + 16 + 15 + + + 229 + 5 + 7 + 5 + + + + + 233 + 3 + 5 + 4 + + + 233 + 11 + 13 + 12 + + + + + 232 + 29 + 31 + 30 + + + 233 + 15 + 17 + 16 + + + + + 14 + 1 + 9 + 2 + + + 15 + 1 + 2 + 1 + + + 15 + 3 + 5 + 4 + + + 73 + 3 + 5 + 4 + + + + + 13 + 3 + 15 + 7 + + + 14 + 7 + 8 + 7 + + + + + 231 + 27 + 33 + 28 + + + 231 + 27 + 29 + 28 + + + 231 + 31 + 33 + 32 + + + + + 18 + 1 + 2 + 1 + + + 19 + 1 + 2 + 1 + + + 19 + 5 + 6 + 5 + + + + + 15 + 10 + 15 + 14 + + + 37 + 3 + 6 + 5 + + + + + 209 + 3 + 4 + 3 + + + 209 + 8 + 9 + 8 + + + 209 + 12 + 13 + 12 + + + 211 + 4 + 5 + 4 + + + 211 + 9 + 10 + 9 + + + + + 21 + 1 + 2 + 1 + + + 22 + 1 + 2 + 1 + + + 22 + 14 + 15 + 14 + + + 38 + 7 + 8 + 7 + + + 39 + 10 + 11 + 10 + + + 69 + 27 + 28 + 27 + + + 112 + 12 + 13 + 12 + + + 199 + 27 + 28 + 27 + + + + + 21 + 7 + 10 + 7 + + + 25 + 13 + 16 + 13 + + + + + 19 + 12 + 14 + 13 + + + 21 + 11 + 13 + 12 + + + 25 + 3 + 5 + 4 + + + 43 + 8 + 10 + 9 + + + 44 + 16 + 18 + 17 + + + 57 + 11 + 13 + 12 + + + 94 + 19 + 21 + 20 + + + + + 211 + 2 + 4 + 3 + + + 246 + 11 + 13 + 12 + + + + + 20 + 9 + 10 + 9 + + + 172 + 6 + 7 + 6 + + + + + 22 + 3 + 6 + 5 + + + 62 + 12 + 13 + 12 + + + 141 + 3 + 6 + 5 + + + 159 + 6 + 9 + 8 + + + 171 + 6 + 8 + 7 + + + 180 + 24 + 27 + 26 + + + 206 + 14 + 17 + 16 + + + + + 164 + 15 + 23 + 16 + + + 22 + 19 + 21 + 20 + + + 27 + 10 + 12 + 11 + + + + + 25 + 1 + 2 + 1 + + + 26 + 1 + 2 + 1 + + + 227 + 1 + 2 + 1 + + + + + 219 + 9 + 48 + 11 + + + 219 + 9 + 12 + 11 + + + 219 + 13 + 29 + 15 + + + + + 32 + 5 + 7 + 6 + + + 32 + 3 + 4 + 3 + + + 40 + 1 + 2 + 1 + + + 154 + 1 + 2 + 1 + + + 218 + 1 + 2 + 1 + + + + + 30 + 7 + 11 + 10 + + + 31 + 1 + 5 + 4 + + + 33 + 12 + 16 + 15 + + + + + 30 + 7 + 10 + 8 + + + 31 + 1 + 4 + 2 + + + 33 + 12 + 15 + 13 + + + 90 + 46 + 49 + 47 + + + + + 28 + 6 + 7 + 6 + + + 33 + 28 + 29 + 28 + + + + + 34 + 2 + 4 + 3 + + + 34 + 5 + 6 + 5 + + + + + 33 + 4 + 5 + 4 + + + 33 + 19 + 20 + 19 + + + 35 + 1 + 2 + 1 + + + 35 + 13 + 14 + 13 + + + 36 + 12 + 13 + 12 + + + 38 + 1 + 2 + 1 + + + 38 + 27 + 28 + 27 + + + 40 + 7 + 8 + 7 + + + + + 101 + 24 + 26 + 25 + + + 37 + 13 + 14 + 13 + + + 50 + 17 + 18 + 17 + + + 103 + 11 + 12 + 11 + + + 104 + 9 + 11 + 9 + + + 111 + 12 + 14 + 13 + + + 115 + 8 + 9 + 8 + + + + + 220 + 5 + 9 + 8 + + + 226 + 16 + 18 + 17 + + + + + 219 + 45 + 48 + 47 + + + 221 + 36 + 39 + 38 + + + + + 35 + 10 + 19 + 11 + + + 36 + 1 + 21 + 1 + + + + + 244 + 63 + 66 + 65 + + + 245 + 33 + 36 + 35 + + + + + 248 + 19 + 21 + 20 + + + 38 + 38 + 40 + 39 + + + 149 + 12 + 14 + 13 + + + + + 73 + 11 + 14 + 12 + + + 38 + 28 + 30 + 28 + + + 78 + 18 + 20 + 19 + + + 249 + 26 + 28 + 27 + + + + + 244 + 48 + 53 + 49 + + + 244 + 48 + 50 + 49 + + + 244 + 51 + 53 + 52 + + + + + 244 + 59 + 66 + 60 + + + 246 + 8 + 10 + 9 + + + 246 + 17 + 18 + 17 + + + + + 38 + 47 + 48 + 47 + + + 64 + 8 + 9 + 8 + + + + + 38 + 6 + 16 + 8 + + + 39 + 9 + 12 + 11 + + + + + 38 + 27 + 41 + 31 + + + 38 + 27 + 34 + 31 + + + 38 + 28 + 34 + 31 + + + + + 248 + 15 + 24 + 15 + + + 248 + 15 + 18 + 15 + + + + + 42 + 13 + 15 + 14 + + + 178 + 20 + 22 + 21 + + + + + 43 + 3 + 10 + 6 + + + 51 + 8 + 10 + 9 + + + 52 + 4 + 6 + 5 + + + 57 + 22 + 24 + 23 + + + + + 41 + 3 + 5 + 4 + + + 185 + 19 + 21 + 20 + + + + + 41 + 13 + 14 + 13 + + + 77 + 6 + 7 + 6 + + + + + 46 + 8 + 11 + 10 + + + 47 + 13 + 14 + 13 + + + 56 + 12 + 15 + 14 + + + 75 + 2 + 5 + 4 + + + + + 48 + 18 + 19 + 18 + + + 48 + 21 + 22 + 21 + + + 48 + 24 + 25 + 24 + + + 48 + 30 + 31 + 30 + + + + + 44 + 21 + 23 + 22 + + + 50 + 23 + 25 + 24 + + + + + 45 + 1 + 15 + 4 + + + 46 + 1 + 2 + 1 + + + 47 + 1 + 2 + 1 + + + + + 45 + 11 + 14 + 13 + + + 48 + 1 + 4 + 3 + + + 49 + 1 + 2 + 1 + + + 49 + 3 + 6 + 5 + + + 50 + 6 + 8 + 7 + + + 156 + 1 + 3 + 2 + + + + + 51 + 6 + 7 + 6 + + + 114 + 12 + 13 + 12 + + + 156 + 3 + 4 + 3 + + + 217 + 26 + 27 + 26 + + + 248 + 7 + 8 + 7 + + + + + 53 + 10 + 14 + 11 + + + 54 + 13 + 17 + 14 + + + + + 53 + 13 + 14 + 13 + + + 54 + 16 + 17 + 16 + + + + + 245 + 6 + 10 + 8 + + + 50 + 19 + 21 + 20 + + + 57 + 13 + 15 + 14 + + + 59 + 24 + 26 + 25 + + + 63 + 12 + 14 + 13 + + + + + 50 + 1 + 13 + 3 + + + 51 + 1 + 2 + 1 + + + 86 + 18 + 21 + 20 + + + + + 235 + 5 + 9 + 6 + + + 235 + 18 + 19 + 18 + + + + + 55 + 18 + 21 + 20 + + + 55 + 25 + 28 + 26 + + + + + 171 + 32 + 35 + 34 + + + 55 + 6 + 8 + 7 + + + 56 + 1 + 30 + 1 + + + + + 54 + 6 + 12 + 7 + + + 54 + 18 + 19 + 18 + + + + + 54 + 1 + 2 + 1 + + + 54 + 9 + 10 + 9 + + + 55 + 1 + 2 + 1 + + + 55 + 3 + 4 + 3 + + + 55 + 5 + 6 + 5 + + + 55 + 18 + 19 + 18 + + + + + 239 + 5 + 8 + 7 + + + 242 + 6 + 7 + 6 + + + + + 57 + 3 + 5 + 4 + + + 57 + 7 + 8 + 7 + + + + + 56 + 16 + 18 + 17 + + + 85 + 24 + 26 + 25 + + + 150 + 2 + 4 + 3 + + + + + 56 + 16 + 17 + 16 + + + 59 + 22 + 23 + 22 + + + + + 60 + 51 + 56 + 55 + + + 60 + 48 + 76 + 49 + + + 60 + 48 + 71 + 49 + + + 60 + 48 + 50 + 49 + + + + + 240 + 8 + 17 + 8 + + + 240 + 8 + 12 + 8 + + + + + 241 + 8 + 36 + 8 + + + 243 + 11 + 12 + 11 + + + 243 + 16 + 17 + 16 + + + 244 + 2 + 3 + 2 + + + 244 + 35 + 36 + 35 + + + 244 + 61 + 62 + 61 + + + 244 + 63 + 64 + 63 + + + + + 242 + 14 + 18 + 17 + + + 243 + 18 + 20 + 19 + + + 244 + 7 + 8 + 7 + + + 244 + 55 + 56 + 55 + + + + + 60 + 18 + 19 + 18 + + + 139 + 16 + 17 + 16 + + + + + 60 + 1 + 11 + 2 + + + 60 + 1 + 3 + 2 + + + 60 + 2 + 3 + 2 + + + 60 + 4 + 10 + 5 + + + + + 62 + 24 + 31 + 25 + + + 62 + 24 + 26 + 25 + + + + + 62 + 14 + 20 + 15 + + + 241 + 17 + 19 + 18 + + + + + 62 + 21 + 31 + 22 + + + 89 + 11 + 12 + 11 + + + 205 + 15 + 16 + 15 + + + + + 62 + 1 + 20 + 5 + + + 63 + 10 + 12 + 11 + + + + + 244 + 13 + 17 + 16 + + + 244 + 20 + 21 + 20 + + + 244 + 38 + 39 + 38 + + + + + 244 + 11 + 43 + 11 + + + 245 + 33 + 34 + 33 + + + 246 + 1 + 2 + 1 + + + 246 + 8 + 9 + 8 + + + 246 + 15 + 16 + 15 + + + + + 244 + 26 + 32 + 26 + + + 244 + 26 + 29 + 26 + + + + + 67 + 1 + 3 + 2 + + + 68 + 1 + 2 + 1 + + + 69 + 1 + 2 + 1 + + + + + 67 + 5 + 8 + 7 + + + 67 + 12 + 13 + 12 + + + 69 + 4 + 5 + 4 + + + 69 + 22 + 23 + 22 + + + 69 + 26 + 27 + 26 + + + 70 + 9 + 10 + 9 + + + 171 + 11 + 13 + 12 + + + + + 69 + 14 + 17 + 16 + + + 83 + 9 + 11 + 10 + + + 84 + 11 + 13 + 12 + + + 243 + 9 + 11 + 10 + + + + + 70 + 2 + 26 + 5 + + + 70 + 24 + 25 + 24 + + + 70 + 35 + 36 + 35 + + + 70 + 39 + 40 + 39 + + + 71 + 8 + 10 + 9 + + + 71 + 17 + 18 + 17 + + + 71 + 20 + 21 + 20 + + + 71 + 23 + 24 + 23 + + + 71 + 28 + 29 + 28 + + + 71 + 38 + 39 + 38 + + + + + 69 + 22 + 29 + 24 + + + 72 + 32 + 35 + 32 + + + + + 63 + 1 + 2 + 1 + + + 65 + 10 + 11 + 10 + + + 67 + 5 + 6 + 5 + + + + + 72 + 3 + 4 + 3 + + + 74 + 1 + 2 + 1 + + + 75 + 3 + 4 + 3 + + + 77 + 1 + 2 + 1 + + + 79 + 8 + 9 + 8 + + + 80 + 22 + 23 + 22 + + + + + 72 + 7 + 29 + 10 + + + 72 + 7 + 11 + 10 + + + 72 + 12 + 17 + 13 + + + 72 + 15 + 17 + 16 + + + 72 + 19 + 29 + 21 + + + + + 72 + 23 + 29 + 25 + + + 222 + 1 + 3 + 2 + + + + + 74 + 1 + 5 + 3 + + + 75 + 10 + 12 + 11 + + + + + 70 + 24 + 26 + 25 + + + 95 + 2 + 3 + 2 + + + 95 + 9 + 10 + 9 + + + + + 71 + 17 + 25 + 18 + + + 71 + 17 + 19 + 18 + + + 71 + 20 + 22 + 21 + + + 71 + 23 + 25 + 24 + + + + + 71 + 28 + 30 + 29 + + + 80 + 22 + 24 + 23 + + + + + 79 + 9 + 10 + 9 + + + 162 + 9 + 10 + 9 + + + + + 78 + 9 + 24 + 9 + + + 78 + 9 + 10 + 9 + + + + + 78 + 21 + 24 + 22 + + + 91 + 1 + 3 + 2 + + + 97 + 22 + 24 + 23 + + + 106 + 15 + 17 + 16 + + + + + 82 + 2 + 3 + 2 + + + 81 + 5 + 8 + 6 + + + + + 80 + 14 + 24 + 16 + + + 80 + 10 + 12 + 11 + + + + + 80 + 2 + 9 + 3 + + + 80 + 2 + 7 + 3 + + + 179 + 24 + 29 + 25 + + + + + 76 + 1 + 4 + 3 + + + 76 + 11 + 12 + 11 + + + + + 186 + 34 + 35 + 34 + + + 74 + 10 + 12 + 11 + + + 134 + 47 + 50 + 48 + + + 186 + 18 + 22 + 19 + + + 186 + 29 + 30 + 29 + + + 186 + 46 + 49 + 47 + + + + + 77 + 8 + 29 + 9 + + + 77 + 8 + 10 + 9 + + + + + 76 + 29 + 35 + 30 + + + 76 + 29 + 31 + 30 + + + 76 + 34 + 35 + 34 + + + + + 87 + 7 + 12 + 8 + + + 87 + 49 + 52 + 49 + + + + + 87 + 4 + 12 + 5 + + + 186 + 33 + 34 + 33 + + + 188 + 7 + 8 + 7 + + + 190 + 12 + 13 + 12 + + + 192 + 12 + 13 + 12 + + + 219 + 10 + 11 + 10 + + + + + 83 + 13 + 19 + 13 + + + 83 + 13 + 14 + 13 + + + 83 + 15 + 16 + 15 + + + 83 + 18 + 19 + 18 + + + + + 84 + 2 + 3 + 2 + + + 84 + 7 + 8 + 7 + + + 84 + 14 + 15 + 14 + + + 85 + 6 + 7 + 6 + + + 85 + 24 + 25 + 24 + + + 86 + 8 + 9 + 8 + + + 87 + 1 + 2 + 1 + + + 87 + 7 + 8 + 7 + + + 87 + 23 + 24 + 23 + + + 87 + 25 + 26 + 25 + + + 87 + 41 + 42 + 41 + + + 88 + 1 + 2 + 1 + + + 89 + 1 + 2 + 1 + + + 90 + 1 + 2 + 1 + + + 90 + 33 + 34 + 33 + + + 91 + 10 + 11 + 10 + + + 91 + 17 + 18 + 17 + + + 91 + 26 + 27 + 26 + + + 92 + 15 + 16 + 15 + + + + + 83 + 5 + 11 + 7 + + + 86 + 23 + 24 + 23 + + + + + 85 + 18 + 26 + 18 + + + 86 + 16 + 17 + 16 + + + + + 92 + 2 + 8 + 3 + + + 92 + 18 + 19 + 18 + + + 134 + 6 + 7 + 6 + + + 199 + 23 + 25 + 24 + + + 201 + 6 + 8 + 7 + + + + + 95 + 9 + 13 + 10 + + + 87 + 43 + 45 + 44 + + + 95 + 9 + 11 + 10 + + + 95 + 12 + 13 + 12 + + + + + 90 + 22 + 30 + 22 + + + 163 + 3 + 7 + 3 + + + 235 + 35 + 39 + 35 + + + + + 100 + 10 + 20 + 11 + + + 101 + 8 + 9 + 8 + + + + + 97 + 20 + 21 + 20 + + + 98 + 6 + 7 + 6 + + + 100 + 6 + 7 + 6 + + + 100 + 21 + 22 + 21 + + + + + 99 + 11 + 14 + 13 + + + 99 + 1 + 10 + 2 + + + 105 + 1 + 4 + 3 + + + 105 + 5 + 13 + 6 + + + 106 + 9 + 10 + 9 + + + + + 96 + 11 + 12 + 11 + + + 98 + 2 + 3 + 2 + + + + + 97 + 1 + 16 + 4 + + + 97 + 1 + 5 + 4 + + + 97 + 9 + 11 + 10 + + + 97 + 13 + 16 + 15 + + + + + 110 + 23 + 24 + 23 + + + 193 + 2 + 3 + 2 + + + + + 110 + 37 + 38 + 37 + + + 110 + 39 + 40 + 39 + + + 198 + 3 + 4 + 3 + + + + + 109 + 12 + 14 + 13 + + + 110 + 2 + 4 + 3 + + + 110 + 3 + 4 + 3 + + + + + 113 + 6 + 8 + 6 + + + 109 + 1 + 2 + 1 + + + 110 + 5 + 6 + 5 + + + 112 + 6 + 7 + 6 + + + 113 + 10 + 11 + 10 + + + 114 + 4 + 11 + 4 + + + 114 + 4 + 8 + 4 + + + 114 + 4 + 5 + 4 + + + 114 + 6 + 7 + 6 + + + + + 108 + 13 + 33 + 15 + + + 113 + 10 + 12 + 11 + + + + + 108 + 17 + 33 + 18 + + + 108 + 17 + 19 + 18 + + + + + 236 + 7 + 8 + 7 + + + 104 + 1 + 15 + 1 + + + 126 + 6 + 7 + 6 + + + 204 + 32 + 34 + 32 + + + 223 + 27 + 28 + 27 + + + + + 117 + 11 + 27 + 12 + + + 117 + 11 + 13 + 12 + + + 117 + 24 + 25 + 24 + + + 118 + 3 + 4 + 3 + + + + + 115 + 12 + 21 + 12 + + + 116 + 1 + 2 + 1 + + + 117 + 11 + 12 + 11 + + + + + 114 + 22 + 25 + 24 + + + 127 + 5 + 6 + 5 + + + + + 113 + 3 + 8 + 4 + + + 113 + 3 + 5 + 4 + + + + + 198 + 2 + 5 + 4 + + + 110 + 37 + 39 + 38 + + + + + 123 + 16 + 17 + 16 + + + 216 + 13 + 14 + 13 + + + + + 123 + 6 + 9 + 8 + + + 124 + 4 + 5 + 4 + + + 124 + 17 + 18 + 17 + + + + + 124 + 14 + 15 + 14 + + + 126 + 56 + 57 + 56 + + + + + 126 + 3 + 8 + 4 + + + 126 + 3 + 5 + 4 + + + + + 126 + 61 + 67 + 63 + + + 178 + 2 + 4 + 3 + + + + + 205 + 6 + 8 + 7 + + + 126 + 34 + 55 + 34 + + + 126 + 34 + 39 + 34 + + + + + 121 + 1 + 9 + 3 + + + 122 + 1 + 2 + 1 + + + + + 126 + 47 + 55 + 47 + + + 121 + 27 + 28 + 27 + + + 126 + 47 + 48 + 47 + + + + + 120 + 17 + 22 + 21 + + + 121 + 9 + 11 + 10 + + + + + 120 + 25 + 31 + 27 + + + 120 + 23 + 31 + 23 + + + 125 + 11 + 14 + 12 + + + 126 + 36 + 39 + 38 + + + 126 + 37 + 39 + 38 + + + + + 126 + 16 + 18 + 17 + + + 120 + 35 + 37 + 36 + + + + + 134 + 10 + 12 + 11 + + + 134 + 11 + 12 + 11 + + + + + 134 + 19 + 38 + 22 + + + 134 + 19 + 23 + 22 + + + + + 134 + 33 + 34 + 33 + + + 134 + 34 + 35 + 34 + + + + + 134 + 39 + 40 + 39 + + + 135 + 17 + 18 + 17 + + + 136 + 8 + 9 + 8 + + + + + 136 + 2 + 5 + 4 + + + 137 + 1 + 2 + 1 + + + 137 + 3 + 6 + 5 + + + 153 + 1 + 3 + 2 + + + + + 138 + 8 + 15 + 8 + + + 190 + 7 + 8 + 7 + + + + + 170 + 11 + 13 + 12 + + + 127 + 13 + 14 + 13 + + + 138 + 14 + 15 + 14 + + + 164 + 2 + 3 + 2 + + + 164 + 19 + 20 + 19 + + + 164 + 40 + 41 + 40 + + + 165 + 1 + 2 + 1 + + + 165 + 3 + 5 + 4 + + + 165 + 12 + 13 + 12 + + + 181 + 9 + 10 + 9 + + + 183 + 37 + 38 + 37 + + + + + 128 + 13 + 14 + 13 + + + 131 + 15 + 16 + 15 + + + 161 + 24 + 25 + 24 + + + 186 + 27 + 28 + 27 + + + + + 207 + 6 + 8 + 7 + + + 128 + 11 + 14 + 11 + + + 128 + 11 + 12 + 11 + + + 131 + 13 + 16 + 13 + + + 131 + 13 + 14 + 13 + + + 161 + 26 + 27 + 26 + + + 186 + 25 + 26 + 25 + + + 207 + 7 + 8 + 7 + + + + + 130 + 4 + 7 + 6 + + + 142 + 13 + 16 + 15 + + + + + 131 + 9 + 10 + 9 + + + 181 + 13 + 14 + 13 + + + + + 131 + 5 + 7 + 6 + + + 142 + 10 + 12 + 11 + + + 170 + 27 + 29 + 28 + + + + + 186 + 32 + 50 + 34 + + + 132 + 9 + 12 + 11 + + + 186 + 32 + 35 + 34 + + + 186 + 36 + 50 + 38 + + + 187 + 41 + 44 + 43 + + + 188 + 6 + 9 + 8 + + + 190 + 11 + 14 + 13 + + + 192 + 11 + 14 + 13 + + + 204 + 38 + 41 + 40 + + + + + 149 + 15 + 16 + 15 + + + 235 + 49 + 50 + 49 + + + + + 148 + 2 + 5 + 4 + + + 148 + 6 + 11 + 6 + + + 148 + 22 + 23 + 22 + + + 149 + 18 + 20 + 19 + + + 157 + 23 + 25 + 24 + + + 193 + 14 + 16 + 15 + + + 208 + 15 + 17 + 16 + + + 236 + 27 + 29 + 28 + + + + + 140 + 3 + 5 + 4 + + + 224 + 14 + 16 + 15 + + + 225 + 9 + 11 + 10 + + + 234 + 5 + 7 + 6 + + + + + 141 + 14 + 18 + 15 + + + 142 + 19 + 23 + 20 + + + + + 141 + 1 + 6 + 1 + + + 140 + 29 + 30 + 29 + + + 141 + 14 + 15 + 14 + + + 142 + 3 + 4 + 3 + + + 142 + 19 + 20 + 19 + + + 142 + 32 + 33 + 32 + + + 142 + 37 + 38 + 37 + + + 144 + 11 + 12 + 11 + + + 146 + 2 + 3 + 2 + + + 146 + 5 + 6 + 5 + + + 149 + 4 + 5 + 4 + + + 149 + 18 + 19 + 18 + + + 150 + 27 + 28 + 27 + + + 153 + 7 + 8 + 7 + + + 155 + 1 + 2 + 1 + + + 156 + 7 + 8 + 7 + + + 157 + 15 + 16 + 15 + + + 159 + 20 + 21 + 20 + + + 162 + 15 + 16 + 15 + + + 163 + 12 + 13 + 12 + + + + + 161 + 3 + 15 + 4 + + + 162 + 1 + 2 + 1 + + + + + 158 + 23 + 30 + 25 + + + 158 + 38 + 40 + 39 + + + + + 163 + 14 + 18 + 17 + + + 173 + 1 + 3 + 2 + + + + + 162 + 9 + 38 + 11 + + + 162 + 9 + 12 + 11 + + + + + 152 + 1 + 7 + 2 + + + 179 + 6 + 12 + 7 + + + 183 + 14 + 20 + 15 + + + 191 + 7 + 13 + 8 + + + + + 151 + 7 + 9 + 8 + + + 151 + 14 + 15 + 14 + + + + + 152 + 4 + 7 + 6 + + + 155 + 10 + 13 + 12 + + + 179 + 9 + 12 + 11 + + + 183 + 17 + 20 + 19 + + + 185 + 12 + 15 + 14 + + + 191 + 10 + 13 + 12 + + + + + 150 + 15 + 18 + 17 + + + 197 + 22 + 25 + 24 + + + + + 157 + 6 + 20 + 8 + + + 157 + 6 + 9 + 8 + + + + + 157 + 15 + 20 + 19 + + + 171 + 21 + 24 + 23 + + + + + 174 + 18 + 21 + 20 + + + 187 + 16 + 18 + 17 + + + + + 174 + 14 + 28 + 16 + + + 174 + 14 + 21 + 16 + + + + + 171 + 11 + 12 + 11 + + + 171 + 21 + 22 + 21 + + + 171 + 32 + 33 + 32 + + + 172 + 5 + 6 + 5 + + + 172 + 17 + 18 + 17 + + + + + 168 + 16 + 18 + 17 + + + 168 + 17 + 18 + 17 + + + + + 164 + 32 + 41 + 33 + + + 164 + 32 + 34 + 33 + + + 164 + 35 + 36 + 35 + + + 164 + 38 + 41 + 38 + + + + + 167 + 35 + 55 + 36 + + + 164 + 3 + 4 + 3 + + + 167 + 2 + 3 + 2 + + + 167 + 33 + 34 + 33 + + + + + 183 + 31 + 41 + 31 + + + 183 + 31 + 32 + 31 + + + + + 184 + 4 + 8 + 7 + + + 185 + 23 + 26 + 25 + + + + + 180 + 36 + 37 + 36 + + + 182 + 7 + 8 + 7 + + + 182 + 23 + 24 + 23 + + + 184 + 1 + 2 + 1 + + + 185 + 19 + 20 + 19 + + + 185 + 23 + 24 + 23 + + + 186 + 3 + 4 + 3 + + + 187 + 6 + 7 + 6 + + + 187 + 59 + 60 + 59 + + + + + 175 + 5 + 13 + 6 + + + 177 + 5 + 7 + 6 + + + + + 220 + 3 + 4 + 3 + + + 176 + 15 + 20 + 16 + + + + + 221 + 41 + 46 + 45 + + + 196 + 1 + 12 + 2 + + + 196 + 1 + 3 + 2 + + + 196 + 4 + 11 + 6 + + + 196 + 9 + 11 + 10 + + + 198 + 5 + 6 + 5 + + + 199 + 2 + 4 + 2 + + + 200 + 19 + 20 + 19 + + + 202 + 9 + 10 + 9 + + + 223 + 11 + 12 + 11 + + + 224 + 3 + 5 + 3 + + + 239 + 5 + 7 + 5 + + + 240 + 10 + 11 + 10 + + + + + 194 + 15 + 20 + 15 + + + 194 + 21 + 22 + 21 + + + + + 235 + 24 + 27 + 26 + + + 193 + 17 + 22 + 17 + + + + + 194 + 11 + 22 + 13 + + + 195 + 1 + 2 + 1 + + + + + 189 + 1 + 3 + 2 + + + 190 + 1 + 2 + 1 + + + + + 188 + 10 + 29 + 11 + + + 188 + 10 + 13 + 11 + + + 188 + 14 + 19 + 15 + + + 188 + 21 + 29 + 22 + + + + + 187 + 28 + 31 + 30 + + + 198 + 21 + 22 + 21 + + + 241 + 28 + 29 + 28 + + + + + 206 + 2 + 8 + 3 + + + 206 + 9 + 10 + 9 + + + + + 204 + 15 + 51 + 19 + + + 204 + 15 + 20 + 19 + + + 204 + 22 + 34 + 24 + + + + + 199 + 34 + 38 + 37 + + + 229 + 21 + 23 + 22 + + + + + 203 + 1 + 6 + 1 + + + 200 + 1 + 2 + 1 + + + 203 + 1 + 2 + 1 + + + + + 200 + 6 + 8 + 7 + + + 234 + 2 + 4 + 3 + + + + + 200 + 13 + 14 + 13 + + + 201 + 3 + 4 + 3 + + + 203 + 15 + 16 + 15 + + + 203 + 18 + 19 + 18 + + + + + 199 + 10 + 38 + 11 + + + 199 + 10 + 16 + 11 + + + 199 + 17 + 38 + 21 + + + + + 197 + 2 + 3 + 2 + + + 198 + 2 + 3 + 2 + + + + + 197 + 7 + 25 + 20 + + + 197 + 3 + 6 + 5 + + + 202 + 8 + 11 + 10 + + + + + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/pom.xml b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/pom.xml new file mode 100644 index 0000000..0a7d6e8 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/pom.xml @@ -0,0 +1,82 @@ + + 4.0.0 + edu.stanford.nlp + stanford-corenlp + 1.3.5 + jar + Stanford CoreNLP + Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications. + http://nlp.stanford.edu/software/corenlp.shtml + + + GNU General Public License Version 2 + http://www.gnu.org/licenses/gpl-2.0.txt + + + + http://nlp.stanford.edu/software/stanford-corenlp-2013-04-04.tgz + http://nlp.stanford.edu/software/stanford-corenlp-2013-04-04.tgz + + + + christopher.manning + Christopher Manning + manning@stanford.edu + + + john.bauer + John Bauer + horatio@gmail.com + + + + 1.6 + 1.6 + UTF-8 + + + + xom + xom + 1.2.5 + + + joda-time + joda-time + 2.1 + + + de.jollyday + jollyday + 0.4.7 + + + + src + + + org.codehaus.mojo + build-helper-maven-plugin + 1.7 + + + attach-models + package + + attach-artifact + + + + + ${project.basedir}/stanford-corenlp-1.3.5-models.jar + jar + models + + + + + + + + + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/META-INF/MANIFEST.MF b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/META-INF/MANIFEST.MF new file mode 100644 index 0000000..0a188a0 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/META-INF/MANIFEST.MF @@ -0,0 +1,6 @@ +Manifest-Version: 1.0 +Implementation-Version: 1.3.5 +Created-By: Stanford JavaNLP (horatio) +Main-class: edu.stanford.nlp.pipeline.StanfordCoreNLP +Built-Date: 2013-04-04 + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/AbstractLinearClassifierFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/AbstractLinearClassifierFactory.java new file mode 100644 index 0000000..dd00e2f --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/AbstractLinearClassifierFactory.java @@ -0,0 +1,91 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.HashIndex; + +import java.lang.ref.Reference; +import java.util.Collection; +import java.util.List; + +/** + * Shared methods for training a {@link LinearClassifier}. + * Inheriting classes need to implement the + * trainWeights method. + * + * @author Dan Klein + * + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param The type of the labels in the Dataset and Datum + * @param The type of the features in the Dataset and Datum + */ + +public abstract class AbstractLinearClassifierFactory implements ClassifierFactory> { + + private static final long serialVersionUID = 1L; + + Index labelIndex = new HashIndex(); + Index featureIndex = new HashIndex(); + + public AbstractLinearClassifierFactory() { + } + + int numFeatures() { + return featureIndex.size(); + } + + int numClasses() { + return labelIndex.size(); + } + + public Classifier trainClassifier(List> examples) { + Dataset dataset = new Dataset(); + dataset.addAll(examples); + return trainClassifier(dataset); + } + + protected abstract double[][] trainWeights(GeneralDataset dataset) ; + + /** + * Takes a {@link Collection} of {@link Datum} objects and gives you back a + * {@link Classifier} trained on it. + * + * @param examples {@link Collection} of {@link Datum} objects to train the + * classifier on + * @return A {@link Classifier} trained on it. + */ + public LinearClassifier trainClassifier(Collection> examples) { + Dataset dataset = new Dataset(); + dataset.addAll(examples); + return trainClassifier(dataset); + } + + /** + * Takes a {@link Reference} to a {@link Collection} of {@link Datum} + * objects and gives you back a {@link Classifier} trained on them + * + * @param ref {@link Reference} to a {@link Collection} of {@link + * Datum} objects to train the classifier on + * @return A Classifier trained on a collection of Datum + */ + public LinearClassifier trainClassifier(Reference>> ref) { + Collection> examples = ref.get(); + return trainClassifier(examples); + } + + + /** + * Trains a {@link Classifier} on a {@link Dataset}. + * + * @return A {@link Classifier} trained on the data. + */ + public LinearClassifier trainClassifier(GeneralDataset data) { + labelIndex = data.labelIndex(); + featureIndex = data.featureIndex(); + double[][] weights = trainWeights(data); + return new LinearClassifier(weights, featureIndex, labelIndex); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/AdaptedGaussianPriorObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/AdaptedGaussianPriorObjectiveFunction.java new file mode 100644 index 0000000..16521e1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/AdaptedGaussianPriorObjectiveFunction.java @@ -0,0 +1,118 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.math.ArrayMath; +import java.util.Arrays; + + +/** + * Adapt the mean of the Gaussian Prior by shifting the mean to the previously trained weights + * @author Pi-Chuan Chang + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param The type of the labels in the Dataset (one can be passed in to the constructor) + * @param The type of the features in the Dataset + */ + +public class AdaptedGaussianPriorObjectiveFunction extends LogConditionalObjectiveFunction { + + double[] weights; + + /** + * Calculate the conditional likelihood. + */ + @Override + protected void calculate(double[] x) { + if (useSummedConditionalLikelihood) { + calculateSCL(x); + } else { + calculateCL(x); + } + } + + + /** + */ + private void calculateSCL(double[] x) { + throw new UnsupportedOperationException(); + } + + /** + */ + private void calculateCL(double[] x) { + value = 0.0; + if (derivativeNumerator == null) { + derivativeNumerator = new double[x.length]; + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + for (int f = 0; f < features.length; f++) { + int i = indexOf(features[f], labels[d]); + if (dataweights == null) { + derivativeNumerator[i] -= 1; + } else { + derivativeNumerator[i] -= dataweights[d]; + } + } + } + } + copy(derivative, derivativeNumerator); + + double[] sums = new double[numClasses]; + double[] probs = new double[numClasses]; + + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + // activation + Arrays.fill(sums, 0.0); + + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < features.length; f++) { + int i = indexOf(features[f], c); + sums[c] += x[i]; + } + } + double total = ArrayMath.logSum(sums); + for (int c = 0; c < numClasses; c++) { + probs[c] = Math.exp(sums[c] - total); + if (dataweights != null) { + probs[c] *= dataweights[d]; + } + for (int f = 0; f < features.length; f++) { + int i = indexOf(features[f], c); + derivative[i] += probs[c]; + } + } + + double dV = sums[labels[d]] - total; + if (dataweights != null) { + dV *= dataweights[d]; + } + value -= dV; + } + //System.err.println("x length="+x.length); + //System.err.println("weights length="+weights.length); + double[] newX = ArrayMath.pairwiseSubtract(x, weights); + value += prior.compute(newX, derivative); + } + + /** + */ + @Override + protected void rvfcalculate(double[] x) { + throw new UnsupportedOperationException(); + } + + public AdaptedGaussianPriorObjectiveFunction(GeneralDataset dataset, LogPrior prior, double weights[][]) { + super(dataset, prior); + this.weights = to1D(weights); + } + + public double[] to1D(double[][] x2) { + double[] x = new double[numFeatures*numClasses]; + for (int i = 0; i < numFeatures; i++) { + for (int j = 0; j < numClasses; j++) { + x[indexOf(i, j)] = x2[i][j]; + } + } + return x; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/BiasedLogConditionalObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/BiasedLogConditionalObjectiveFunction.java new file mode 100644 index 0000000..5378d14 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/BiasedLogConditionalObjectiveFunction.java @@ -0,0 +1,137 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; + +import java.util.Arrays; + + +/** + * Maximizes the conditional likelihood with a given prior. + * + * @author Jenny Finkel + */ + +public class BiasedLogConditionalObjectiveFunction extends AbstractCachingDiffFunction { + + public void setPrior(LogPrior prior) { + this.prior = prior; + } + + protected LogPrior prior; + + protected int numFeatures = 0; + protected int numClasses = 0; + + protected int[][] data = null; + protected int[] labels = null; + + private double[][] confusionMatrix; + + @Override + public int domainDimension() { + return numFeatures * numClasses; + } + + int classOf(int index) { + return index % numClasses; + } + + int featureOf(int index) { + return index / numClasses; + } + + protected int indexOf(int f, int c) { + return f * numClasses + c; + } + + public double[][] to2D(double[] x) { + double[][] x2 = new double[numFeatures][numClasses]; + for (int i = 0; i < numFeatures; i++) { + for (int j = 0; j < numClasses; j++) { + x2[i][j] = x[indexOf(i, j)]; + } + } + return x2; + } + + @Override + protected void calculate(double[] x) { + + if (derivative == null) { + derivative = new double[x.length]; + } else { + Arrays.fill(derivative, 0.0); + } + + value = 0.0; + + double[] sums = new double[numClasses]; + double[] probs = new double[numClasses]; + double[] weightedProbs = new double[numClasses]; + + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + int observedLabel = labels[d]; + // activation + Arrays.fill(sums, 0.0); + + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < features.length; f++) { + int i = indexOf(features[f], c); + sums[c] += x[i]; + } + } + + double total = ArrayMath.logSum(sums); + + double[] weightedSums = new double[numClasses]; + for (int trueLabel = 0; trueLabel < numClasses; trueLabel++) { + weightedSums[trueLabel] = Math.log(confusionMatrix[observedLabel][trueLabel]) + sums[trueLabel]; + } + + double weightedTotal = ArrayMath.logSum(weightedSums); + + for (int c = 0; c < numClasses; c++) { + probs[c] = Math.exp(sums[c] - total); + weightedProbs[c] = Math.exp(weightedSums[c] - weightedTotal); + for (int f = 0; f < features.length; f++) { + int i = indexOf(features[f], c); + derivative[i] += probs[c] - weightedProbs[c]; + } + } + + double tmpValue = 0.0; + for (int c = 0; c < numClasses; c++) { + tmpValue += confusionMatrix[observedLabel][c] * Math.exp(sums[c] - total); + } + value -= Math.log(tmpValue); + } + + value += prior.compute(x, derivative); + + } + + + + public BiasedLogConditionalObjectiveFunction(GeneralDataset dataset, double[][] confusionMatrix) { + this(dataset, confusionMatrix, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public BiasedLogConditionalObjectiveFunction(GeneralDataset dataset, double[][] confusionMatrix, LogPrior prior) { + this(dataset.numFeatures(), dataset.numClasses(), dataset.getDataArray(), dataset.getLabelsArray(), confusionMatrix, prior); + } + + public BiasedLogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, double[][] confusionMatrix) { + this(numFeatures, numClasses, data, labels, confusionMatrix, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public BiasedLogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, double[][] confusionMatrix, LogPrior prior) { + this.numFeatures = numFeatures; + this.numClasses = numClasses; + this.data = data; + this.labels = labels; + this.prior = prior; + this.confusionMatrix = confusionMatrix; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/BiasedLogisticObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/BiasedLogisticObjectiveFunction.java new file mode 100644 index 0000000..6ef7fc5 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/BiasedLogisticObjectiveFunction.java @@ -0,0 +1,147 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; + +import java.util.Arrays; + + +/** + * @author jrfinkel + */ +public class BiasedLogisticObjectiveFunction extends AbstractCachingDiffFunction { + + private final int numFeatures; + private final int[][] data; + private final double[][] dataValues; + private final int[] labels; + protected float[] dataweights = null; + private final LogPrior prior; + double probCorrect = 0.7; + + @Override + public int domainDimension() { + return numFeatures; + } + + @Override + protected void calculate(double[] x) { + + if (dataValues != null) { + throw new RuntimeException(); + } + + value = 0.0; + Arrays.fill(derivative, 0.0); + + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + double sum = 0; + + for (int f = 0; f < features.length; f++) { + sum += x[features[f]]; + } + + double expSum, derivativeIncrement; + + if (dataweights != null) { + throw new RuntimeException(); + } + + if (labels[d] == 1) { + expSum = Math.exp(-sum); + double g = (1 / (1 + expSum)); + value -= Math.log(g); + derivativeIncrement = (g-1); + } else { +// expSum = Math.exp(-sum); +// double g = (1 / (1 + expSum)); +// value -= Math.log(1-g); +// derivativeIncrement = (g); +// } + expSum = Math.exp(-sum); + double g = (1 / (1 + expSum)); + double e = (1-probCorrect) * g + (probCorrect)*(1 - g); + value -= Math.log(e); + derivativeIncrement = -(g*(1-g)*(1-2*probCorrect)) / (e); + } + + for (int f = 0; f < features.length; f++) { + derivative[features[f]] += derivativeIncrement; + } + } + + value += prior.compute(x, derivative); + } + + protected void calculateRVF(double[] x) { + + value = 0.0; + Arrays.fill(derivative, 0.0); + + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + double[] values = dataValues[d]; + double sum = 0; + + for (int f = 0; f < features.length; f++) { + sum += x[features[f]]*values[features[f]]; + } + + double expSum, derivativeIncrement; + + if (labels[d] == 0) { + expSum = Math.exp(sum); + derivativeIncrement = 1.0 / (1.0 + (1.0 / expSum)); + } else { + expSum = Math.exp(-sum); + derivativeIncrement = -1.0 / (1.0 + (1.0 / expSum)); + } + + if (dataweights == null) { + value += Math.log(1.0 + expSum); + } else { + value += Math.log(1.0 + expSum) * dataweights[d]; + derivativeIncrement *= dataweights[d]; + } + + for (int f = 0; f < features.length; f++) { + derivative[features[f]] += values[features[f]]*derivativeIncrement; + } + } + + value += prior.compute(x, derivative); + } + + + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels) { + this(numFeatures, data, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels, LogPrior prior) { + this(numFeatures, data, labels, prior, null); + } + + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels, float[] dataweights) { + this(numFeatures, data, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC), dataweights); + } + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels, LogPrior prior, float[] dataweights) { + this(numFeatures, data, null, labels, prior, dataweights); + } + + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels) { + this(numFeatures, data, values, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels, LogPrior prior) { + this(numFeatures, data, values, labels, prior, null); + } + + public BiasedLogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels, LogPrior prior, float[] dataweights) { + this.numFeatures = numFeatures; + this.data = data; + this.labels = labels; + this.prior = prior; + this.dataweights = dataweights; + this.dataValues = values; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/Classifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/Classifier.java new file mode 100644 index 0000000..4998323 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/Classifier.java @@ -0,0 +1,28 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.stats.Counter; + +import java.io.Serializable; +import java.util.Collection; + +/** + * A simple interface for classifying and scoring data points, implemented + * by most of the classifiers in this package. A basic Classifier + * works over a List of categorical features. For classifiers over + * real-valued features, see {@link RVFClassifier}. + * + * @author Dan Klein + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param The type of the label(s) in each Datum + * @param The type of the features in each Datum + */ + +public interface Classifier extends Serializable { + public L classOf(Datum example); + + public Counter scoresOf(Datum example); + + public Collection labels(); +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ClassifierCreator.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ClassifierCreator.java new file mode 100644 index 0000000..d58905b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ClassifierCreator.java @@ -0,0 +1,10 @@ +package edu.stanford.nlp.classify; + +/** + * Creates a classifier with given weights + * + * @author Angel Chang + */ +public interface ClassifierCreator { + public Classifier createClassifier(double[] weights); +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ClassifierFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ClassifierFactory.java new file mode 100644 index 0000000..4d4cb38 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ClassifierFactory.java @@ -0,0 +1,24 @@ +package edu.stanford.nlp.classify; + +import java.io.Serializable; +import java.util.List; + +import edu.stanford.nlp.ling.RVFDatum; + +/** + * A simple interface for training a Classifier from a Dataset of training + * examples. + * + * @author Dan Klein + * + * Templatized by Sarah Spikes (sdspikes@cs.stanford.edu) + */ + +public interface ClassifierFactory> extends Serializable { + + @Deprecated //ClassifierFactory should implement trainClassifier(GeneralDataset) instead. + public C trainClassifier(List> examples); + + public C trainClassifier(GeneralDataset dataset); + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/CrossValidator.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/CrossValidator.java new file mode 100644 index 0000000..110d98a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/CrossValidator.java @@ -0,0 +1,94 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Triple; +import edu.stanford.nlp.util.Function; + +import java.util.Iterator; + +/** + * This class is meant to simplify performing cross validation of + * classifiers for hyper-parameters. It has the ability to save + * state for each fold (for instance, the weights for a MaxEnt + * classifier, and the alphas for an SVM). + * + * @author Aria Haghighi + * @author Jenny Finkel + * @author Sarah Spikes (Templatization) + */ +public class CrossValidator { + private final GeneralDataset originalTrainData; + private final int kFold; + private final SavedState[] savedStates; + + public CrossValidator(GeneralDataset trainData) { + this(trainData, 10); + } + + public CrossValidator(GeneralDataset trainData, int kFold) { + originalTrainData = trainData; + this.kFold = kFold; + savedStates = new SavedState[kFold]; + for (int i = 0; i < savedStates.length; i++) { + savedStates[i] = new SavedState(); + } + } + + /** + * Returns an Iterator over train/test/saved states. + * + * @return An Iterator over train/test/saved states + */ + private Iterator,GeneralDataset,SavedState>> iterator() { return new CrossValidationIterator(); } + + /** + * This computes the average over all folds of the function we're trying to optimize. + * The input triple contains, in order, the train set, the test set, and the saved state. + * You don't have to use the saved state if you don't want to. + */ + public double computeAverage (Function,GeneralDataset,SavedState>,Double> function) + { + double sum = 0; + Iterator,GeneralDataset,SavedState>> foldIt = iterator(); + while (foldIt.hasNext()) { + sum += function.apply(foldIt.next()); + } + return sum / kFold; + } + + class CrossValidationIterator implements Iterator,GeneralDataset,SavedState>> + { + int iter = 0; + public boolean hasNext() { return iter < kFold; } + + public void remove() + { + throw new RuntimeException("CrossValidationIterator doesn't support remove()"); + } + + public Triple,GeneralDataset,SavedState> next() + { + if (iter == kFold) return null; + int start = originalTrainData.size() * iter / kFold; + int end = originalTrainData.size() * (iter + 1) / kFold; + //System.err.println("##train data size: " + originalTrainData.size() + " start " + start + " end " + end); + Pair, GeneralDataset> split = originalTrainData.split(start, end); + + return new Triple,GeneralDataset,SavedState>(split.first(),split.second(),savedStates[iter++]); + } + } + + public static class SavedState { + public Object state; + } + + public static void main(String[] args) { + Dataset d = Dataset.readSVMLightFormat(args[0]); + Iterator,GeneralDataset,SavedState>> it = (new CrossValidator(d)).iterator(); + while (it.hasNext()) + { + it.next(); + break; + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/Dataset.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/Dataset.java new file mode 100644 index 0000000..3786a54 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/Dataset.java @@ -0,0 +1,758 @@ +package edu.stanford.nlp.classify; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.ling.BasicDatum; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.TwoDimensionalCounter; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.HashIndex; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.ScoredComparator; +import edu.stanford.nlp.util.ScoredObject; + + +/** + * An interfacing class for {@link ClassifierFactory} that incrementally + * builds a more memory-efficient representation of a {@link List} of + * {@link Datum} objects for the purposes of training a {@link Classifier} + * with a {@link ClassifierFactory}. + * + * @author Roger Levy (rog@stanford.edu) + * @author Anna Rafferty (various refactoring with GeneralDataset/RVFDataset) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (templatization) + * @author nmramesh@cs.stanford.edu {@link #getL1NormalizedTFIDFDatum(Datum, Counter) and #getL1NormalizedTFIDFDataset()} + * + * @param Label type + * @param Feature type + */ +public class Dataset extends GeneralDataset { + + private static final long serialVersionUID = -3883164942879961091L; + + public Dataset() { + this(10); + } + + public Dataset(int numDatums) { + initialize(numDatums); + } + + public Dataset(int numDatums, Index featureIndex, Index labelIndex) { + initialize(numDatums); + this.featureIndex = featureIndex; + this.labelIndex = labelIndex; + } + + public Dataset(Index featureIndex, Index labelIndex) { + this(10, featureIndex, labelIndex); + } + + + /** + * Constructor that fully specifies a Dataset. Needed this for MulticlassDataset. + */ + public Dataset(Index labelIndex, int[] labels, Index featureIndex, int[][] data) { + this (labelIndex, labels, featureIndex, data, data.length); + } + + /** + * Constructor that fully specifies a Dataset. Needed this for MulticlassDataset. + */ + public Dataset(Index labelIndex, int[] labels, Index featureIndex, int[][] data, int size) { + this.labelIndex = labelIndex; + this.labels = labels; + this.featureIndex = featureIndex; + this.data = data; + this.size = size; + } + + @Override + public Pair, GeneralDataset> split(double percentDev) { + int devSize = (int)(percentDev * size()); + int trainSize = size() - devSize; + + int[][] devData = new int[devSize][]; + int[] devLabels = new int[devSize]; + + int[][] trainData = new int[trainSize][]; + int[] trainLabels = new int[trainSize]; + + System.arraycopy(data, 0, devData, 0, devSize); + System.arraycopy(labels, 0, devLabels, 0, devSize); + + System.arraycopy(data, devSize, trainData, 0, trainSize); + System.arraycopy(labels, devSize, trainLabels, 0, trainSize); + + if (this instanceof WeightedDataset) { + float[] trainWeights = new float[trainSize]; + float[] devWeights = new float[devSize]; + + WeightedDataset w = (WeightedDataset)this; + + System.arraycopy(w.weights, 0, devWeights, 0, devSize); + System.arraycopy(w.weights, devSize, trainWeights, 0, trainSize); + + WeightedDataset dev = new WeightedDataset(labelIndex, devLabels, featureIndex, devData, devSize, devWeights); + WeightedDataset train = new WeightedDataset(labelIndex, trainLabels, featureIndex, trainData, trainSize, trainWeights); + + return new Pair,GeneralDataset>(train, dev); + } + Dataset dev = new Dataset(labelIndex, devLabels, featureIndex, devData, devSize); + Dataset train = new Dataset(labelIndex, trainLabels, featureIndex, trainData, trainSize); + + return new Pair,GeneralDataset>(train, dev); + } + + @Override + public Pair,GeneralDataset> split(int start, int end) { + int devSize = end - start; + int trainSize = size() - devSize; + + int[][] devData = new int[devSize][]; + int[] devLabels = new int[devSize]; + + int[][] trainData = new int[trainSize][]; + int[] trainLabels = new int[trainSize]; + + System.arraycopy(data, start, devData, 0, devSize); + System.arraycopy(labels, start, devLabels, 0, devSize); + + System.arraycopy(data, 0, trainData, 0, start); + System.arraycopy(data, end, trainData, start, size()-end); + System.arraycopy(labels, 0, trainLabels, 0, start); + System.arraycopy(labels, end, trainLabels, start, size()-end); + + if (this instanceof WeightedDataset) { + float[] trainWeights = new float[trainSize]; + float[] devWeights = new float[devSize]; + + WeightedDataset w = (WeightedDataset)this; + + System.arraycopy(w.weights, start, devWeights, 0, devSize); + System.arraycopy(w.weights, 0, trainWeights, 0, start); + System.arraycopy(w.weights, end, trainWeights, start, size()-end); + + WeightedDataset dev = new WeightedDataset(labelIndex, devLabels, featureIndex, devData, devSize, devWeights); + WeightedDataset train = new WeightedDataset(labelIndex, trainLabels, featureIndex, trainData, trainSize, trainWeights); + + return new Pair,GeneralDataset>(train, dev); + } + Dataset dev = new Dataset(labelIndex, devLabels, featureIndex, devData, devSize); + Dataset train = new Dataset(labelIndex, trainLabels, featureIndex, trainData, trainSize); + + return new Pair,GeneralDataset>(train, dev); + } + + + public Dataset getRandomSubDataset(double p, int seed) { + int newSize = (int)(p * size()); + Set indicesToKeep = Generics.newHashSet(); + Random r = new Random(seed); + int s = size(); + while (indicesToKeep.size() < newSize) { + indicesToKeep.add(r.nextInt(s)); + } + + int[][] newData = new int[newSize][]; + int[] newLabels = new int[newSize]; + + int i = 0; + for (int j : indicesToKeep) { + newData[i] = data[j]; + newLabels[i] = labels[j]; + i++; + } + + return new Dataset(labelIndex, newLabels, featureIndex, newData); + } + + @Override + public double[][] getValuesArray() { + return null; + } + + /** + * Constructs a Dataset by reading in a file in SVM light format. + */ + public static Dataset readSVMLightFormat(String filename) { + return readSVMLightFormat(filename, new HashIndex(), new HashIndex()); + } + + /** + * Constructs a Dataset by reading in a file in SVM light format. + * The lines parameter is filled with the lines of the file for further processing + * (if lines is null, it is assumed no line information is desired) + */ + public static Dataset readSVMLightFormat(String filename, List lines) { + return readSVMLightFormat(filename, new HashIndex(), new HashIndex(), lines); + } + + /** + * Constructs a Dataset by reading in a file in SVM light format. + * the created dataset has the same feature and label index as given + */ + public static Dataset readSVMLightFormat(String filename, Index featureIndex, Index labelIndex) { + return readSVMLightFormat(filename, featureIndex, labelIndex, null); + } + /** + * Constructs a Dataset by reading in a file in SVM light format. + * the created dataset has the same feature and label index as given + */ + public static Dataset readSVMLightFormat(String filename, Index featureIndex, Index labelIndex, List lines) { + Dataset dataset; + try { + dataset = new Dataset(10, featureIndex, labelIndex); + for (String line : ObjectBank.getLineIterator(new File(filename))) { + if(lines != null) + lines.add(line); + dataset.add(svmLightLineToDatum(line)); + } + + } catch (Exception e) { + throw new RuntimeException(e); + } + return dataset; + } + + private static int line1 = 0; + + public static Datum svmLightLineToDatum(String l) { + line1++; + l = l.replaceAll("#.*", ""); // remove any trailing comments + String[] line = l.split("\\s+"); + Collection features = new ArrayList(); + for (int i = 1; i < line.length; i++) { + String[] f = line[i].split(":"); + if (f.length != 2) { + System.err.println("Dataset error: line " + line1); + } + int val = (int) Double.parseDouble(f[1]); + for (int j = 0; j < val; j++) { + features.add(f[0]); + } + } + features.add(String.valueOf(Integer.MAX_VALUE)); // a constant feature for a class + Datum d = new BasicDatum(features, line[0]); + return d; + } + + /** + * Get Number of datums a given feature appears in. + */ + public Counter getFeatureCounter() + { + Counter featureCounts = new ClassicCounter(); + for (int i=0; i < this.size(); i++) + { + BasicDatum datum = (BasicDatum) getDatum(i); + Set featureSet = Generics.newHashSet(datum.asFeatures()); + for (F key : featureSet) { + featureCounts.incrementCount(key, 1.0); + } + } + return featureCounts; + } + + /** + * Method to convert features from counts to L1-normalized TFIDF based features + * @param datum with a collection of features. + * @param featureDocCounts a counter of doc-count for each feature. + * @return RVFDatum with l1-normalized tf-idf features. + */ + public RVFDatum getL1NormalizedTFIDFDatum(Datum datum,Counter featureDocCounts){ + Counter tfidfFeatures = new ClassicCounter(); + for(F feature : datum.asFeatures()){ + if(featureDocCounts.containsKey(feature)) + tfidfFeatures.incrementCount(feature,1.0); + } + double l1norm = 0; + for(F feature: tfidfFeatures.keySet()){ + double idf = Math.log(((double)(this.size()+1))/(featureDocCounts.getCount(feature)+0.5)); + double tf = tfidfFeatures.getCount(feature); + tfidfFeatures.setCount(feature, tf*idf); + l1norm += tf*idf; + } + for(F feature: tfidfFeatures.keySet()){ + double tfidf = tfidfFeatures.getCount(feature); + tfidfFeatures.setCount(feature, tfidf/l1norm); + } + RVFDatum rvfDatum = new RVFDatum(tfidfFeatures,datum.label()); + return rvfDatum; + } + + /** + * Method to convert this dataset to RVFDataset using L1-normalized TF-IDF features + * @return RVFDataset + */ + public RVFDataset getL1NormalizedTFIDFDataset(){ + RVFDataset rvfDataset = new RVFDataset(this.size(),this.featureIndex,this.labelIndex); + Counter featureDocCounts = getFeatureCounter(); + for(int i = 0; i < this.size(); i++){ + Datum datum = this.getDatum(i); + RVFDatum rvfDatum = getL1NormalizedTFIDFDatum(datum,featureDocCounts); + rvfDataset.add(rvfDatum); + } + return rvfDataset; + } + + @Override + public void add(Datum d) { + add(d.asFeatures(), d.label()); + } + + public void add(Collection features, L label) { + add(features, label, true); + } + + public void add(Collection features, L label, boolean addNewFeatures) { + ensureSize(); + addLabel(label); + addFeatures(features, addNewFeatures); + size++; + } + + /** + * Adds a datums defined by feature indices and label index + * Careful with this one! Make sure that all indices are valid! + * @param features + * @param label + */ + public void add(int [] features, int label) { + ensureSize(); + addLabelIndex(label); + addFeatureIndices(features); + size++; + } + + protected void ensureSize() { + if (labels.length == size) { + int[] newLabels = new int[size * 2]; + System.arraycopy(labels, 0, newLabels, 0, size); + labels = newLabels; + int[][] newData = new int[size * 2][]; + System.arraycopy(data, 0, newData, 0, size); + data = newData; + } + } + + protected void addLabel(L label) { + labelIndex.add(label); + labels[size] = labelIndex.indexOf(label); + } + + protected void addLabelIndex(int label) { + labels[size] = label; + } + + protected void addFeatures(Collection features) { + addFeatures(features, true); + } + + protected void addFeatures(Collection features, boolean addNewFeatures) { + int[] intFeatures = new int[features.size()]; + int j = 0; + for (F feature : features) { + if(addNewFeatures) featureIndex.add(feature); + int index = featureIndex.indexOf(feature); + if (index >= 0) { + intFeatures[j] = featureIndex.indexOf(feature); + j++; + } + } + data[size] = new int[j]; + System.arraycopy(intFeatures, 0, data[size], 0, j); + } + + protected void addFeatureIndices(int [] features) { + data[size] = features; + } + + @Override + protected final void initialize(int numDatums) { + labelIndex = new HashIndex(); + featureIndex = new HashIndex(); + labels = new int[numDatums]; + data = new int[numDatums][]; + size = 0; + } + + /** + * @return the index-ed datum + */ + @Override + public Datum getDatum(int index) { + return new BasicDatum(featureIndex.objects(data[index]), labelIndex.get(labels[index])); + } + + /** + * @return the index-ed datum + */ + @Override + public RVFDatum getRVFDatum(int index) { + ClassicCounter c = new ClassicCounter(); + for (F key : featureIndex.objects(data[index])) { + c.incrementCount(key); + } + return new RVFDatum(c, labelIndex.get(labels[index])); + } + + /** + * Prints some summary statistics to stderr for the Dataset. + */ + @Override + public void summaryStatistics() { + System.err.println(toSummaryStatistics()); + } + + public String toSummaryStatistics() { + StringBuilder sb = new StringBuilder(); + sb.append("numDatums: ").append(size).append('\n'); + sb.append("numLabels: ").append(labelIndex.size()).append(" ["); + Iterator iter = labelIndex.iterator(); + while (iter.hasNext()) { + sb.append(iter.next()); + if (iter.hasNext()) { + sb.append(", "); + } + } + sb.append("]\n"); + sb.append("numFeatures (Phi(X) types): ").append(featureIndex.size()).append('\n'); + // List l = new ArrayList(featureIndex); +// Collections.sort(l); +// sb.append(l); + return sb.toString(); + } + + + /** + * Applies feature count thresholds to the Dataset. + * Only features that match pattern_i and occur at + * least threshold_i times (for some i) are kept. + * + * @param thresholds a list of pattern, threshold pairs + */ + public void applyFeatureCountThreshold(List> thresholds) { + + // get feature counts + float[] counts = getFeatureCounts(); + + // build a new featureIndex + Index newFeatureIndex = new HashIndex(); + LOOP: + for (F f : featureIndex) { + for (Pair threshold : thresholds) { + Pattern p = threshold.first(); + Matcher m = p.matcher(f.toString()); + if (m.matches()) { + if (counts[featureIndex.indexOf(f)] >= threshold.second) { + newFeatureIndex.add(f); + } + continue LOOP; + } + } + // we only get here if it didn't match anything on the list + newFeatureIndex.add(f); + } + + counts = null; + + int[] featMap = new int[featureIndex.size()]; + for (int i = 0; i < featMap.length; i++) { + featMap[i] = newFeatureIndex.indexOf(featureIndex.get(i)); + } + + featureIndex = null; + + for (int i = 0; i < size; i++) { + List featList = new ArrayList(data[i].length); + for (int j = 0; j < data[i].length; j++) { + if (featMap[data[i][j]] >= 0) { + featList.add(featMap[data[i][j]]); + } + } + data[i] = new int[featList.size()]; + for (int j = 0; j < data[i].length; j++) { + data[i][j] = featList.get(j); + } + } + + featureIndex = newFeatureIndex; + } + + + /** + * prints the full feature matrix in tab-delimited form. These can be BIG + * matrices, so be careful! + */ + public void printFullFeatureMatrix(PrintWriter pw) { + String sep = "\t"; + for (int i = 0; i < featureIndex.size(); i++) { + pw.print(sep + featureIndex.get(i)); + } + pw.println(); + for (int i = 0; i < labels.length; i++) { + pw.print(labelIndex.get(i)); + Set feats = Generics.newHashSet(); + for (int j = 0; j < data[i].length; j++) { + int feature = data[i][j]; + feats.add(Integer.valueOf(feature)); + } + for (int j = 0; j < featureIndex.size(); j++) { + if (feats.contains(Integer.valueOf(j))) { + pw.print(sep + '1'); + } else { + pw.print(sep + '0'); + } + } + } + } + + /** + * prints the sparse feature matrix using {@link #printSparseFeatureMatrix()} + * to {@link System#out System.out}. + */ + public void printSparseFeatureMatrix() { + printSparseFeatureMatrix(new PrintWriter(System.out, true)); + } + + /** + * prints a sparse feature matrix representation of the Dataset. Prints the actual + * {@link Object#toString()} representations of features. + */ + public void printSparseFeatureMatrix(PrintWriter pw) { + String sep = "\t"; + for (int i = 0; i < size; i++) { + pw.print(labelIndex.get(labels[i])); + int[] datum = data[i]; + for (int j : datum) { + pw.print(sep + featureIndex.get(j)); + } + pw.println(); + } + } + + + public void changeLabelIndex(Index newLabelIndex) { + + labels = trimToSize(labels); + + for (int i = 0; i < labels.length; i++) { + labels[i] = newLabelIndex.indexOf(labelIndex.get(labels[i])); + } + labelIndex = newLabelIndex; + } + + public void changeFeatureIndex(Index newFeatureIndex) { + + data = trimToSize(data); + labels = trimToSize(labels); + + int[][] newData = new int[data.length][]; + for (int i = 0; i < data.length; i++) { + int[] newD = new int[data[i].length]; + int k = 0; + for (int j = 0; j < data[i].length; j++) { + int newIndex = newFeatureIndex.indexOf(featureIndex.get(data[i][j])); + if (newIndex >= 0) { + newD[k++] = newIndex; + } + } + newData[i] = new int[k]; + System.arraycopy(newD, 0, newData[i], 0, k); + } + data = newData; + featureIndex = newFeatureIndex; + } + + public void selectFeaturesBinaryInformationGain(int numFeatures) { + double[] scores = getInformationGains(); + selectFeatures(numFeatures,scores); + } + + /** + * Generic method to select features based on the feature scores vector provided as an argument. + * @param numFeatures number of features to be selected. + * @param scores a vector of size total number of features in the data. + */ + public void selectFeatures(int numFeatures, double[] scores) { + + List> scoredFeatures = new ArrayList>(); + + for (int i = 0; i < scores.length; i++) { + scoredFeatures.add(new ScoredObject(featureIndex.get(i), scores[i])); + } + + Collections.sort(scoredFeatures, ScoredComparator.DESCENDING_COMPARATOR); + Index newFeatureIndex = new HashIndex(); + for (int i = 0; i < scoredFeatures.size() && i < numFeatures; i++) { + newFeatureIndex.add(scoredFeatures.get(i).object()); + //System.err.println(scoredFeatures.get(i)); + } + + for (int i = 0; i < size; i++) { + int[] newData = new int[data[i].length]; + int curIndex = 0; + for (int j = 0; j < data[i].length; j++) { + int index; + if ((index = newFeatureIndex.indexOf(featureIndex.get(data[i][j]))) != -1) { + newData[curIndex++] = index; + } + } + int[] newDataTrimmed = new int[curIndex]; + System.arraycopy(newData, 0, newDataTrimmed, 0, curIndex); + data[i] = newDataTrimmed; + } + featureIndex = newFeatureIndex; + } + + + public double[] getInformationGains() { + + data = trimToSize(data); + labels = trimToSize(labels); + + // counts the number of times word X is present + ClassicCounter featureCounter = new ClassicCounter(); + + // counts the number of time a document has label Y + ClassicCounter labelCounter = new ClassicCounter(); + + // counts the number of times the document has label Y given word X is present + TwoDimensionalCounter condCounter = new TwoDimensionalCounter(); + + for (int i = 0; i < labels.length; i++) { + labelCounter.incrementCount(labelIndex.get(labels[i])); + + // convert the document to binary feature representation + boolean[] doc = new boolean[featureIndex.size()]; + //System.err.println(i); + for (int j = 0; j < data[i].length; j++) { + doc[data[i][j]] = true; + } + + for (int j = 0; j < doc.length; j++) { + if (doc[j]) { + featureCounter.incrementCount(featureIndex.get(j)); + condCounter.incrementCount(featureIndex.get(j), labelIndex.get(labels[i]), 1.0); + } + } + } + + double entropy = 0.0; + for (int i = 0; i < labelIndex.size(); i++) { + double labelCount = labelCounter.getCount(labelIndex.get(i)); + double p = labelCount / size(); + entropy -= p * (Math.log(p) / Math.log(2)); + } + + double[] ig = new double[featureIndex.size()]; + Arrays.fill(ig, entropy); + + for (int i = 0; i < featureIndex.size(); i++) { + F feature = featureIndex.get(i); + + double featureCount = featureCounter.getCount(feature); + double notFeatureCount = size() - featureCount; + + double pFeature = featureCount / size(); + double pNotFeature = (1.0 - pFeature); + + if (featureCount == 0) { ig[i] = 0; continue; } + if (notFeatureCount == 0) { ig[i] = 0; continue; } + + double sumFeature = 0.0; + double sumNotFeature = 0.0; + + for (int j = 0; j < labelIndex.size(); j++) { + L label = labelIndex.get(j); + + double featureLabelCount = condCounter.getCount(feature, label); + double notFeatureLabelCount = size() - featureLabelCount; + + // yes, these dont sum to 1. that is correct. + // one is the prob of the label, given that the + // feature is present, and the other is the prob + // of the label given that the feature is absent + double p = featureLabelCount / featureCount; + double pNot = notFeatureLabelCount / notFeatureCount; + + if (featureLabelCount != 0) { + sumFeature += p * (Math.log(p) / Math.log(2)); + } + + if (notFeatureLabelCount != 0) { + sumNotFeature += pNot * (Math.log(pNot) / Math.log(2)); + } + //System.out.println(pNot+" "+(Math.log(pNot)/Math.log(2))); + + } + + //System.err.println(pFeature+" * "+sumFeature+" = +"+); + //System.err.println("^ "+pNotFeature+" "+sumNotFeature); + + ig[i] += pFeature*sumFeature + pNotFeature*sumNotFeature; + /* earlier the line above used to be: ig[i] = pFeature*sumFeature + pNotFeature*sumNotFeature; + * This completely ignored the entropy term computed above. So added the "+=" to take that into account. + * -Ramesh (nmramesh@cs.stanford.edu) + */ + } + return ig; + } + + public void updateLabels(int[] labels) { + if (labels.length != size()) + throw new IllegalArgumentException( + "size of labels array does not match dataset size"); + + this.labels = labels; + } + + @Override + public String toString() { + return "Dataset of size " + size; + } + + public String toSummaryString() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + pw.println("Number of data points: " + size()); + pw.println("Number of active feature tokens: " + numFeatureTokens()); + pw.println("Number of active feature types:" + numFeatureTypes()); + return pw.toString(); + } + + /** + * Need to sort the counter by feature keys and dump it + * + */ + public static void printSVMLightFormat(PrintWriter pw, ClassicCounter c, int classNo) { + Integer[] features = c.keySet().toArray(new Integer[c.keySet().size()]); + Arrays.sort(features); + StringBuilder sb = new StringBuilder(); + sb.append(classNo); + sb.append(' '); + for (int f: features) { + sb.append(f + 1).append(':').append(c.getCount(f)).append(' '); + } + pw.println(sb.toString()); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/GeneralDataset.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/GeneralDataset.java new file mode 100644 index 0000000..1d368ce --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/GeneralDataset.java @@ -0,0 +1,490 @@ +package edu.stanford.nlp.classify; + +import java.io.PrintWriter; +import java.io.Serializable; +import java.util.*; + +import edu.stanford.nlp.ling.BasicDatum; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.HashIndex; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.Pair; + +/** + * The purpose of this interface is to unify {@link Dataset} and {@link RVFDataset}. + * + * @author Kristina Toutanova (kristina@cs.stanford.edu) + * @author Anna Rafferty (various refactoring with subclasses) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * @author Ramesh Nallapati (nmramesh@cs.stanford.edu) + * (added an abstract method getDatum, July 17th, 2008) + * + * @param The type of the labels in the Dataset + * @param The type of the features in the Dataset + */ +public abstract class GeneralDataset implements Serializable, Iterable> { + + private static final long serialVersionUID = 19157757130054829L; + + public Index labelIndex; + public Index featureIndex; + + protected int[] labels; + protected int[][] data; + + protected int size; + + public GeneralDataset() { } + + public Index labelIndex() { return labelIndex; } + + public Index featureIndex() { return featureIndex; } + + public int numFeatures() { return featureIndex.size(); } + + public int numClasses() { return labelIndex.size(); } + + public int[] getLabelsArray() { + labels = trimToSize(labels); + return labels; + } + + public int[][] getDataArray() { + data = trimToSize(data); + return data; + } + + public abstract double[][] getValuesArray(); + + /** + * Resets the Dataset so that it is empty and ready to collect data. + */ + public void clear() { + clear(10); + } + + /** + * Resets the Dataset so that it is empty and ready to collect data. + * @param numDatums initial capacity of dataset + */ + public void clear(int numDatums) { + initialize(numDatums); + } + + /** + * This method takes care of resetting values of the dataset + * such that it is empty with an initial capacity of numDatums. + * Should be accessed only by appropriate methods within the class, + * such as clear(), which take care of other parts of the emptying of data. + * + * @param numDatums initial capacity of dataset + */ + protected abstract void initialize(int numDatums); + + + public abstract RVFDatum getRVFDatum(int index); + + public abstract Datum getDatum(int index); + + + public abstract void add(Datum d); + + /** + * Get the total count (over all data instances) of each feature + * + * @return an array containing the counts (indexed by index) + */ + public float[] getFeatureCounts() { + float[] counts = new float[featureIndex.size()]; + for (int i = 0, m = size; i < m; i++) { + for (int j = 0, n = data[i].length; j < n; j++) { + counts[data[i][j]] += 1.0; + } + } + return counts; + } + + /** + * Applies a feature count threshold to the Dataset. All features that + * occur fewer than k times are expunged. + */ + public void applyFeatureCountThreshold(int k) { + float[] counts = getFeatureCounts(); + Index newFeatureIndex = new HashIndex(); + + int[] featMap = new int[featureIndex.size()]; + for (int i = 0; i < featMap.length; i++) { + F feat = featureIndex.get(i); + if (counts[i] >= k) { + int newIndex = newFeatureIndex.size(); + newFeatureIndex.add(feat); + featMap[i] = newIndex; + } else { + featMap[i] = -1; + } + // featureIndex.remove(feat); + } + + featureIndex = newFeatureIndex; + // counts = null; // This is unnecessary; JVM can clean it up + + for (int i = 0; i < size; i++) { + List featList = new ArrayList(data[i].length); + for (int j = 0; j < data[i].length; j++) { + if (featMap[data[i][j]] >= 0) { + featList.add(featMap[data[i][j]]); + } + } + data[i] = new int[featList.size()]; + for (int j = 0; j < data[i].length; j++) { + data[i][j] = featList.get(j); + } + } + } + + + /** + * Applies a max feature count threshold to the Dataset. All features that + * occur greater than k times are expunged. + */ + public void applyFeatureMaxCountThreshold(int k) { + float[] counts = getFeatureCounts(); + HashIndex newFeatureIndex = new HashIndex(); + + int[] featMap = new int[featureIndex.size()]; + for (int i = 0; i < featMap.length; i++) { + F feat = featureIndex.get(i); + if (counts[i] <= k) { + int newIndex = newFeatureIndex.size(); + newFeatureIndex.add(feat); + featMap[i] = newIndex; + } else { + featMap[i] = -1; + } + // featureIndex.remove(feat); + } + + featureIndex = newFeatureIndex; + // counts = null; // This is unnecessary; JVM can clean it up + + for (int i = 0; i < size; i++) { + List featList = new ArrayList(data[i].length); + for (int j = 0; j < data[i].length; j++) { + if (featMap[data[i][j]] >= 0) { + featList.add(featMap[data[i][j]]); + } + } + data[i] = new int[featList.size()]; + for (int j = 0; j < data[i].length; j++) { + data[i][j] = featList.get(j); + } + } + } + + + /** + * returns the number of feature tokens in the Dataset. + */ + public int numFeatureTokens() { + int x = 0; + for (int i = 0, m = size; i < m; i++) { + x += data[i].length; + } + return x; + } + + /** + * returns the number of distinct feature types in the Dataset. + */ + public int numFeatureTypes() { + return featureIndex.size(); + } + + + + /** + * Adds all Datums in the given collection of data to this dataset + * @param data collection of datums you would like to add to the dataset + */ + public void addAll(Iterable> data) { + for (Datum d : data) { + add(d); + } + } + + public abstract Pair, GeneralDataset> split (int start, int end) ; + public abstract Pair, GeneralDataset> split (double p) ; + + /** + * Returns the number of examples ({@link Datum}s) in the Dataset. + */ + public int size() { return size; } + + protected void trimData() { + data = trimToSize(data); + } + + protected void trimLabels() { + labels = trimToSize(labels); + } + + protected int[] trimToSize(int[] i) { + int[] newI = new int[size]; + System.arraycopy(i, 0, newI, 0, size); + return newI; + } + + protected int[][] trimToSize(int[][] i) { + int[][] newI = new int[size][]; + System.arraycopy(i, 0, newI, 0, size); + return newI; + } + + protected double[][] trimToSize(double[][] i) { + double[][] newI = new double[size][]; + System.arraycopy(i, 0, newI, 0, size); + return newI; + } + + /** + * Randomizes the data array in place. + * Note: this cannot change the values array or the datum weights, + * so redefine this for RVFDataset and WeightedDataset! + * @param randomSeed + */ + public void randomize(int randomSeed) { + Random rand = new Random(randomSeed); + for(int j = size - 1; j > 0; j --){ + int randIndex = rand.nextInt(j); + + int [] tmp = data[randIndex]; + data[randIndex] = data[j]; + data[j] = tmp; + + int tmpl = labels[randIndex]; + labels[randIndex] = labels[j]; + labels[j] = tmpl; + } + } + + public GeneralDataset sampleDataset(int randomSeed, double sampleFrac, boolean sampleWithReplacement) { + int sampleSize = (int)(this.size()*sampleFrac); + Random rand = new Random(randomSeed); + GeneralDataset subset; + if(this instanceof RVFDataset) + subset = new RVFDataset(); + else if (this instanceof Dataset) { + subset = new Dataset(); + } + else { + throw new RuntimeException("Can't handle this type of GeneralDataset."); + } + if (sampleWithReplacement) { + for(int i = 0; i < sampleSize; i++){ + int datumNum = rand.nextInt(this.size()); + subset.add(this.getDatum(datumNum)); + } + } else { + Set indicedSampled = Generics.newHashSet(); + while (subset.size() < sampleSize) { + int datumNum = rand.nextInt(this.size()); + if (!indicedSampled.contains(datumNum)) { + subset.add(this.getDatum(datumNum)); + indicedSampled.add(datumNum); + } + } + } + return subset; + } + + /** + * Print some statistics summarizing the dataset + * + */ + public abstract void summaryStatistics(); + + /** + * Returns an iterator over the class labels of the Dataset + * + * @return An iterator over the class labels of the Dataset + */ + public Iterator labelIterator() { + return labelIndex.iterator(); + } + + + /** + * + * @param dataset + * @return a new GeneralDataset whose features and ids map exactly to those of this GeneralDataset. + * Useful when two Datasets are created independently and one wants to train a model on one dataset and test on the other. -Ramesh. + */ + public GeneralDataset mapDataset(GeneralDataset dataset){ + GeneralDataset newDataset; + if(dataset instanceof RVFDataset) + newDataset = new RVFDataset(this.featureIndex,this.labelIndex); + else newDataset = new Dataset(this.featureIndex,this.labelIndex); + this.featureIndex.lock(); + this.labelIndex.lock(); + //System.out.println("inside mapDataset: dataset size:"+dataset.size()); + for(int i = 0; i < dataset.size(); i++) + //System.out.println("inside mapDataset: adding datum number"+i); + newDataset.add(dataset.getDatum(i)); + + //System.out.println("old Dataset stats: numData:"+dataset.size()+" numfeatures:"+dataset.featureIndex().size()+" numlabels:"+dataset.labelIndex.size()); + //System.out.println("new Dataset stats: numData:"+newDataset.size()+" numfeatures:"+newDataset.featureIndex().size()+" numlabels:"+newDataset.labelIndex.size()); + //System.out.println("this dataset stats: numData:"+size()+" numfeatures:"+featureIndex().size()+" numlabels:"+labelIndex.size()); + + this.featureIndex.unlock(); + this.labelIndex.unlock(); + return newDataset; + } + + public static Datum mapDatum(Datum d, Map labelMapping, L2 defaultLabel) { + // TODO: How to copy datum? + L2 newLabel = labelMapping.get(d.label()); + if (newLabel == null) { + newLabel = defaultLabel; + } + + if (d instanceof RVFDatum) { + return new RVFDatum( ((RVFDatum) d).asFeaturesCounter(), newLabel ); + } else { + return new BasicDatum( d.asFeatures(), newLabel ); + } + } + + + /** + * + * @param dataset + * @return a new GeneralDataset whose features and ids map exactly to those of this GeneralDataset. But labels are converted to be another set of labels + */ + public GeneralDataset mapDataset(GeneralDataset dataset, Index newLabelIndex, Map labelMapping, L2 defaultLabel) + { + GeneralDataset newDataset; + if(dataset instanceof RVFDataset) + newDataset = new RVFDataset(this.featureIndex, newLabelIndex); + else newDataset = new Dataset(this.featureIndex, newLabelIndex); + this.featureIndex.lock(); + this.labelIndex.lock(); + //System.out.println("inside mapDataset: dataset size:"+dataset.size()); + for(int i = 0; i < dataset.size(); i++) { + //System.out.println("inside mapDataset: adding datum number"+i); + Datum d = dataset.getDatum(i); + Datum d2 = mapDatum(d, labelMapping, defaultLabel); + newDataset.add(d2); + } + //System.out.println("old Dataset stats: numData:"+dataset.size()+" numfeatures:"+dataset.featureIndex().size()+" numlabels:"+dataset.labelIndex.size()); + //System.out.println("new Dataset stats: numData:"+newDataset.size()+" numfeatures:"+newDataset.featureIndex().size()+" numlabels:"+newDataset.labelIndex.size()); + //System.out.println("this dataset stats: numData:"+size()+" numfeatures:"+featureIndex().size()+" numlabels:"+labelIndex.size()); + + this.featureIndex.unlock(); + this.labelIndex.unlock(); + return newDataset; + } + + /** + * Dumps the Dataset as a training/test file for SVMLight.
    + * class [fno:val]+ + * The features must occur in consecutive order. + */ + public void printSVMLightFormat() { + printSVMLightFormat(new PrintWriter(System.out)); + } + + /** + * Maps our labels to labels that are compatible with svm_light + * @return array of strings + */ + public String[] makeSvmLabelMap() { + String[] labelMap = new String[numClasses()]; + if (numClasses() > 2) { + for (int i = 0; i < labelMap.length; i++) { + labelMap[i] = String.valueOf((i + 1)); + } + } else { + labelMap = new String[]{"+1", "-1"}; + } + return labelMap; + } + + // todo: Fix javadoc, have unit tested + /** + * Print SVM Light Format file. + * + * The following comments are no longer applicable because I am + * now printing out the exact labelID for each example. -Ramesh (nmramesh@cs.stanford.edu) 12/17/2009. + * + * If the Dataset has more than 2 classes, then it + * prints using the label index (+1) (for svm_struct). If it is 2 classes, then the labelIndex.get(0) + * is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light). + */ + + public void printSVMLightFormat(PrintWriter pw) { + //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter + + // old comment: + // the following code commented out by Ramesh (nmramesh@cs.stanford.edu) 12/17/2009. + // why not simply print the exact id of the label instead of mapping to some values?? + // new comment: + // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels, + // e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010) + String[] labelMap = makeSvmLabelMap(); + + for (int i = 0; i < size; i++) { + RVFDatum d = getRVFDatum(i); + Counter c = d.asFeaturesCounter(); + ClassicCounter printC = new ClassicCounter(); + for (F f : c.keySet()) { + printC.setCount(featureIndex.indexOf(f), c.getCount(f)); + } + Integer[] features = printC.keySet().toArray(new Integer[printC.keySet().size()]); + Arrays.sort(features); + StringBuilder sb = new StringBuilder(); + sb.append(labelMap[labels[i]]).append(' '); + // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions! + + /* Old code: assumes that F is Integer.... + * + for (int f: features) { + sb.append((f + 1)).append(":").append(c.getCount(f)).append(" "); + } + */ + //I think this is what was meant (using printC rather than c), but not sure + // ~Sarah Spikes (sdspikes@cs.stanford.edu) + for (int f: features) { + sb.append((f + 1)).append(':').append(printC.getCount(f)).append(' '); + } + pw.println(sb.toString()); + } + } + + + public Iterator> iterator() { + return new Iterator>() { + private int id; // = 0; + + public boolean hasNext() { + return id < size(); + } + + public RVFDatum next() { + if (id >= size()) { + throw new NoSuchElementException(); + } + return getRVFDatum(id++); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + }; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/GeneralizedExpectationObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/GeneralizedExpectationObjectiveFunction.java new file mode 100644 index 0000000..d5e4fc1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/GeneralizedExpectationObjectiveFunction.java @@ -0,0 +1,225 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Triple; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +/** + * Implementation of Generalized Expectation Objective function for + * an I.I.D. log-linear model. See Mann and McCallum, ACL 2008. + * IMPORTANT: the current implementation is only correct as long as + * the labeled features passed to GE are binary. + * However, other features are allowed to be real valued. + * The original paper also discusses GE only for binary features. + * + * @author Ramesh Nallapati (nmramesh@cs.stanford.edu) + */ + +public class GeneralizedExpectationObjectiveFunction extends AbstractCachingDiffFunction { + + private final GeneralDataset labeledDataset; + private final List> unlabeledDataList; + private final List geFeatures; + private final LinearClassifier classifier; + private double[][] geFeature2EmpiricalDist; //empirical label distributions of each feature. Really final but java won't let us. + private List> geFeature2DatumList; //an inverted list of active unlabeled documents for each feature. Really final but java won't let us. + + private final int numFeatures; + private final int numClasses; + + + @Override + public int domainDimension() { + return numFeatures * numClasses; + } + + int classOf(int index) { + return index % numClasses; + } + + int featureOf(int index) { + return index / numClasses; + } + + protected int indexOf(int f, int c) { + return f * numClasses + c; + } + + public double[][] to2D(double[] x) { + double[][] x2 = new double[numFeatures][numClasses]; + for (int i = 0; i < numFeatures; i++) { + for (int j = 0; j < numClasses; j++) { + x2[i][j] = x[indexOf(i, j)]; + } + } + return x2; + } + + @Override + protected void calculate(double[] x) { + classifier.setWeights(to2D(x)); + if (derivative == null) { + derivative = new double[x.length]; + } else { + Arrays.fill(derivative, 0.0); + } + Counter> feature2classPairDerivatives = new ClassicCounter>(); + + value = 0.0; + for(int n = 0; n < geFeatures.size(); n++){ + //F feature = geFeatures.get(n); + double[] modelDist = new double[numClasses]; + Arrays.fill(modelDist,0); + + //go over the unlabeled active data to compute expectations + List activeData = geFeature2DatumList.get(n); + for (Integer activeDatum : activeData) { + Datum datum = unlabeledDataList.get(activeDatum); + double[] probs = getModelProbs(datum); + for (int c = 0; c < numClasses; c++) { + modelDist[c] += probs[c]; + } + updateDerivative(datum, probs, feature2classPairDerivatives); //computes p(y_d)*(1-p(y_d))*f_d for all active features. + } + + //now compute the value (KL-divergence) and the final value of the derivative. + if (activeData.size()>0) { + for (int c = 0; c < numClasses; c++) { + modelDist[c]/= activeData.size(); + } + smoothDistribution(modelDist); + + for(int c = 0; c < numClasses; c++) + value += -geFeature2EmpiricalDist[n][c]*Math.log(modelDist[c]); + + for(int f = 0; f < labeledDataset.featureIndex().size(); f++) { + for(int c = 0; c < numClasses; c++) { + int wtIndex = indexOf(f,c); + for(int cPrime = 0; cPrime < numClasses; cPrime++){ + derivative[wtIndex] += feature2classPairDerivatives.getCount(new Triple(f,c,cPrime))*geFeature2EmpiricalDist[n][cPrime]/modelDist[cPrime]; + } + derivative[wtIndex] /= activeData.size(); + } + } // loop over each feature for derivative computation + } //end of if condition + } //loop over each GE feature + } + + + private void updateDerivative(Datum datum, double[] probs,Counter> feature2classPairDerivatives){ + for (F feature : datum.asFeatures()) { + int fID = labeledDataset.featureIndex.indexOf(feature); + if (fID >= 0) { + for (int c = 0; c < numClasses; c++) { + for (int cPrime = 0; cPrime < numClasses; cPrime++) { + if (cPrime == c) { + feature2classPairDerivatives.incrementCount(new Triple(fID,c,cPrime), - probs[c]*(1-probs[c])*valueOfFeature(feature,datum)); + } else { + feature2classPairDerivatives.incrementCount(new Triple(fID,c,cPrime), probs[c]*probs[cPrime]*valueOfFeature(feature,datum)); + } + } + } + } + } + } + + /* + * This method assumes the feature already exists in the datum. + */ + private double valueOfFeature(F feature, Datum datum){ + if(datum instanceof RVFDatum) + return ((RVFDatum)datum).asFeaturesCounter().getCount(feature); + else return 1.0; + } + + private void computeEmpiricalStatistics(List geFeatures){ + //allocate memory to the containers and initialize them + geFeature2EmpiricalDist = new double[geFeatures.size()][labeledDataset.labelIndex.size()]; + geFeature2DatumList = new ArrayList>(geFeatures.size()); + Map geFeatureMap = Generics.newHashMap(); + Set activeUnlabeledExamples = Generics.newHashSet(); + for(int n = 0; n < geFeatures.size(); n++){ + F geFeature = geFeatures.get(n); + geFeature2DatumList.add(new ArrayList()); + Arrays.fill(geFeature2EmpiricalDist[n], 0); + geFeatureMap.put(geFeature,n); + } + + //compute the empirical label distribution for each GE feature + for(int i = 0; i < labeledDataset.size(); i++){ + Datum datum = labeledDataset.getDatum(i); + int labelID = labeledDataset.labelIndex.indexOf(datum.label()); + for(F feature : datum.asFeatures()){ + if(geFeatureMap.containsKey(feature)){ + int geFnum = geFeatureMap.get(feature); + geFeature2EmpiricalDist[geFnum][labelID]++; + } + } + } + //now normalize and smooth the label distribution for each feature. + for(int n = 0; n < geFeatures.size(); n++){ + ArrayMath.normalize(geFeature2EmpiricalDist[n]); + smoothDistribution(geFeature2EmpiricalDist[n]); + } + + //now build the inverted index from each GE feature to unlabeled datums that contain it. + for (int i = 0; i < unlabeledDataList.size(); i++) { + Datum datum = unlabeledDataList.get(i); + for (F feature : datum.asFeatures()) { + if (geFeatureMap.containsKey(feature)) { + int geFnum = geFeatureMap.get(feature); + geFeature2DatumList.get(geFnum).add(i); + activeUnlabeledExamples.add(i); + } + } + } + System.out.println("Number of active unlabeled examples:"+activeUnlabeledExamples.size()); + } + + private static void smoothDistribution(double [] dist) { + //perform Laplace smoothing + double epsilon = 1e-6; + for(int i = 0; i < dist.length; i++) + dist[i] += epsilon; + ArrayMath.normalize(dist); + } + + private double[] getModelProbs(Datum datum){ + double[] condDist = new double[labeledDataset.numClasses()]; + Counter probCounter = classifier.probabilityOf(datum); + for(L label : probCounter.keySet()){ + int labelID = labeledDataset.labelIndex.indexOf(label); + condDist[labelID] = probCounter.getCount(label); + } + return condDist; + } + + public GeneralizedExpectationObjectiveFunction(GeneralDataset labeledDataset, List> unlabeledDataList,List geFeatures) { + System.out.println("Number of labeled examples:"+labeledDataset.size+"\nNumber of unlabeled examples:"+unlabeledDataList.size()); + System.out.println("Number of GE features:"+geFeatures.size()); + this.numFeatures = labeledDataset.numFeatures(); + this.numClasses = labeledDataset.numClasses(); + this.labeledDataset = labeledDataset; + this.unlabeledDataList = unlabeledDataList; + this.geFeatures = geFeatures; + this.classifier = new LinearClassifier(null,labeledDataset.featureIndex,labeledDataset.labelIndex); + computeEmpiricalStatistics(geFeatures); + //empirical distributions don't change with iterations, so compute them only once. + //model distributions will have to be recomputed every iteration though. + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LinearClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LinearClassifier.java new file mode 100644 index 0000000..2519dd0 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LinearClassifier.java @@ -0,0 +1,1433 @@ +// Stanford Classifier - a multiclass maxent classifier +// LinearClassifier +// Copyright (c) 2003-2007 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// Support/Questions: java-nlp-user@lists.stanford.edu +// Licensing: java-nlp-support@lists.stanford.edu +// http://www-nlp.stanford.edu/software/classifier.shtml + +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.BasicDatum; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.util.*; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.Distribution; +import edu.stanford.nlp.stats.Counters; + +import java.io.*; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.*; + + +/** + * Implements a multiclass linear classifier. At classification time this + * can be any generalized linear model classifier (such as a perceptron, + * naive logistic regression, SVM). + * + * @author Dan Klein + * @author Jenny Finkel + * @author Galen Andrew (converted to arrays and indices) + * @author Christopher Manning (most of the printing options) + * @author Eric Yeh (save to text file, new constructor w/thresholds) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * @author (nmramesh@cs.stanford.edu) {@link #weightsAsMapOfCounters()} + * @author Angel Chang (Add functions to get top features, and number of features with weights above a certain threshold) + * + * @param The type of the labels in the Classifier + * @param The type of the features in the Classifier + */ +public class LinearClassifier implements ProbabilisticClassifier, RVFClassifier { + + /** Classifier weights. First index is the featureIndex value and second + * index is the labelIndex value. + */ + private double[][] weights; + private Index labelIndex; + private Index featureIndex; + public boolean intern = false; // variable should be deleted when breaking serialization anyway.... + private double[] thresholds = null; + + private static final long serialVersionUID = 8499574525453275255L; + + private static final int MAX_FEATURE_ALIGN_WIDTH = 50; + + public static final String TEXT_SERIALIZATION_DELIMITER = "\t"; + + @Override + public Collection labels() { + return labelIndex.objectsList(); + } + + public Collection features() { + return featureIndex.objectsList(); + } + + public Index labelIndex() { + return labelIndex; + } + + public Index featureIndex() { + return featureIndex; + } + + private double weight(int iFeature, int iLabel) { + if (iFeature < 0) { + //System.err.println("feature not seen "); + return 0.0; + } + return weights[iFeature][iLabel]; + } + + private double weight(F feature, int iLabel) { + int f = featureIndex.indexOf(feature); + return weight(f, iLabel); + } + + public double weight(F feature, L label) { + int f = featureIndex.indexOf(feature); + int iLabel = labelIndex.indexOf(label); + return weight(f, iLabel); + } + + /* --- obsolete method from before this class was rewritten using arrays + public Counter scoresOf(Datum example) { + Counter scores = new Counter(); + for (Object l : labels()) { + scores.setCount(l, scoreOf(example, l)); + } + return scores; + } + --- */ + + /** Construct a counter with keys the labels of the classifier and + * values the score (unnormalized log probability) of each class. + */ + public Counter scoresOf(Datum example) { + if(example instanceof RVFDatum)return scoresOfRVFDatum((RVFDatum)example); + Collection feats = example.asFeatures(); + int[] features = new int[feats.size()]; + int i = 0; + for (F f : feats) { + int index = featureIndex.indexOf(f); + if (index >= 0) { + features[i++] = index; + } else { + //System.err.println("FEATURE LESS THAN ZERO: " + f); + } + } + int[] activeFeatures = new int[i]; + System.arraycopy(features, 0, activeFeatures, 0, i); + Counter scores = new ClassicCounter(); + for (L lab : labels()) { + scores.setCount(lab, scoreOf(activeFeatures, lab)); + } + return scores; + } + + /** Given a datum's features, construct a counter with keys + * the labels and values the score (unnormalized log probability) + * for each class. + */ + public Counter scoresOf(int[] features) { + Counter scores = new ClassicCounter(); + for (L label : labels()) + scores.setCount(label, scoreOf(features, label)); + return scores; + } + + /** Returns of the score of the Datum for the specified label. + * Ignores the true label of the Datum. + */ + public double scoreOf(Datum example, L label) { + if(example instanceof RVFDatum)return scoreOfRVFDatum((RVFDatum)example, label); + int iLabel = labelIndex.indexOf(label); + double score = 0.0; + for (F f : example.asFeatures()) { + score += weight(f, iLabel); + } + return score + thresholds[iLabel]; + } + + /** Construct a counter with keys the labels of the classifier and + * values the score (unnormalized log probability) of each class + * for an RVFDatum. + */ + @Deprecated + public Counter scoresOf(RVFDatum example) { + Counter scores = new ClassicCounter(); + for (L l : labels()) { + scores.setCount(l, scoreOf(example, l)); + } + //System.out.println("Scores are: " + scores + " (gold: " + example.label() + ")"); + return scores; + } + + /** Construct a counter with keys the labels of the classifier and + * values the score (unnormalized log probability) of each class + * for an RVFDatum. + */ + private Counter scoresOfRVFDatum(RVFDatum example) { + Counter scores = new ClassicCounter(); + for (L l : labels()) { + scores.setCount(l, scoreOfRVFDatum(example, l)); + } + //System.out.println("Scores are: " + scores + " (gold: " + example.label() + ")"); + return scores; + } + + /** Returns the score of the RVFDatum for the specified label. + * Ignores the true label of the RVFDatum. + */ + @Deprecated + public double scoreOf(RVFDatum example, L label) { + int iLabel = labelIndex.indexOf(label); + double score = 0.0; + Counter features = example.asFeaturesCounter(); + for (F f : features.keySet()) { + score += weight(f, iLabel) * features.getCount(f); + } + return score + thresholds[iLabel]; + } + + /** Returns the score of the RVFDatum for the specified label. + * Ignores the true label of the RVFDatum. + */ + private double scoreOfRVFDatum(RVFDatum example, L label) { + int iLabel = labelIndex.indexOf(label); + double score = 0.0; + Counter features = example.asFeaturesCounter(); + for (F f : features.keySet()) { + score += weight(f, iLabel) * features.getCount(f); + } + return score + thresholds[iLabel]; + } + + + /** Returns of the score of the Datum as internalized features for the + * specified label. Ignores the true label of the Datum. + * Doesn't consider a value for each feature. + */ + private double scoreOf(int[] feats, L label) { + int iLabel = labelIndex.indexOf(label); + double score = 0.0; + for (int feat : feats) { + score += weight(feat, iLabel); + } + return score + thresholds[iLabel]; + } + + + /** + * Returns a counter mapping from each class name to the probability of + * that class for a certain example. + * Looking at the the sum of each count v, should be 1.0. + */ + public Counter probabilityOf(Datum example) { + if(example instanceof RVFDatum)return probabilityOfRVFDatum((RVFDatum)example); + Counter scores = logProbabilityOf(example); + for (L label : scores.keySet()) { + scores.setCount(label, Math.exp(scores.getCount(label))); + } + return scores; + } + + /** + * Returns a counter mapping from each class name to the probability of + * that class for a certain example. + * Looking at the the sum of each count v, should be 1.0. + */ + private Counter probabilityOfRVFDatum(RVFDatum example) { + // NB: this duplicate method is needed so it calls the scoresOf method + // with a RVFDatum signature + Counter scores = logProbabilityOfRVFDatum(example); + for (L label : scores.keySet()) { + scores.setCount(label, Math.exp(scores.getCount(label))); + } + return scores; + } + + /** + * Returns a counter mapping from each class name to the probability of + * that class for a certain example. + * Looking at the the sum of each count v, should be 1.0. + */ + @Deprecated + public Counter probabilityOf(RVFDatum example) { + // NB: this duplicate method is needed so it calls the scoresOf method + // with a RVFDatum signature + Counter scores = logProbabilityOf(example); + for (L label : scores.keySet()) { + scores.setCount(label, Math.exp(scores.getCount(label))); + } + return scores; + } + + /** + * Returns a counter mapping from each class name to the log probability of + * that class for a certain example. + * Looking at the the sum of e^v for each count v, should be 1.0. + */ + public Counter logProbabilityOf(Datum example) { + if(example instanceof RVFDatum)return logProbabilityOfRVFDatum((RVFDatum)example); + Counter scores = scoresOf(example); + Counters.logNormalizeInPlace(scores); + return scores; + } + + /** + * Given a datum's features, returns a counter mapping from each + * class name to the log probability of that class. + * Looking at the the sum of e^v for each count v, should be 1. + */ + public Counter logProbabilityOf(int[] features) { + Counter scores = scoresOf(features); + Counters.logNormalizeInPlace(scores); + return scores; + } + + public Counter probabilityOf(int [] features) { + Counter scores = logProbabilityOf(features); + for (L label : scores.keySet()) { + scores.setCount(label, Math.exp(scores.getCount(label))); + } + return scores; + } + + /** + * Returns a counter for the log probability of each of the classes + * looking at the the sum of e^v for each count v, should be 1 + */ + private Counter logProbabilityOfRVFDatum(RVFDatum example) { + // NB: this duplicate method is needed so it calls the scoresOf method + // with an RVFDatum signature!! Don't remove it! + // JLS: type resolution of method parameters is static + Counter scores = scoresOfRVFDatum(example); + Counters.logNormalizeInPlace(scores); + return scores; + } + + /** + * Returns a counter for the log probability of each of the classes + * looking at the the sum of e^v for each count v, should be 1 + */ + @Deprecated + public Counter logProbabilityOf(RVFDatum example) { + // NB: this duplicate method is needed so it calls the scoresOf method + // with an RVFDatum signature!! Don't remove it! + // JLS: type resolution of method parameters is static + Counter scores = scoresOf(example); + Counters.logNormalizeInPlace(scores); + return scores; + } + + /** + * Returns indices of labels + * @param labels - Set of labels to get indicies + * @return Set of indicies + */ + protected Set getLabelIndices(Set labels) { + Set iLabels = Generics.newHashSet(); + for (L label:labels) { + int iLabel = labelIndex.indexOf(label); + iLabels.add(iLabel); + if (iLabel < 0) throw new IllegalArgumentException("Unknown label " + label); + } + return iLabels; + } + + /** + * Returns number of features with weight above a certain threshold + * (across all labels) + * @param threshold Threshold above which we will count the feature + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @return number of features satisfying the specified conditions + */ + public int getFeatureCount(double threshold, boolean useMagnitude) + { + int n = 0; + for (int feat = 0; feat < weights.length; feat++) { + for (int lab = 0; lab < weights[feat].length; lab++) { + double thisWeight = (useMagnitude)? Math.abs(weights[feat][lab]):weights[feat][lab]; + if (thisWeight > threshold) { + n++; + } + } + } + return n; + } + + /** + * Returns number of features with weight above a certain threshold + * @param labels Set of labels we care about when counting features + * Use null to get counts across all labels + * @param threshold Threshold above which we will count the feature + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @return number of features satisfying the specified conditions + */ + public int getFeatureCount(Set labels, double threshold, boolean useMagnitude) + { + if (labels != null) { + Set iLabels = getLabelIndices(labels); + return getFeatureCountLabelIndices(iLabels, threshold, useMagnitude); + } else { + return getFeatureCount(threshold, useMagnitude); + } + } + + /** + * Returns number of features with weight above a certain threshold + * @param iLabels Set of label indices we care about when counting features + * Use null to get counts across all labels + * @param threshold Threshold above which we will count the feature + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @return number of features satisfying the specified conditions + */ + protected int getFeatureCountLabelIndices(Set iLabels, double threshold, boolean useMagnitude) + { + int n = 0; + for (int feat = 0; feat < weights.length; feat++) { + for (int labIndex:iLabels) { + double thisWeight = (useMagnitude)? Math.abs(weights[feat][labIndex]):weights[feat][labIndex]; + if (thisWeight > threshold) { + n++; + } + } + } + return n; + } + + /** + * Returns list of top features with weight above a certain threshold + * (list is descending and across all labels) + * @param threshold Threshold above which we will count the feature + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @param numFeatures How many top features to return (-1 for unlimited) + * @return List of triples indicating feature, label, weight + */ + public List> getTopFeatures(double threshold, boolean useMagnitude, int numFeatures) + { + return getTopFeatures(null, threshold, useMagnitude, numFeatures, true); + } + + /** + * Returns list of top features with weight above a certain threshold + * @param labels Set of labels we care about when getting features + * Use null to get features across all labels + * @param threshold Threshold above which we will count the feature + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @param numFeatures How many top features to return (-1 for unlimited) + * @param descending Return weights in descending order + * @return List of triples indicating feature, label, weight + */ + public List> getTopFeatures(Set labels, + double threshold, boolean useMagnitude, int numFeatures, + boolean descending) + { + if (labels != null) { + Set iLabels = getLabelIndices(labels); + return getTopFeaturesLabelIndices(iLabels, threshold, useMagnitude, numFeatures, descending); + } else { + return getTopFeaturesLabelIndices(null, threshold, useMagnitude, numFeatures, descending); + } + } + + /** + * Returns list of top features with weight above a certain threshold + * @param iLabels Set of label indices we care about when getting features + * Use null to get features across all labels + * @param threshold Threshold above which we will count the feature + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @param numFeatures How many top features to return (-1 for unlimited) + * @param descending Return weights in descending order + * @return List of triples indicating feature, label, weight + */ + protected List> getTopFeaturesLabelIndices(Set iLabels, + double threshold, boolean useMagnitude, int numFeatures, + boolean descending) + { + edu.stanford.nlp.util.PriorityQueue> biggestKeys = + new FixedPrioritiesPriorityQueue>(); + + // locate biggest keys + for (int feat = 0; feat < weights.length; feat++) { + for (int lab = 0; lab < weights[feat].length; lab++) { + if (iLabels != null && !iLabels.contains(lab)) { + continue; + } + double thisWeight; + if (useMagnitude) { + thisWeight = Math.abs(weights[feat][lab]); + } else { + thisWeight = weights[feat][lab]; + } + + if (thisWeight > threshold) { + // reverse the weight, so get smallest first + thisWeight = -thisWeight; + if (biggestKeys.size() == numFeatures) { + // have enough features, add only if bigger + double lowest = biggestKeys.getPriority(); + if (thisWeight < lowest) { + // remove smallest + biggestKeys.removeFirst(); + biggestKeys.add(new Pair(feat, lab), thisWeight); + } + } else { + // always add it if don't have enough features yet + biggestKeys.add(new Pair(feat, lab), thisWeight); + } + } + } + } + + List> topFeatures = new ArrayList>(biggestKeys.size()); + while (!biggestKeys.isEmpty()) { + Pair p = biggestKeys.removeFirst(); + double weight = weights[p.first()][p.second()]; + F feat = featureIndex.get(p.first()); + L label = labelIndex.get(p.second()); + topFeatures.add(new Triple(feat, label, weight)); + } + if (descending) { + Collections.reverse(topFeatures); + } + return topFeatures; + } + + /** + * Returns string representation of a list of top features + * @param topFeatures List of triples indicating feature, label, weight + * @return String representation of the list of features + */ + public String topFeaturesToString(List> topFeatures) + { + // find longest key length (for pretty printing) with a limit + int maxLeng = 0; + for (Triple t : topFeatures) { + String key = "(" + t.first + "," + t.second + ")"; + int leng = key.length(); + if (leng > maxLeng) { + maxLeng = leng; + } + } + maxLeng = Math.min(64, maxLeng); + + // set up pretty printing of weights + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMinimumFractionDigits(4); + nf.setMaximumFractionDigits(4); + if (nf instanceof DecimalFormat) { + ((DecimalFormat) nf).setPositivePrefix(" "); + } + + //print high weight features to a String + StringBuilder sb = new StringBuilder(); + for (Triple t : topFeatures) { + String key = "(" + t.first + "," + t.second + ")"; + sb.append(StringUtils.pad(key, maxLeng)); + sb.append(" "); + double cnt = t.third(); + if (Double.isInfinite(cnt)) { + sb.append(cnt); + } else { + sb.append(nf.format(cnt)); + } + sb.append("\n"); + } + return sb.toString(); + } + + /** Return a String that prints features with large weights. + * + * @param useMagnitude Whether the notion of "large" should ignore + * the sign of the feature weight. + * @param numFeatures How many top features to print + * @param printDescending Print weights in descending order + * @return The String representation of features with large weights + */ + public String toBiggestWeightFeaturesString(boolean useMagnitude, + int numFeatures, + boolean printDescending) { + // this used to try to use a treeset, but that was WRONG.... + edu.stanford.nlp.util.PriorityQueue> biggestKeys = + new FixedPrioritiesPriorityQueue>(); + + // locate biggest keys + for (int feat = 0; feat < weights.length; feat++) { + for (int lab = 0; lab < weights[feat].length; lab++) { + double thisWeight; + // reverse the weight, so get smallest first + if (useMagnitude) { + thisWeight = -Math.abs(weights[feat][lab]); + } else { + thisWeight = -weights[feat][lab]; + } + if (biggestKeys.size() == numFeatures) { + // have enough features, add only if bigger + double lowest = biggestKeys.getPriority(); + if (thisWeight < lowest) { + // remove smallest + biggestKeys.removeFirst(); + biggestKeys.add(new Pair(feat, lab), thisWeight); + } + } else { + // always add it if don't have enough features yet + biggestKeys.add(new Pair(feat, lab), thisWeight); + } + } + } + + // Put in List either reversed or not + // (Note: can't repeatedly iterate over PriorityQueue.) + int actualSize = biggestKeys.size(); + Pair[] bigArray = ErasureUtils.>mkTArray(Pair.class,actualSize); + // System.err.println("biggestKeys is " + biggestKeys); + if (printDescending) { + for (int j = actualSize - 1; j >= 0; j--) { + bigArray[j] = biggestKeys.removeFirst(); + } + } else { + for (int j = 0; j < actualSize; j--) { + bigArray[j] = biggestKeys.removeFirst(); + } + } + List> bigColl = Arrays.asList(bigArray); + // System.err.println("bigColl is " + bigColl); + + // find longest key length (for pretty printing) with a limit + int maxLeng = 0; + for (Pair p : bigColl) { + String key = "(" + featureIndex.get(p.first) + "," + labelIndex.get(p.second) + ")"; + int leng = key.length(); + if (leng > maxLeng) { + maxLeng = leng; + } + } + maxLeng = Math.min(64, maxLeng); + + // set up pretty printing of weights + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMinimumFractionDigits(4); + nf.setMaximumFractionDigits(4); + if (nf instanceof DecimalFormat) { + ((DecimalFormat) nf).setPositivePrefix(" "); + } + + //print high weight features to a String + StringBuilder sb = new StringBuilder("LinearClassifier [printing top " + numFeatures + " features]\n"); + for (Pair p : bigColl) { + String key = "(" + featureIndex.get(p.first) + "," + labelIndex.get(p.second) + ")"; + sb.append(StringUtils.pad(key, maxLeng)); + sb.append(" "); + double cnt = weights[p.first][p.second]; + if (Double.isInfinite(cnt)) { + sb.append(cnt); + } else { + sb.append(nf.format(cnt)); + } + sb.append("\n"); + } + return sb.toString(); + } + + /** + * Similar to histogram but exact values of the weights + * to see whether there are many equal weights. + * + * @return A human readable string about the classifier distribution. + */ + public String toDistributionString(int treshold) { + Counter weightCounts = new ClassicCounter(); + StringBuilder s = new StringBuilder(); + s.append("Total number of weights: ").append(totalSize()); + for (int f = 0; f < weights.length; f++) { + for (int l = 0; l < weights[f].length; l++) { + weightCounts.incrementCount(weights[f][l]); + } + } + + s.append("Counts of weights\n"); + Set keys = Counters.keysAbove(weightCounts, treshold); + s.append(keys.size()).append(" keys occur more than ").append(treshold).append(" times "); + return s.toString(); + } + + public int totalSize() { + return labelIndex.size() * featureIndex.size(); + } + + public String toHistogramString() { + // big classifiers + double[][] hist = new double[3][202]; + Object[][] histEg = new Object[3][202]; + int num = 0; + int pos = 0; + int neg = 0; + int zero = 0; + double total = 0.0; + double x2total = 0.0; + double max = 0.0, min = 0.0; + for (int f = 0; f < weights.length; f++) { + for (int l = 0; l < weights[f].length; l++) { + Pair feat = new Pair(featureIndex.get(f), labelIndex.get(l)); + num++; + double wt = weights[f][l]; + total += wt; + x2total += wt * wt; + if (wt > max) { + max = wt; + } + if (wt < min) { + min = wt; + } + if (wt < 0.0) { + neg++; + } else if (wt > 0.0) { + pos++; + } else { + zero++; + } + int index; + index = bucketizeValue(wt); + hist[0][index]++; + if (histEg[0][index] == null) { + histEg[0][index] = feat; + } + if (wt < 0.1 && wt >= -0.1) { + index = bucketizeValue(wt * 100.0); + hist[1][index]++; + if (histEg[1][index] == null) { + histEg[1][index] = feat; + } + if (wt < 0.001 && wt >= -0.001) { + index = bucketizeValue(wt * 10000.0); + hist[2][index]++; + if (histEg[2][index] == null) { + histEg[2][index] = feat; + } + } + } + } + } + double ave = total / num; + double stddev = (x2total / num) - ave * ave; + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + + pw.println("Linear classifier with " + num + " f(x,y) features"); + pw.println("Average weight: " + ave + "; std dev: " + stddev); + pw.println("Max weight: " + max + " min weight: " + min); + pw.println("Weights: " + neg + " negative; " + pos + " positive; " + zero + " zero."); + + printHistCounts(0, "Counts of lambda parameters between [-10, 10)", pw, hist, histEg); + printHistCounts(1, "Closeup view of [-0.1, 0.1) depicted * 10^2", pw, hist, histEg); + printHistCounts(2, "Closeup view of [-0.001, 0.001) depicted * 10^4", pw, hist, histEg); + pw.close(); + return sw.toString(); + } + + /** Print out a partial representation of a linear classifier. + * This just calls toString("WeightHistogram", 0) + */ + @Override + public String toString() { + return toString("WeightHistogram", 0); + } + + + /** + * Print out a partial representation of a linear classifier in one of + * several ways. + * + * @param style Options are: + * HighWeight: print out the param parameters with largest weights; + * HighMagnitude: print out the param parameters for which the absolute + * value of their weight is largest; + * AllWeights: print out the weights of all features; + * WeightHistogram: print out a particular hard-coded textual histogram + * representation of a classifier; + * WeightDistribution; + * + * @param param Determines the number of things printed in certain styles + * @throws IllegalArgumentException if the style name is unrecognized + */ + public String toString(String style, int param) { + if (style == null || "".equals(style)) { + return "LinearClassifier with " + featureIndex.size() + " features, " + + labelIndex.size() + " classes, and " + + labelIndex.size() * featureIndex.size() + " parameters.\n"; + } else if (style.equalsIgnoreCase("HighWeight")) { + return toBiggestWeightFeaturesString(false, param, true); + } else if (style.equalsIgnoreCase("HighMagnitude")) { + return toBiggestWeightFeaturesString(true, param, true); + } else if (style.equalsIgnoreCase("AllWeights")) { + return toAllWeightsString(); + } else if (style.equalsIgnoreCase("WeightHistogram")) { + return toHistogramString(); + } else if (style.equalsIgnoreCase("WeightDistribution")) { + return toDistributionString(param); + } else { + throw new IllegalArgumentException("Unknown style: " + style); + } + } + + + /** + * Convert parameter value into number between 0 and 201 + */ + private static int bucketizeValue(double wt) { + int index; + if (wt >= 0.0) { + index = ((int) (wt * 10.0)) + 100; + } else { + index = ((int) (Math.floor(wt * 10.0))) + 100; + } + if (index < 0) { + index = 201; + } else if (index > 200) { + index = 200; + } + return index; + } + + /** + * Print histogram counts from hist and examples over a certain range + */ + private static void printHistCounts(int ind, String title, PrintWriter pw, double[][] hist, Object[][] histEg) { + pw.println(title); + for (int i = 0; i < 200; i++) { + int intpart, fracpart; + if (i < 100) { + intpart = 10 - ((i + 9) / 10); + fracpart = (10 - (i % 10)) % 10; + } else { + intpart = (i / 10) - 10; + fracpart = i % 10; + } + pw.print("[" + ((i < 100) ? "-" : "") + intpart + "." + fracpart + ", " + ((i < 100) ? "-" : "") + intpart + "." + fracpart + "+0.1): " + hist[ind][i]); + if (histEg[ind][i] != null) { + pw.print(" [" + histEg[ind][i] + ((hist[ind][i] > 1) ? ", ..." : "") + "]"); + } + pw.println(); + } + } + + + //TODO: Sort of assumes that Labels are Strings... + public String toAllWeightsString() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + pw.println("Linear classifier with the following weights"); + Datum allFeatures = new BasicDatum(features(), (L)null); + justificationOf(allFeatures, pw); + return sw.toString(); + } + + + /** + * Print all features in the classifier and the weight that they assign + * to each class. + */ + public void dump() { + Datum allFeatures = new BasicDatum(features(), (L)null); + justificationOf(allFeatures); + } + + public void dump(PrintWriter pw) { + Datum allFeatures = new BasicDatum(features(), (L)null); + justificationOf(allFeatures, pw); + } + + + + @Deprecated + public void justificationOf(RVFDatum example) { + PrintWriter pw = new PrintWriter(System.err, true); + justificationOf(example, pw); + } + + /** + * Print all features active for a particular datum and the weight that + * the classifier assigns to each class for those features. + */ + private void justificationOfRVFDatum(RVFDatum example, PrintWriter pw) { + int featureLength = 0; + int labelLength = 6; + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMinimumFractionDigits(2); + nf.setMaximumFractionDigits(2); + if (nf instanceof DecimalFormat) { + ((DecimalFormat) nf).setPositivePrefix(" "); + } + Counter features = example.asFeaturesCounter(); + for (F f : features.keySet()) { + featureLength = Math.max(featureLength, f.toString().length() + 2 + + nf.format(features.getCount(f)).length()); + } + // make as wide as total printout + featureLength = Math.max(featureLength, "Total:".length()); + // don't make it ridiculously wide + featureLength = Math.min(featureLength, MAX_FEATURE_ALIGN_WIDTH); + + for (Object l : labels()) { + labelLength = Math.max(labelLength, l.toString().length()); + } + + StringBuilder header = new StringBuilder(""); + for (int s = 0; s < featureLength; s++) { + header.append(' '); + } + for (L l : labels()) { + header.append(' '); + header.append(StringUtils.pad(l, labelLength)); + } + pw.println(header); + for (F f : features.keySet()) { + String fStr = f.toString(); + StringBuilder line = new StringBuilder(fStr); + line.append("[").append(nf.format(features.getCount(f))).append("]"); + fStr = line.toString(); + for (int s = fStr.length(); s < featureLength; s++) { + line.append(' '); + } + for (L l : labels()) { + String lStr = nf.format(weight(f, l)); + line.append(' '); + line.append(lStr); + for (int s = lStr.length(); s < labelLength; s++) { + line.append(' '); + } + } + pw.println(line); + } + Counter scores = scoresOfRVFDatum(example); + StringBuilder footer = new StringBuilder("Total:"); + for (int s = footer.length(); s < featureLength; s++) { + footer.append(' '); + } + for (L l : labels()) { + footer.append(' '); + String str = nf.format(scores.getCount(l)); + footer.append(str); + for (int s = str.length(); s < labelLength; s++) { + footer.append(' '); + } + } + pw.println(footer); + Distribution distr = Distribution.distributionFromLogisticCounter(scores); + footer = new StringBuilder("Prob:"); + for (int s = footer.length(); s < featureLength; s++) { + footer.append(' '); + } + for (L l : labels()) { + footer.append(' '); + String str = nf.format(distr.getCount(l)); + footer.append(str); + for (int s = str.length(); s < labelLength; s++) { + footer.append(' '); + } + } + pw.println(footer); + } + + + /** + * Print all features active for a particular datum and the weight that + * the classifier assigns to each class for those features. + */ + @Deprecated + public void justificationOf(RVFDatum example, PrintWriter pw) { + int featureLength = 0; + int labelLength = 6; + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMinimumFractionDigits(2); + nf.setMaximumFractionDigits(2); + if (nf instanceof DecimalFormat) { + ((DecimalFormat) nf).setPositivePrefix(" "); + } + Counter features = example.asFeaturesCounter(); + for (F f : features.keySet()) { + featureLength = Math.max(featureLength, f.toString().length() + 2 + + nf.format(features.getCount(f)).length()); + } + // make as wide as total printout + featureLength = Math.max(featureLength, "Total:".length()); + // don't make it ridiculously wide + featureLength = Math.min(featureLength, MAX_FEATURE_ALIGN_WIDTH); + + for (Object l : labels()) { + labelLength = Math.max(labelLength, l.toString().length()); + } + + StringBuilder header = new StringBuilder(""); + for (int s = 0; s < featureLength; s++) { + header.append(' '); + } + for (L l : labels()) { + header.append(' '); + header.append(StringUtils.pad(l, labelLength)); + } + pw.println(header); + for (F f : features.keySet()) { + String fStr = f.toString(); + StringBuilder line = new StringBuilder(fStr); + line.append("[").append(nf.format(features.getCount(f))).append("]"); + fStr = line.toString(); + for (int s = fStr.length(); s < featureLength; s++) { + line.append(' '); + } + for (L l : labels()) { + String lStr = nf.format(weight(f, l)); + line.append(' '); + line.append(lStr); + for (int s = lStr.length(); s < labelLength; s++) { + line.append(' '); + } + } + pw.println(line); + } + Counter scores = scoresOf(example); + StringBuilder footer = new StringBuilder("Total:"); + for (int s = footer.length(); s < featureLength; s++) { + footer.append(' '); + } + for (L l : labels()) { + footer.append(' '); + String str = nf.format(scores.getCount(l)); + footer.append(str); + for (int s = str.length(); s < labelLength; s++) { + footer.append(' '); + } + } + pw.println(footer); + Distribution distr = Distribution.distributionFromLogisticCounter(scores); + footer = new StringBuilder("Prob:"); + for (int s = footer.length(); s < featureLength; s++) { + footer.append(' '); + } + for (L l : labels()) { + footer.append(' '); + String str = nf.format(distr.getCount(l)); + footer.append(str); + for (int s = str.length(); s < labelLength; s++) { + footer.append(' '); + } + } + pw.println(footer); + } + + + public void justificationOf(Datum example) { + PrintWriter pw = new PrintWriter(System.err, true); + justificationOf(example, pw); + } + + public void justificationOf(Datum example, PrintWriter pw, Function printer) { + justificationOf(example, pw, printer, false); + } + + /** Print all features active for a particular datum and the weight that + * the classifier assigns to each class for those features. + * + * @param example The datum for which features are to be printed + * @param pw Where to print it to + * @param printer If this is non-null, then it is applied to each + * feature to convert it to a more readable form + * @param sortedByFeature Whether to sort by feature names + */ + public void justificationOf(Datum example, PrintWriter pw, + Function printer, boolean sortedByFeature) { + + if(example instanceof RVFDatum) { + justificationOfRVFDatum((RVFDatum)example,pw); + return; + } + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMinimumFractionDigits(2); + nf.setMaximumFractionDigits(2); + if (nf instanceof DecimalFormat) { + ((DecimalFormat) nf).setPositivePrefix(" "); + } + + // determine width for features, making it at least total's width + int featureLength = 0; + //TODO: not really sure what this Printer is supposed to spit out... + for (F f : example.asFeatures()) { + int length = f.toString().length(); + if (printer != null) { + length = printer.apply(f).toString().length(); + } + featureLength = Math.max(featureLength, length); + } + // make as wide as total printout + featureLength = Math.max(featureLength, "Total:".length()); + // don't make it ridiculously wide + featureLength = Math.min(featureLength, MAX_FEATURE_ALIGN_WIDTH); + + // determine width for labels + int labelLength = 6; + for (L l : labels()) { + labelLength = Math.max(labelLength, l.toString().length()); + } + + // print header row of output listing classes + StringBuilder header = new StringBuilder(""); + for (int s = 0; s < featureLength; s++) { + header.append(' '); + } + for (L l : labels()) { + header.append(' '); + header.append(StringUtils.pad(l, labelLength)); + } + pw.println(header); + + // print active features and weights per class + Collection featColl = example.asFeatures(); + if (sortedByFeature){ + featColl = ErasureUtils.sortedIfPossible(featColl); + } + for (F f : featColl) { + String fStr; + if (printer != null) { + fStr = printer.apply(f).toString(); + } else { + fStr = f.toString(); + } + StringBuilder line = new StringBuilder(fStr); + for (int s = fStr.length(); s < featureLength; s++) { + line.append(' '); + } + for (L l : labels()) { + String lStr = nf.format(weight(f, l)); + line.append(' '); + line.append(lStr); + for (int s = lStr.length(); s < labelLength; s++) { + line.append(' '); + } + } + pw.println(line); + } + + // Print totals, probs, etc. + Counter scores = scoresOf(example); + StringBuilder footer = new StringBuilder("Total:"); + for (int s = footer.length(); s < featureLength; s++) { + footer.append(' '); + } + for (L l : labels()) { + footer.append(' '); + String str = nf.format(scores.getCount(l)); + footer.append(str); + for (int s = str.length(); s < labelLength; s++) { + footer.append(' '); + } + } + pw.println(footer); + Distribution distr = Distribution.distributionFromLogisticCounter(scores); + footer = new StringBuilder("Prob:"); + for (int s = footer.length(); s < featureLength; s++) { + footer.append(' '); + } + for (L l : labels()) { + footer.append(' '); + String str = nf.format(distr.getCount(l)); + footer.append(str); + for (int s = str.length(); s < labelLength; s++) { + footer.append(' '); + } + } + pw.println(footer); + } + +/** + * This method returns a map from each label to a counter of feature weights for that label. + * Useful for feature analysis. + * @return a map of counters + */ + + public Map> weightsAsMapOfCounters() { + Map> mapOfCounters = Generics.newHashMap(); + for(L label : labelIndex){ + int labelID = labelIndex.indexOf(label); + Counter c = new ClassicCounter(); + mapOfCounters.put(label, c); + for (F f : featureIndex) { + c.incrementCount(f, weights[featureIndex.indexOf(f)][labelID]); + } + } + return mapOfCounters; + } + + /** + * Print all features active for a particular datum and the weight that + * the classifier assigns to each class for those features. + */ + public void justificationOf(Datum example, PrintWriter pw) { + justificationOf(example, pw, null); + } + + + /** + * Print all features in the classifier and the weight that they assign + * to each class. The feature names are printed in sorted order. + */ + public void dumpSorted() { + Datum allFeatures = new BasicDatum(features(), (L)null); + justificationOf(allFeatures, new PrintWriter(System.err, true), true); + } + + /** + * Print all features active for a particular datum and the weight that + * the classifier assigns to each class for those features. Sorts by feature + * name if 'sorted' is true. + */ + public void justificationOf(Datum example, PrintWriter pw, boolean sorted) { + if(example instanceof RVFDatum) + justificationOf(example, pw, null, sorted); + } + + + public Counter scoresOf(Datum example, Collection possibleLabels) { + Counter scores = new ClassicCounter(); + for (L l : possibleLabels) { + if (labelIndex.indexOf(l) == -1) { + continue; + } + double score = scoreOf(example, l); + scores.setCount(l, score); + } + return scores; + } + + + public L experimentalClassOf(Datum example) { + if(example instanceof RVFDatum) { + throw new UnsupportedOperationException(); + } + + int labelCount = weights[0].length; + //System.out.printf("labelCount: %d\n", labelCount); + Collection features = example.asFeatures(); + + int[] featureInts = new int[features.size()]; + int fI = 0; + for (F feature : features) { + featureInts[fI++] = featureIndex.indexOf(feature); + } + //System.out.println("Features: "+features); + double bestScore = Double.NEGATIVE_INFINITY; + int bestI = 0; + for (int i = 0; i < labelCount; i++) { + double score = 0; + for (int j = 0; j < featureInts.length; j++) { + if (featureInts[j] < 0) continue; + score += weights[featureInts[j]][i]; + } + if (score > bestScore) { + bestI = i; + bestScore = score; + } + //System.out.printf("Score: %s(%d): %e\n", labelIndex.get(i), i, score); + } + //System.out.printf("label(%d): %s\n", bestI, labelIndex.get(bestI));; + return labelIndex.get(bestI); + } + + public L classOf(Datum example) { + if(example instanceof RVFDatum)return classOfRVFDatum((RVFDatum)example); + Counter scores = scoresOf(example); + return Counters.argmax(scores); + } + + + private L classOfRVFDatum(RVFDatum example) { + Counter scores = scoresOfRVFDatum(example); + return Counters.argmax(scores); + } + + @Deprecated + public L classOf(RVFDatum example) { + Counter scores = scoresOf(example); + return Counters.argmax(scores); + } + + public LinearClassifier(double[][] weights, Index featureIndex, Index labelIndex) { + this.featureIndex = featureIndex; + this.labelIndex = labelIndex; + this.weights = weights; + thresholds = new double[labelIndex.size()]; + Arrays.fill(thresholds, 0.0); + } + + public LinearClassifier(double[][] weights, Index featureIndex, Index labelIndex, + double[] thresholds) throws Exception { + this.featureIndex = featureIndex; + this.labelIndex = labelIndex; + this.weights = weights; + if (thresholds.length != labelIndex.size()) + throw new Exception("Number of thresholds and number of labels do not match."); + thresholds = new double[thresholds.length]; + int curr = 0; + for (double tval : thresholds) { + thresholds[curr++] = tval; + } + Arrays.fill(thresholds, 0.0); + } + + public LinearClassifier(double[] weights, Index> weightIndex) { + Counter> weightCounter = new ClassicCounter>(); + for (int i = 0; i < weightIndex.size(); i++) { + if (weights[i] == 0) { + continue; // no need to save 0 weights + } + weightCounter.setCount(weightIndex.get(i), weights[i]); + } + init(weightCounter, new ClassicCounter()); + } + + public LinearClassifier(Counter> weightCounter) { + this(weightCounter, new ClassicCounter()); + } + + public LinearClassifier(Counter> weightCounter, Counter thresholdsC) { + init(weightCounter,thresholdsC); + } + + private void init(Counter> weightCounter, Counter thresholdsC) { + Collection> keys = weightCounter.keySet(); + featureIndex = new HashIndex(); + labelIndex = new HashIndex(); + for (Pair p : keys) { + featureIndex.add(p.first()); + labelIndex.add(p.second()); + } + thresholds = new double[labelIndex.size()]; + for (L label : labelIndex) { + thresholds[labelIndex.indexOf(label)] = thresholdsC.getCount(label); + } + weights = new double[featureIndex.size()][labelIndex.size()]; + Pair tempPair = new Pair(); + for (int f = 0; f < weights.length; f++) { + for (int l = 0; l < weights[f].length; l++) { + tempPair.first = featureIndex.get(f); + tempPair.second = labelIndex.get(l); + weights[f][l] = weightCounter.getCount(tempPair); + } + } + } + + + public void adaptWeights(Dataset adapt,LinearClassifierFactory lcf) { + System.err.println("before adapting, weights size="+weights.length); + weights = lcf.adaptWeights(weights,adapt); + System.err.println("after adapting, weights size="+weights.length); + } + + public double[][] weights() { + return weights; + } + + public void setWeights(double[][] newWeights) { + weights = newWeights; + } + + /** + * Loads a classifier from a file. + * Simple convenience wrapper for IOUtils.readFromString. + */ + public static LinearClassifier readClassifier(String loadPath) { + System.err.print("Deserializing classifier from " + loadPath + "..."); + + try { + ObjectInputStream ois = IOUtils.readStreamFromString(loadPath); + LinearClassifier classifier = ErasureUtils.>uncheckedCast(ois.readObject()); + ois.close(); + return classifier; + } catch (Exception e) { + e.printStackTrace(); + throw new RuntimeException("Deserialization failed: "+e.getMessage()); + } + } + + /** + * Convenience wrapper for IOUtils.writeObjectToFile + */ + public static void writeClassifier(LinearClassifier classifier, String writePath) { + try { + IOUtils.writeObjectToFile(classifier, writePath); + } catch (Exception e) { + throw new RuntimeException("Serialization failed: "+e.getMessage(), e); + } + } + + /** + * Saves this out to a standard text file, instead of as a serialized Java object. + * NOTE: this currently assumes feature and weights are represented as Strings. + * @param file String filepath to write out to. + */ + public void saveToFilename(String file) { + try { + File tgtFile = new File(file); + BufferedWriter out = new BufferedWriter(new FileWriter(tgtFile)); + // output index first, blank delimiter, outline feature index, then weights + labelIndex.saveToWriter(out); + featureIndex.saveToWriter(out); + int numLabels = labelIndex.size(); + int numFeatures = featureIndex.size(); + for (int featIndex=0; featIndex extends AbstractLinearClassifierFactory { + + private static final long serialVersionUID = 7893768984379107397L; + private double TOL; + //public double sigma; + private int mem = 15; + private boolean verbose = false; + //private int prior; + //private double epsilon = 0.0; + private LogPrior logPrior; + //private Minimizer minimizer; + //private boolean useSum = false; + private boolean tuneSigmaHeldOut = false; + private boolean tuneSigmaCV = false; + //private boolean resetWeight = true; + private int folds; + private double min = 0.1; + private double max = 10.0; + private boolean retrainFromScratchAfterSigmaTuning = false; + + private Factory> minimizerCreator = null; + private int evalIters = -1; + private Evaluator[] evaluators = null; + + private Minimizer getMinimizer() { + // Create a new minimizer + Minimizer minimizer = minimizerCreator.create(); + if (minimizer instanceof HasEvaluators) { + ((HasEvaluators) minimizer).setEvaluators(evalIters, evaluators); + } + return minimizer; + } + + + /** + * Adapt classifier (adjust the mean of Gaussian prior) + * under construction -pichuan + * @param origWeights the original weights trained from the training data + * @param adaptDataset the Dataset used to adapt the trained weights + * @return adapted weights + */ + public double[][] adaptWeights(double[][] origWeights, GeneralDataset adaptDataset) { + Minimizer minimizer = getMinimizer(); + System.err.println("adaptWeights in LinearClassifierFactory. increase weight dim only"); + double[][] newWeights = new double[adaptDataset.featureIndex.size()][adaptDataset.labelIndex.size()]; + + System.arraycopy(origWeights,0,newWeights,0,origWeights.length); + + AdaptedGaussianPriorObjectiveFunction objective = new AdaptedGaussianPriorObjectiveFunction(adaptDataset, logPrior,newWeights); + + double[] initial = objective.initial(); + + double[] weights = minimizer.minimize(objective, TOL, initial); + return objective.to2D(weights); + + //Question: maybe the adaptWeights can be done just in LinearClassifier ?? (pichuan) + } + + @Override + public double[][] trainWeights(GeneralDataset dataset) { + return trainWeights(dataset, null); + } + + public double[][] trainWeights(GeneralDataset dataset, double[] initial) { + return trainWeights(dataset, initial, false); + } + + public double[][] trainWeights(GeneralDataset dataset, double[] initial, boolean bypassTuneSigma) { + return trainWeights(dataset, initial, bypassTuneSigma, null); + } + + public double[][] trainWeights(GeneralDataset dataset, double[] initial, boolean bypassTuneSigma, Minimizer minimizer) { + if (minimizer == null) minimizer = minimizerCreator.create(); + if(dataset instanceof RVFDataset) + ((RVFDataset)dataset).ensureRealValues(); + double[] interimWeights = null; + if(! bypassTuneSigma) { + if (tuneSigmaHeldOut) { + interimWeights = heldOutSetSigma(dataset); // the optimum interim weights from held-out training data have already been found. + } else if (tuneSigmaCV) { + crossValidateSetSigma(dataset,folds); // TODO: assign optimum interim weights as part of this process. + } + } + LogConditionalObjectiveFunction objective = new LogConditionalObjectiveFunction(dataset, logPrior); + if(initial == null && interimWeights != null && ! retrainFromScratchAfterSigmaTuning) { + //System.err.println("## taking advantage of interim weights as starting point."); + initial = interimWeights; + } + if (initial == null) { + initial = objective.initial(); + } + + double[] weights = minimizer.minimize(objective, TOL, initial); + return objective.to2D(weights); + } + + /** + * IMPORTANT: dataset and biasedDataset must have same featureIndex, labelIndex + */ + public Classifier trainClassifierSemiSup(GeneralDataset data, GeneralDataset biasedData, double[][] confusionMatrix, double[] initial) { + double[][] weights = trainWeightsSemiSup(data, biasedData, confusionMatrix, initial); + LinearClassifier classifier = new LinearClassifier(weights, data.featureIndex(), data.labelIndex()); + return classifier; + } + + public double[][] trainWeightsSemiSup(GeneralDataset data, GeneralDataset biasedData, double[][] confusionMatrix, double[] initial) { + Minimizer minimizer = minimizerCreator.create(); + LogConditionalObjectiveFunction objective = new LogConditionalObjectiveFunction(data, new LogPrior(LogPrior.LogPriorType.NULL)); + BiasedLogConditionalObjectiveFunction biasedObjective = new BiasedLogConditionalObjectiveFunction(biasedData, confusionMatrix, new LogPrior(LogPrior.LogPriorType.NULL)); + SemiSupervisedLogConditionalObjectiveFunction semiSupObjective = new SemiSupervisedLogConditionalObjectiveFunction(objective, biasedObjective, logPrior); + if (initial == null) { + initial = objective.initial(); + } + double[] weights = minimizer.minimize(semiSupObjective, TOL, initial); + return objective.to2D(weights); + } + + /** + * Trains the linear classifier using Generalized Expectation criteria as described in + * Generalized Expectation Criteria for Semi Supervised Learning of Conditional Random Fields, Mann and McCallum, ACL 2008. + * The original algorithm is proposed for CRFs but has been adopted to LinearClassifier (which is a simpler special case of a CRF). + * IMPORTANT: the labeled features that are passed as an argument are assumed to be binary valued, although + * other features are allowed to be real valued. + */ + public LinearClassifier trainSemiSupGE(GeneralDataset labeledDataset, List> unlabeledDataList, List GEFeatures, double convexComboCoeff) { + Minimizer minimizer = minimizerCreator.create(); + LogConditionalObjectiveFunction objective = new LogConditionalObjectiveFunction(labeledDataset, new LogPrior(LogPrior.LogPriorType.NULL)); + GeneralizedExpectationObjectiveFunction geObjective = new GeneralizedExpectationObjectiveFunction(labeledDataset, unlabeledDataList, GEFeatures); + SemiSupervisedLogConditionalObjectiveFunction semiSupObjective = new SemiSupervisedLogConditionalObjectiveFunction(objective, geObjective, null,convexComboCoeff); + double[] initial = objective.initial(); + double[] weights = minimizer.minimize(semiSupObjective, TOL, initial); + return new LinearClassifier(objective.to2D(weights), labeledDataset.featureIndex(), labeledDataset.labelIndex()); + } + + + /** + * Trains the linear classifier using Generalized Expectation criteria as described in + * Generalized Expectation Criteria for Semi Supervised Learning of Conditional Random Fields, Mann and McCallum, ACL 2008. + * The original algorithm is proposed for CRFs but has been adopted to LinearClassifier (which is a simpler, special case of a CRF). + * Automatically discovers high precision, high frequency labeled features to be used as GE constraints. + * IMPORTANT: the current feature selector assumes the features are binary. The GE constraints assume the constraining features are binary anyway, although + * it doesn't make such assumptions about other features. + */ + public LinearClassifier trainSemiSupGE(GeneralDataset labeledDataset, List> unlabeledDataList) { + List GEFeatures = getHighPrecisionFeatures(labeledDataset,0.9,10); + return trainSemiSupGE(labeledDataset, unlabeledDataList, GEFeatures,0.5); + } + + public LinearClassifier trainSemiSupGE(GeneralDataset labeledDataset, List> unlabeledDataList, double convexComboCoeff) { + List GEFeatures = getHighPrecisionFeatures(labeledDataset,0.9,10); + return trainSemiSupGE(labeledDataset, unlabeledDataList, GEFeatures,convexComboCoeff); + } + + + /** + * Returns a list of featured thresholded by minPrecision and sorted by their frequency of occurrence. + * precision in this case, is defined as the frequency of majority label over total frequency for that feature. + * @return list of high precision features. + */ + private List getHighPrecisionFeatures(GeneralDataset dataset, double minPrecision, int maxNumFeatures){ + int[][] feature2label = new int[dataset.numFeatures()][dataset.numClasses()]; + for(int f = 0; f < dataset.numFeatures(); f++) + Arrays.fill(feature2label[f],0); + + int[][] data = dataset.data; + int[] labels = dataset.labels; + for(int d = 0; d < data.length; d++){ + int label = labels[d]; + //System.out.println("datum id:"+d+" label id: "+label); + if(data[d] != null){ + //System.out.println(" number of features:"+data[d].length); + for(int n = 0; n < data[d].length; n++){ + feature2label[data[d][n]][label]++; + } + } + } + Counter feature2freq = new ClassicCounter(); + for(int f = 0; f < dataset.numFeatures(); f++){ + int maxF = ArrayMath.max(feature2label[f]); + int total = ArrayMath.sum(feature2label[f]); + double precision = ((double)maxF)/total; + F feature = dataset.featureIndex.get(f); + if(precision >= minPrecision){ + feature2freq.incrementCount(feature, total); + } + } + if(feature2freq.size() > maxNumFeatures){ + Counters.retainTop(feature2freq, maxNumFeatures); + } + //for(F feature : feature2freq.keySet()) + //System.out.println(feature+" "+feature2freq.getCount(feature)); + //System.exit(0); + return Counters.toSortedList(feature2freq); + } + + /** + * Train a classifier with a sigma tuned on a validation set. + * + * @return The constructed classifier + */ + public LinearClassifier trainClassifierV(GeneralDataset train, GeneralDataset validation, double min, double max, boolean accuracy) { + labelIndex = train.labelIndex(); + featureIndex = train.featureIndex(); + this.min = min; + this.max = max; + heldOutSetSigma(train, validation); + double[][] weights = trainWeights(train); + return new LinearClassifier(weights, train.featureIndex(), train.labelIndex()); + } + + /** + * Train a classifier with a sigma tuned on a validation set. + * In this case we are fitting on the last 30% of the training data. + * + * @param train The data to train (and validate) on. + * @return The constructed classifier + */ + public LinearClassifier trainClassifierV(GeneralDataset train, double min, double max, boolean accuracy) { + labelIndex = train.labelIndex(); + featureIndex = train.featureIndex(); + tuneSigmaHeldOut = true; + this.min = min; + this.max = max; + heldOutSetSigma(train); + double[][] weights = trainWeights(train); + return new LinearClassifier(weights, train.featureIndex(), train.labelIndex()); + } + + /** NOTE: Constructors that takes in a Minimizer creates a LinearClassifierFactory that will reuse the minimizer + * and will not be threadsafe (unless the Minimzer itself is ThreadSafe which is probably not the case) + */ + + + public LinearClassifierFactory() { + this(new QNMinimizer(15)); + this.mem = 15; + this.useQuasiNewton(); + } + + public LinearClassifierFactory(Minimizer min) { + this(min, false); + } + + public LinearClassifierFactory(boolean useSum) { + this(new QNMinimizer(15), useSum); + this.mem = 15; + this.useQuasiNewton(); + } + + public LinearClassifierFactory(double tol) { + this(new QNMinimizer(15), tol, false); + this.mem = 15; + this.useQuasiNewton(); + } + public LinearClassifierFactory(Minimizer min, boolean useSum) { + this(min, 1e-4, useSum); + } + public LinearClassifierFactory(Minimizer min, double tol, boolean useSum) { + this(min, tol, useSum, 1.0); + } + public LinearClassifierFactory(double tol, boolean useSum, double sigma) { + this(new QNMinimizer(15), tol, useSum, sigma); + this.mem = 15; + this.useQuasiNewton(); + } + public LinearClassifierFactory(Minimizer min, double tol, boolean useSum, double sigma) { + this(min, tol, useSum, LogPrior.LogPriorType.QUADRATIC.ordinal(), sigma); + } + public LinearClassifierFactory(Minimizer min, double tol, boolean useSum, int prior, double sigma) { + this(min, tol, useSum, prior, sigma, 0.0); + } + public LinearClassifierFactory(double tol, boolean useSum, int prior, double sigma, double epsilon) { + this(new QNMinimizer(15), tol, useSum, new LogPrior(prior, sigma, epsilon)); + this.mem = 15; + this.useQuasiNewton(); + } + + public LinearClassifierFactory(double tol, boolean useSum, int prior, double sigma, double epsilon, int mem) { + this(new QNMinimizer(mem), tol, useSum, new LogPrior(prior, sigma, epsilon)); + this.useQuasiNewton(); + } + + /** + * Create a factory that builds linear classifiers from training data. + * + * @param min The method to be used for optimization (minimization) (default: {@link QNMinimizer}) + * @param tol The convergence threshold for the minimization (default: 1e-4) + * @param useSum Asks to the optimizer to minimize the sum of the + * likelihoods of individual data items rather than their product (default: false) + * NOTE: this is currently ignored!!! + * @param prior What kind of prior to use, as an enum constant from class + * LogPrior + * @param sigma The strength of the prior (smaller is stronger for most + * standard priors) (default: 1.0) + * @param epsilon A second parameter to the prior (currently only used + * by the Huber prior) + */ + public LinearClassifierFactory(Minimizer min, double tol, boolean useSum, int prior, double sigma, double epsilon) { + this(min, tol, useSum, new LogPrior(prior, sigma, epsilon)); + } + + public LinearClassifierFactory(final Minimizer min, double tol, boolean useSum, LogPrior logPrior) { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return min; + } + }; + this.TOL = tol; + //this.useSum = useSum; + this.logPrior = logPrior; + } + + public LinearClassifierFactory(Factory> minimizerCreator, double tol, boolean useSum, LogPrior logPrior) { + this.minimizerCreator = minimizerCreator; + this.TOL = tol; + //this.useSum = useSum; + this.logPrior = logPrior; + } + + /** + * Set the tolerance. 1e-4 is the default. + */ + public void setTol(double tol) { + this.TOL = tol; + } + + /** + * Set the prior. + * + * @param logPrior One of the priors defined in + * LogConditionalObjectiveFunction. + * LogPrior.QUADRATIC is the default. + */ + public void setPrior(LogPrior logPrior) { + this.logPrior = logPrior; + } + + /** + * Set the verbose flag for {@link CGMinimizer}. + * Only used with conjugate-gradient minimization. + * false is the default. + */ + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + /** + * Sets the minimizer. {@link QNMinimizer} is the default. + */ + public void setMinimizerCreator(Factory> minimizerCreator) { + this.minimizerCreator = minimizerCreator; + } + + /** + * Sets the epsilon value for {@link LogConditionalObjectiveFunction}. + */ + public void setEpsilon(double eps) { + logPrior.setEpsilon(eps); + } + + public void setSigma(double sigma) { + logPrior.setSigma(sigma); + } + + public double getSigma() { + return logPrior.getSigma(); + } + + /** + * Sets the minimizer to QuasiNewton. {@link QNMinimizer} is the default. + */ + public void useQuasiNewton() { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new QNMinimizer(LinearClassifierFactory.this.mem); + } + }; + } + + public void useQuasiNewton(final boolean useRobust) { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new QNMinimizer(LinearClassifierFactory.this.mem, useRobust); + } + }; + } + + public void useStochasticQN(final double initialSMDGain, final int stochasticBatchSize){ + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new SQNMinimizer(LinearClassifierFactory.this.mem, initialSMDGain, stochasticBatchSize, false); + } + }; + } + + public void useStochasticMetaDescent(){ + useStochasticMetaDescent(0.1,15,StochasticCalculateMethods.ExternalFiniteDifference,20); + } + + public void useStochasticMetaDescent(final double initialSMDGain, final int stochasticBatchSize, + final StochasticCalculateMethods stochasticMethod,final int passes) { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new SMDMinimizer(initialSMDGain,stochasticBatchSize,stochasticMethod,passes); + } + }; + } + + public void useStochasticGradientDescent(){ + useStochasticGradientDescent(0.1,15); + } + + public void useStochasticGradientDescent(final double gainSGD, final int stochasticBatchSize){ + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new SGDMinimizer(gainSGD,stochasticBatchSize); + } + }; + } + + public void useInPlaceStochasticGradientDescent() { + useInPlaceStochasticGradientDescent(-1, -1, 1.0); + } + + public void useInPlaceStochasticGradientDescent(final int SGDPasses, final int tuneSampleSize, final double sigma) { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new StochasticInPlaceMinimizer(sigma, SGDPasses, tuneSampleSize); + } + }; + } + + public void useHybridMinimizerWithInPlaceSGD(final int SGDPasses, final int tuneSampleSize, final double sigma) { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + Minimizer firstMinimizer = new StochasticInPlaceMinimizer(sigma, SGDPasses, tuneSampleSize); + Minimizer secondMinimizer = new QNMinimizer(mem); + return new HybridMinimizer(firstMinimizer, secondMinimizer, SGDPasses); + } + }; + } + + public void useStochasticGradientDescentToQuasiNewton(final double SGDGain, final int batchSize, final int sgdPasses, + final int qnPasses, final int hessSamples, final int QNMem, + final boolean outputToFile) { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new SGDToQNMinimizer(SGDGain, batchSize, sgdPasses, + qnPasses, hessSamples, QNMem, outputToFile); + } + }; + } + + public void useHybridMinimizer() { + useHybridMinimizer(0.1,15,StochasticCalculateMethods.ExternalFiniteDifference , 0); + } + + public void useHybridMinimizer(final double initialSMDGain, final int stochasticBatchSize, + final StochasticCalculateMethods stochasticMethod, final int cutoffIteration){ + this.minimizerCreator = new Factory>() { + public Minimizer create() { + Minimizer firstMinimizer = new SMDMinimizer(initialSMDGain, stochasticBatchSize,stochasticMethod,cutoffIteration); + Minimizer secondMinimizer = new QNMinimizer(mem); + return new HybridMinimizer(firstMinimizer,secondMinimizer,cutoffIteration); + } + }; + } + + /** + * Set the mem value for {@link QNMinimizer}. + * Only used with quasi-newton minimization. 15 is the default. + * + * @param mem Number of previous function/derivative evaluations to store + * to estimate second derivative. Storing more previous evaluations + * improves training convergence speed. This number can be very + * small, if memory conservation is the priority. For large + * optimization systems (of 100,000-1,000,000 dimensions), setting this + * to 15 produces quite good results, but setting it to 50 can + * decrease the iteration count by about 20% over a value of 15. + */ + public void setMem(int mem) { + this.mem = mem; + } + + /** + * Sets the minimizer to {@link CGMinimizer}, with the passed verbose flag. + */ + public void useConjugateGradientAscent(boolean verbose) { + this.verbose = verbose; + useConjugateGradientAscent(); + } + + /** + * Sets the minimizer to {@link CGMinimizer}. + */ + public void useConjugateGradientAscent() { + this.minimizerCreator = new Factory>() { + public Minimizer create() { + return new CGMinimizer(!LinearClassifierFactory.this.verbose); + } + }; + } + + /** + * NOTE: nothing is actually done with this value! + * + * SetUseSum sets the useSum flag: when turned on, + * the Summed Conditional Objective Function is used. Otherwise, the + * LogConditionalObjectiveFunction is used. The default is false. + */ + public void setUseSum(boolean useSum) { + //this.useSum = useSum; + } + + /** + * setTuneSigmaHeldOut sets the tuneSigmaHeldOut flag: when turned on, + * the sigma is tuned by means of held-out (70%-30%). Otherwise no tuning on sigma is done. + * The default is false. + */ + public void setTuneSigmaHeldOut() { + tuneSigmaHeldOut = true; + tuneSigmaCV = false; + } + + /** + * setTuneSigmaCV sets the tuneSigmaCV flag: when turned on, + * the sigma is tuned by cross-validation. The number of folds is the parameter. + * If there is less data than the number of folds, leave-one-out is used. + * The default is false. + */ + public void setTuneSigmaCV(int folds) { + tuneSigmaCV = true; + tuneSigmaHeldOut = false; + this.folds = folds; + } + + /** + * NOTE: Nothing is actually done with this value. + * + * resetWeight sets the restWeight flag. This flag makes sense only if sigma is tuned: + * when turned on, the weights output by the tuneSigma method will be reset to zero when training the + * classifier. + * The default is false. + */ + public void resetWeight() { + //resetWeight = true; + } + + protected static double[] sigmasToTry = {0.5,1.0,2.0,4.0,10.0, 20.0, 100.0}; + + /** + * Calls the method {@link #crossValidateSetSigma(GeneralDataset, int)} with 5-fold cross-validation. + * @param dataset the data set to optimize sigma on. + */ + public void crossValidateSetSigma(GeneralDataset dataset) { + crossValidateSetSigma(dataset, 5); + } + + /** + * callls the method {@link #crossValidateSetSigma(GeneralDataset, int, Scorer, LineSearcher)} with + * multi-class log-likelihood scoring (see {@link MultiClassAccuracyStats}) and golden-section line search + * (see {@link GoldenSectionLineSearch}). + * @param dataset the data set to optimize sigma on. + */ + public void crossValidateSetSigma(GeneralDataset dataset,int kfold) { + System.err.println("##you are here."); + crossValidateSetSigma(dataset, kfold, new MultiClassAccuracyStats(MultiClassAccuracyStats.USE_LOGLIKELIHOOD), new GoldenSectionLineSearch(true, 1e-2, min, max)); + } + + public void crossValidateSetSigma(GeneralDataset dataset,int kfold, final Scorer scorer) { + crossValidateSetSigma(dataset, kfold, scorer, new GoldenSectionLineSearch(true, 1e-2, min, max)); + } + public void crossValidateSetSigma(GeneralDataset dataset,int kfold, LineSearcher minimizer) { + crossValidateSetSigma(dataset, kfold, new MultiClassAccuracyStats(MultiClassAccuracyStats.USE_LOGLIKELIHOOD), minimizer); + } + /** + * Sets the sigma parameter to a value that optimizes the cross-validation score given by scorer. Search for an optimal value + * is carried out by minimizer + * @param dataset the data set to optimize sigma on. + */ + public void crossValidateSetSigma(GeneralDataset dataset,int kfold, final Scorer scorer, LineSearcher minimizer) { + System.err.println("##in Cross Validate, folds = " + kfold); + System.err.println("##Scorer is " + scorer); + + featureIndex = dataset.featureIndex; + labelIndex = dataset.labelIndex; + + final CrossValidator crossValidator = new CrossValidator(dataset,kfold); + final Function,GeneralDataset,CrossValidator.SavedState>,Double> score = + new Function,GeneralDataset,CrossValidator.SavedState>,Double> () + { + public Double apply (Triple,GeneralDataset,CrossValidator.SavedState> fold) { + GeneralDataset trainSet = fold.first(); + GeneralDataset devSet = fold.second(); + + double[] weights = (double[])fold.third().state; + double[][] weights2D; + + weights2D = trainWeights(trainSet, weights,true); // must of course bypass sigma tuning here. + + fold.third().state = ArrayUtils.flatten(weights2D); + + LinearClassifier classifier = new LinearClassifier(weights2D, trainSet.featureIndex, trainSet.labelIndex); + + double score = scorer.score(classifier, devSet); + //System.out.println("score: "+score); + System.out.print("."); + return score; + } + }; + + Function negativeScorer = + new Function () + { + public Double apply(Double sigmaToTry) { + //sigma = sigmaToTry; + setSigma(sigmaToTry); + Double averageScore = crossValidator.computeAverage(score); + System.err.print("##sigma = "+getSigma()+" "); + System.err.println("-> average Score: "+averageScore); + return -averageScore; + } + }; + + double bestSigma = minimizer.minimize(negativeScorer); + System.err.println("##best sigma: " + bestSigma); + setSigma(bestSigma); + } + + /** + * Set the {@link LineSearcher} to be used in {@link #heldOutSetSigma(GeneralDataset, GeneralDataset)}. + */ + public void setHeldOutSearcher(LineSearcher heldOutSearcher) { + this.heldOutSearcher = heldOutSearcher; + } + + private LineSearcher heldOutSearcher = null; + public double[] heldOutSetSigma(GeneralDataset train) { + Pair, GeneralDataset> data = train.split(0.3); + return heldOutSetSigma(data.first(), data.second()); + } + + public double[] heldOutSetSigma(GeneralDataset train, Scorer scorer) { + Pair, GeneralDataset> data = train.split(0.3); + return heldOutSetSigma(data.first(), data.second(), scorer); + } + + public double[] heldOutSetSigma(GeneralDataset train, GeneralDataset dev) { + return heldOutSetSigma(train, dev, new MultiClassAccuracyStats(MultiClassAccuracyStats.USE_LOGLIKELIHOOD), heldOutSearcher == null ? new GoldenSectionLineSearch(true, 1e-2, min, max) : heldOutSearcher); + } + + public double[] heldOutSetSigma(GeneralDataset train, GeneralDataset dev, final Scorer scorer) { + return heldOutSetSigma(train, dev, scorer, new GoldenSectionLineSearch(true, 1e-2, min, max)); + } + public double[] heldOutSetSigma(GeneralDataset train, GeneralDataset dev, LineSearcher minimizer) { + return heldOutSetSigma(train, dev, new MultiClassAccuracyStats(MultiClassAccuracyStats.USE_LOGLIKELIHOOD), minimizer); + } + + /** + * Sets the sigma parameter to a value that optimizes the held-out score given by scorer. Search for an optimal value + * is carried out by minimizer + * dataset the data set to optimize sigma on. + * kfold + * @return an interim set of optimal weights: the weights + */ + public double[] heldOutSetSigma(final GeneralDataset trainSet, final GeneralDataset devSet, final Scorer scorer, LineSearcher minimizer) { + + featureIndex = trainSet.featureIndex; + labelIndex = trainSet.labelIndex; + //double[] resultWeights = null; + Timing timer = new Timing(); + + NegativeScorer negativeScorer = new NegativeScorer(trainSet,devSet,scorer,timer); + + timer.start(); + double bestSigma = minimizer.minimize(negativeScorer); + System.err.println("##best sigma: " + bestSigma); + setSigma(bestSigma); + + return ArrayUtils.flatten(trainWeights(trainSet,negativeScorer.weights,true)); // make sure it's actually the interim weights from best sigma + } + + class NegativeScorer implements Function { + public double[] weights = null; + GeneralDataset trainSet; + GeneralDataset devSet; + Scorer scorer; + Timing timer; + + public NegativeScorer(GeneralDataset trainSet, GeneralDataset devSet, Scorer scorer,Timing timer) { + super(); + this.trainSet = trainSet; + this.devSet = devSet; + this.scorer = scorer; + this.timer = timer; + } + + public Double apply(Double sigmaToTry) { + double[][] weights2D; + setSigma(sigmaToTry); + + weights2D = trainWeights(trainSet, weights,true); //bypass. + + weights = ArrayUtils.flatten(weights2D); + + LinearClassifier classifier = new LinearClassifier(weights2D, trainSet.featureIndex, trainSet.labelIndex); + + double score = scorer.score(classifier, devSet); + //System.out.println("score: "+score); + //System.out.print("."); + System.err.print("##sigma = "+getSigma()+" "); + System.err.println("-> average Score: "+ score); + System.err.println("##time elapsed: " + timer.stop() + " milliseconds."); + timer.restart(); + return -score; + } + } + + /** If set to true, then when training a classifier, after an optimal sigma is chosen a model is relearned from + * scratch. If set to false (the default), then the model is updated from wherever it wound up in the sigma-tuning process. + * The latter is likely to be faster, but it's not clear which model will wind up better. */ + public void setRetrainFromScratchAfterSigmaTuning( boolean retrainFromScratchAfterSigmaTuning) { + this.retrainFromScratchAfterSigmaTuning = retrainFromScratchAfterSigmaTuning; + } + + + public Classifier trainClassifier(Iterable> dataIterable) { + Minimizer minimizer = getMinimizer(); + Index featureIndex = Generics.newIndex(); + Index labelIndex = Generics.newIndex(); + for (Datum d : dataIterable) { + labelIndex.add(d.label()); + featureIndex.addAll(d.asFeatures());//If there are duplicates, it doesn't add them again. + } + System.err.println(String.format("Training linear classifier with %d features and %d labels", featureIndex.size(), labelIndex.size())); + + LogConditionalObjectiveFunction objective = new LogConditionalObjectiveFunction(dataIterable, logPrior, featureIndex, labelIndex); + objective.setPrior(new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + + double[] initial = objective.initial(); + double[] weights = minimizer.minimize(objective, TOL, initial); + + LinearClassifier classifier = new LinearClassifier(objective.to2D(weights), featureIndex, labelIndex); + return classifier; + } + + public Classifier trainClassifier(GeneralDataset dataset, float[] dataWeights, LogPrior prior) { + Minimizer minimizer = getMinimizer(); + if(dataset instanceof RVFDataset) + ((RVFDataset)dataset).ensureRealValues(); + LogConditionalObjectiveFunction objective = new LogConditionalObjectiveFunction(dataset, dataWeights, logPrior); + + double[] initial = objective.initial(); + double[] weights = minimizer.minimize(objective, TOL, initial); + + LinearClassifier classifier = new LinearClassifier(objective.to2D(weights), dataset.featureIndex(), dataset.labelIndex()); + return classifier; + } + + + @Override + public LinearClassifier trainClassifier(GeneralDataset dataset) { + return trainClassifier(dataset, null); + } + public LinearClassifier trainClassifier(GeneralDataset dataset, double[] initial) { + if(dataset instanceof RVFDataset) + ((RVFDataset)dataset).ensureRealValues(); + double[][] weights = trainWeights(dataset, initial, false); + LinearClassifier classifier = new LinearClassifier(weights, dataset.featureIndex(), dataset.labelIndex()); + return classifier; + } + public LinearClassifier trainClassifierWithInitialWeights(GeneralDataset dataset, double[][] initialWeights2D) { + double[] initialWeights = (initialWeights2D != null)? ArrayUtils.flatten(initialWeights2D):null; + return trainClassifier(dataset, initialWeights); + } + public LinearClassifier trainClassifierWithInitialWeights(GeneralDataset dataset, LinearClassifier initialClassifier) { + double[][] initialWeights2D = (initialClassifier != null)? initialClassifier.weights():null; + return trainClassifierWithInitialWeights(dataset, initialWeights2D); + } + + + /** + * Given the path to a file representing the text based serialization of a + * Linear Classifier, reconstitutes and returns that LinearClassifier. + * + * TODO: Leverage Index + */ + public Classifier loadFromFilename(String file) { + try { + File tgtFile = new File(file); + BufferedReader in = new BufferedReader(new FileReader(tgtFile)); + + // Format: read indicies first, weights, then thresholds + Index labelIndex = HashIndex.loadFromReader(in); + Index featureIndex = HashIndex.loadFromReader(in); + double[][] weights = new double[featureIndex.size()][labelIndex.size()]; + String line = in.readLine(); + int currLine = 1; + while (line != null && line.length()>0) { + String[] tuples = line.split(LinearClassifier.TEXT_SERIALIZATION_DELIMITER); + if (tuples.length != 3) { + throw new Exception("Error: incorrect number of tokens in weight specifier, line=" + +currLine+" in file "+tgtFile.getAbsolutePath()); + } + currLine++; + int feature = Integer.valueOf(tuples[0]); + int label = Integer.valueOf(tuples[1]); + double value = Double.valueOf(tuples[2]); + weights[feature][label] = value; + line = in.readLine(); + } + + // First line in thresholds is the number of thresholds + int numThresholds = Integer.valueOf(in.readLine()); + double[] thresholds = new double[numThresholds]; + int curr = 0; + while ((line = in.readLine()) != null) { + double tval = Double.valueOf(line.trim()); + thresholds[curr++] = tval; + } + in.close(); + LinearClassifier classifier = new LinearClassifier(weights, featureIndex, labelIndex); + return classifier; + } catch (Exception e) { + System.err.println("Error in LinearClassifierFactory, loading from file="+file); + e.printStackTrace(); + return null; + } + } + + @Deprecated + @Override + public LinearClassifier trainClassifier(List> examples) { + // TODO Auto-generated method stub + return null; + } + + public void setEvaluators(int iters, Evaluator[] evaluators) + { + this.evalIters = iters; + this.evaluators = evaluators; + } + + public LinearClassifierCreator getClassifierCreator(GeneralDataset dataset) { +// LogConditionalObjectiveFunction objective = new LogConditionalObjectiveFunction(dataset, logPrior); + return new LinearClassifierCreator(dataset.featureIndex, dataset.labelIndex); + } + + public static class LinearClassifierCreator implements ClassifierCreator, ProbabilisticClassifierCreator + { + LogConditionalObjectiveFunction objective; + Index featureIndex; + Index labelIndex; + + public LinearClassifierCreator(LogConditionalObjectiveFunction objective, Index featureIndex, Index labelIndex) + { + this.objective = objective; + this.featureIndex = featureIndex; + this.labelIndex = labelIndex; + } + + public LinearClassifierCreator(Index featureIndex, Index labelIndex) + { + this.featureIndex = featureIndex; + this.labelIndex = labelIndex; + } + + public LinearClassifier createLinearClassifier(double[] weights) { + double[][] weights2D; + if (objective != null) { + weights2D = objective.to2D(weights); + } else { + weights2D = ArrayUtils.to2D(weights, featureIndex.size(), labelIndex.size()); + } + return new LinearClassifier(weights2D, featureIndex, labelIndex); + } + + public Classifier createClassifier(double[] weights) { + return createLinearClassifier(weights); + } + + public ProbabilisticClassifier createProbabilisticClassifier(double[] weights) { + return createLinearClassifier(weights); + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java new file mode 100644 index 0000000..e2c8793 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java @@ -0,0 +1,901 @@ +package edu.stanford.nlp.classify; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.math.ADMath; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.math.DoubleAD; +import edu.stanford.nlp.optimization.AbstractStochasticCachingDiffUpdateFunction; +import edu.stanford.nlp.optimization.StochasticCalculateMethods; +import edu.stanford.nlp.util.Index; + + +/** + * Maximizes the conditional likelihood with a given prior. + * + * @author Dan Klein + * @author Galen Andrew + * @author Chris Cox (merged w/ SumConditionalObjectiveFunction, 2/16/05) + * @author Sarah Spikes (Templatization, allowing an Iterable> to be passed in instead of a GeneralDataset) + * @author Angel Chang (support in place SGD - extend AbstractStochasticCachingDiffUpdateFunction) + */ + +public class LogConditionalObjectiveFunction extends AbstractStochasticCachingDiffUpdateFunction { + + public void setPrior(LogPrior prior) { + this.prior = prior; + clearCache(); + } + + protected LogPrior prior; + + protected int numFeatures = 0; + protected int numClasses = 0; + + protected int[][] data = null; + protected Iterable> dataIterable = null; + protected double[][] values = null; + protected int[] labels = null; + protected float[] dataweights = null; + protected double[] derivativeNumerator = null; + + protected DoubleAD[] xAD = null; + protected double [] priorDerivative = null; //The only reason this is around is because the Prior Functions don't handle stochastic calculations yet. + protected DoubleAD[] derivativeAD = null; + protected DoubleAD[] sums = null; + protected DoubleAD[] probs = null; + + protected Index labelIndex = null; + protected Index featureIndex = null; + protected boolean useIterable = false; + + protected boolean useSummedConditionalLikelihood = false; //whether to use sumConditional or logConditional + + @Override + public int domainDimension() { + return numFeatures * numClasses; + } + + @Override + public int dataDimension(){ + return data.length; + } + + int classOf(int index) { + return index % numClasses; + } + + int featureOf(int index) { + return index / numClasses; + } + + protected int indexOf(int f, int c) { + return f * numClasses + c; + } + + public double[][] to2D(double[] x) { + double[][] x2 = new double[numFeatures][numClasses]; + for (int i = 0; i < numFeatures; i++) { + for (int j = 0; j < numClasses; j++) { + x2[i][j] = x[indexOf(i, j)]; + } + } + return x2; + } + + /** + * Calculate the conditional likelihood. + * If useSummedConditionalLikelihood is false (the default), + * this calculates standard(product) CL, otherwise this calculates summed CL. + * What's the difference? See Klein and Manning's 2002 EMNLP paper. + */ + @Override + protected void calculate(double[] x) { + //If the batchSize is 0 then use the regular calculate methods + if (useSummedConditionalLikelihood) { + calculateSCL(x); + } else { + calculateCL(x); + } + + } + + + + /* + * This function is used to comme up with an estimate of the value / gradient based on only a small + * portion of the data (refered to as the batchSize for lack of a better term. In this case batch does + * not mean All!! It should be thought of in the sense of "a small batch of the data". + */ + + + @Override + public void calculateStochastic(double[] x, double[] v, int[] batch){ + + if(method.calculatesHessianVectorProduct() && v != null){ + // This is used for Stochastic Methods that involve second order information (SMD for example) + if(method.equals(StochasticCalculateMethods.AlgorithmicDifferentiation)){ + calculateStochasticAlgorithmicDifferentiation(x,v,batch); + }else if(method.equals(StochasticCalculateMethods.IncorporatedFiniteDifference)){ + calculateStochasticFiniteDifference(x,v,finiteDifferenceStepSize,batch); + } + } else{ + //This is used for Stochastic Methods that don't need anything but the gradient (SGD) + calculateStochasticGradientOnly(x,batch); + } + + } + + + + + /** + * Calculate the summed conditional likelihood of this data by summing + * conditional estimates. + * + */ + private void calculateSCL(double[] x) { + //System.out.println("Checking at: "+x[0]+" "+x[1]+" "+x[2]); + value = 0.0; + Arrays.fill(derivative, 0.0); + double[] sums = new double[numClasses]; + double[] probs = new double[numClasses]; + double[] counts = new double[numClasses]; + Arrays.fill(counts, 0.0); + for (int d = 0; d < data.length; d++) { + // if (d == testMin) { + // d = testMax - 1; + // continue; + // } + int[] features = data[d]; + // activation + Arrays.fill(sums, 0.0); + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < features.length; f++) { + int i = indexOf(features[f], c); + sums[c] += x[i]; + } + } + // expectation (slower routine replaced by fast way) + // double total = Double.NEGATIVE_INFINITY; + // for (int c=0; c datum : dataIterable) { + // if (d == testMin) { + // d = testMax - 1; + // continue; + // } + Collection features = datum.asFeatures(); + for (F feature : features) { + int i = indexOf(featureIndex.indexOf(feature), labelIndex.indexOf(datum.label())); + if (dataweights == null) { + derivativeNumerator[i] -= 1; + } /*else { + derivativeNumerator[i] -= dataweights[index]; + }*/ + } + } + } + else { + System.err.println("Both were null! Couldn't calculate."); + System.exit(-1); + } + } + copy(derivative, derivativeNumerator); + // Arrays.fill(derivative, 0.0); + double[] sums = new double[numClasses]; + double[] probs = new double[numClasses]; + // double[] counts = new double[numClasses]; + // Arrays.fill(counts, 0.0); + + Iterator> iter = null; + int d = -1; + if(useIterable) + iter = dataIterable.iterator(); + Datum datum = null; + while(true){ + if(useIterable) { + if(!iter.hasNext()) break; + datum = iter.next(); + } else { + d++; + if(d >= data.length) break; + } + + // if (d == testMin) { + // d = testMax - 1; + // continue; + // } + + // activation + Arrays.fill(sums, 0.0); + double total = 0; + if(!useIterable) { + int[] featuresArr = data[d]; + + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < featuresArr.length; f++) { + int i = indexOf(featuresArr[f], c); + sums[c] += x[i]; + } + } + // expectation (slower routine replaced by fast way) + // double total = Double.NEGATIVE_INFINITY; + // for (int c=0; c features = datum.asFeatures(); + for (int c = 0; c < numClasses; c++) { + for (F feature : features) { + int i = indexOf(featureIndex.indexOf(feature), c); + sums[c] += x[i]; + } + } + // expectation (slower routine replaced by fast way) + // double total = Double.NEGATIVE_INFINITY; + // for (int c=0; c dataset) { + this(dataset, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public LogConditionalObjectiveFunction(GeneralDataset dataset, LogPrior prior) { + this(dataset, prior, false); + } + + public LogConditionalObjectiveFunction(GeneralDataset dataset, float[] dataWeights, LogPrior prior) { + this(dataset, prior, false); + this.dataweights = dataWeights; + System.err.println("correct constructor"); + } + + public LogConditionalObjectiveFunction(GeneralDataset dataset, LogPrior prior, boolean useSumCondObjFun) { + setPrior(prior); + setUseSumCondObjFun(useSumCondObjFun); + this.numFeatures = dataset.numFeatures(); + this.numClasses = dataset.numClasses(); + this.data = dataset.getDataArray(); + this.labels = dataset.getLabelsArray(); + this.values = dataset.getValuesArray(); + if (dataset instanceof WeightedDataset) { + this.dataweights = ((WeightedDataset)dataset).getWeights(); + } + } + + //TODO: test this + public LogConditionalObjectiveFunction(Iterable> dataIterable, LogPrior logPrior, Index featureIndex, Index labelIndex) { + setPrior(logPrior); + setUseSumCondObjFun(false); + this.useIterable = true; + this.numFeatures = featureIndex.size(); + this.numClasses = labelIndex.size(); + this.data = null; + this.dataIterable = dataIterable; + + this.labelIndex = labelIndex; + this.featureIndex = featureIndex; + this.labels = null;//dataset.getLabelsArray(); + this.values = null;//dataset.getValuesArray(); + //this.dataweights //leave it null? + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, boolean useSumCondObjFun) { + this(numFeatures, numClasses, data, labels); + this.useSummedConditionalLikelihood = useSumCondObjFun; + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels) { + this(numFeatures, numClasses, data, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, LogPrior prior) { + this(numFeatures, numClasses, data, labels, null, prior); + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, float[] dataweights) { + this(numFeatures, numClasses, data, labels, dataweights, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, float[] dataweights, LogPrior prior) { + this.numFeatures = numFeatures; + this.numClasses = numClasses; + this.data = data; + this.labels = labels; + this.prior = prior; + this.dataweights = dataweights; + // this.testMin = data.length; + // this.testMax = data.length; + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, int intPrior, double sigma, double epsilon) { + this(numFeatures, numClasses, data, null, labels, intPrior, sigma, epsilon); + } + + public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, double[][] values, int[] labels, int intPrior, double sigma, double epsilon) { + this.numFeatures = numFeatures; + this.numClasses = numClasses; + this.data = data; + this.values = values; + this.labels = labels; + this.prior = new LogPrior(intPrior, sigma, epsilon); + // this.testMin = data.length; + // this.testMax = data.length; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogPrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogPrior.java new file mode 100644 index 0000000..19257c1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogPrior.java @@ -0,0 +1,334 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.util.ArrayUtils; + +import java.io.Serializable; + + +/** + * A Prior for functions. Immutable. + * + * @author Galen Andrew + */ +public class LogPrior implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 7826853908892790965L; + + public enum LogPriorType { NULL, QUADRATIC, HUBER, QUARTIC, COSH, ADAPT, MULTIPLE_QUADRATIC } + + public static LogPriorType getType(String name) { + if (name.equalsIgnoreCase("null")) { return LogPriorType.NULL; } + else if (name.equalsIgnoreCase("quadratic")) { return LogPriorType.QUADRATIC; } + else if (name.equalsIgnoreCase("huber")) { return LogPriorType.HUBER; } + else if (name.equalsIgnoreCase("quartic")) { return LogPriorType.QUARTIC; } + else if (name.equalsIgnoreCase("cosh")) { return LogPriorType.COSH; } +// else if (name.equalsIgnoreCase("multiple")) { return LogPriorType.MULTIPLE; } + else { throw new RuntimeException("Unknown LogPriorType: "+name); } + } + + // these fields are just for the ADAPT prior - + // is there a better way to do this? + private double[] means = null; + private LogPrior otherPrior = null; + + public static LogPrior getAdaptationPrior(double[] means, LogPrior otherPrior) { + LogPrior lp = new LogPrior(LogPriorType.ADAPT); + lp.means = means; + lp.otherPrior = otherPrior; + return lp; + } + + public LogPriorType getType() { + return type; + } + + public final LogPriorType type; + + public LogPrior() { + this(LogPriorType.QUADRATIC); + } + + public LogPrior(int intPrior) { + this(intPrior, 1.0, 0.1); + } + + public LogPrior(LogPriorType type) { + this(type, 1.0, 0.1); + } + + // why isn't this functionality in enum? + private static LogPriorType intToType(int intPrior) { + LogPriorType[] values = LogPriorType.values(); + for (LogPriorType val : values) { + if (val.ordinal() == intPrior) { + return val; + } + } + throw new IllegalArgumentException(intPrior + " is not a legal LogPrior."); + } + + public LogPrior(int intPrior, double sigma, double epsilon) { + this(intToType(intPrior), sigma, epsilon); + } + + public LogPrior(LogPriorType type, double sigma, double epsilon) { + this.type = type; + if (type != LogPriorType.ADAPT) { + setSigma(sigma); + setEpsilon(epsilon); + } + } + + + // this is the C variable in CSFoo's MM paper C = 1/\sigma^2 +// private double[] regularizationHyperparameters = null; + + private double[] sigmaSqM = null; + private double[] sigmaQuM = null; + + +// public double[] getRegularizationHyperparameters() { +// return regularizationHyperparameters; +// } +// +// public void setRegularizationHyperparameters( +// double[] regularizationHyperparameters) { +// this.regularizationHyperparameters = regularizationHyperparameters; +// } + + /** + * IMPORTANT NOTE: This constructor allows non-uniform regularization, but it + * transforms the inputs C (like the machine learning people like) to sigma + * (like we NLP folks like). C = 1/\sigma^2 + */ + public LogPrior(double[] C) { + this.type = LogPriorType.MULTIPLE_QUADRATIC; + double[] sigmaSqM = new double[C.length]; + for (int i=0;i 30.0) { + val = norm - Math.log(2); + d = 1.0 / sigmaSq; + } else { + val = Math.log(Math.cosh(norm)); + d = (2 * (1 / (Math.exp(-2.0 * norm) + 1)) - 1.0) / sigmaSq; + } + for (int i=0; i < x.length; i++) { + grad[i] += Math.signum(x[i]) * d; + } + return val; + case MULTIPLE_QUADRATIC: +// for (int i = 0; i < x.length; i++) { +// val += x[i] * x[i]* 1/2 * regularizationHyperparameters[i]; +// grad[i] += x[i] * regularizationHyperparameters[i]; +// } + + for (int i = 0; i < x.length; i++) { + val += x[i] * x[i] / 2.0 / sigmaSqM[i]; + grad[i] += x[i] / sigmaSqM[i]; + } + + + return val; + default: + throw new RuntimeException("LogPrior.valueAt is undefined for prior of type " + this); + } + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticClassifier.java new file mode 100644 index 0000000..ecc2e49 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticClassifier.java @@ -0,0 +1,386 @@ +// Stanford Classifier - a multiclass maxent classifier +// LogisticClassifier +// Copyright (c) 2003-2007 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// Support/Questions: java-nlp-user@lists.stanford.edu +// Licensing: java-nlp-support@lists.stanford.edu +// http://www-nlp.stanford.edu/software/classifier.shtml + +package edu.stanford.nlp.classify; + +import java.io.File; +import java.io.Serializable; +import java.util.*; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.optimization.DiffFunction; +import edu.stanford.nlp.optimization.Minimizer; +import edu.stanford.nlp.optimization.QNMinimizer; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.util.ErasureUtils; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.ReflectionLoading; +import edu.stanford.nlp.util.StringUtils; + +/** + * A classifier for binary logistic regression problems. + * This uses the standard statistics textbook formulation of binary + * logistic regression, which is more efficient than using the + * LinearClassifier class. + * + * @author Galen Andrew + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * @author Ramesh Nallapati nmramesh@cs.stanford.edu {@link #justificationOf(Collection)} + * + * @param The type of the labels in the Dataset + * @param The type of the features in the Dataset + */ +public class LogisticClassifier implements Classifier, Serializable, RVFClassifier { + + //TODO make it implement ProbabilisticClassifier as well. --Ramesh 12/03/2009. + /** + * + */ + private static final long serialVersionUID = 6672245467246897192L; + private double[] weights; + private Index featureIndex; + private L[] classes = ErasureUtils.mkTArray(Object.class,2); + @Deprecated + private LogPrior prior; + @Deprecated + private boolean biased = false; + + @Override + public String toString() { + if (featureIndex == null) { + return ""; + } + + StringBuilder sb = new StringBuilder(); + for (F f : featureIndex) { + sb.append(classes[1]).append(" / ").append(f).append(" = ").append(weights[featureIndex.indexOf(f)]); + } + + return sb.toString(); + } + + public L getLabelForInternalPositiveClass(){ + return classes[1]; + } + + public L getLabelForInternalNegativeClass(){ + return classes[0]; + } + + // todo [cdm]: This method should be removed, and weightsAsGenericCounter renamed as weightsAsCounter! + public Counter weightsAsCounter() { + Counter c = new ClassicCounter(); + for (F f : featureIndex) { + c.incrementCount(classes[1]+" / "+f, weights[featureIndex.indexOf(f)]); + } + + return c; + } + + public Counter weightsAsGenericCounter() { + Counter c = new ClassicCounter(); + for (F f : featureIndex) { + double w = weights[featureIndex.indexOf(f)]; + if(w != 0.0) + c.setCount(f, w); + } + return c; + } + + public Index getFeatureIndex() { + return featureIndex; + } + + public double[] getWeights() { + return weights; + } + + + public LogisticClassifier(double[] weights, Index featureIndex, L[] classes){ + this.weights = weights; + this.featureIndex = featureIndex; + this.classes = classes; + } + + + @Deprecated //use LogisticClassifierFactory instead + public LogisticClassifier(boolean biased) { + this(new LogPrior(LogPrior.LogPriorType.QUADRATIC), biased); + } + + @Deprecated //use in LogisticClassifierFactory instead. + public LogisticClassifier(LogPrior prior) { + this.prior = prior; + } + + + @Deprecated //use in LogisticClassifierFactory instead + public LogisticClassifier(LogPrior prior, boolean biased) { + this.prior = prior; + this.biased = biased; + } + + public Collection labels() { + Collection l = new LinkedList(); + l.add(classes[0]); + l.add(classes[1]); + return l; + } + + public L classOf(Datum datum) { + if(datum instanceof RVFDatum){ + return classOfRVFDatum((RVFDatum) datum); + } + return classOf(datum.asFeatures()); + } + + @Deprecated //use classOf(Datum) instead. + public L classOf(RVFDatum example) { + return classOf(example.asFeaturesCounter()); + } + + private L classOfRVFDatum(RVFDatum example) { + return classOf(example.asFeaturesCounter()); + } + + public L classOf(Counter features) { + if (scoreOf(features) > 0) { + return classes[1]; + } + return classes[0]; + } + + public L classOf(Collection features) { + if (scoreOf(features) > 0) { + return classes[1]; + } + return classes[0]; + } + + + public double scoreOf(Collection features) { + double sum = 0; + for (F feature : features) { + int f = featureIndex.indexOf(feature); + if (f >= 0) { + sum += weights[f]; + } + } + return sum; + } + + public double scoreOf(Counter features) { + double sum = 0; + for (F feature : features.keySet()) { + int f = featureIndex.indexOf(feature); + if (f >= 0) { + sum += weights[f]*features.getCount(feature); + } + } + return sum; + } + /* + * returns the weights to each feature assigned by the classifier + * nmramesh@cs.stanford.edu + */ + public Counter justificationOf(Counter features){ + Counter fWts = new ClassicCounter(); + for (F feature : features.keySet()) { + int f = featureIndex.indexOf(feature); + if (f >= 0) { + fWts.incrementCount(feature,weights[f]*features.getCount(feature)); + } + } + return fWts; + } + /** + * returns the weights assigned by the classifier to each feature + */ + public Counter justificationOf(Collection features){ + Counter fWts = new ClassicCounter(); + for (F feature : features) { + int f = featureIndex.indexOf(feature); + if (f >= 0) { + fWts.incrementCount(feature,weights[f]); + } + } + return fWts; + } + + /** + * returns the scores for both the classes + */ + public Counter scoresOf(Datum datum) { + if(datum instanceof RVFDatum)return scoresOfRVFDatum((RVFDatum)datum); + Collection features = datum.asFeatures(); + double sum = scoreOf(features); + Counter c = new ClassicCounter(); + c.setCount(classes[0], -sum); + c.setCount(classes[1], sum); + return c; + } + + + @Deprecated //use scoresOfDatum(Datum) instead. + public Counter scoresOf(RVFDatum example) { + return scoresOfRVFDatum(example); + } + + + private Counter scoresOfRVFDatum(RVFDatum example) { + Counter features = example.asFeaturesCounter(); + double sum = scoreOf(features); + Counter c = new ClassicCounter(); + c.setCount(classes[0], -sum); + c.setCount(classes[1], sum); + return c; + } + + public double probabilityOf(Datum example) { + if(example instanceof RVFDatum)return probabilityOfRVFDatum((RVFDatum)example); + return probabilityOf(example.asFeatures(), example.label()); + } + + public double probabilityOf(Collection features, L label) { + short sign = (short)(label.equals(classes[0]) ? 1 : -1); + return 1.0 / (1.0 + Math.exp(sign * scoreOf(features))); + } + + @Deprecated //use probabilityOf(Datum) instead. + public double probabilityOf(RVFDatum example) { + return probabilityOf(example.asFeaturesCounter(), example.label()); + } + + private double probabilityOfRVFDatum(RVFDatum example) { + return probabilityOf(example.asFeaturesCounter(), example.label()); + } + + public double probabilityOf(Counter features, L label) { + short sign = (short)(label.equals(classes[0]) ? 1 : -1); + return 1.0 / (1.0 + Math.exp(sign * scoreOf(features))); + } + + /** + * Trains on weighted dataset. + * @param dataWeights weights of the data. + */ + @Deprecated //Use LogisticClassifierFactory to train instead. + public void trainWeightedData(GeneralDataset data, float[] dataWeights){ + if (data.labelIndex.size() != 2) { + throw new RuntimeException("LogisticClassifier is only for binary classification!"); + } + + Minimizer minim; + LogisticObjectiveFunction lof = null; + if(data instanceof Dataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getLabelsArray(), prior,dataWeights); + else if(data instanceof RVFDataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getValuesArray(), data.getLabelsArray(), prior,dataWeights); + minim = new QNMinimizer(lof); + weights = minim.minimize(lof, 1e-4, new double[data.numFeatureTypes()]); + + featureIndex = data.featureIndex; + classes[0] = data.labelIndex.get(0); + classes[1] = data.labelIndex.get(1); + } + + @Deprecated //Use LogisticClassifierFactory to train instead. + public void train(GeneralDataset data) { + train(data, 0.0, 1e-4); + } + + @Deprecated //Use LogisticClassifierFactory to train instead. + public void train(GeneralDataset data, double l1reg, double tol) { + if (data.labelIndex.size() != 2) { + throw new RuntimeException("LogisticClassifier is only for binary classification!"); + } + + Minimizer minim; + if (!biased) { + LogisticObjectiveFunction lof = null; + if(data instanceof Dataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getLabelsArray(), prior); + else if(data instanceof RVFDataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getValuesArray(), data.getLabelsArray(), prior); + if (l1reg > 0.0) { + minim = ReflectionLoading.loadByReflection("edu.stanford.nlp.optimization.OWLQNMinimizer", l1reg); + } else { + minim = new QNMinimizer(lof); + } + weights = minim.minimize(lof, tol, new double[data.numFeatureTypes()]); + } else { + BiasedLogisticObjectiveFunction lof = new BiasedLogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getLabelsArray(), prior); + if (l1reg > 0.0) { + minim = ReflectionLoading.loadByReflection("edu.stanford.nlp.optimization.OWLQNMinimizer", l1reg); + } else { + minim = new QNMinimizer(lof); + } + weights = minim.minimize(lof, tol, new double[data.numFeatureTypes()]); + } + + featureIndex = data.featureIndex; + classes[0] = data.labelIndex.get(0); + classes[1] = data.labelIndex.get(1); + } + + + public static void main(String[] args) throws Exception { + Properties prop = StringUtils.argsToProperties(args); + + double l1reg = Double.parseDouble(prop.getProperty("l1reg","0.0")); + + Dataset ds = new Dataset(); + for (String line : ObjectBank.getLineIterator(new File(prop.getProperty("trainFile")))) { + String[] bits = line.split("\\s+"); + Collection f = new LinkedList(Arrays.asList(bits).subList(1, bits.length)); + String l = bits[0]; + ds.add(f, l); + } + + ds.summaryStatistics(); + + boolean biased = prop.getProperty("biased", "false").equals("true"); + LogisticClassifierFactory factory = new LogisticClassifierFactory(); + LogisticClassifier lc = factory.trainClassifier(ds, l1reg, 1e-4, biased); + + + for (String line : ObjectBank.getLineIterator(new File(prop.getProperty("testFile")))) { + String[] bits = line.split("\\s+"); + Collection f = new LinkedList(Arrays.asList(bits).subList(1, bits.length)); + //String l = bits[0]; + String g = lc.classOf(f); + System.out.println(g + '\t' + line); + } + + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticClassifierFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticClassifierFactory.java new file mode 100644 index 0000000..437319e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticClassifierFactory.java @@ -0,0 +1,117 @@ +package edu.stanford.nlp.classify; + +import java.util.List; + +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.optimization.DiffFunction; +import edu.stanford.nlp.optimization.Minimizer; +import edu.stanford.nlp.optimization.QNMinimizer; +import edu.stanford.nlp.util.ErasureUtils; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.ReflectionLoading; + +/** + * Builds a classifier for binary logistic regression problems. + * This uses the standard statistics textbook formulation of binary + * logistic regression, which is more efficient than using the + * LinearClassifier class. + * + * @author Ramesh Nallapati nmramesh@cs.stanford.edu + * + */ +public class LogisticClassifierFactory implements ClassifierFactory> { + private static final long serialVersionUID = 1L; + private double[] weights; + private Index featureIndex; + private L[] classes = ErasureUtils.mkTArray(Object.class,2); + + + public LogisticClassifier trainWeightedData(GeneralDataset data, float[] dataWeights){ + if(data instanceof RVFDataset) + ((RVFDataset)data).ensureRealValues(); + if (data.labelIndex.size() != 2) { + throw new RuntimeException("LogisticClassifier is only for binary classification!"); + } + + Minimizer minim; + LogisticObjectiveFunction lof = null; + if(data instanceof Dataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getLabelsArray(), new LogPrior(LogPrior.LogPriorType.QUADRATIC),dataWeights); + else if(data instanceof RVFDataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getValuesArray(), data.getLabelsArray(), new LogPrior(LogPrior.LogPriorType.QUADRATIC),dataWeights); + minim = new QNMinimizer(lof); + weights = minim.minimize(lof, 1e-4, new double[data.numFeatureTypes()]); + + featureIndex = data.featureIndex; + classes[0] = data.labelIndex.get(0); + classes[1] = data.labelIndex.get(1); + return new LogisticClassifier(weights,featureIndex,classes); + } + + public LogisticClassifier trainClassifier(GeneralDataset data) { + return trainClassifier(data, 0.0); + } + + public LogisticClassifier trainClassifier(GeneralDataset data, LogPrior prior, boolean biased) { + return trainClassifier(data, 0.0, 1e-4, prior, biased); + } + + public LogisticClassifier trainClassifier(GeneralDataset data, double l1reg) { + return trainClassifier(data, l1reg, 1e-4); + } + + public LogisticClassifier trainClassifier(GeneralDataset data, double l1reg, double tol) { + return trainClassifier(data, l1reg, tol, new LogPrior(LogPrior.LogPriorType.QUADRATIC), false); + } + + public LogisticClassifier trainClassifier(GeneralDataset data, double l1reg, double tol, LogPrior prior) { + return trainClassifier(data, l1reg, tol, prior, false); + } + + public LogisticClassifier trainClassifier(GeneralDataset data, double l1reg, double tol, boolean biased) { + return trainClassifier(data, l1reg, tol, new LogPrior(LogPrior.LogPriorType.QUADRATIC), biased); + } + + public LogisticClassifier trainClassifier(GeneralDataset data, double l1reg, double tol, LogPrior prior, boolean biased) { + if(data instanceof RVFDataset) + ((RVFDataset)data).ensureRealValues(); + if (data.labelIndex.size() != 2) { + throw new RuntimeException("LogisticClassifier is only for binary classification!"); + } + + Minimizer minim; + if (!biased) { + LogisticObjectiveFunction lof = null; + if(data instanceof Dataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getLabelsArray(), prior); + else if(data instanceof RVFDataset) + lof = new LogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getValuesArray(), data.getLabelsArray(), prior); + if (l1reg > 0.0) { + minim = ReflectionLoading.loadByReflection("edu.stanford.nlp.optimization.OWLQNMinimizer", l1reg); + } else { + minim = new QNMinimizer(lof); + } + weights = minim.minimize(lof, tol, new double[data.numFeatureTypes()]); + } else { + BiasedLogisticObjectiveFunction lof = new BiasedLogisticObjectiveFunction(data.numFeatureTypes(), data.getDataArray(), data.getLabelsArray(), prior); + if (l1reg > 0.0) { + minim = ReflectionLoading.loadByReflection("edu.stanford.nlp.optimization.OWLQNMinimizer", l1reg); + } else { + minim = new QNMinimizer(lof); + } + weights = minim.minimize(lof, tol, new double[data.numFeatureTypes()]); + } + + featureIndex = data.featureIndex; + classes[0] = data.labelIndex.get(0); + classes[1] = data.labelIndex.get(1); + return new LogisticClassifier(weights,featureIndex,classes); + } + + @Deprecated //this method no longer required by the ClassifierFactory Interface. + public LogisticClassifier trainClassifier(List> examples) { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticObjectiveFunction.java new file mode 100644 index 0000000..2df1e2f --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/LogisticObjectiveFunction.java @@ -0,0 +1,150 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; + +import java.util.Arrays; + + +/** + * Maximizes the conditional likelihood with a given prior. + * Because the problem is binary, optimizations are possible that + * cannot be done in LogConditionalObjectiveFunction. + * + * @author Galen Andrew + */ + +public class LogisticObjectiveFunction extends AbstractCachingDiffFunction { + + private final int numFeatures; + private final int[][] data; + private final double[][] dataValues; + private final int[] labels; + protected float[] dataweights = null; + private final LogPrior prior; + + + @Override + public int domainDimension() { + return numFeatures; + } + + @Override + protected void calculate(double[] x) { + + if (dataValues != null) { + calculateRVF(x); + return; + } + + value = 0.0; + Arrays.fill(derivative, 0.0); + + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + double sum = 0; + + for (int f = 0; f < features.length; f++) { + sum += x[features[f]]; + } + + double expSum, derivativeIncrement; + + if (labels[d] == 0) { + expSum = Math.exp(sum); + derivativeIncrement = 1.0 / (1.0 + (1.0 / expSum)); + } else { + expSum = Math.exp(-sum); + derivativeIncrement = -1.0 / (1.0 + (1.0 / expSum)); + } + + if (dataweights == null) { + value += Math.log(1.0 + expSum); + } else { + value += Math.log(1.0 + expSum) * dataweights[d]; + derivativeIncrement *= dataweights[d]; + } + + for (int f = 0; f < features.length; f++) { + derivative[features[f]] += derivativeIncrement; + } + } + + value += prior.compute(x, derivative); + } + + protected void calculateRVF(double[] x) { + + value = 0.0; + Arrays.fill(derivative, 0.0); + + for (int d = 0; d < data.length; d++) { + int[] features = data[d]; + double[] values = dataValues[d]; + double sum = 0; + + for (int f = 0; f < features.length; f++) { + sum += x[features[f]]*values[f]; + } + + double expSum, derivativeIncrement; + + if (labels[d] == 0) { + expSum = Math.exp(sum); + derivativeIncrement = 1.0 / (1.0 + (1.0 / expSum)); + } else { + expSum = Math.exp(-sum); + derivativeIncrement = -1.0 / (1.0 + (1.0 / expSum)); + } + + if (dataweights == null) { + value += Math.log(1.0 + expSum); + } else { + value += Math.log(1.0 + expSum) * dataweights[d]; + derivativeIncrement *= dataweights[d]; + } + + for (int f = 0; f < features.length; f++) { + derivative[features[f]] += values[f]*derivativeIncrement; + } + } + + value += prior.compute(x, derivative); + } + + + public LogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels) { + this(numFeatures, data, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public LogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels, LogPrior prior) { + this(numFeatures, data, labels, prior, null); + } + + public LogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels, float[] dataweights) { + this(numFeatures, data, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC), dataweights); + } + public LogisticObjectiveFunction(int numFeatures, int[][] data, int[] labels, LogPrior prior, float[] dataweights) { + this(numFeatures, data, null, labels, prior, dataweights); + } + + public LogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels) { + this(numFeatures, data, values, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC)); + } + + public LogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels, LogPrior prior) { + this(numFeatures, data, values, labels, prior, null); + } + + public LogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels, float[] dataweights) { + this(numFeatures, data, values, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC), dataweights); + } + + public LogisticObjectiveFunction(int numFeatures, int[][] data, double[][] values, int[] labels, LogPrior prior, float[] dataweights) { + this.numFeatures = numFeatures; + this.data = data; + this.labels = labels; + this.prior = prior; + this.dataweights = dataweights; + this.dataValues = values; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/NBLinearClassifierFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/NBLinearClassifierFactory.java new file mode 100644 index 0000000..f1d5c68 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/NBLinearClassifierFactory.java @@ -0,0 +1,240 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.BasicDatum; +import edu.stanford.nlp.optimization.GoldenSectionLineSearch; +import edu.stanford.nlp.util.Function; + +/** + * Provides a medium-weight implementation of Bernoulli (or binary) + * Naive Bayes via a linear classifier. It's medium weight in that + * it uses dense arrays for counts and calculation (but, hey, NB is + * efficient to estimate). Each feature is treated as an independent + * binary variable. + *

    + * CDM Jun 2003: I added a dirty trick so that if there is a feature + * that is always on in input examples, then its weight is turned into + * a prior feature! (This will work well iff it is also always on at + * test time.) In fact, this is done for each such feature, so by + * having several such features, one can even get an integral prior + * boost out of this. + * + * @author Dan Klein + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param The type of the labels in the Classifier + * @param The type of the features in the Classifier + */ +public class NBLinearClassifierFactory extends AbstractLinearClassifierFactory { + + private static final boolean VERBOSE = false; + + private double sigma; // amount of add-k smoothing of evidence + private final boolean interpretAlwaysOnFeatureAsPrior; + private static final double epsilon = 1e-30; // fudge to keep nonzero + private boolean tuneSigma = false; + private int folds; + + + @Override + protected double[][] trainWeights(GeneralDataset data) { + return trainWeights(data.getDataArray(), data.getLabelsArray()); + } + + /** + * Train weights. + * If tuneSigma is true, the optimal sigma value is found using cross-validation: + * the number of folds is determined by the folds variable, + * if there are less training examples than folds, + * leave-one-out is used. + */ + double[][] trainWeights(int[][] data, int[] labels) { + if (tuneSigma) { + tuneSigma(data, labels); + } + if (VERBOSE) { + System.err.println("NB CF: " + data.length + " data items "); + for (int i = 0; i < data.length; i++) { + System.err.print("Datum " + i + ": " + labels[i] + ":"); + for (int j = 0; j < data[i].length; j++) { + System.err.print(" " + data[i][j]); + } + System.err.println(); + } + } + int numFeatures = numFeatures(); + int numClasses = numClasses(); + double[][] weights = new double[numFeatures][numClasses]; + // find P(C|F)/P(C) + int num = 0; + double[] numc = new double[numClasses]; + double n = 0; // num active features in whole dataset + double[] n_c = new double[numClasses]; // num active features in class c items + double[] n_f = new double[numFeatures]; // num data items for which feature is active + double[][] n_fc = new double[numFeatures][numClasses]; // num times feature active in class c + for (int d = 0; d < data.length; d++) { + num++; + numc[labels[d]]++; + for (int i = 0; i < data[d].length; i++) { + n++; + n_c[labels[d]]++; + n_f[data[d][i]]++; + n_fc[data[d][i]][labels[d]]++; + } + } + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < numFeatures; f++) { + if (interpretAlwaysOnFeatureAsPrior && n_f[f] == data.length) { + // interpret always on feature as prior! + weights[f][c] = Math.log(numc[c] / num); + } else { + // p_c_f = (N(f,c)+k)/(N(f)+|C|k) = Paddk(c|f) + // set lambda = log (P()/P()) + double p_c = (n_c[c] + epsilon) / (n + numClasses * epsilon); + double p_c_f = (n_fc[f][c] + sigma) / (n_f[f] + sigma * numClasses); + if (VERBOSE) { + System.err.println("Prob ratio(f=" + f + ",c=" + c + ") = " + p_c_f / p_c + " (nc=" + n_c[c] + ", nf=" + n_f[f] + ", nfc=" + n_fc[f][c] + ")"); + } + weights[f][c] = Math.log(p_c_f / p_c); + } + } + } + return weights; + } + + double[][] weights(int[][] data, int[] labels, int testMin, int testMax, double trialSigma, int foldSize) { + int numFeatures = numFeatures(); + int numClasses = numClasses(); + double[][] weights = new double[numFeatures][numClasses]; + // find P(C|F)/P(C) + int num = 0; + double[] numc = new double[numClasses]; + double n = 0; // num active features in whole dataset + double[] n_c = new double[numClasses]; // num active features in class c items + double[] n_f = new double[numFeatures]; // num data items for which feature is active + double[][] n_fc = new double[numFeatures][numClasses]; // num times feature active in class c + for (int d = 0; d < data.length; d++) { + if (d == testMin) { + d = testMax - 1; + continue; + } + num++; + numc[labels[d]]++; + for (int i = 0; i < data[d].length; i++) { + if (i == testMin) { + i = testMax - 1; + continue; + } + n++; + n_c[labels[d]]++; + n_f[data[d][i]]++; + n_fc[data[d][i]][labels[d]]++; + } + } + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < numFeatures; f++) { + if (interpretAlwaysOnFeatureAsPrior && n_f[f] == data.length - foldSize) { + // interpret always on feature as prior! + weights[f][c] = Math.log(numc[c] / num); + } else { + // p_c_f = (N(f,c)+k)/(N(f)+|C|k) = Paddk(c|f) + // set lambda = log (P()/P()) + double p_c = (n_c[c] + epsilon) / (n + numClasses * epsilon); + double p_c_f = (n_fc[f][c] + trialSigma) / (n_f[f] + trialSigma * numClasses); + weights[f][c] = Math.log(p_c_f / p_c); + } + } + } + return weights; + } + + + private void tuneSigma(final int[][] data, final int[] labels) { + + Function CVSigmaToPerplexity = new Function() { + @Override + public Double apply(Double trialSigma) { + double score = 0.0; + double sumScore = 0.0; + int foldSize, nbCV; + System.err.println("Trying sigma = " + trialSigma); + //test if enough training data + if (data.length >= folds) { + foldSize = data.length / folds; + nbCV = folds; + } else { //leave-one-out + foldSize = 1; + nbCV = data.length; + } + + for (int j = 0; j < nbCV; j++) { + //System.out.println("CV j: "+ j); + int testMin = j * foldSize; + int testMax = testMin + foldSize; + + LinearClassifier c = new LinearClassifier(weights(data, labels, testMin, testMax, trialSigma, foldSize), featureIndex, labelIndex); + for (int i = testMin; i < testMax; i++) { + //System.out.println("test i: "+ i + " "+ new BasicDatum(featureIndex.objects(data[i]))); + score -= c.logProbabilityOf(new BasicDatum(featureIndex.objects(data[i]))).getCount(labelIndex.get(labels[i])); + } + //System.err.printf("%d: %8g\n", j, score); + sumScore += score; + } + System.err.printf(": %8g\n", sumScore); + return sumScore; + } + }; + + GoldenSectionLineSearch gsls = new GoldenSectionLineSearch(true); + sigma = gsls.minimize(CVSigmaToPerplexity, 0.01, 0.0001, 2.0); + System.out.println("Sigma used: " + sigma); + } + + /** + * Create a ClassifierFactory. + */ + public NBLinearClassifierFactory() { + this(1.0); + } + + /** + * Create a ClassifierFactory. + * + * @param sigma The amount of add-sigma smoothing of evidence + */ + public NBLinearClassifierFactory(double sigma) { + this(sigma, false); + } + + /** + * Create a ClassifierFactory. + * + * @param sigma The amount of add-sigma smoothing of evidence + * @param interpretAlwaysOnFeatureAsPrior If true, a feature that is in every + * data item is interpreted as an indication to include a prior + * factor over classes. (If there are multiple such features, an + * integral "prior boost" will occur.) If false, an always on + * feature is interpreted as an evidence feature (and, following + * the standard math) will have no effect on the model. + + */ + public NBLinearClassifierFactory(double sigma, boolean interpretAlwaysOnFeatureAsPrior) { + this.sigma = sigma; + this.interpretAlwaysOnFeatureAsPrior = interpretAlwaysOnFeatureAsPrior; + } + + /** + * setTuneSigmaCV sets the tuneSigma flag: when turned on, + * the sigma is tuned by cross-validation. + * If there is less data than the number of folds, leave-one-out is used. + * The default for tuneSigma is false. + * + * @param folds Number of folds for cross validation + */ + public void setTuneSigmaCV(int folds) { + tuneSigma = true; + this.folds = folds; + } + + private static final long serialVersionUID = 1; + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/PRCurve.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/PRCurve.java new file mode 100644 index 0000000..16c1295 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/PRCurve.java @@ -0,0 +1,367 @@ +package edu.stanford.nlp.classify; + +import java.util.ArrayList; +import java.util.List; +import java.io.File; + +import edu.stanford.nlp.util.BinaryHeapPriorityQueue; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.PriorityQueue; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.util.Triple; + +/** + * @author Kristina Toutanova + * May 23, 2005 + * A class to create recall-precision curves given scores + * used to fit the best monotonic function for logistic regression and svms + */ +public class PRCurve { + double[] scores; //sorted scores + int[] classes; // the class of example i + int[] guesses; // the guess of example i according to the argmax + int[] numpositive; // number positive in the i-th highest scores + int[] numnegative; // number negative in the i-th lowest scores + + /** + * reads scores with classes from a file, sorts by score and creates the arrays + * + */ + public PRCurve(String filename) { + try { + ArrayList> dataScores = new ArrayList>(); + for(String line : ObjectBank.getLineIterator(new File(filename))) { + List elems = StringUtils.split(line); + Pair p = new Pair(new Double(elems.get(0).toString()), Integer.valueOf(elems.get(1).toString())); + dataScores.add(p); + } + init(dataScores); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + + /** + * reads scores with classes from a file, sorts by score and creates the arrays + * + */ + public PRCurve(String filename, boolean svm) { + try { + + ArrayList> dataScores = new ArrayList>(); + for(String line : ObjectBank.getLineIterator(new File(filename))) { + List elems = StringUtils.split(line); + int cls = (new Double(elems.get(0).toString())).intValue(); + if (cls == -1) { + cls = 0; + } + double score = Double.parseDouble(elems.get(1).toString()) + 0.5; + Pair p = new Pair(new Double(score), Integer.valueOf(cls)); + dataScores.add(p); + } + init(dataScores); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + public double optimalAccuracy() { + return precision(numSamples()) / (double) numSamples(); + } + + public double accuracy() { + return logPrecision(numSamples()) / (double) numSamples(); + } + + + public PRCurve(List> dataScores) { + init(dataScores); + } + + public void init(List> dataScores) { + PriorityQueue>> q = new BinaryHeapPriorityQueue>>(); + for (int i = 0; i < dataScores.size(); i++) { + q.add(new Pair>(Integer.valueOf(i), dataScores.get(i)), -dataScores.get(i).first().doubleValue()); + } + List>> sorted = q.toSortedList(); + scores = new double[sorted.size()]; + classes = new int[sorted.size()]; + System.err.println("incoming size " + dataScores.size() + " resulting " + sorted.size()); + + for (int i = 0; i < sorted.size(); i++) { + Pair next = sorted.get(i).second(); + scores[i] = next.first().doubleValue(); + classes[i] = next.second().intValue(); + } + init(); + } + + + public void initMC(ArrayList> dataScores) { + PriorityQueue>> q = new BinaryHeapPriorityQueue>>(); + for (int i = 0; i < dataScores.size(); i++) { + q.add(new Pair>(Integer.valueOf(i), dataScores.get(i)), -dataScores.get(i).first().doubleValue()); + } + List>> sorted = q.toSortedList(); + scores = new double[sorted.size()]; + classes = new int[sorted.size()]; + guesses = new int[sorted.size()]; + System.err.println("incoming size " + dataScores.size() + " resulting " + sorted.size()); + + for (int i = 0; i < sorted.size(); i++) { + Triple next = sorted.get(i).second(); + scores[i] = next.first().doubleValue(); + classes[i] = next.second().intValue(); + guesses[i] = next.third().intValue(); + } + init(); + } + + + /** + * initialize the numpositive and the numnegative arrays + */ + void init() { + numnegative = new int[numSamples() + 1]; + numpositive = new int[numSamples() + 1]; + numnegative[0] = 0; + numpositive[0] = 0; + int num = numSamples(); + for (int i = 1; i <= num; i++) { + numnegative[i] = numnegative[i - 1] + (classes[i - 1] == 0 ? 1 : 0); + } + for (int i = 1; i <= num; i++) { + numpositive[i] = numpositive[i - 1] + (classes[num - i] == 0 ? 0 : 1); + } + System.err.println("total positive " + numpositive[num] + " total negative " + numnegative[num] + " total " + num); + for (int i = 1; i < numpositive.length; i++) { + //System.out.println(i + " positive " + numpositive[i] + " negative " + numnegative[i] + " classes " + classes[i - 1] + " " + classes[num - i]); + } + } + + int numSamples() { + return scores.length; + } + + /** + * what is the best precision at the given recall + * + */ + public int precision(int recall) { + int optimum = 0; + for (int right = 0; right <= recall; right++) { + int candidate = numpositive[right] + numnegative[recall - right]; + if (candidate > optimum) { + optimum = candidate; + } + } + return optimum; + } + + public static double f1(int tp, int fp, int fn) { + double prec = 1; + double recall = 1; + if (tp + fp > 0) { + prec = tp / (double) (tp + fp); + } + if (tp + fn > 0) { + recall = tp / (double) (tp + fn); + } + return 2 * prec * recall / (prec + recall); + } + + /** + * the f-measure if we just guess as negativ the first numleft and guess as poitive the last numright + * + */ + public double fmeasure(int numleft, int numright) { + int tp = 0, fp = 0, fn = 0; + tp = numpositive[numright]; + fp = numright - tp; + fn = numleft - numnegative[numleft]; + return f1(tp, fp, fn); + } + + + /** + * what is the precision at this recall if we look at the score as the probability of class 1 given x + * as if coming from logistic regression + * + */ + public int logPrecision(int recall) { + int totaltaken = 0; + int rightIndex = numSamples() - 1; //next right candidate + int leftIndex = 0; //next left candidate + int totalcorrect = 0; + + while (totaltaken < recall) { + double confr = Math.abs(scores[rightIndex] - .5); + double confl = Math.abs(scores[leftIndex] - .5); + int chosen = leftIndex; + if (confr > confl) { + chosen = rightIndex; + rightIndex--; + } else { + leftIndex++; + } + //System.err.println("chose "+chosen+" score "+scores[chosen]+" class "+classes[chosen]+" correct "+correct(scores[chosen],classes[chosen])); + if ((scores[chosen] >= .5) && (classes[chosen] == 1)) { + totalcorrect++; + } + if ((scores[chosen] < .5) && (classes[chosen] == 0)) { + totalcorrect++; + } + totaltaken++; + } + + return totalcorrect; + } + + /** + * what is the optimal f-measure we can achieve given recall guesses + * using the optimal monotonic function + * + */ + public double optFmeasure(int recall) { + double max = 0; + for (int i = 0; i < (recall + 1); i++) { + double f = fmeasure(i, recall - i); + if (f > max) { + max = f; + } + } + return max; + } + + public double opFmeasure() { + return optFmeasure(numSamples()); + } + + /** + * what is the f-measure at this recall if we look at the score as the probability of class 1 given x + * as if coming from logistic regression same as logPrecision but calculating f-measure + * + * @param recall make this many guesses for which we are most confident + */ + public double fmeasure(int recall) { + int totaltaken = 0; + int rightIndex = numSamples() - 1; //next right candidate + int leftIndex = 0; //next left candidate + int tp = 0, fp = 0, fn = 0; + while (totaltaken < recall) { + double confr = Math.abs(scores[rightIndex] - .5); + double confl = Math.abs(scores[leftIndex] - .5); + int chosen = leftIndex; + if (confr > confl) { + chosen = rightIndex; + rightIndex--; + } else { + leftIndex++; + } + //System.err.println("chose "+chosen+" score "+scores[chosen]+" class "+classes[chosen]+" correct "+correct(scores[chosen],classes[chosen])); + if ((scores[chosen] >= .5)) { + if (classes[chosen] == 1) { + tp++; + } else { + fp++; + } + } + if ((scores[chosen] < .5)) { + if (classes[chosen] == 1) { + fn++; + } + } + totaltaken++; + } + + return f1(tp, fp, fn); + + } + + + /** + * assuming the scores are probability of 1 given x + * + */ + public double logLikelihood() { + double loglik = 0; + for (int i = 0; i < scores.length; i++) { + loglik += Math.log(classes[i] == 0 ? 1 - scores[i] : scores[i]); + } + return loglik; + } + + /** + * confidence weighted accuracy assuming the scores are probabilities and using .5 as treshold + * + */ + public double cwa() { + double acc = 0; + for (int recall = 1; recall <= numSamples(); recall++) { + acc += logPrecision(recall) / (double) recall; + } + return acc / numSamples(); + } + + /** + * confidence weighted accuracy assuming the scores are probabilities and using .5 as treshold + * + */ + public int[] cwaArray() { + int[] arr = new int[numSamples()]; + for (int recall = 1; recall <= numSamples(); recall++) { + arr[recall - 1] = logPrecision(recall); + } + return arr; + } + + /** + * confidence weighted accuracy assuming the scores are probabilities and using .5 as treshold + * + */ + public int[] optimalCwaArray() { + int[] arr = new int[numSamples()]; + for (int recall = 1; recall <= numSamples(); recall++) { + arr[recall - 1] = precision(recall); + } + return arr; + } + + /** + * optimal confidence weighted accuracy assuming for each recall we can fit an optimal monotonic function + * + */ + public double optimalCwa() { + double acc = 0; + for (int recall = 1; recall <= numSamples(); recall++) { + acc += precision(recall) / (double) recall; + } + return acc / numSamples(); + } + + + public static boolean correct(double score, int cls) { + return ((score >= .5) && (cls == 1)) || ((score < .5) && (cls == 0)); + } + + public static void main(String[] args) { + + PriorityQueue q = new BinaryHeapPriorityQueue(); + q.add("bla", 2); + q.add("bla3", 2); + System.err.println("size of q " + q.size()); + + PRCurve pr = new PRCurve("c:/data0204/precsvm", true); + System.err.println("acc " + pr.accuracy() + " opt " + pr.optimalAccuracy() + " cwa " + pr.cwa() + " optcwa " + pr.optimalCwa()); + for (int r = 1; r <= pr.numSamples(); r++) { + System.err.println("optimal precision at recall " + r + " " + pr.precision(r)); + System.err.println("model precision at recall " + r + " " + pr.logPrecision(r)); + } + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ProbabilisticClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ProbabilisticClassifier.java new file mode 100644 index 0000000..7cae490 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ProbabilisticClassifier.java @@ -0,0 +1,10 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.stats.Counter; + +public interface ProbabilisticClassifier extends Classifier +{ + public Counter probabilityOf(Datum example); + public Counter logProbabilityOf(Datum example); +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ProbabilisticClassifierCreator.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ProbabilisticClassifierCreator.java new file mode 100644 index 0000000..ee06806 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/ProbabilisticClassifierCreator.java @@ -0,0 +1,10 @@ +package edu.stanford.nlp.classify; + +/** + * Creates a probablic classifier with given weights + * + * @author Angel Chang + */ +public interface ProbabilisticClassifierCreator { + public ProbabilisticClassifier createProbabilisticClassifier(double[] weights); +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/RVFClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/RVFClassifier.java new file mode 100644 index 0000000..1a7d438 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/RVFClassifier.java @@ -0,0 +1,20 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.stats.Counter; + +import java.io.Serializable; + +/** + * A simple interface for classifying and scoring data points with + * real-valued features. Implemented by the linear classifier. + * + * @author Jenny Finkel + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + */ + +public interface RVFClassifier extends Serializable { + public L classOf(RVFDatum example); + + public Counter scoresOf(RVFDatum example); +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/RVFDataset.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/RVFDataset.java new file mode 100644 index 0000000..69fc9ef --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/RVFDataset.java @@ -0,0 +1,981 @@ +package edu.stanford.nlp.classify; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Random; +import java.util.Set; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.Counters; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.HashIndex; + +/** + * An interfacing class for {@link ClassifierFactory} that incrementally builds + * a more memory-efficient representation of a {@link List} of {@link RVFDatum} + * objects for the purposes of training a {@link Classifier} with a + * {@link ClassifierFactory}. + * + * @author Jenny Finkel (jrfinkel@stanford.edu) + * @author Rajat Raina (added methods to record data sources and ids) + * @author Anna Rafferty (various refactoring with GeneralDataset/Dataset) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param + * The type of the labels in the Dataset + * @param + * The type of the features in the Dataset + */ +public class RVFDataset extends GeneralDataset { // implements Iterable>, Serializable + + private static final long serialVersionUID = -3841757837680266182L; + + private double[][] values; // [datumIndex][i] values of features listed in int[][] data + private double[] minValues; // = null; //stores the minValues of all features + // for normalization. + private double[] maxValues; // = null; //stores the maxValues of all features + // for normalization. + double[] means; + double[] stdevs; // means and stdevs of features, used for + + /* + * Store source and id of each datum; optional, and not fully supported. + */ + private ArrayList> sourcesAndIds; + + public RVFDataset() { + this(10); + } + + public RVFDataset(int numDatums, Index featureIndex, Index labelIndex) { + this(numDatums); + this.labelIndex = labelIndex; + this.featureIndex = featureIndex; + } + + public RVFDataset(Index featureIndex, Index labelIndex) { + this(10); + this.labelIndex = labelIndex; + this.featureIndex = featureIndex; + } + + public RVFDataset(int numDatums) { + initialize(numDatums); + } + + /** + * Constructor that fully specifies a Dataset. Needed this for + * MulticlassDataset. + */ + public RVFDataset(Index labelIndex, int[] labels, Index featureIndex, int[][] data, double[][] values) { + this.labelIndex = labelIndex; + this.labels = labels; + this.featureIndex = featureIndex; + this.data = data; + this.values = values; + this.size = labels.length; + } + + @Override + public Pair, GeneralDataset> split(double percentDev) { + int devSize = (int) (percentDev * size()); + int trainSize = size() - devSize; + + int[][] devData = new int[devSize][]; + double[][] devValues = new double[devSize][]; + int[] devLabels = new int[devSize]; + + int[][] trainData = new int[trainSize][]; + double[][] trainValues = new double[trainSize][]; + int[] trainLabels = new int[trainSize]; + + System.arraycopy(data, 0, devData, 0, devSize); + System.arraycopy(values, 0, devValues, 0, devSize); + System.arraycopy(labels, 0, devLabels, 0, devSize); + + System.arraycopy(data, devSize, trainData, 0, trainSize); + System.arraycopy(values, devSize, trainValues, 0, trainSize); + System.arraycopy(labels, devSize, trainLabels, 0, trainSize); + + RVFDataset dev = new RVFDataset(labelIndex, devLabels, featureIndex, devData, devValues); + RVFDataset train = new RVFDataset(labelIndex, trainLabels, featureIndex, trainData, trainValues); + + return new Pair, GeneralDataset>(train, dev); + + } + + public void scaleFeaturesGaussian() { + means = new double[this.numFeatures()]; + Arrays.fill(means, 0); + + for (int i = 0; i < this.size(); i++) { + for (int j = 0; j < data[i].length; j++) + means[data[i][j]] += values[i][j]; + } + ArrayMath.multiplyInPlace(means, 1.0 / this.size()); + + stdevs = new double[this.numFeatures()]; + Arrays.fill(stdevs, 0); + double[] deltaX = new double[this.numFeatures()]; + + for (int i = 0; i < this.size(); i++) { + for (int f = 0; f < this.numFeatures(); f++) + deltaX[f] = -means[f]; + for (int j = 0; j < data[i].length; j++) + deltaX[data[i][j]] += values[i][j]; + for (int f = 0; f < this.numFeatures(); f++) { + stdevs[f] += deltaX[f] * deltaX[f]; + } + } + for (int f = 0; f < this.numFeatures(); f++) { + stdevs[f] /= (this.size() - 1); + stdevs[f] = Math.sqrt(stdevs[f]); + } + for (int i = 0; i < this.size(); i++) { + for (int j = 0; j < data[i].length; j++) { + int fID = data[i][j]; + if (stdevs[fID] != 0) + values[i][j] = (values[i][j] - means[fID]) / stdevs[fID]; + } + } + + } + + /** + * Scales feature values linearly such that each feature value lies between 0 + * and 1. + * + */ + public void scaleFeatures() { + // TODO: should also implement a method that scales the features using the + // mean and std. + minValues = new double[featureIndex.size()]; + maxValues = new double[featureIndex.size()]; + Arrays.fill(minValues, Double.POSITIVE_INFINITY); + Arrays.fill(maxValues, Double.NEGATIVE_INFINITY); + + // first identify the max and min values for each feature. + // System.out.printf("number of datums: %d dataset size: %d\n",data.length,size()); + for (int i = 0; i < size(); i++) { + // System.out.printf("datum %d length %d\n", i,data[i].length); + for (int j = 0; j < data[i].length; j++) { + int f = data[i][j]; + if (values[i][j] < minValues[f]) + minValues[f] = values[i][j]; + if (values[i][j] > maxValues[f]) + maxValues[f] = values[i][j]; + } + } + + for (int f = 0; f < featureIndex.size(); f++) { + if (minValues[f] == Double.POSITIVE_INFINITY) + throw new RuntimeException("minValue for feature " + f + " not assigned. "); + if (maxValues[f] == Double.NEGATIVE_INFINITY) + throw new RuntimeException("maxValue for feature " + f + " not assigned."); + } + + // now scale each value such that it's between 0 and 1. + for (int i = 0; i < size(); i++) { + for (int j = 0; j < data[i].length; j++) { + int f = data[i][j]; + if (minValues[f] != maxValues[f])// the equality can happen for binary + // features which always take the value + // of 1.0 + values[i][j] = (values[i][j] - minValues[f]) / (maxValues[f] - minValues[f]); + } + } + + /* + for(int f = 0; f < featureIndex.size(); f++){ + if(minValues[f] == maxValues[f]) + throw new RuntimeException("minValue for feature "+f+" is equal to maxValue:"+minValues[f]); + } + */ + } + + /** + * Checks if the dataset has any unbounded values. Always good to use this + * before training a model on the dataset. This way, one can avoid seeing the + * infamous 4's that get printed by the QuasiNewton Method when NaNs exist in + * the data! -Ramesh + */ + public void ensureRealValues() { + double[][] values = getValuesArray(); + int[][] data = getDataArray(); + for (int i = 0; i < size(); i++) { + for (int j = 0; j < values[i].length; j++) { + if (Double.isNaN(values[i][j])) { + int fID = data[i][j]; + F feature = featureIndex.get(fID); + throw new RuntimeException("datum " + i + " has a NaN value for feature:" + feature); + } + if (Double.isInfinite(values[i][j])) { + int fID = data[i][j]; + F feature = featureIndex.get(fID); + throw new RuntimeException("datum " + i + " has infinite value for feature:" + feature); + } + } + } + } + + /** + * Scales the values of each feature in each linearly using the min and max + * values found in the training set. NOTE1: Not guaranteed to be between 0 and + * 1 for a test datum. NOTE2: Also filters out features from each datum that + * are not seen at training time. + * + * @param dataset + * @return a new dataset + */ + public RVFDataset scaleDataset(RVFDataset dataset) { + RVFDataset newDataset = new RVFDataset(this.featureIndex, this.labelIndex); + for (int i = 0; i < dataset.size(); i++) { + RVFDatum datum = dataset.getDatum(i); + newDataset.add(scaleDatum(datum)); + } + return newDataset; + } + + /** + * Scales the values of each feature linearly using the min and max values + * found in the training set. NOTE1: Not guaranteed to be between 0 and 1 for + * a test datum. NOTE2: Also filters out features from the datum that are not + * seen at training time. + * + * @param datum + * @return a new datum + */ + public RVFDatum scaleDatum(RVFDatum datum) { + // scale this dataset before scaling the datum + if (minValues == null || maxValues == null) + scaleFeatures(); + Counter scaledFeatures = new ClassicCounter(); + for (F feature : datum.asFeatures()) { + int fID = this.featureIndex.indexOf(feature); + if (fID >= 0) { + double oldVal = datum.asFeaturesCounter().getCount(feature); + double newVal; + if (minValues[fID] != maxValues[fID]) + newVal = (oldVal - minValues[fID]) / (maxValues[fID] - minValues[fID]); + else + newVal = oldVal; + scaledFeatures.incrementCount(feature, newVal); + } + } + return new RVFDatum(scaledFeatures, datum.label()); + } + + public RVFDataset scaleDatasetGaussian(RVFDataset dataset) { + RVFDataset newDataset = new RVFDataset(this.featureIndex, this.labelIndex); + for (int i = 0; i < dataset.size(); i++) { + RVFDatum datum = dataset.getDatum(i); + newDataset.add(scaleDatumGaussian(datum)); + } + return newDataset; + } + + public RVFDatum scaleDatumGaussian(RVFDatum datum) { + // scale this dataset before scaling the datum + if (means == null || stdevs == null) + scaleFeaturesGaussian(); + Counter scaledFeatures = new ClassicCounter(); + for (F feature : datum.asFeatures()) { + int fID = this.featureIndex.indexOf(feature); + if (fID >= 0) { + double oldVal = datum.asFeaturesCounter().getCount(feature); + double newVal; + if (stdevs[fID] != 0) + newVal = (oldVal - means[fID]) / stdevs[fID]; + else + newVal = oldVal; + scaledFeatures.incrementCount(feature, newVal); + } + } + return new RVFDatum(scaledFeatures, datum.label()); + } + + @Override + public Pair, GeneralDataset> split(int start, int end) { + int devSize = end - start; + int trainSize = size() - devSize; + + int[][] devData = new int[devSize][]; + double[][] devValues = new double[devSize][]; + int[] devLabels = new int[devSize]; + + int[][] trainData = new int[trainSize][]; + double[][] trainValues = new double[trainSize][]; + int[] trainLabels = new int[trainSize]; + + System.arraycopy(data, start, devData, 0, devSize); + System.arraycopy(values, start, devValues, 0, devSize); + System.arraycopy(labels, start, devLabels, 0, devSize); + + System.arraycopy(data, 0, trainData, 0, start); + System.arraycopy(data, end, trainData, start, size() - end); + System.arraycopy(values, 0, trainValues, 0, start); + System.arraycopy(values, end, trainValues, start, size() - end); + System.arraycopy(labels, 0, trainLabels, 0, start); + System.arraycopy(labels, end, trainLabels, start, size() - end); + + GeneralDataset dev = new RVFDataset(labelIndex, devLabels, featureIndex, devData, devValues); + GeneralDataset train = new RVFDataset(labelIndex, trainLabels, featureIndex, trainData, trainValues); + + return new Pair, GeneralDataset>(train, dev); + + } + + // TODO: Check that this does what we want for Datum other than RVFDatum + @Override + public void add(Datum d) { + if (d instanceof RVFDatum) { + addLabel(d.label()); + addFeatures(((RVFDatum) d).asFeaturesCounter()); + size++; + } else { + addLabel(d.label()); + addFeatures(Counters.asCounter(d.asFeatures())); + size++; + } + } + + public void add(Datum d, String src, String id) { + if (d instanceof RVFDatum) { + addLabel(d.label()); + addFeatures(((RVFDatum) d).asFeaturesCounter()); + addSourceAndId(src, id); + size++; + } else { + addLabel(d.label()); + addFeatures(Counters.asCounter(d.asFeatures())); + addSourceAndId(src, id); + size++; + } + } + + // TODO shouldn't have both this and getRVFDatum + @Override + public RVFDatum getDatum(int index) { + return getRVFDatum(index); + } + + /** + * @return the index-ed datum + * + * Note, this returns a new RVFDatum object, not the original RVFDatum + * that was added to the dataset. + */ + @Override + public RVFDatum getRVFDatum(int index) { + ClassicCounter c = new ClassicCounter(); + for (int i = 0; i < data[index].length; i++) { + c.incrementCount(featureIndex.get(data[index][i]), values[index][i]); + } + return new RVFDatum(c, labelIndex.get(labels[index])); + } + + public String getRVFDatumSource(int index) { + return sourcesAndIds.get(index).first(); + } + + public String getRVFDatumId(int index) { + return sourcesAndIds.get(index).second(); + } + + private void addSourceAndId(String src, String id) { + sourcesAndIds.add(new Pair(src, id)); + } + + private void addLabel(L label) { + if (labels.length == size) { + int[] newLabels = new int[size * 2]; + System.arraycopy(labels, 0, newLabels, 0, size); + labels = newLabels; + } + labels[size] = labelIndex.indexOf(label, true); + } + + private void addFeatures(Counter features) { + if (data.length == size) { + int[][] newData = new int[size * 2][]; + double[][] newValues = new double[size * 2][]; + System.arraycopy(data, 0, newData, 0, size); + System.arraycopy(values, 0, newValues, 0, size); + data = newData; + values = newValues; + } + + final List featureNames = new ArrayList(features.keySet()); + final int nFeatures = featureNames.size(); + data[size] = new int[nFeatures]; + values[size] = new double[nFeatures]; + for (int i = 0; i < nFeatures; ++i) { + F feature = featureNames.get(i); + int fID = featureIndex.indexOf(feature, true); + if (fID >= 0) { + data[size][i] = fID; + values[size][i] = features.getCount(feature); + } else { + // Usually a feature present at test but not training time. + assert featureIndex.isLocked() : "Could not add feature to index: " + feature; + } + } + } + + /** + * Resets the Dataset so that it is empty and ready to collect data. + */ + @Override + public void clear() { + clear(10); + } + + /** + * Resets the Dataset so that it is empty and ready to collect data. + */ + @Override + public void clear(int numDatums) { + initialize(numDatums); + } + + @Override + protected void initialize(int numDatums) { + labelIndex = new HashIndex(); + featureIndex = new HashIndex(); + labels = new int[numDatums]; + data = new int[numDatums][]; + values = new double[numDatums][]; + sourcesAndIds = new ArrayList>(numDatums); + size = 0; + } + + /** + * Prints some summary statistics to stderr for the Dataset. + */ + @Override + public void summaryStatistics() { + System.err.println("numDatums: " + size); + System.err.print("numLabels: " + labelIndex.size() + " ["); + Iterator iter = labelIndex.iterator(); + while (iter.hasNext()) { + System.err.print(iter.next()); + if (iter.hasNext()) { + System.err.print(", "); + } + } + System.err.println("]"); + System.err.println("numFeatures (Phi(X) types): " + featureIndex.size()); + /*for(int i = 0; i < data.length; i++) { + for(int j = 0; j < data[i].length; j++) { + System.out.println(data[i][j]); + } + }*/ + } + + // private int[] trimToSize(int[] i, int size) { + // int[] newI = new int[size]; + // System.arraycopy(i, 0, newI, 0, size); + // return newI; + // } + // + // private int[][] trimToSize(int[][] i, int size) { + // int[][] newI = new int[size][]; + // System.arraycopy(i, 0, newI, 0, size); + // return newI; + // } + + private static double[][] trimToSize(double[][] i, int size) { + double[][] newI = new double[size][]; + System.arraycopy(i, 0, newI, 0, size); + return newI; + } + + /** + * prints the full feature matrix in tab-delimited form. These can be BIG + * matrices, so be careful! [Can also use printFullFeatureMatrixWithValues] + */ + public void printFullFeatureMatrix(PrintWriter pw) { + String sep = "\t"; + for (int i = 0; i < featureIndex.size(); i++) { + pw.print(sep + featureIndex.get(i)); + } + pw.println(); + for (int i = 0; i < labels.length; i++) { + pw.print(labelIndex.get(i)); + Set feats = Generics.newHashSet(); + for (int j = 0; j < data[i].length; j++) { + int feature = data[i][j]; + feats.add(Integer.valueOf(feature)); + } + for (int j = 0; j < featureIndex.size(); j++) { + if (feats.contains(Integer.valueOf(j))) { + pw.print(sep + "1"); + } else { + pw.print(sep + "0"); + } + } + pw.println(); + } + } + + /** + * Modification of printFullFeatureMatrix to correct bugs & print values + * (Rajat). Prints the full feature matrix in tab-delimited form. These can be + * BIG matrices, so be careful! + */ + public void printFullFeatureMatrixWithValues(PrintWriter pw) { + String sep = "\t"; + for (int i = 0; i < featureIndex.size(); i++) { + pw.print(sep + featureIndex.get(i)); + } + pw.println(); + for (int i = 0; i < size; i++) { // changed labels.length to size + pw.print(labelIndex.get(labels[i])); // changed i to labels[i] + Map feats = Generics.newHashMap(); + for (int j = 0; j < data[i].length; j++) { + int feature = data[i][j]; + double val = values[i][j]; + feats.put(Integer.valueOf(feature), new Double(val)); + } + for (int j = 0; j < featureIndex.size(); j++) { + if (feats.containsKey(Integer.valueOf(j))) { + pw.print(sep + feats.get(Integer.valueOf(j))); + } else { + pw.print(sep + " "); + } + } + pw.println(); + } + pw.flush(); + } + + /** + * Constructs a Dataset by reading in a file in SVM light format. + * + */ + public static RVFDataset readSVMLightFormat(String filename) { + return readSVMLightFormat(filename, new HashIndex(), new HashIndex()); + } + + /** + * Constructs a Dataset by reading in a file in SVM light format. The lines + * parameter is filled with the lines of the file for further processing (if + * lines is null, it is assumed no line information is desired) + */ + public static RVFDataset readSVMLightFormat(String filename, List lines) { + return readSVMLightFormat(filename, new HashIndex(), new HashIndex(), lines); + } + + /** + * Constructs a Dataset by reading in a file in SVM light format. the created + * dataset has the same feature and label index as given + */ + public static RVFDataset readSVMLightFormat(String filename, Index featureIndex, Index labelIndex) { + return readSVMLightFormat(filename, featureIndex, labelIndex, null); + } + + /** + * Removes all features from the dataset that are not in featureSet. + * + * @param featureSet + */ + public void selectFeaturesFromSet(Set featureSet) { + HashIndex newFeatureIndex = new HashIndex(); + int[] featMap = new int[featureIndex.size()]; + Arrays.fill(featMap, -1); + for (F feature : featureSet) { + int oldID = featureIndex.indexOf(feature); + if (oldID >= 0) { // it's a valid feature in the index + int newID = newFeatureIndex.indexOf(feature, true); + featMap[oldID] = newID; + } + } + featureIndex = newFeatureIndex; + for (int i = 0; i < size; i++) { + List featList = new ArrayList(data[i].length); + List valueList = new ArrayList(values[i].length); + for (int j = 0; j < data[i].length; j++) { + if (featMap[data[i][j]] >= 0) { + featList.add(featMap[data[i][j]]); + valueList.add(values[i][j]); + } + } + data[i] = new int[featList.size()]; + values[i] = new double[valueList.size()]; + for (int j = 0; j < data[i].length; j++) { + data[i][j] = featList.get(j); + values[i][j] = valueList.get(j); + } + } + } + + /** + * Applies a feature count threshold to the RVFDataset. All features that + * occur fewer than k times are expunged. + */ + public void applyFeatureCountThreshold(int k) { + float[] counts = getFeatureCounts(); + HashIndex newFeatureIndex = new HashIndex(); + + int[] featMap = new int[featureIndex.size()]; + for (int i = 0; i < featMap.length; i++) { + F feat = featureIndex.get(i); + if (counts[i] >= k) { + int newIndex = newFeatureIndex.size(); + newFeatureIndex.add(feat); + featMap[i] = newIndex; + } else { + featMap[i] = -1; + } + // featureIndex.remove(feat); + } + + featureIndex = newFeatureIndex; + // counts = null; // This is unnecessary; JVM can clean it up + + for (int i = 0; i < size; i++) { + List featList = new ArrayList(data[i].length); + List valueList = new ArrayList(values[i].length); + for (int j = 0; j < data[i].length; j++) { + if (featMap[data[i][j]] >= 0) { + featList.add(featMap[data[i][j]]); + valueList.add(values[i][j]); + } + } + data[i] = new int[featList.size()]; + values[i] = new double[valueList.size()]; + for (int j = 0; j < data[i].length; j++) { + data[i][j] = featList.get(j); + values[i][j] = valueList.get(j); + } + } + } + + /** + * Applies a feature max count threshold to the RVFDataset. All features that + * occur greater than k times are expunged. + */ + public void applyFeatureMaxCountThreshold(int k) { + float[] counts = getFeatureCounts(); + HashIndex newFeatureIndex = new HashIndex(); + + int[] featMap = new int[featureIndex.size()]; + for (int i = 0; i < featMap.length; i++) { + F feat = featureIndex.get(i); + if (counts[i] <= k) { + int newIndex = newFeatureIndex.size(); + newFeatureIndex.add(feat); + featMap[i] = newIndex; + } else { + featMap[i] = -1; + } + // featureIndex.remove(feat); + } + + featureIndex = newFeatureIndex; + // counts = null; // This is unnecessary; JVM can clean it up + + for (int i = 0; i < size; i++) { + List featList = new ArrayList(data[i].length); + List valueList = new ArrayList(values[i].length); + for (int j = 0; j < data[i].length; j++) { + if (featMap[data[i][j]] >= 0) { + featList.add(featMap[data[i][j]]); + valueList.add(values[i][j]); + } + } + data[i] = new int[featList.size()]; + values[i] = new double[valueList.size()]; + for (int j = 0; j < data[i].length; j++) { + data[i][j] = featList.get(j); + values[i][j] = valueList.get(j); + } + } + } + + private static RVFDataset readSVMLightFormat(String filename, Index featureIndex, Index labelIndex, List lines) { + BufferedReader in = null; + RVFDataset dataset; + try { + dataset = new RVFDataset(10, featureIndex, labelIndex); + in = new BufferedReader(new FileReader(filename)); + + while (in.ready()) { + String line = in.readLine(); + if (lines != null) + lines.add(line); + dataset.add(svmLightLineToRVFDatum(line)); + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } finally { + IOUtils.closeIgnoringExceptions(in); + } + return dataset; + } + + public static RVFDatum svmLightLineToRVFDatum(String l) { + l = l.replaceFirst("#.*$", ""); // remove any trailing comments + String[] line = l.split("\\s+"); + ClassicCounter features = new ClassicCounter(); + for (int i = 1; i < line.length; i++) { + String[] f = line[i].split(":"); + if (f.length != 2) { + throw new IllegalArgumentException("Bad data format: " + l); + } + double val = Double.parseDouble(f[1]); + features.incrementCount(f[0], val); + } + return new RVFDatum(features, line[0]); + } + + // todo [cdm 2012]: This duplicates the functionality of the methods above. Should be refactored. + /** + * Read SVM-light formatted data into this dataset. + * + * A strict SVM-light format is expected, where labels and features are both + * encoded as integers. These integers are converted into the dataset label + * and feature types using the indexes stored in this dataset. + * + * @param file The file from which the data should be read. + */ + public void readSVMLightFormat(File file) { + for (String line : IOUtils.readLines(file)) { + line = line.replaceAll("#.*", ""); // remove any trailing comments + String[] items = line.split("\\s+"); + Integer label = Integer.parseInt(items[0]); + Counter features = new ClassicCounter(); + for (int i = 1; i < items.length; i++) { + String[] featureItems = items[i].split(":"); + int feature = Integer.parseInt(featureItems[0]); + double value = Double.parseDouble(featureItems[1]); + features.incrementCount(this.featureIndex.get(feature), value); + } + this.add(new RVFDatum(features, this.labelIndex.get(label))); + } + } + + /** + * Write the dataset in SVM-light format to the file. + * + * A strict SVM-light format will be written, where labels and features are + * both encoded as integers, using the label and feature indexes of this + * dataset. Datasets written by this method can be read by + * {@link #readSVMLightFormat(File)}. + * + * @param file The location where the dataset should be written. + */ + public void writeSVMLightFormat(File file) throws FileNotFoundException { + PrintWriter writer = new PrintWriter(file); + writeSVMLightFormat(writer); + writer.close(); + } + + public void writeSVMLightFormat(PrintWriter writer) { + for (RVFDatum datum : this) { + writer.print(this.labelIndex.indexOf(datum.label())); + Counter features = datum.asFeaturesCounter(); + for (F feature : features.keySet()) { + double count = features.getCount(feature); + writer.format(" %s:%f", this.featureIndex.indexOf(feature), count); + } + writer.println(); + } + } + + /** + * Prints the sparse feature matrix using + * {@link #printSparseFeatureMatrix(PrintWriter)} to {@link System#out + * System.out}. + */ + public void printSparseFeatureMatrix() { + printSparseFeatureMatrix(new PrintWriter(System.out, true)); + } + + /** + * Prints a sparse feature matrix representation of the Dataset. Prints the + * actual {@link Object#toString()} representations of features. + */ + public void printSparseFeatureMatrix(PrintWriter pw) { + String sep = "\t"; + for (int i = 0; i < size; i++) { + pw.print(labelIndex.get(labels[i])); + int[] datum = data[i]; + for (int feat : datum) { + pw.print(sep); + pw.print(featureIndex.get(feat)); + } + pw.println(); + } + } + + /** + * Prints a sparse feature-value output of the Dataset. Prints the actual + * {@link Object#toString()} representations of features. This is probably + * what you want for RVFDataset since the above two methods seem useless and + * unused. + */ + public void printSparseFeatureValues(PrintWriter pw) { + for (int i = 0; i < size; i++) { + printSparseFeatureValues(i, pw); + } + } + + /** + * Prints a sparse feature-value output of the Dataset. Prints the actual + * {@link Object#toString()} representations of features. This is probably + * what you want for RVFDataset since the above two methods seem useless and + * unused. + */ + public void printSparseFeatureValues(int datumNo, PrintWriter pw) { + pw.print(labelIndex.get(labels[datumNo])); + pw.print('\t'); + pw.println("LABEL"); + int[] datum = data[datumNo]; + double[] vals = values[datumNo]; + assert datum.length == vals.length; + for (int i = 0; i < datum.length; i++) { + pw.print(featureIndex.get(datum[i])); + pw.print('\t'); + pw.println(vals[i]); + } + pw.println(); + } + + public static void main(String[] args) { + RVFDataset data = new RVFDataset(); + ClassicCounter c1 = new ClassicCounter(); + c1.incrementCount("fever", 3.5); + c1.incrementCount("cough", 1.1); + c1.incrementCount("congestion", 4.2); + + ClassicCounter c2 = new ClassicCounter(); + c2.incrementCount("fever", 1.5); + c2.incrementCount("cough", 2.1); + c2.incrementCount("nausea", 3.2); + + ClassicCounter c3 = new ClassicCounter(); + c3.incrementCount("cough", 2.5); + c3.incrementCount("congestion", 3.2); + + data.add(new RVFDatum(c1, "cold")); + data.add(new RVFDatum(c2, "flu")); + data.add(new RVFDatum(c3, "cold")); + data.summaryStatistics(); + + LinearClassifierFactory factory = new LinearClassifierFactory(); + factory.useQuasiNewton(); + + LinearClassifier c = factory.trainClassifier(data); + + ClassicCounter c4 = new ClassicCounter(); + c4.incrementCount("cough", 2.3); + c4.incrementCount("fever", 1.3); + + RVFDatum datum = new RVFDatum(c4); + + c.justificationOf((Datum) datum); + } + + @Override + public double[][] getValuesArray() { + values = trimToSize(values, size); + return values; + } + + @Override + public String toString() { + return "Dataset of size " + size; + } + + public String toSummaryString() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + pw.println("Number of data points: " + size()); + + pw.print("Number of labels: " + labelIndex.size() + " ["); + Iterator iter = labelIndex.iterator(); + while (iter.hasNext()) { + pw.print(iter.next()); + if (iter.hasNext()) { + pw.print(", "); + } + } + pw.println("]"); + pw.println("Number of features (Phi(X) types): " + featureIndex.size()); + pw.println("Number of active feature types: " + numFeatureTypes()); + pw.println("Number of active feature tokens: " + numFeatureTokens()); + + return sw.toString(); + } + + /** + * {@inheritDoc} + */ + @Override + public Iterator> iterator() { + return new Iterator>() { + private int index; // = 0; + + public boolean hasNext() { + return this.index < size; + } + + public RVFDatum next() { + if (index >= size) { + throw new NoSuchElementException(); + } + RVFDatum next = getRVFDatum(this.index); + ++this.index; + return next; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + /** + * Randomizes the data array in place. Needs to be redefined here because we + * need to randomize the values as well. + */ + @Override + public void randomize(int randomSeed) { + Random rand = new Random(randomSeed); + for (int j = size - 1; j > 0; j--) { + int randIndex = rand.nextInt(j); + int[] tmp = data[randIndex]; + data[randIndex] = data[j]; + data[j] = tmp; + + int tmpl = labels[randIndex]; + labels[randIndex] = labels[j]; + labels[j] = tmpl; + + double[] tmpv = values[randIndex]; + values[randIndex] = values[j]; + values[j] = tmpv; + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SVMLightClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SVMLightClassifier.java new file mode 100644 index 0000000..98ec52f --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SVMLightClassifier.java @@ -0,0 +1,74 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; + +/** + * This class represents a trained SVM Classifier. It is actually just a + * LinearClassifier, but it can have a Platt (sigmoid) model overlaying + * it for the purpose of producing meaningful probabilities. + * + * @author Jenny Finkel + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (templatization) + */ + +public class SVMLightClassifier extends LinearClassifier { + + /** + * + */ + private static final long serialVersionUID = 1L; + public LinearClassifier platt = null; + + public SVMLightClassifier(ClassicCounter> weightCounter, ClassicCounter thresholds) { + super(weightCounter, thresholds); + } + + public SVMLightClassifier(ClassicCounter> weightCounter, ClassicCounter thresholds, LinearClassifier platt) { + super(weightCounter, thresholds); + this.platt = platt; + } + + public void setPlatt(LinearClassifier platt) { + this.platt = platt; + } + + /** + * Returns a counter for the log probability of each of the classes + * looking at the the sum of e^v for each count v, should be 1 + * Note: Uses SloppyMath.logSum which isn't exact but isn't as + * offensively slow as doing a series of exponentials + */ + @Override + public Counter logProbabilityOf(Datum example) { + if (platt == null) { + throw new UnsupportedOperationException("If you want to ask for the probability, you must train a Platt model!"); + } + Counter scores = scoresOf(example); + scores.incrementCount(null); + Counter probs = platt.logProbabilityOf(new RVFDatum(scores)); + //System.out.println(scores+" "+probs); + return probs; + } + + /** + * Returns a counter for the log probability of each of the classes + * looking at the the sum of e^v for each count v, should be 1 + * Note: Uses SloppyMath.logSum which isn't exact but isn't as + * offensively slow as doing a series of exponentials + */ + @Override + public Counter logProbabilityOf(RVFDatum example) { + if (platt == null) { + throw new UnsupportedOperationException("If you want to ask for the probability, you must train a Platt model!"); + } + Counter scores = scoresOf(example); + scores.incrementCount(null); + Counter probs = platt.logProbabilityOf(new RVFDatum(scores)); + //System.out.println(scores+" "+probs); + return probs; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SVMLightClassifierFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SVMLightClassifierFactory.java new file mode 100644 index 0000000..2873fcb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SVMLightClassifierFactory.java @@ -0,0 +1,503 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.optimization.GoldenSectionLineSearch; +import edu.stanford.nlp.stats.*; +import edu.stanford.nlp.util.*; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.RVFDatum; +import edu.stanford.nlp.optimization.LineSearcher; + +import java.io.*; +import java.text.NumberFormat; +import java.util.*; +import java.util.regex.Pattern; + +/** + * This class is meant for training SVMs ({@link SVMLightClassifier}s). It actually calls SVM Light. or + * SVM Struct for multiclass SVMs, on the command line, reads in the produced + * model file and creates a Linear Classifier. A Platt model is also trained + * (unless otherwise specified) on top of the SVM so that probabilities can + * be produced. + * + * @author Jenny Finkel + * @author Aria Haghighi + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (templatization) + */ + +public class SVMLightClassifierFactory implements ClassifierFactory>{ //extends AbstractLinearClassifierFactory { + + /** + * + */ + private static final long serialVersionUID = 1L; + + /** + * C can be tuned using held-out set or cross-validation + * For binary SVM, if C=0, svmlight uses default of 1/(avg x*x) + */ + protected double C = -1.0; + private boolean useSigmoid = false; + protected boolean verbose = true; + private String svmLightLearn = "/u/nlp/packages/svm_light/svm_learn"; + private String svmStructLearn = "/u/nlp/packages/svm_multiclass/svm_multiclass_learn"; + private String svmLightClassify = "/u/nlp/packages/svm_light/svm_classify"; + private String svmStructClassify = "/u/nlp/packages/svm_multiclass/svm_multiclass_classify"; + private boolean useAlphaFile = false; + protected File alphaFile; + private boolean deleteTempFilesOnExit = true; + private int svmLightVerbosity = 0; // not verbose + private boolean doEval = false; + + /** @param svmLightLearn is the fullPathname of the training program of svmLight with default value "/u/nlp/packages/svm_light/svm_learn" + * @param svmStructLearn is the fullPathname of the training program of svmMultiClass with default value "/u/nlp/packages/svm_multiclass/svm_multiclass_learn" + */ + public SVMLightClassifierFactory(String svmLightLearn, String svmStructLearn){ + this.svmLightLearn = svmLightLearn; + this.svmStructLearn = svmStructLearn; + } + + public SVMLightClassifierFactory(){ + } + /** + * Set the C parameter (for the slack variables) for training the SVM. + */ + public void setC(double C) { + this.C = C; + } + + /** + * Get the C parameter (for the slack variables) for training the SVM. + */ + + public double getC() { + return C; + } + + /** + * Specify whether or not to train an overlying platt (sigmoid) + * model for producing meaningful probabilities. + */ + public void setUseSigmoid(boolean useSigmoid) { + this.useSigmoid = useSigmoid; + } + + /** + * Get whether or not to train an overlying platt (sigmoid) + * model for producing meaningful probabilities. + */ + public boolean getUseSigma() { + return useSigmoid; + } + + + public boolean getDeleteTempFilesOnExitFlag() { + return deleteTempFilesOnExit; + } + + public void setDeleteTempFilesOnExitFlag(boolean deleteTempFilesOnExit) { + this.deleteTempFilesOnExit = deleteTempFilesOnExit; + } + + /** + * Reads in a model file in svm light format. It needs to know if its multiclass or not + * because it affects the number of header lines. Maybe there is another way to tell and we + * can remove this flag? + */ + private static Pair> readModel(File modelFile, boolean multiclass) { + int modelLineCount = 0; + try { + + int numLinesToSkip = multiclass ? 13 : 10; + String stopToken = "#"; + + BufferedReader in = new BufferedReader(new FileReader(modelFile)); + + for (int i=0; i < numLinesToSkip; i++) { + in.readLine(); + modelLineCount ++; + } + + List>> supportVectors = new ArrayList>>(); + // Read Threshold + String thresholdLine = in.readLine(); + modelLineCount ++; + String[] pieces = thresholdLine.split("\\s+"); + double threshold = Double.parseDouble(pieces[0]); + // Read Support Vectors + while (in.ready()) { + String svLine = in.readLine(); + modelLineCount ++; + pieces = svLine.split("\\s+"); + // First Element is the alpha_i * y_i + double alpha = Double.parseDouble(pieces[0]); + ClassicCounter supportVector = new ClassicCounter(); + for (int i=1; i < pieces.length; ++i) { + String piece = pieces[i]; + if (piece.equals(stopToken)) break; + // Each in featureIndex:num class + String[] indexNum = piece.split(":"); + String featureIndex = indexNum[0]; + // mihai: we may see "qid" as indexNum[0]. just skip this piece. this is the block id useful only for reranking, which we don't do here. + if(! featureIndex.equals("qid")){ + double count = Double.parseDouble(indexNum[1]); + supportVector.incrementCount(Integer.valueOf(featureIndex), count); + } + } + supportVectors.add(new Pair>(alpha, supportVector)); + } + + in.close(); + + return new Pair>(threshold, getWeights(supportVectors)); + } + catch (Exception e) { + e.printStackTrace(); + throw new RuntimeException("Error reading SVM model (line " + modelLineCount + " in file " + modelFile.getAbsolutePath() + ")"); + } + } + + /** + * Takes all the support vectors, and their corresponding alphas, and computes a weight + * vector that can be used in a vanilla LinearClassifier. This only works because + * we are using a linear kernel. The Counter is over the feature indices (+1 cos for + * some reason svm_light is 1-indexed), not features. + */ + private static ClassicCounter getWeights(List>> supportVectors) { + ClassicCounter weights = new ClassicCounter(); + for (Pair> sv : supportVectors) { + ClassicCounter c = new ClassicCounter(sv.second()); + Counters.multiplyInPlace(c, sv.first()); + Counters.addInPlace(weights, c); + } + return weights; + } + + /** + * Converts the weight Counter to be from indexed, svm_light format, to a format + * we can use in our LinearClassifier. + */ + private ClassicCounter> convertWeights(ClassicCounter weights, Index featureIndex, Index labelIndex, boolean multiclass) { + return multiclass ? convertSVMStructWeights(weights, featureIndex, labelIndex) : convertSVMLightWeights(weights, featureIndex, labelIndex); + } + + /** + * Converts the svm_light weight Counter (which uses feature indices) into a weight Counter + * using the actual features and labels. Because this is svm_light, and not svm_struct, the + * weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class + * (which correspond to labelIndex.get(1)) are just the negation of one another. + */ + private ClassicCounter> convertSVMLightWeights(ClassicCounter weights, Index featureIndex, Index labelIndex) { + ClassicCounter> newWeights = new ClassicCounter>(); + for (int i : weights.keySet()) { + F f = featureIndex.get(i-1); + double w = weights.getCount(i); + // the first guy in the labelIndex was the +1 class and the second guy + // was the -1 class + newWeights.incrementCount(new Pair(f, labelIndex.get(0)),w); + newWeights.incrementCount(new Pair(f, labelIndex.get(1)),-w); + } + return newWeights; + } + + /** + * Converts the svm_struct weight Counter (in which the weight for a feature/label pair + * correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter + * using the actual features and labels. + */ + private ClassicCounter> convertSVMStructWeights(ClassicCounter weights, Index featureIndex, Index labelIndex) { + // int numLabels = labelIndex.size(); + int numFeatures = featureIndex.size(); + ClassicCounter> newWeights = new ClassicCounter>(); + for (int i : weights.keySet()) { + L l = labelIndex.get((i-1) / numFeatures); // integer division on purpose + F f = featureIndex.get((i-1) % numFeatures); + double w = weights.getCount(i); + newWeights.incrementCount(new Pair(f, l),w); + } + + return newWeights; + } + + /** + * Builds a sigmoid model to turn the classifier outputs into probabilities. + */ + private LinearClassifier fitSigmoid(SVMLightClassifier classifier, GeneralDataset dataset) { + RVFDataset plattDataset = new RVFDataset(); + for (int i = 0; i < dataset.size(); i++) { + RVFDatum d = dataset.getRVFDatum(i); + Counter scores = classifier.scoresOf((Datum)d); + scores.incrementCount(null); + plattDataset.add(new RVFDatum(scores, d.label())); + } + LinearClassifierFactory factory = new LinearClassifierFactory(); + factory.setPrior(new LogPrior(LogPrior.LogPriorType.NULL)); + return factory.trainClassifier(plattDataset); + } + + /** + * This method will cross validate on the given data and number of folds + * to find the optimal C. The scorer is how you determine what to + * optimize for (F-score, accuracy, etc). The C is then saved, so that + * if you train a classifier after calling this method, that C will be used. + */ + public void crossValidateSetC(GeneralDataset dataset, int numFolds, final Scorer scorer, LineSearcher minimizer) { + System.out.println("in Cross Validate"); + + useAlphaFile = true; + boolean oldUseSigmoid = useSigmoid; + useSigmoid = false; + + final CrossValidator crossValidator = new CrossValidator(dataset,numFolds); + final Function,GeneralDataset,CrossValidator.SavedState>,Double> score = + new Function,GeneralDataset,CrossValidator.SavedState>,Double> () + { + public Double apply (Triple,GeneralDataset,CrossValidator.SavedState> fold) { + GeneralDataset trainSet = fold.first(); + GeneralDataset devSet = fold.second(); + alphaFile = (File)fold.third().state; + //train(trainSet,true,true); + SVMLightClassifier classifier = trainClassifierBasic(trainSet); + fold.third().state = alphaFile; + return scorer.score(classifier,devSet); + } + }; + + Function negativeScorer = + new Function () + { + public Double apply(Double cToTry) { + C = cToTry; + if (verbose) { System.out.print("C = "+cToTry+" "); } + Double averageScore = crossValidator.computeAverage(score); + if (verbose) { System.out.println(" -> average Score: "+averageScore); } + return -averageScore; + } + }; + + C = minimizer.minimize(negativeScorer); + + useAlphaFile = false; + useSigmoid = oldUseSigmoid; + } + + public void heldOutSetC(GeneralDataset train, double percentHeldOut, final Scorer scorer, LineSearcher minimizer) { + Pair, GeneralDataset> data = train.split(percentHeldOut); + heldOutSetC(data.first(), data.second(), scorer, minimizer); + } + + /** + * This method will cross validate on the given data and number of folds + * to find the optimal C. The scorer is how you determine what to + * optimize for (F-score, accuracy, etc). The C is then saved, so that + * if you train a classifier after calling this method, that C will be used. + */ + public void heldOutSetC(final GeneralDataset trainSet, final GeneralDataset devSet, final Scorer scorer, LineSearcher minimizer) { + + useAlphaFile = true; + boolean oldUseSigmoid = useSigmoid; + useSigmoid = false; + + Function negativeScorer = + new Function () + { + public Double apply(Double cToTry) { + C = cToTry; + SVMLightClassifier classifier = trainClassifierBasic(trainSet); + double score = scorer.score(classifier,devSet); + return -score; + } + }; + + C = minimizer.minimize(negativeScorer); + + useAlphaFile = false; + useSigmoid = oldUseSigmoid; + } + + @Deprecated + public SVMLightClassifier trainClassifier(List> examples) { + // TODO Auto-generated method stub + return null; + } + + private boolean tuneHeldOut = false; + private boolean tuneCV = false; + private Scorer scorer = new MultiClassAccuracyStats(); + private LineSearcher tuneMinimizer = new GoldenSectionLineSearch(true); + private int folds; + private double heldOutPercent; + + public double getHeldOutPercent() { + return heldOutPercent; + } + + public void setHeldOutPercent(double heldOutPercent) { + this.heldOutPercent = heldOutPercent; + } + + public int getFolds() { + return folds; + } + + public void setFolds(int folds) { + this.folds = folds; + } + + public LineSearcher getTuneMinimizer() { + return tuneMinimizer; + } + + public void setTuneMinimizer(LineSearcher minimizer) { + this.tuneMinimizer = minimizer; + } + + public Scorer getScorer() { + return scorer; + } + + public void setScorer(Scorer scorer) { + this.scorer = scorer; + } + + public boolean getTuneCV() { + return tuneCV; + } + + public void setTuneCV(boolean tuneCV) { + this.tuneCV = tuneCV; + } + + public boolean getTuneHeldOut() { + return tuneHeldOut; + } + + public void setTuneHeldOut(boolean tuneHeldOut) { + this.tuneHeldOut = tuneHeldOut; + } + + public int getSvmLightVerbosity() { + return svmLightVerbosity; + } + + public void setSvmLightVerbosity(int svmLightVerbosity) { + this.svmLightVerbosity = svmLightVerbosity; + } + + public SVMLightClassifier trainClassifier(GeneralDataset dataset) { + if (tuneHeldOut) { + heldOutSetC(dataset, heldOutPercent, scorer, tuneMinimizer); + } else if (tuneCV) { + crossValidateSetC(dataset, folds, scorer, tuneMinimizer); + } + return trainClassifierBasic(dataset); + } + + Pattern whitespacePattern = Pattern.compile("\\s+"); + + public SVMLightClassifier trainClassifierBasic(GeneralDataset dataset) { + Index labelIndex = dataset.labelIndex(); + Index featureIndex = dataset.featureIndex; + boolean multiclass = (dataset.numClasses() > 2); + try { + + // this is the file that the model will be saved to + File modelFile = File.createTempFile("svm-", ".model"); + if (deleteTempFilesOnExit) { + modelFile.deleteOnExit(); + } + + // this is the file that the svm light formated dataset + // will be printed to + File dataFile = File.createTempFile("svm-", ".data"); + if (deleteTempFilesOnExit) { + dataFile.deleteOnExit(); + } + + // print the dataset + PrintWriter pw = new PrintWriter(new FileWriter(dataFile)); + dataset.printSVMLightFormat(pw); + pw.close(); + + // -v 0 makes it not verbose + // -m 400 gives it a larger cache, for faster training + String cmd = (multiclass ? svmStructLearn : svmLightLearn) + " -v " + svmLightVerbosity + " -m 400 "; + + // set the value of C if we have one specified + if (C > 0.0) cmd = cmd + " -c " + C + " "; // C value + + // Alpha File + if (useAlphaFile) { + File newAlphaFile = File.createTempFile("svm-", ".alphas"); + if (deleteTempFilesOnExit) { + newAlphaFile.deleteOnExit(); + } + cmd = cmd + " -a " + newAlphaFile.getAbsolutePath(); + if (alphaFile != null) { + cmd = cmd + " -y " + alphaFile.getAbsolutePath(); + } + alphaFile = newAlphaFile; + } + + // File and Model Data + cmd = cmd + " " + dataFile.getAbsolutePath() + " " + modelFile.getAbsolutePath(); + + if (verbose) System.err.println("<< "+cmd+" >>"); + + /*Process p = Runtime.getRuntime().exec(cmd); + + p.waitFor(); + + if (p.exitValue() != 0) throw new RuntimeException("Error Training SVM Light exit value: " + p.exitValue()); + p.destroy(); */ + SystemUtils.run(new ProcessBuilder(whitespacePattern.split(cmd)), + new PrintWriter(System.err), new PrintWriter(System.err)); + + if (doEval) { + File predictFile = File.createTempFile("svm-", ".pred"); + if (deleteTempFilesOnExit) { + predictFile.deleteOnExit(); + } + String evalCmd = (multiclass ? svmStructClassify : svmLightClassify) + " " + + dataFile.getAbsolutePath() + " " + modelFile.getAbsolutePath() + " " + predictFile.getAbsolutePath(); + if (verbose) System.err.println("<< "+evalCmd+" >>"); + SystemUtils.run(new ProcessBuilder(whitespacePattern.split(evalCmd)), + new PrintWriter(System.err), new PrintWriter(System.err)); + } + // read in the model file + Pair> weightsAndThresh = readModel(modelFile, multiclass); + double threshold = weightsAndThresh.first(); + ClassicCounter> weights = convertWeights(weightsAndThresh.second(), featureIndex, labelIndex, multiclass); + ClassicCounter thresholds = new ClassicCounter(); + if (!multiclass) { + thresholds.setCount(labelIndex.get(0), -threshold); + thresholds.setCount(labelIndex.get(1), threshold); + } + SVMLightClassifier classifier = new SVMLightClassifier(weights, thresholds); + if (doEval) { + File predictFile = File.createTempFile("svm-", ".pred2"); + if (deleteTempFilesOnExit) { + predictFile.deleteOnExit(); + } + PrintWriter pw2 = new PrintWriter(predictFile); + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMaximumFractionDigits(5); + for (Datum datum:dataset) { + Counter scores = classifier.scoresOf(datum); + pw2.println(Counters.toString(scores, nf)); + } + pw2.close(); + } + + if (useSigmoid) { + if (verbose) System.out.print("fitting sigmoid..."); + classifier.setPlatt(fitSigmoid(classifier, dataset)); + if (verbose) System.out.println("done"); + } + + return classifier; + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SemiSupervisedLogConditionalObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SemiSupervisedLogConditionalObjectiveFunction.java new file mode 100644 index 0000000..9112f42 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/SemiSupervisedLogConditionalObjectiveFunction.java @@ -0,0 +1,67 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; + + +/** + * Maximizes the conditional likelihood with a given prior. + * + * @author Jenny Finkel + * @author Sarah Spikes (Templatization) + * @author Ramesh Nallapati (Made the function more general to support other AbstractCachingDiffFunctions involving the summation of two objective functions) + */ + +public class SemiSupervisedLogConditionalObjectiveFunction extends AbstractCachingDiffFunction { + + AbstractCachingDiffFunction objFunc; + //BiasedLogConditionalObjectiveFunction biasedObjFunc; + AbstractCachingDiffFunction biasedObjFunc; + double convexComboFrac = 0.5; + + LogPrior prior; + + public void setPrior(LogPrior prior) { + this.prior = prior; + } + + @Override + public int domainDimension() { + return objFunc.domainDimension(); + } + + @Override + protected void calculate(double[] x) { + if (derivative == null) { + derivative = new double[domainDimension()]; + } + + value = convexComboFrac*objFunc.valueAt(x) + (1.0-convexComboFrac)*biasedObjFunc.valueAt(x); + //value = objFunc.valueAt(x) + biasedObjFunc.valueAt(x); + double[] d1 = objFunc.derivativeAt(x); + double[] d2 = biasedObjFunc.derivativeAt(x); + + for (int i = 0; i < domainDimension(); i++) { + derivative[i] = convexComboFrac*d1[i] + (1.0-convexComboFrac)*d2[i]; + //derivative[i] = d1[i] + d2[i]; + } + if(prior != null) + value += prior.compute(x, derivative); + } + + public SemiSupervisedLogConditionalObjectiveFunction(AbstractCachingDiffFunction objFunc, AbstractCachingDiffFunction biasedObjFunc, LogPrior prior, double convexComboFrac) { + this.objFunc = objFunc; + this.biasedObjFunc = biasedObjFunc; + this.prior = prior; + this.convexComboFrac = convexComboFrac; + if(convexComboFrac < 0 || convexComboFrac > 1.0) + throw new RuntimeException ("convexComboFrac has to lie between 0 and 1 (both inclusive)."); + } + + public SemiSupervisedLogConditionalObjectiveFunction(AbstractCachingDiffFunction objFunc, AbstractCachingDiffFunction biasedObjFunc, LogPrior prior) { + //this.objFunc = objFunc; + //this.biasedObjFunc = biasedObjFunc; + //this.prior = prior; + this(objFunc,biasedObjFunc,prior,0.5); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/WeightedDataset.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/WeightedDataset.java new file mode 100644 index 0000000..f5c4cd9 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/classify/WeightedDataset.java @@ -0,0 +1,110 @@ +package edu.stanford.nlp.classify; + +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.util.Index; + +import java.util.Collection; +import java.util.Random; + +/** + * @author Galen Andrew + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + */ +public class WeightedDataset extends Dataset { + /** + * + */ + private static final long serialVersionUID = -5435125789127705430L; + protected float[] weights; + + public WeightedDataset(Index labelIndex, int[] labels, Index featureIndex, int[][] data, int size, float[] weights) { + super(labelIndex, labels, featureIndex, data, data.length); + this.weights = weights; + } + + public WeightedDataset() { + this(10); + } + + public WeightedDataset(int initSize) { + super(initSize); + weights = new float[initSize]; + } + + private float[] trimToSize(float[] i) { + float[] newI = new float[size]; + System.arraycopy(i, 0, newI, 0, size); + return newI; + } + + public float[] getWeights() { + weights = trimToSize(weights); + return weights; + } + + @Override + public float[] getFeatureCounts() { + float[] counts = new float[featureIndex.size()]; + for (int i = 0, m = size; i < m; i++) { + for (int j = 0, n = data[i].length; j < n; j++) { + counts[data[i][j]] += weights[i]; + } + } + return counts; + } + + @Override + public void add(Datum d) { + add(d, 1.0f); + } + + @Override + public void add(Collection features, L label) { + add(features, label, 1.0f); + } + + public void add(Datum d, float weight) { + add(d.asFeatures(), d.label(), weight); + } + + @Override + protected void ensureSize() { + super.ensureSize(); + if (weights.length == size) { + float[] newWeights = new float[size * 2]; + System.arraycopy(weights, 0, newWeights, 0, size); + weights = newWeights; + } + } + + public void add(Collection features, L label, float weight) { + ensureSize(); + addLabel(label); + addFeatures(features); + weights[size++] = weight; + } + + /** + * Randomizes the data array in place + * Needs to be redefined here because we need to randomize the weights as well + */ + @Override + public void randomize(int randomSeed) { + Random rand = new Random(randomSeed); + for(int j = size - 1; j > 0; j --){ + int randIndex = rand.nextInt(j); + + int [] tmp = data[randIndex]; + data[randIndex] = data[j]; + data[j] = tmp; + + int tmpl = labels[randIndex]; + labels[randIndex] = labels[j]; + labels[j] = tmpl; + + float tmpw = weights[randIndex]; + weights[randIndex] = weights[j]; + weights[j] = tmpw; + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ACEMentionExtractor.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ACEMentionExtractor.java new file mode 100644 index 0000000..c62b0b3 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ACEMentionExtractor.java @@ -0,0 +1,266 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.TreeSet; +import java.util.logging.Level; +import java.util.logging.Logger; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.ie.machinereading.domains.ace.AceReader; +import edu.stanford.nlp.ie.machinereading.structure.EntityMention; +import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; + +/** + * Extracts {@code } mentions from a file annotated in ACE format (ACE2004, ACE2005). + * + * @author Heeyoung Lee + */ +public class ACEMentionExtractor extends MentionExtractor { + private AceReader aceReader; + + private String corpusPath; + protected int fileIndex = 0; + protected String[] files; + + private static final Logger logger = SieveCoreferenceSystem.logger; + + private static class EntityComparator implements Comparator { + @Override + public int compare(EntityMention m1, EntityMention m2){ + if(m1.getExtentTokenStart() > m2.getExtentTokenStart()) return 1; + else if(m1.getExtentTokenStart() < m2.getExtentTokenStart()) return -1; + else if(m1.getExtentTokenEnd() > m2.getExtentTokenEnd()) return -1; + else if(m1.getExtentTokenEnd() < m2.getExtentTokenEnd()) return 1; + else return 0; + } + } + + public ACEMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { + super(dict, semantics); + stanfordProcessor = loadStanfordProcessor(props); + + if(props.containsKey(Constants.ACE2004_PROP)) { + corpusPath = props.getProperty(Constants.ACE2004_PROP); + aceReader = new AceReader(stanfordProcessor, false, "ACE2004"); + } + else if(props.containsKey(Constants.ACE2005_PROP)) { + corpusPath = props.getProperty(Constants.ACE2005_PROP); + aceReader = new AceReader(stanfordProcessor, false); + } + aceReader.setLoggerLevel(Level.INFO); + + if(corpusPath.charAt(corpusPath.length()-1)!= File.separatorChar) corpusPath+= File.separatorChar; + + files = new File(corpusPath).list(); + } + + public ACEMentionExtractor(Dictionaries dict, Properties props, Semantics semantics, + LogisticClassifier singletonModel) throws Exception { + this(dict, props, semantics); + singletonPredictor = singletonModel; + } + + public void resetDocs() { + super.resetDocs(); + fileIndex = 0; + } + + public Document nextDoc() throws Exception { + List> allWords = new ArrayList>(); + List> allGoldMentions = new ArrayList>(); + List> allPredictedMentions; + List allTrees = new ArrayList(); + + Annotation anno; + + try { + String filename=""; + while(files.length > fileIndex){ + if(files[fileIndex].contains("apf.xml")) { + filename = files[fileIndex]; + fileIndex++; + break; + } + else { + fileIndex++; + filename=""; + } + } + if(files.length <= fileIndex && filename.equals("")) return null; + + anno = aceReader.parse(corpusPath+filename); + stanfordProcessor.annotate(anno); + + + List sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); + + for (CoreMap s : sentences){ + int i = 1; + for(CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)){ + w.set(CoreAnnotations.IndexAnnotation.class, i++); + if(!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) { + w.set(CoreAnnotations.UtteranceAnnotation.class, 0); + } + } + allTrees.add(s.get(TreeCoreAnnotations.TreeAnnotation.class)); + allWords.add(s.get(CoreAnnotations.TokensAnnotation.class)); + EntityComparator comparator = new EntityComparator(); + extractGoldMentions(s, allGoldMentions, comparator); + } + + if(Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; + else allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries); + + printRawDoc(sentences, allGoldMentions, filename, true); + printRawDoc(sentences, allPredictedMentions, filename, false); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + + return arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); + } + + private void extractGoldMentions(CoreMap s, List> allGoldMentions, EntityComparator comparator) { + List goldMentions = new ArrayList(); + allGoldMentions.add(goldMentions); + List goldMentionList = s.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + List words = s.get(CoreAnnotations.TokensAnnotation.class); + + TreeSet treeForSortGoldMentions = new TreeSet(comparator); + if(goldMentionList!=null) treeForSortGoldMentions.addAll(goldMentionList); + if(!treeForSortGoldMentions.isEmpty()){ + for(EntityMention e : treeForSortGoldMentions){ + Mention men = new Mention(); + men.dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + men.startIndex = e.getExtentTokenStart(); + men.endIndex = e.getExtentTokenEnd(); + + String[] parseID = e.getObjectId().split("-"); + men.mentionID = Integer.parseInt(parseID[parseID.length-1]); + String[] parseCorefID = e.getCorefID().split("-E"); + men.goldCorefClusterID = Integer.parseInt(parseCorefID[parseCorefID.length-1]); + men.originalRef = -1; + + for(int j=allGoldMentions.size()-1 ; j>=0 ; j--){ + List l = allGoldMentions.get(j); + for(int k=l.size()-1 ; k>=0 ; k--){ + Mention m = l.get(k); + if(men.goldCorefClusterID == m.goldCorefClusterID){ + men.originalRef = m.mentionID; + } + } + } + goldMentions.add(men); + if(men.mentionID > maxID) maxID = men.mentionID; + + // set ner type + for(int j = e.getExtentTokenStart() ; j < e.getExtentTokenEnd() ; j++){ + CoreLabel word = words.get(j); + String ner = e.getType() +"-"+ e.getSubType(); + if(Constants.USE_GOLD_NE){ + word.set(CoreAnnotations.EntityTypeAnnotation.class, e.getMentionType()); + if(e.getMentionType().equals("NAM")) word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); + } + } + } + } + } + + private static void printRawDoc(List sentences, List> allMentions, String filename, boolean gold) throws FileNotFoundException { + StringBuilder doc = new StringBuilder(); + int previousOffset = 0; + Counter mentionCount = new ClassicCounter(); + for(List l : allMentions) { + for(Mention m : l) { + mentionCount.incrementCount(m.goldCorefClusterID); + } + } + + for(int i = 0 ; i mentions = allMentions.get(i); + + String[] tokens = sentence.get(CoreAnnotations.TextAnnotation.class).split(" "); + String sent = ""; + List t = sentence.get(CoreAnnotations.TokensAnnotation.class); + if(previousOffset+2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) sent += "\n"; + previousOffset = t.get(t.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + Counter startCounts = new ClassicCounter(); + Counter endCounts = new ClassicCounter(); + Map> endID = Generics.newHashMap(); + for (Mention m : mentions) { + startCounts.incrementCount(m.startIndex); + endCounts.incrementCount(m.endIndex); + if(!endID.containsKey(m.endIndex)) endID.put(m.endIndex, Generics.newHashSet()); + endID.get(m.endIndex).add(m.goldCorefClusterID); + } + for (int j = 0 ; j < tokens.length; j++){ + if(endID.containsKey(j)) { + for(Integer id : endID.get(j)){ + if(mentionCount.getCount(id)!=1 && gold) sent += "]_"+id; + else sent += "]"; + } + } + for (int k = 0 ; k < startCounts.getCount(j) ; k++) { + if(!sent.endsWith("[")) sent += " "; + sent += "["; + } + sent += " "; + sent = sent + tokens[j]; + } + for(int k = 0 ; k fileList; + private int curFileIndex; + private final Options options; + + public static final Logger logger = Logger.getLogger(CoNLL2011DocumentReader.class.getName()); + + public CoNLL2011DocumentReader(String filepath) + { + this(filepath, new Options()); + } + + public CoNLL2011DocumentReader(String filepath, Options options) + { +// this.filepath = filepath; + this.fileList = getFiles(filepath, options.filePattern); + this.options = options; + if (options.sortFiles) { + Collections.sort(this.fileList); + } + curFileIndex = 0; + logger.info("Reading " + fileList.size() + " CoNll2011 files from " + filepath); + } + + private static List getFiles(String filepath, Pattern filter) + { + Iterable iter = IOUtils.iterFilesRecursive(new File(filepath), filter); + List fileList = new ArrayList(); + for (File f:iter) { + fileList.add(f); + } + Collections.sort(fileList); + return fileList; + } + + public void reset() { + curFileIndex = 0; + if (docIterator != null) { + docIterator.close(); + docIterator = null; + } + } + + public Document getNextDocument() + { + try { + if (curFileIndex >= fileList.size()) return null; // DONE! + File curFile = fileList.get(curFileIndex); + if (docIterator == null) { + docIterator = new DocumentIterator(curFile.getAbsolutePath(), options); + } + while ( ! docIterator.hasNext()) { + logger.info("Processed " + docIterator.docCnt + " documents in " + curFile.getAbsolutePath()); + docIterator.close(); + curFileIndex++; + if (curFileIndex >= fileList.size()) { + return null; // DONE! + } + curFile = fileList.get(curFileIndex); + docIterator = new DocumentIterator(curFile.getAbsolutePath(), options); + } + Document next = docIterator.next(); + SieveCoreferenceSystem.logger.fine("Reading document: " + next.getDocumentID()); + return next; + } catch (IOException ex) { + throw new RuntimeIOException(ex); + } + } + + public void close() + { + IOUtils.closeIgnoringExceptions(docIterator); + } + + public static class NamedEntityAnnotation implements CoreAnnotation { + public Class getType() { + return CoreMap.class; + } + } + + public static class CorefMentionAnnotation implements CoreAnnotation { + public Class getType() { + return CoreMap.class; + } + } + + /** Flags **/ + public static class Options { + public boolean useCorefBIOESEncoding = false; // Marks Coref mentions with prefix + // B- begin, I- inside, E- end, S- single + public boolean annotateTokenCoref = true; // Annotate token with CorefAnnotation + // If token belongs to multiple clusters + // coref clusterid are separted by '|' + public boolean annotateTokenSpeaker = true; // Annotate token with SpeakerAnnotation + public boolean annotateTokenPos = true; // Annotate token with PartOfSpeechAnnotation + public boolean annotateTokenNer = true; // Annotate token with NamedEntityTagAnnotation + + public boolean annotateTreeCoref = false; // Annotate tree with CorefMentionAnnotation + public boolean annotateTreeNer = false; // Annotate tree with NamedEntityAnnotation + + public String backgroundNerTag = "O"; // Background NER tag + + protected String fileFilter; + protected Pattern filePattern; + protected boolean sortFiles; + + public Options() { + this(".*_gold_conll$"); // _gold_conll or _auto_conll or .conll + } + + public Options(String filter) { + fileFilter = filter; + filePattern = Pattern.compile(fileFilter); + } + + public void setFilter(String filter) { + fileFilter = filter; + filePattern = Pattern.compile(fileFilter); + } + } + + public static class Document { + String documentIdPart; + String documentID; + String partNo; + List> sentenceWordLists = new ArrayList>(); + + Annotation annotation; + CollectionValuedMap corefChainMap; + List nerChunks; + + public String getDocumentID() { + return documentID; + } + + public void setDocumentID(String documentID) { + this.documentID = documentID; + } + + public String getPartNo() { + return partNo; + } + + public void setPartNo(String partNo) { + this.partNo = partNo; + } + + public List> getSentenceWordLists() { + return sentenceWordLists; + } + + public void addSentence(List sentence) { + this.sentenceWordLists.add(sentence); + } + + public Annotation getAnnotation() { + return annotation; + } + + public void setAnnotation(Annotation annotation) { + this.annotation = annotation; + } + + public CollectionValuedMap getCorefChainMap() + { + return corefChainMap; + } + } + + private static String getField(String[] fields, int pos) + { + if (pos == FIELD_LAST) { + return fields[fields.length - 1]; + } else { + return fields[pos]; + } + } + + private static String concatField(List sentWords, int pos) + { + StringBuilder sb = new StringBuilder(); + for (String[] fields:sentWords) { + if (sb.length() > 0) { + sb.append(' '); + } + sb.append(getField(fields, pos)); + } + return sb.toString(); + } + + /** Helper iterator **/ + private static class DocumentIterator extends AbstractIterator implements Closeable { + + private static final Pattern delimiterPattern = Pattern.compile("\\s+"); + private static final LabeledScoredTreeReaderFactory treeReaderFactory = + new LabeledScoredTreeReaderFactory((TreeNormalizer) null); + + private final Options options; + + // State + String filename; + BufferedReader br; + Document nextDoc; + int lineCnt = 0; + int docCnt = 0; + + public DocumentIterator(String filename, Options options) throws IOException { + this.options = options; + this.filename = filename; + this.br = IOUtils.getBufferedFileReader(filename); + nextDoc = readNextDocument(); + } + + @Override + public boolean hasNext() { + return nextDoc != null; + } + + @Override + public Document next() { + if (nextDoc == null) { + throw new NoSuchElementException("DocumentIterator exhausted."); + } + Document curDoc = nextDoc; + nextDoc = readNextDocument(); + return curDoc; + } + + private static final Pattern starPattern = Pattern.compile("\\*"); + + private static Tree wordsToParse(List sentWords) + { + StringBuilder sb = new StringBuilder(); + for (String[] fields:sentWords) { + if (sb.length() > 0) { + sb.append(' '); + } + + String str = fields[FIELD_PARSE_BIT]; + String tagword = "(" + fields[FIELD_POS_TAG] + " " + fields[FIELD_WORD] + ")"; + // Replace stars + int si = str.indexOf('*'); + sb.append(str.substring(0, si)); + sb.append(tagword); + sb.append(str.substring(si+1)); + si = str.indexOf('*', si+1); + if (si >= 0) { + logger.warning(" Parse bit with multiple *: " + str); + } + } + String parseStr = sb.toString(); + return Tree.valueOf(parseStr, treeReaderFactory); + } + + + private static List> getCorefSpans(List sentWords) + { + return getLabelledSpans(sentWords, FIELD_COREF, HYPHEN, true); + } + + private static List> getNerSpans(List sentWords) + { + return getLabelledSpans(sentWords, FIELD_NER_TAG, ASTERISK, false); + } + + + private static final String ASTERISK = "*"; + private static final String HYPHEN = "-"; + + private static List> getLabelledSpans(List sentWords, int fieldIndex, + String defaultMarker, boolean checkEndLabel) + { + List> spans = new ArrayList>(); + Stack> openSpans = new Stack>(); + boolean removeStar = (ASTERISK.equals(defaultMarker)); + for (int wordPos = 0; wordPos < sentWords.size(); wordPos++) { + String[] fields = sentWords.get(wordPos); + String val = getField(fields, fieldIndex); + if (!defaultMarker.equals(val)) { + int openParenIndex = -1; + int lastDelimiterIndex = -1; + for (int j = 0; j < val.length(); j++) { + char c = val.charAt(j); + boolean isDelimiter = false; + if (c == '(' || c == ')' || c == '|') { + if (openParenIndex >= 0) { + String s = val.substring(openParenIndex+1, j); + if (removeStar) { + s = starPattern.matcher(s).replaceAll(""); + } + openSpans.push(new Triple(wordPos,-1,s)); + openParenIndex = -1; + } + isDelimiter = true; + } + if (c == '(') { + openParenIndex = j; + } else if (c == ')') { + Triple t = openSpans.pop(); + if (checkEndLabel) { + // NOTE: end parens may cross (usually because mention either start or end on the same token + // and it is just an artifact of the ordering + String s = val.substring(lastDelimiterIndex+1, j); + if (!s.equals(t.third())) { + Stack> saved = new Stack>(); + while (!s.equals(t.third())) { + // find correct match + saved.push(t); + if (openSpans.isEmpty()) { + throw new RuntimeException("Cannot find matching labelled span for " + s); + } + t = openSpans.pop(); + } + while (!saved.isEmpty()) { + openSpans.push(saved.pop()); + } + assert(s.equals(t.third())); + } + } + t.setSecond(wordPos); + spans.add(t); + } + if (isDelimiter) { + lastDelimiterIndex = j; + } + } + if (openParenIndex >= 0) { + String s = val.substring(openParenIndex+1, val.length()); + if (removeStar) { + s = starPattern.matcher(s).replaceAll(""); + } + openSpans.push(new Triple(wordPos,-1,s)); + } + } + } + if (openSpans.size() != 0) { + throw new RuntimeException("Error extracting labelled spans for column " + fieldIndex + ": " + + concatField(sentWords, fieldIndex)); + } + return spans; + } + + private CoreMap wordsToSentence(List sentWords) + { + String sentText = concatField(sentWords, FIELD_WORD); + Annotation sentence = new Annotation(sentText); + Tree tree = wordsToParse(sentWords); + sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); + List leaves = tree.getLeaves(); + // Check leaves == number of words + assert(leaves.size() == sentWords.size()); + List tokens = new ArrayList(leaves.size()); + sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); + for (int i = 0; i < sentWords.size(); i++) { + String[] fields = sentWords.get(i); + int wordPos = Integer.parseInt(fields[FIELD_WORD_NO]); + assert(wordPos == i); + Tree leaf = leaves.get(i); + CoreLabel token = (CoreLabel) leaf.label(); + tokens.add(token); + if (options.annotateTokenSpeaker) { + String speaker = fields[FIELD_SPEAKER_AUTHOR].replace("_", " "); + if (!HYPHEN.equals(speaker)) { + token.set(CoreAnnotations.SpeakerAnnotation.class, speaker); + } + } + } + if (options.annotateTokenPos) { + for (Tree leaf:leaves) { + CoreLabel token = (CoreLabel) leaf.label(); + token.set(CoreAnnotations.PartOfSpeechAnnotation.class, leaf.parent(tree).value()); + } + } + if (options.annotateTokenNer) { + List> nerSpans = getNerSpans(sentWords); + for (Triple nerSpan:nerSpans) { + int startToken = nerSpan.first(); + int endToken = nerSpan.second(); /* inclusive */ + String label = nerSpan.third(); + for (int i = startToken; i <= endToken; i++) { + Tree leaf = leaves.get(i); + CoreLabel token = (CoreLabel) leaf.label(); + String oldLabel = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if (oldLabel != null) { + logger.warning("Replacing old named entity tag " + oldLabel + " with " + label); + } + token.set(CoreAnnotations.NamedEntityTagAnnotation.class, label); + } + } + for (CoreLabel token:tokens) { + if (!token.containsKey(CoreAnnotations.NamedEntityTagAnnotation.class)) { + token.set(CoreAnnotations.NamedEntityTagAnnotation.class, options.backgroundNerTag); + } + } + } + if (options.annotateTokenCoref) { + List> corefSpans = getCorefSpans(sentWords); + for (Triple corefSpan:corefSpans) { + int startToken = corefSpan.first(); + int endToken = corefSpan.second(); /* inclusive */ + String label = corefSpan.third(); + for (int i = startToken; i <= endToken; i++) { + Tree leaf = leaves.get(i); + CoreLabel token = (CoreLabel) leaf.label(); + String curLabel = label; + if (options.useCorefBIOESEncoding) { + String prefix; + if (startToken == endToken) { + prefix = "S-"; + } else if (i == startToken) { + prefix = "B-"; + } else if (i == endToken) { + prefix = "E-"; + } else { + prefix = "I-"; + } + curLabel = prefix + label; + } + String oldLabel = token.get(CorefCoreAnnotations.CorefAnnotation.class); + if (oldLabel != null) { + curLabel = oldLabel + "|" + curLabel; + } + token.set(CorefCoreAnnotations.CorefAnnotation.class, curLabel); + } + } + } + return sentence; + } + + public static Annotation sentencesToDocument(String documentID, List sentences) + { + String docText = null; + Annotation document = new Annotation(docText); + document.set(CoreAnnotations.DocIDAnnotation.class, documentID); + document.set(CoreAnnotations.SentencesAnnotation.class, sentences); + + + // Accumulate docTokens and label sentence with overall token begin/end, and sentence index annotations + List docTokens = new ArrayList(); + int sentenceIndex = 0; + int tokenBegin = 0; + for (CoreMap sentenceAnnotation:sentences) { + List sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class); + docTokens.addAll(sentenceTokens); + + int tokenEnd = tokenBegin + sentenceTokens.size(); + sentenceAnnotation.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBegin); + sentenceAnnotation.set(CoreAnnotations.TokenEndAnnotation.class, tokenEnd); + sentenceAnnotation.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex); + sentenceIndex++; + tokenBegin = tokenEnd; + } + document.set(CoreAnnotations.TokensAnnotation.class, docTokens); + + // Put in character offsets + int i = 0; + for (CoreLabel token:docTokens) { + String tokenText = token.get(CoreAnnotations.TextAnnotation.class); + token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i); + i+=tokenText.length(); + token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, i); + i++; // Skip space + } + for (CoreMap sentenceAnnotation:sentences) { + List sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class); + sentenceAnnotation.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, + sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); + sentenceAnnotation.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, + sentenceTokens.get(sentenceTokens.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + } + + return document; + } + + private static Tree getLowestCommonAncestor(Tree root, int startToken, int endToken) + { + Tree leftLeaf = Trees.getLeaf(root, startToken); + Tree rightLeaf = Trees.getLeaf(root, endToken); + // todo [cdm 2013]: It might be good to climb certain unaries here, like VP or S under NP, but it's not good to climb all unaries (e.g., NP under FRAG) + return Trees.getLowestCommonAncestor(leftLeaf, rightLeaf, root); + } + + private static Tree getTreeNonTerminal(Tree root, int startToken, int endToken, boolean acceptPreTerminals) + { + Tree t = getLowestCommonAncestor(root, startToken, endToken); + if (t.isLeaf()) { + t = t.parent(root); + } + if (!acceptPreTerminals && t.isPreTerminal()) { + t = t.parent(root); + } + return t; + } + + public void annotateDocument(Document document) + { + List sentences = new ArrayList(document.sentenceWordLists.size()); + for (List sentWords:document.sentenceWordLists) { + sentences.add(wordsToSentence(sentWords)); + } + + Annotation docAnnotation = sentencesToDocument(document.documentIdPart /*document.documentID + "." + document.partNo */, sentences); + document.setAnnotation(docAnnotation); + + // Do this here so we have updated character offsets and all + CollectionValuedMap corefChainMap = new CollectionValuedMap(CollectionFactory.arrayListFactory()); + List nerChunks = new ArrayList(); + for (int i = 0; i < sentences.size(); i++) { + CoreMap sentence = sentences.get(i); + Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + tree.setSpans(); + List sentWords = document.sentenceWordLists.get(i); + + // Get NER chunks + List> nerSpans = getNerSpans(sentWords); + for (Triple nerSpan:nerSpans) { + int startToken = nerSpan.first(); + int endToken = nerSpan.second(); /* inclusive */ + String label = nerSpan.third(); + CoreMap nerChunk = ChunkAnnotationUtils.getAnnotatedChunk(sentence, startToken, endToken+1); + nerChunk.set(CoreAnnotations.NamedEntityTagAnnotation.class, label); + nerChunk.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class)); + nerChunks.add(nerChunk); + Tree t = getTreeNonTerminal(tree, startToken, endToken, true); + if (t.getSpan().getSource() == startToken && t.getSpan().getTarget() == endToken) { + nerChunk.set(TreeCoreAnnotations.TreeAnnotation.class, t); + if (options.annotateTreeNer) { + Label tlabel = t.label(); + if (tlabel instanceof CoreLabel) { + ((CoreLabel) tlabel).set(NamedEntityAnnotation.class, nerChunk); + } + } + } + } + + List> corefSpans = getCorefSpans(sentWords); + for (Triple corefSpan:corefSpans) { + int startToken = corefSpan.first(); + int endToken = corefSpan.second(); /* inclusive */ + String corefId = corefSpan.third(); + CoreMap mention = ChunkAnnotationUtils.getAnnotatedChunk(sentence, startToken, endToken+1); + mention.set(CorefCoreAnnotations.CorefAnnotation.class, corefId); + mention.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class)); + corefChainMap.add(corefId, mention); + Tree t = getTreeNonTerminal(tree, startToken, endToken, true); + mention.set(TreeCoreAnnotations.TreeAnnotation.class, t); + if (options.annotateTreeCoref) { + Label tlabel = t.label(); + if (tlabel instanceof CoreLabel) { + ((CoreLabel) tlabel).set(CorefMentionAnnotation.class, mention); + } + } + } + + } + document.corefChainMap = corefChainMap; + document.nerChunks = nerChunks; + } + + private static final String docStart = "#begin document "; + private static final int docStartLength = docStart.length(); + + public Document readNextDocument() { + try { + List curSentWords = new ArrayList(); + Document document = null; + for (String line; (line = br.readLine()) != null; ) { + lineCnt++; + line = line.trim(); + if (line.length() != 0) { + if (line.startsWith(docStart)) { + // Start of new document + if (document != null) { + logger.warning("Unexpected begin document at line (\" + filename + \",\" + lineCnt + \")"); + } + document = new Document(); + document.documentIdPart = line.substring(docStartLength); + } else if (line.startsWith("#end document")) { + annotateDocument(document); + docCnt++; + return document; + // End of document + } else { + assert document != null; + String[] fields = delimiterPattern.split(line); + if (fields.length < FIELDS_MIN) { + throw new RuntimeException("Unexpected number of field " + fields.length + + ", expected >= " + FIELDS_MIN + " for line (" + filename + "," + lineCnt + "): " + line); + } + String curDocId = fields[FIELD_DOC_ID]; + String partNo = fields[FIELD_PART_NO]; + if (document.getDocumentID() == null) { + document.setDocumentID(curDocId); + document.setPartNo(partNo); + } else { + // Check documentID didn't suddenly change on us + assert(document.getDocumentID().equals(curDocId)); + assert(document.getPartNo().equals(partNo)); + } + curSentWords.add(fields); + } + } else { + // Current sentence has ended, new sentence is about to be started + if (curSentWords.size() > 0) { + assert document != null; + document.addSentence(curSentWords); + curSentWords = new ArrayList(); + } + } + } + } catch (IOException ex) { + throw new RuntimeIOException(ex); + } + return null; + } + + public void close() { + IOUtils.closeIgnoringExceptions(br); + } + + } // end static class DocumentIterator + + public static void usage() + { + System.err.println("java edu.stanford.nlp.dcoref.CoNLL2011DocumentReader [-ext ] -i -o "); + } + + public static Pair getMention(Integer index, String corefG, List sentenceAnno) { + + Integer i = -1; + Integer end = index; + for (CoreLabel newAnno : sentenceAnno) { + i += 1; + if (i > index) { + String corefS = newAnno.get(CorefCoreAnnotations.CorefAnnotation.class); + if (corefS != null) { + String[] allC = corefS.split("\\|"); + if (Arrays.asList(allC).contains(corefG)) { + end = i; + } else { + break; + } + } else { + break; + } + } + } + return Pair.makePair(index, end); + } + + public static boolean include(Map,String> sentenceInfo, + Pair mention, + String corefG) { + Set> keys = sentenceInfo.keySet(); + for (Pair key : keys) { + String corefS = sentenceInfo.get(key); + if (corefS != null && corefS.equals(corefG)) { + if (key.first < mention.first && key.second.equals(mention.second)) { + return true; + } + } + } + return false; + } + + public static void writeTabSep(PrintWriter pw, CoreMap sentence, CollectionValuedMap chainmap) + { + HeadFinder headFinder = new ModCollinsHeadFinder(); + + List sentenceAnno = sentence.get(CoreAnnotations.TokensAnnotation.class); + + Tree sentenceTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + Map,String> sentenceInfo = Generics.newHashMap(); + + Set sentenceSubTrees = sentenceTree.subTrees(); + sentenceTree.setSpans(); + Map,Tree> treeSpanMap = Generics.newHashMap(); + Map,List> wordSpanMap = Generics.newHashMap(); + + for (Tree ctree : sentenceSubTrees) { + IntPair span = ctree.getSpan(); + if (span != null) { + treeSpanMap.put(Pair.makePair(span.getSource(), span.getTarget()), ctree); + wordSpanMap.put(Pair.makePair(span.getSource(), span.getTarget()), ctree.getLeaves()); + } + } + + String[][] finalSentence; + finalSentence = new String [sentenceAnno.size()][]; + Map,String> allHeads = Generics.newHashMap(); + + int index = -1; + for (CoreLabel newAnno : sentenceAnno) { + index += 1; + String word = newAnno.word(); + String tag = newAnno.tag(); + String cat = newAnno.ner(); + String coref = newAnno.get(CorefCoreAnnotations.CorefAnnotation.class); + finalSentence[index] = new String[4]; + finalSentence[index][0] = word; + finalSentence[index][1] = tag; + finalSentence[index][2] = cat; + finalSentence[index][3] = coref; + + if (coref == null) { + sentenceInfo.put(Pair.makePair(index, index), coref); + finalSentence[index][3] = "O"; + + } else { + String[] allC = coref.split("\\|"); + for (String corefG : allC) { + Pair mention = getMention(index, corefG, sentenceAnno); + + if ( ! include(sentenceInfo, mention, corefG)) { + // find largest NP in mention + sentenceInfo.put(mention, corefG); + Tree mentionTree = treeSpanMap.get(mention); + String head = null; + if (mentionTree != null) { + head = mentionTree.headTerminal(headFinder).nodeString(); + } else if (mention.first.equals(mention.second)) { + head = word; + } + allHeads.put(mention, head); + } + } + + if (allHeads.values().contains(word)) { + finalSentence[index][3] = "MENTION"; + } else { + finalSentence[index][3] = "O"; + } + } + } + for (int i=0;i mentionTreeLabelCounter = new IntCounter(); + IntCounter mentionTreeNonPretermLabelCounter = new IntCounter(); + IntCounter mentionTreePretermNonPretermNoMatchLabelCounter = new IntCounter(); + IntCounter mentionTreeMixedLabelCounter = new IntCounter(); + IntCounter mentionTokenLengthCounter = new IntCounter(); + IntCounter nerMentionTokenLengthCounter = new IntCounter(); + int mentionExactTreeSpan = 0; + int nonPretermSpanMatches = 0; + int totalMentions = 0; + int nestedNerMentions = 0; + int nerMentions = 0; + + public void process(Document doc) + { + List sentences = doc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class); + for (String id:doc.corefChainMap.keySet()) { + Collection mentions = doc.corefChainMap.get(id); + for (CoreMap m:mentions) { + CoreMap sent = sentences.get(m.get(CoreAnnotations.SentenceIndexAnnotation.class)); + Tree root = sent.get(TreeCoreAnnotations.TreeAnnotation.class); + Tree t = m.get(TreeCoreAnnotations.TreeAnnotation.class); + Tree npt = t; + Tree npt2 = t; + if (npt.isPreTerminal()) { + npt = npt.parent(root); + } + int sentTokenStart = sent.get(CoreAnnotations.TokenBeginAnnotation.class); + int tokenStart = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart; + int tokenEnd = m.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart; + int length = tokenEnd - tokenStart; + mentionTokenLengthCounter.incrementCount(length); + // Check if exact span + IntPair span = t.getSpan(); + if (span != null) { + if (span.getSource() == tokenStart && span.getTarget() == tokenEnd - 1) { + mentionExactTreeSpan++; + } else { + logger.info("Tree span is " + span + ", tree node is " + t); + logger.info("Mention span is " + tokenStart + " " + (tokenEnd - 1) + ", mention is " + m); + } + } else { + logger.warning("No span for " + t); + } + IntPair nptSpan = npt.getSpan(); + if (nptSpan.getSource() == tokenStart && nptSpan.getTarget() == tokenEnd - 1) { + nonPretermSpanMatches++; + npt2 = npt; + } else { + mentionTreePretermNonPretermNoMatchLabelCounter.incrementCount(t.label().value()); + logger.info("NPT: Tree span is " + span + ", tree node is " + npt); + logger.info("NPT: Mention span is " + tokenStart + " " + (tokenEnd - 1) + ", mention is " + m); + Label tlabel = t.label(); + if (tlabel instanceof CoreLabel) { + CoreMap mention = ((CoreLabel) tlabel).get(CorefMentionAnnotation.class); + String corefClusterId = mention.get(CorefCoreAnnotations.CorefAnnotation.class); + Collection clusteredMentions = doc.corefChainMap.get(corefClusterId); + for (CoreMap m2:clusteredMentions) { + logger.info("NPT: Clustered mention " + m2.get(CoreAnnotations.TextAnnotation.class)); + } + } + + } + totalMentions++; + mentionTreeLabelCounter.incrementCount(t.label().value()); + mentionTreeNonPretermLabelCounter.incrementCount(npt.label().value()); + mentionTreeMixedLabelCounter.incrementCount(npt2.label().value()); + Label tlabel = t.label(); + if (tlabel instanceof CoreLabel) { + if (((CoreLabel) tlabel).containsKey(NamedEntityAnnotation.class)) { + // walk up tree + nerMentions++; + nerMentionTokenLengthCounter.incrementCount(length); + + Tree parent = t.parent(root); + while (parent != null) { + Label plabel = parent.label(); + if (plabel instanceof CoreLabel) { + if (((CoreLabel) plabel).containsKey(NamedEntityAnnotation.class)) { + logger.info("NER Mention: " + m); + CoreMap parentNerChunk = ((CoreLabel) plabel).get(NamedEntityAnnotation.class); + logger.info("Nested inside NER Mention: " + parentNerChunk); + logger.info("Nested inside NER Mention parent node: " + parent); + nestedNerMentions++; + break; + } + } + parent = parent.parent(root); + } + } + } + } + } + } + + private static void appendFrac(StringBuilder sb, String label, int num, int den) + { + double frac = ((double) num)/ den; + sb.append(label).append("\t").append(frac).append("\t(").append(num).append("/").append(den).append(")"); + } + + private static void appendIntCountStats(StringBuilder sb, String label, IntCounter counts) + { + sb.append(label).append("\n"); + List sortedKeys = Counters.toSortedList(counts); + int total = counts.totalIntCount(); + for (E key:sortedKeys) { + int count = counts.getIntCount(key); + appendFrac(sb, key.toString(), count, total); + sb.append("\n"); + } + } + + public String toString() + { + StringBuilder sb = new StringBuilder(); + appendIntCountStats(sb, "Mention Tree Labels (no preterminals)", mentionTreeNonPretermLabelCounter); + sb.append("\n"); + appendIntCountStats(sb, "Mention Tree Labels (with preterminals)", mentionTreeLabelCounter); + sb.append("\n"); + appendIntCountStats(sb, "Mention Tree Labels (preterminals with parent span not match)", mentionTreePretermNonPretermNoMatchLabelCounter); + sb.append("\n"); + appendIntCountStats(sb, "Mention Tree Labels (mixed)", mentionTreeMixedLabelCounter); + sb.append("\n"); + appendIntCountStats(sb, "Mention Lengths", mentionTokenLengthCounter); + sb.append("\n"); + appendFrac(sb, "Mention Exact Non Preterm Tree Span", nonPretermSpanMatches, totalMentions); + sb.append("\n"); + appendFrac(sb, "Mention Exact Tree Span", mentionExactTreeSpan, totalMentions); + sb.append("\n"); + appendFrac(sb, "NER", nerMentions, totalMentions); + sb.append("\n"); + appendFrac(sb, "Nested NER", nestedNerMentions, totalMentions); + sb.append("\n"); + appendIntCountStats(sb, "NER Mention Lengths", nerMentionTokenLengthCounter); + return sb.toString(); + } + + } + + /** Reads and dumps output, mainly for debugging. */ + public static void main(String[] args) throws IOException { + Properties props = StringUtils.argsToProperties(args); + boolean debug = Boolean.parseBoolean(props.getProperty("debug", "false")); + String filepath = props.getProperty("i"); + String outfile = props.getProperty("o"); + if (filepath == null || outfile == null) { + usage(); + System.exit(-1); + } + PrintWriter fout = new PrintWriter(outfile); + logger.info("Writing to " + outfile); + String ext = props.getProperty("ext"); + Options options; + if (ext != null) { + options = new Options(".*" + ext + "$"); + } else { + options = new Options(); + } + options.annotateTreeCoref = true; + options.annotateTreeNer = true; + CorpusStats corpusStats = new CorpusStats(); + CoNLL2011DocumentReader reader = new CoNLL2011DocumentReader(filepath, options); + int docCnt = 0; + int sentCnt = 0; + int tokenCnt = 0; + for (Document doc; (doc = reader.getNextDocument()) != null; ) { + corpusStats.process(doc); + docCnt++; + Annotation anno = doc.getAnnotation(); + if (debug) System.out.println("Document " + docCnt + ": " + anno.get(CoreAnnotations.DocIDAnnotation.class)); + for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) { + if (debug) System.out.println("Parse: " + sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); + if (debug) System.out.println("Sentence Tokens: " + StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), ",")); + writeTabSep(fout,sentence,doc.corefChainMap); + sentCnt++; + tokenCnt += sentence.get(CoreAnnotations.TokensAnnotation.class).size(); + } + if (debug) { + for (CoreMap ner:doc.nerChunks) { + System.out.println("NER Chunk: " + ner); + } + for (String id:doc.corefChainMap.keySet()) { + System.out.println("Coref: " + id + " = " + StringUtils.join(doc.corefChainMap.get(id), ";")); + } + } + } + fout.close(); + System.out.println("Total document count: " + docCnt); + System.out.println("Total sentence count: " + sentCnt); + System.out.println("Total token count: " + tokenCnt); + System.out.println(corpusStats); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CoNLLMentionExtractor.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CoNLLMentionExtractor.java new file mode 100644 index 0000000..03acdde --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CoNLLMentionExtractor.java @@ -0,0 +1,276 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.logging.Level; +import java.util.logging.Logger; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.dcoref.Semantics; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.semgraph.SemanticGraphFactory; +import edu.stanford.nlp.util.CollectionValuedMap; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Pair; + +/** + * Extracts coref mentions from a CoNLL2011 data files + * @author Angel Chang + */ +public class CoNLLMentionExtractor extends MentionExtractor { + + private final CoNLL2011DocumentReader reader; + private final String corpusPath; + private final boolean replicateCoNLL; + + private static final Logger logger = SieveCoreferenceSystem.logger; + + public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { + super(dict, semantics); + + // Initialize reader for reading from CONLL2011 corpus + corpusPath = props.getProperty(Constants.CONLL2011_PROP); + replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false")); + + CoNLL2011DocumentReader.Options options = new CoNLL2011DocumentReader.Options(); + options.annotateTokenCoref = false; + options.annotateTokenSpeaker = Constants.USE_GOLD_SPEAKER_TAGS || replicateCoNLL; + options.annotateTokenNer = Constants.USE_GOLD_NE || replicateCoNLL; + options.annotateTokenPos = Constants.USE_GOLD_POS || replicateCoNLL; + if (Constants.USE_CONLL_AUTO) options.setFilter(".*_auto_conll$"); + reader = new CoNLL2011DocumentReader(corpusPath, options); + + stanfordProcessor = loadStanfordProcessor(props); + } + + public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics semantics, + LogisticClassifier singletonModel) throws Exception { + this(dict, props, semantics); + singletonPredictor = singletonModel; + } + + private final boolean collapse = true; + private final boolean ccProcess = false; + private final boolean includeExtras = false; + private final boolean lemmatize = true; + private final boolean threadSafe = true; + + + public void resetDocs() { + super.resetDocs(); + reader.reset(); + } + + @Override + public Document nextDoc() throws Exception { + List> allWords = new ArrayList>(); + List allTrees = new ArrayList(); + + CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument(); + if (conllDoc == null) { + return null; + } + + Annotation anno = conllDoc.getAnnotation(); + List sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); + for (CoreMap sentence:sentences) { + if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) { + // Remove tree from annotation and replace with parse using stanford parser + sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); + } else { + Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + // generate the dependency graph + try { + SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree, + collapse, ccProcess, includeExtras, lemmatize, threadSafe); + SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree, + !collapse, ccProcess, includeExtras, lemmatize, threadSafe); + sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps); + sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps); + } catch(Exception e) { + logger.log(Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e); + } + } + } + + String preSpeaker = null; + String curSpeaker = null; + int utterance = -1; + for (CoreLabel token:anno.get(CoreAnnotations.TokensAnnotation.class)) { + if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) { + token.set(CoreAnnotations.SpeakerAnnotation.class, ""); + } + curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class); + if(!curSpeaker.equals(preSpeaker)) { + utterance++; + preSpeaker = curSpeaker; + } + token.set(CoreAnnotations.UtteranceAnnotation.class, utterance); + } + + // Run pipeline + stanfordProcessor.annotate(anno); + + for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) { + allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class)); + allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); + } + + // Initialize gold mentions + List> allGoldMentions = extractGoldMentions(conllDoc); + + List> allPredictedMentions; + if (Constants.USE_GOLD_MENTIONS) { + //allPredictedMentions = allGoldMentions; + // Make copy of gold mentions since mentions may be later merged, mentionID's changed and stuff + allPredictedMentions = makeCopy(allGoldMentions); + } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) { + allPredictedMentions = ((RuleBasedCorefMentionFinder) mentionFinder).filterPredictedMentions(allGoldMentions, anno, dictionaries); + } else { + allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries); + } + + try { + recallErrors(allGoldMentions,allPredictedMentions,anno); + } catch (IOException e) { + throw new RuntimeException(e); + } + Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); + doc.conllDoc = conllDoc; + return doc; + } + + public List> makeCopy(List> mentions) { + List> copy = new ArrayList>(mentions.size()); + for (List sm:mentions) { + List sm2 = new ArrayList(sm.size()); + for (Mention m:sm) { + Mention m2 = new Mention(); + m2.goldCorefClusterID = m.goldCorefClusterID; + m2.mentionID = m.mentionID; + m2.startIndex = m.startIndex; + m2.endIndex = m.endIndex; + m2.originalSpan = m.originalSpan; + m2.dependency = m.dependency; + sm2.add(m2); + } + copy.add(sm2); + } + return copy; + } + + private static void recallErrors(List> goldMentions, List> predictedMentions, Annotation doc) throws IOException { + List coreMaps = doc.get(CoreAnnotations.SentencesAnnotation.class); + int numSentences = goldMentions.size(); + for (int i=0;i words = coreMap.get(CoreAnnotations.TokensAnnotation.class); + Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class); + List goldMentionsSent = goldMentions.get(i); + List> goldMentionsSpans = extractSpans(goldMentionsSent); + + for (Pair mentionSpan: goldMentionsSpans){ + logger.finer("RECALL ERROR\n"); + logger.finer(coreMap + "\n"); + for (int x=mentionSpan.first;x> extractSpans(List listOfMentions) { + List> mentionSpans = new ArrayList>(); + for (Mention mention: listOfMentions){ + Pair mentionSpan = new Pair(mention.startIndex,mention.endIndex); + mentionSpans.add(mentionSpan); + } + return mentionSpans; + } + + public List> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) { + List sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class); + List> allGoldMentions = new ArrayList>(); + CollectionValuedMap corefChainMap = conllDoc.getCorefChainMap(); + for (int i = 0; i < sentences.size(); i++) { + allGoldMentions.add(new ArrayList()); + } + int maxCorefClusterId = -1; + for (String corefIdStr:corefChainMap.keySet()) { + int id = Integer.parseInt(corefIdStr); + if (id > maxCorefClusterId) { + maxCorefClusterId = id; + } + } + int newMentionID = maxCorefClusterId + 1; + for (String corefIdStr:corefChainMap.keySet()) { + int id = Integer.parseInt(corefIdStr); + int clusterMentionCnt = 0; + for (CoreMap m:corefChainMap.get(corefIdStr)) { + clusterMentionCnt++; + Mention mention = new Mention(); + + mention.goldCorefClusterID = id; + if (clusterMentionCnt == 1) { + // First mention in cluster + mention.mentionID = id; + mention.originalRef = -1; + } else { + mention.mentionID = newMentionID; + mention.originalRef = id; + newMentionID++; + } + if(maxID < mention.mentionID) maxID = mention.mentionID; + int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class); + CoreMap sent = sentences.get(sentIndex); + mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); + mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); + + // will be set by arrange + mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class); + + // Mention dependency is collapsed dependency for sentence + mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + + allGoldMentions.get(sentIndex).add(mention); + } + } + return allGoldMentions; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Constants.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Constants.java new file mode 100644 index 0000000..1325905 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Constants.java @@ -0,0 +1,151 @@ +package edu.stanford.nlp.dcoref; + +import java.util.logging.Logger; + +public class Constants { + + protected Constants() {} // static class but extended by jcoref + + /** if true, use truecase annotator */ + public static final boolean USE_TRUECASE = false; + + /** if true, use gold speaker tags */ + public static final boolean USE_GOLD_SPEAKER_TAGS = false; + + /** if false, use Stanford NER to predict NE labels */ + public static final boolean USE_GOLD_NE = false; + + /** if false, use Stanford parse to parse */ + public static final boolean USE_GOLD_PARSES = false; + + /** if false, use Stanford tagger to tag */ + public static final boolean USE_GOLD_POS = false; + + /** if false, use mention prediction */ + public static final boolean USE_GOLD_MENTIONS = false; + + /** if true, use given mention boundaries */ + public static final boolean USE_GOLD_MENTION_BOUNDARIES = false; + + /** Flag for using discourse salience */ + public static final boolean USE_DISCOURSE_SALIENCE = true; + + /** Use person attributes in pronoun matching */ + public static final boolean USE_DISCOURSE_CONSTRAINTS = true; + + /** if true, remove appositives, predicate nominatives in post processing */ + public static final boolean REMOVE_APPOSITION_PREDICATENOMINATIVES = true; + + /** if true, remove singletons in post processing */ + public static final boolean REMOVE_SINGLETONS = true; + + /** if true, read *auto_conll, if false, read *gold_conll */ + public static final boolean USE_CONLL_AUTO = true; + + /** if true, print in conll output format */ + public static final boolean PRINT_CONLL_OUTPUT = false; + + /** Default path for conll scorer script */ + public static final String conllMentionEvalScript = "/scr/nlp/data/conll-2011/scorer/v4/scorer.pl"; + + /** if true, skip coreference resolution. do mention detection only */ + public static final boolean SKIP_COREF = false; + + /** Default sieve passes */ + public static final String SIEVEPASSES = "MarkRole, DiscourseMatch, ExactStringMatch, RelaxedExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, RelaxedHeadMatch, PronounMatch"; + + /** Use gender list (Bergsma and Lin, 2006; Ji and Lin, 2009) */ + public static final boolean USE_GENDER_LIST = true; + + /** Use number list (Bergsma and Lin, 2006; Ji and Lin, 2009) */ + public static final boolean USE_NUMBER_LIST = true; + + /** Use animacy list (Bergsma and Lin, 2006; Ji and Lin, 2009) */ + public static final boolean USE_ANIMACY_LIST = true; + + /** Share attributes between coreferent mentions **/ + public static final boolean SHARE_ATTRIBUTES = true; + + public static final String STATES_PROP = "dcoref.states"; + public static final String DEMONYM_PROP = "dcoref.demonym"; + public static final String ANIMATE_PROP = "dcoref.animate"; + public static final String INANIMATE_PROP = "dcoref.inanimate"; + public static final String MALE_PROP = "dcoref.male"; + public static final String NEUTRAL_PROP = "dcoref.neutral"; + public static final String FEMALE_PROP = "dcoref.female"; + public static final String PLURAL_PROP = "dcoref.plural"; + public static final String SINGULAR_PROP = "dcoref.singular"; + public static final String SIEVES_PROP = "dcoref.sievePasses"; + public static final String MENTION_FINDER_PROP = "dcoref.mentionFinder"; + public static final String MENTION_FINDER_PROPFILE_PROP = "dcoref.mentionFinder.props"; + public static final String SCORE_PROP = "dcoref.score"; + public static final String LOG_PROP = "dcoref.logFile"; + public static final String ACE2004_PROP = "dcoref.ace2004"; + public static final String ACE2005_PROP = "dcoref.ace2005"; + public static final String MUC_PROP = "dcoref.muc"; + public static final String CONLL2011_PROP = "dcoref.conll2011"; + public static final String CONLL_OUTPUT_PROP = "dcoref.conll.output"; + public static final String CONLL_SCORER = "dcoref.conll.scorer"; + public static final String PARSER_MODEL_PROP = "parse.model"; + public static final String PARSER_MAXLEN_PROP = "parse.maxlen"; + public static final String POSTPROCESSING_PROP = "dcoref.postprocessing"; + public static final String MAXDIST_PROP = "dcoref.maxdist"; + public static final String REPLICATECONLL_PROP = "dcoref.replicate.conll"; + public static final String BIG_GENDER_NUMBER_PROP = "dcoref.use.big.gender.number"; + public static final String GENDER_NUMBER_PROP = "dcoref.big.gender.number"; + public static final String COUNTRIES_PROP = "dcoref.countries"; + public static final String STATES_PROVINCES_PROP = "dcoref.states.provinces"; + public static final String EXTRA_GENDER_PROP = "dcoref.extra.gender"; + public static final String OPTIMIZE_SIEVES_PROP = "dcoref.optimize.sieves"; + public static final String OPTIMIZE_SIEVES_KEEP_ORDER_PROP = "dcoref.optimize.sieves.keepOrder"; + public static final String OPTIMIZE_SIEVES_SCORE_PROP = "dcoref.optimize.sieves.score"; + public static final String RUN_DIST_CMD_PROP = "dcoref.dist.cmd"; + public static final String RUN_DIST_CMD_WORK_DIR = "dcoref.dist.workdir"; + public static final String SCORE_FILE_PROP = "dcoref.score.output"; + public static final String SINGLETON_PROP = "dcoref.singleton.predictor"; + + public static final int MONITOR_DIST_CMD_FINISHED_WAIT_MILLIS = 60000; + + // + // note that default paths for all dictionaries used are in + // pipeline.DefaultPaths + // + + /** print the values of variables in this class */ + public static void printConstants(Logger logger) { + if (Constants.USE_ANIMACY_LIST) logger.info("USE_ANIMACY_LIST on"); + else logger.info("USE_ANIMACY_LIST off"); + if (Constants.USE_GENDER_LIST) logger.info("USE_GENDER_LIST on"); + else logger.info("USE_GENDER_LIST off"); + if (Constants.USE_NUMBER_LIST) logger.info("USE_NUMBER_LIST on"); + else logger.info("USE_NUMBER_LIST off"); + if (Constants.USE_ANIMACY_LIST) logger.info("USE_ANIMACY_LIST on"); + else logger.info("USE_ANIMACY_LIST off"); + if (Constants.USE_DISCOURSE_SALIENCE) logger.info("use discourse salience"); + else logger.info("not use discourse salience"); + if (Constants.USE_TRUECASE) logger.info("use truecase annotator"); + else logger.info("not use truecase annotator"); + if (Constants.USE_DISCOURSE_CONSTRAINTS) logger.info("USE_DISCOURSE_CONSTRAINTS on"); + else logger.info("USE_DISCOURSE_CONSTRAINTS off"); + if (Constants.USE_GOLD_POS) logger.info("USE_GOLD_POS on"); + else logger.info("USE_GOLD_POS off"); + if (Constants.USE_GOLD_NE) logger.info("use gold NE type annotation"); + else logger.info("use Stanford NER"); + if (Constants.USE_GOLD_PARSES) logger.info("USE_GOLD_PARSES on"); + else logger.info("USE_GOLD_PARSES off"); + if (Constants.USE_GOLD_SPEAKER_TAGS) logger.info("USE_GOLD_SPEAKER_TAGS on"); + else logger.info("USE_GOLD_SPEAKER_TAGS off"); + if (Constants.USE_GOLD_MENTIONS) logger.info("USE_GOLD_MENTIONS on"); + else logger.info("USE_GOLD_MENTIONS off"); + if (Constants.USE_GOLD_MENTION_BOUNDARIES) logger.info("USE_GOLD_MENTION_BOUNDARIES on"); + else logger.info("USE_GOLD_MENTION_BOUNDARIES off"); + if (Constants.USE_CONLL_AUTO) logger.info("use conll auto set -> if GOLD_NE, GOLD_PARSE, GOLD_POS, etc turned on, use auto"); + else logger.info("use conll gold set -> if GOLD_NE, GOLD_PARSE, GOLD_POS, etc turned on, use gold"); + if (Constants.REMOVE_SINGLETONS) logger.info("REMOVE_SINGLETONS on"); + else logger.info("REMOVE_SINGLETONS off"); + if (Constants.REMOVE_APPOSITION_PREDICATENOMINATIVES) logger.info("REMOVE_APPOSITION_PREDICATENOMINATIVES on"); + else logger.info("REMOVE_APPOSITION_PREDICATENOMINATIVES off"); + logger.info("================================================================="); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefChain.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefChain.java new file mode 100644 index 0000000..da8e41a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefChain.java @@ -0,0 +1,318 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.Serializable; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.dcoref.CorefCoreAnnotations; +import edu.stanford.nlp.dcoref.Dictionaries.Animacy; +import edu.stanford.nlp.dcoref.Dictionaries.Gender; +import edu.stanford.nlp.dcoref.Dictionaries.MentionType; +import edu.stanford.nlp.dcoref.Dictionaries.Number; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.IntPair; +import edu.stanford.nlp.util.IntTuple; + +/** + * Output of coref system. Each CorefChain represents a set of + * entries in the text which should all correspond to the same actual + * entity. There is a representative mention, which stores the best + * mention of an entity, and then there is a sequence of other + * mentions which connect to that mention. + * + * @author Heeyoung Lee + */ +public class CorefChain implements Serializable { + + private final int chainID; + private final List mentions; + private final Map> mentionMap; + + /** The most representative mention in this cluster */ + private CorefMention representative = null; + + @Override + public boolean equals(Object aThat) { + if (this == aThat) + return true; + if (!(aThat instanceof CorefChain)) + return false; + CorefChain that = (CorefChain) aThat; + if (chainID != that.chainID) + return false; + if (!mentions.equals(that.mentions)) + return false; + if ((representative == null && that.representative != null) || + (representative != null && that.representative == null) || + (!representative.equals(that.representative))) { + return false; + } + // mentionMap is another view of mentions, so no need to compare + // that once we've compared mentions + return true; + } + + @Override + public int hashCode() { + return mentions.hashCode(); + } + + /** get List of CorefMentions */ + public List getMentionsInTextualOrder() { return mentions; } + + /** get CorefMentions by position (sentence number, headIndex) Can be multiple mentions sharing headword */ + public Set getMentionsWithSameHead(IntPair position) { return mentionMap.get(position); } + + /** get CorefMention by position */ + public Set getMentionsWithSameHead(int sentenceNumber, int headIndex) { + return mentionMap.get(new IntPair(sentenceNumber, headIndex)); + } + + public Map> getMentionMap() { return mentionMap; } + + /** Return the most representative mention in the chain. + * Proper mention and a mention with more pre-modifiers are preferred. + */ + public CorefMention getRepresentativeMention() { return representative; } + public int getChainID() { return chainID; } + + /** Mention for coref output. This is one instance of the entity + * referred to by a given CorefChain. */ + public static class CorefMention implements Serializable { + public final MentionType mentionType; + public final Number number; + public final Gender gender; + public final Animacy animacy; + + /** + * Starting word number, indexed from 1 + */ + public final int startIndex; + /** + * One past the end word number, indexed from 1 + */ + public final int endIndex; + /** + * Head word of the mention + */ + public final int headIndex; + public final int corefClusterID; + public final int mentionID; + /** + * Sentence number in the document containing this mention, + * indexed from 1. + */ + public final int sentNum; + /** + * Position is a binary tuple of (sentence number, mention number + * in that sentence). This is used for indexing by mention. + */ + public final IntTuple position; + public final String mentionSpan; + + public CorefMention(MentionType mentionType, + Number number, + Gender gender, + Animacy animacy, + int startIndex, + int endIndex, + int headIndex, + int corefClusterID, + int mentionID, + int sentNum, + IntTuple position, + String mentionSpan) { + this.mentionType = mentionType; + this.number = number; + this.gender = gender; + this.animacy = animacy; + this.startIndex = startIndex; + this.endIndex = endIndex; + this.headIndex = headIndex; + this.corefClusterID = corefClusterID; + this.mentionID = mentionID; + this.sentNum = sentNum; + this.position = position; + this.mentionSpan = mentionSpan; + } + + public CorefMention(Mention m, IntTuple pos){ + mentionType = m.mentionType; + number = m.number; + gender = m.gender; + animacy = m.animacy; + startIndex = m.startIndex + 1; + endIndex = m.endIndex + 1; + headIndex = m.headIndex + 1; + corefClusterID = m.corefClusterID; + sentNum = m.sentNum + 1; + mentionID = m.mentionID; + mentionSpan = m.spanToString(); + + // index starts from 1 + position = new IntTuple(2); + position.set(0, pos.get(0)+1); + position.set(1, pos.get(1)+1); + + m.headWord.set(CorefCoreAnnotations.CorefClusterIdAnnotation.class, corefClusterID); + } + + @Override + public boolean equals(Object aThat) { + if (this == aThat) + return true; + if (!(aThat instanceof CorefMention)) + return false; + CorefMention that = (CorefMention) aThat; + if (mentionType != that.mentionType) + return false; + if (number != that.number) + return false; + if (gender != that.gender) + return false; + if (animacy != that.animacy) + return false; + if (startIndex != that.startIndex) + return false; + if (endIndex != that.endIndex) + return false; + if (headIndex != that.headIndex) + return false; + if (corefClusterID != that.corefClusterID) + return false; + if (mentionID != that.mentionID) + return false; + if (sentNum != that.sentNum) + return false; + if (!position.equals(that.position)) + return false; + // we ignore MentionSpan as it is constructed from the tokens + // the mention is a span of, so if we know those spans are the + // same, we should be able to ignore the actual text + return true; + } + + @Override + public int hashCode() { + return position.hashCode(); + } + + @Override + public String toString(){ + StringBuilder s = new StringBuilder(); + s.append("\"").append(mentionSpan).append("\"").append(" in sentence ").append(sentNum); + return s.toString(); + // return "(sentence:" + sentNum + ", startIndex:" + startIndex + "-endIndex:" + endIndex + ")"; + } + private boolean moreRepresentativeThan(CorefMention m){ + if(m==null) return true; + if(mentionType!=m.mentionType) { + if((mentionType==MentionType.PROPER && m.mentionType!=MentionType.PROPER) + || (mentionType==MentionType.NOMINAL && m.mentionType==MentionType.PRONOMINAL)) return true; + else return false; + } else { + // First, check length + if (headIndex - startIndex > m.headIndex - m.startIndex) return true; + if (headIndex - startIndex < m.headIndex - m.startIndex) return false; + if (endIndex - startIndex > m.endIndex - m.startIndex) return true; + if (endIndex - startIndex < m.endIndex - m.startIndex) return false; + // Now check relative position + if (sentNum < m.sentNum) return true; + if (sentNum > m.sentNum) return false; + if (headIndex < m.headIndex) return true; + if (headIndex > m.headIndex) return false; + if (startIndex < m.startIndex) return true; + if (startIndex > m.startIndex) return false; + // At this point they're equal... + return false; + } + } + + private static final long serialVersionUID = 3657691243504173L; + } + + protected static class MentionComparator implements Comparator { + public int compare(CorefMention m1, CorefMention m2) { + if(m1.sentNum < m2.sentNum) return -1; + else if(m1.sentNum > m2.sentNum) return 1; + else{ + if(m1.startIndex < m2.startIndex) return -1; + else if(m1.startIndex > m2.startIndex) return 1; + else { + if(m1.endIndex > m2.endIndex) return -1; + else if(m1.endIndex < m2.endIndex) return 1; + else return 0; + } + } + } + } + public CorefChain(CorefCluster c, Map positions){ + chainID = c.clusterID; + mentions = new ArrayList(); + mentionMap = Generics.newHashMap(); + for (Mention m : c.getCorefMentions()) { + CorefMention men = new CorefMention(m, positions.get(m)); + mentions.add(men); + IntPair position = new IntPair(men.sentNum, men.headIndex); + if(!mentionMap.containsKey(position)) mentionMap.put(position, Generics.newHashSet()); + mentionMap.get(position).add(men); + if(men.moreRepresentativeThan(representative)) representative = men; + } + Collections.sort(mentions, new MentionComparator()); + } + + /** Constructor required by CustomAnnotationSerializer */ + public CorefChain(int cid, + Map> mentionMap, + CorefMention representative) { + this.chainID = cid; + this.representative = representative; + this.mentionMap = mentionMap; + this.mentions = new ArrayList(); + for(Set ms: mentionMap.values()){ + for(CorefMention m: ms) { + this.mentions.add(m); + } + } + Collections.sort(mentions, new MentionComparator()); + } + + public String toString(){ + return "CHAIN"+this.chainID+ "-" +mentions.toString(); + } + + private static final long serialVersionUID = 3657691243506528L; + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefCluster.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefCluster.java new file mode 100644 index 0000000..06df4a4 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefCluster.java @@ -0,0 +1,179 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.Serializable; +import java.util.EnumSet; +import java.util.Set; +import java.util.TreeMap; +import java.util.logging.Logger; + +import edu.stanford.nlp.dcoref.Dictionaries.Animacy; +import edu.stanford.nlp.dcoref.Dictionaries.Gender; +import edu.stanford.nlp.dcoref.Dictionaries.Number; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.util.Generics; + +/** + * One cluster for the SieveCoreferenceSystem. + * + * @author Heeyoung Lee + */ +public class CorefCluster implements Serializable{ + + private static final long serialVersionUID = 8655265337578515592L; + + protected final Set corefMentions; + protected final int clusterID; + + // Attributes for cluster - can include multiple attribute e.g., {singular, plural} + protected final Set numbers; + protected final Set genders; + protected final Set animacies; + protected final Set nerStrings; + protected final Set heads; + + /** All words in this cluster - for word inclusion feature */ + public final Set words; + + /** The first mention in this cluster */ + protected Mention firstMention; + + /** Return the most representative mention in the chain. + * A proper noun mention or a mention with more pre-modifiers is preferred. + */ + protected Mention representative; + + public int getClusterID(){ return clusterID; } + public Set getCorefMentions() { return corefMentions; } + public Mention getFirstMention() { return firstMention; } + public Mention getRepresentativeMention() { return representative; } + + public CorefCluster(int ID) { + clusterID = ID; + corefMentions = Generics.newHashSet(); + numbers = EnumSet.noneOf(Number.class); + genders = EnumSet.noneOf(Gender.class); + animacies = EnumSet.noneOf(Animacy.class); + nerStrings = Generics.newHashSet(); + heads = Generics.newHashSet(); + words = Generics.newHashSet(); + firstMention = null; + representative = null; + } + + public CorefCluster(int ID, Set mentions){ + this(ID); + corefMentions.addAll(mentions); + for (Mention m : mentions) { + animacies.add(m.animacy); + genders.add(m.gender); + numbers.add(m.number); + nerStrings.add(m.nerString); + heads.add(m.headString); + if(!m.isPronominal()){ + for(CoreLabel w : m.originalSpan){ + words.add(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase()); + } + } + if (firstMention == null) firstMention = m; + else { + if(m.appearEarlierThan(firstMention)) firstMention = m; + } + } + representative = firstMention; + for (Mention m : mentions) { + if(m.moreRepresentativeThan(representative)) representative = m; + } + } + + /** merge 2 clusters: to = to + from */ + public static void mergeClusters(CorefCluster to, CorefCluster from) { + int toID = to.clusterID; + for (Mention m : from.corefMentions){ + m.corefClusterID = toID; + } + if(Constants.SHARE_ATTRIBUTES){ + to.numbers.addAll(from.numbers); + if(to.numbers.size() > 1 && to.numbers.contains(Number.UNKNOWN)) { + to.numbers.remove(Number.UNKNOWN); + } + + to.genders.addAll(from.genders); + if(to.genders.size() > 1 && to.genders.contains(Gender.UNKNOWN)) { + to.genders.remove(Gender.UNKNOWN); + } + + to.animacies.addAll(from.animacies); + if(to.animacies.size() > 1 && to.animacies.contains(Animacy.UNKNOWN)) { + to.animacies.remove(Animacy.UNKNOWN); + } + + to.nerStrings.addAll(from.nerStrings); + if(to.nerStrings.size() > 1 && to.nerStrings.contains("O")) { + to.nerStrings.remove("O"); + } + if(to.nerStrings.size() > 1 && to.nerStrings.contains("MISC")) { + to.nerStrings.remove("MISC"); + } + } + + to.heads.addAll(from.heads); + to.corefMentions.addAll(from.corefMentions); + to.words.addAll(from.words); + if(from.firstMention.appearEarlierThan(to.firstMention) && !from.firstMention.isPronominal()) to.firstMention = from.firstMention; + if(from.representative.moreRepresentativeThan(to.representative)) to.representative = from.representative; + SieveCoreferenceSystem.logger.finer("merge clusters: "+toID+" += "+from.clusterID); + } + + /** Print cluster information */ + public void printCorefCluster(Logger logger){ + logger.finer("Cluster ID: "+clusterID+"\tNumbers: "+numbers+"\tGenders: "+genders+"\tanimacies: "+animacies); + logger.finer("NE: "+nerStrings+"\tfirst Mention's ID: "+firstMention.mentionID+"\tHeads: "+heads+"\twords: "+words); + TreeMap forSortedPrint = new TreeMap(); + for(Mention m : this.corefMentions){ + forSortedPrint.put(m.mentionID, m); + } + for(Mention m : forSortedPrint.values()){ + if(m.goldCorefClusterID==-1){ + logger.finer("mention-> id:"+m.mentionID+"\toriginalRef: "+m.originalRef+"\t"+m.spanToString() +"\tsentNum: "+m.sentNum+"\tstartIndex: "+m.startIndex); + } else{ + logger.finer("mention-> id:"+m.mentionID+"\toriginalClusterID: "+m.goldCorefClusterID+"\t"+m.spanToString() +"\tsentNum: "+m.sentNum+"\tstartIndex: "+m.startIndex +"\toriginalRef: "+m.originalRef+"\tType: "+m.mentionType); + } + } + } + + public boolean isSinglePronounCluster(Dictionaries dict){ + if(this.corefMentions.size() > 1) return false; + for(Mention m : this.corefMentions) { + if(m.isPronominal() || dict.allPronouns.contains(m.spanToString().toLowerCase())) return true; + } + return false; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefCoreAnnotations.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefCoreAnnotations.java new file mode 100644 index 0000000..89ee205 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefCoreAnnotations.java @@ -0,0 +1,89 @@ +package edu.stanford.nlp.dcoref; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.ling.CoreAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.util.ErasureUtils; +import edu.stanford.nlp.util.IntTuple; +import edu.stanford.nlp.util.Pair; + +/** + * Similar to {@link edu.stanford.nlp.ling.CoreAnnotations}, + * but this class contains + * annotations made specifically for storing Coref data. This is kept + * separate from CoreAnnotations so that systems which only need + * CoreAnnotations do not depend on Coref classes. + */ +public class CorefCoreAnnotations { + + /** + * the standard key for the coref label. + * not used by the new dcoref system. + */ + public static class CorefAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Destination of the coreference link for this word (if any): it + * contains the index of the sentence and the index of the word that + * are the end of this coref link Both indices start at 1 The + * sentence index is IntTuple.get(0); the token index in the + * sentence is IntTuple.get(1) + */ + public static class CorefDestAnnotation implements CoreAnnotation { + public Class getType() { + return IntTuple.class; + } + } + + /** + * This stores the entire set of coreference links for one + * document. Each link is stored as a pair of pointers (source and + * destination), where each pointer stores a sentence offset and a + * token offset. All offsets start at 0. + */ + public static class CorefGraphAnnotation implements CoreAnnotation>> { + public Class>> getType() { + return ErasureUtils.uncheckedCast(List.class); + } + } + + /** + * An integer representing a document-level unique cluster of + * coreferent entities. In other words, if two entities have the + * same CorefClusterIdAnnotation, they are coreferent. This + * annotation is typically attached to tokens (CoreLabel). + */ + public static class CorefClusterIdAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * Set of all the CoreLabel objects which are coreferent with a + * CoreLabel. Note that the list includes the CoreLabel that was + * annotated which creates a cycle. + */ + public static class CorefClusterAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.uncheckedCast(Set.class); + } + } + + /** + * CorefChainID - CorefChain map + */ + public static class CorefChainAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.uncheckedCast(Map.class); + } + } + +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefMentionFinder.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefMentionFinder.java new file mode 100644 index 0000000..586b5f4 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefMentionFinder.java @@ -0,0 +1,23 @@ +package edu.stanford.nlp.dcoref; + +import edu.stanford.nlp.pipeline.Annotation; + +import java.util.List; + +/** + * Interface for finding coref mentions in a document. + * + * @author Angel Chang + */ +public interface CorefMentionFinder { + + /** Get all the predicted mentions for a document. + * + * @param doc The syntactically annotated document + * @param maxGoldID The last mention ID assigned. New ones are assigned starting one above this number. + * @param dict Dictionaries for coref. + * @return For each of the List of sentences in the document, a List of Mention objects + */ + public List> extractPredictedMentions(Annotation doc, int maxGoldID, Dictionaries dict); + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefScorer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefScorer.java new file mode 100644 index 0000000..2906e76 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/CorefScorer.java @@ -0,0 +1,99 @@ +package edu.stanford.nlp.dcoref; + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.logging.Logger; + +/** + * Wrapper for a coreference resolution score: MUC, B cubed, Pairwise. + */ +public abstract class CorefScorer { + + enum SubScoreType {Recall, Precision, F1} + enum ScoreType { MUC, BCubed, Pairwise } + + double precisionNumSum; + double precisionDenSum; + double recallNumSum; + double recallDenSum; + private final ScoreType scoreType; + + CorefScorer(ScoreType st) { + scoreType = st; + precisionNumSum = 0.0; + precisionDenSum = 0.0; + recallNumSum = 0.0; + recallDenSum = 0.0; + } + + public double getScore(SubScoreType subScoreType) { + switch (subScoreType) { + case Precision: + return getPrecision(); + case Recall: + return getRecall(); + case F1: + return getF1(); + default: + throw new IllegalArgumentException("Unsupported subScoreType: " + subScoreType); + } + } + + public double getPrecision() { + return precisionDenSum == 0.0 ? 0.0: precisionNumSum/precisionDenSum; + } + + public double getRecall() { + return recallDenSum == 0.0 ? 0.0: recallNumSum/recallDenSum; + } + + public double getF1() { + double p = getPrecision(); + double r = getRecall(); + return (p + r == 0.0) ? 0.0: 2.0 * p * r / (p + r); + } + + public void calculateScore(Document doc) { + calculatePrecision(doc); + calculateRecall(doc); + } + + protected abstract void calculatePrecision(Document doc); + protected abstract void calculateRecall(Document doc); + + public void printF1(Logger logger, boolean printF1First) { + NumberFormat nf = new DecimalFormat("0.0000"); + + double r = getRecall(); + double p = getPrecision(); + double f1 = getF1(); + + String R = nf.format(r); + String P = nf.format(p); + String F1 = nf.format(f1); + + NumberFormat nf2 = new DecimalFormat("00.0"); + + String RR = nf2.format(r*100); + String PP = nf2.format(p*100); + String F1F1 = nf2.format(f1*100); + + if (printF1First) { + String str = "F1 = "+F1+", P = "+P+" ("+(int) precisionNumSum+"/"+(int) precisionDenSum+"), R = "+R+" ("+(int) recallNumSum+"/"+(int) recallDenSum+")"; + if(scoreType == ScoreType.Pairwise){ + logger.fine("Pairwise "+str); + } else if(scoreType == ScoreType.BCubed){ + logger.fine("B cubed "+str); + } else { + logger.fine("MUC "+str); + } + } else { + logger.fine("& "+PP+" & "+RR + " & "+F1F1); + } + } + + public void printF1(Logger logger) { + printF1(logger, true); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Dictionaries.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Dictionaries.java new file mode 100644 index 0000000..184e42c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Dictionaries.java @@ -0,0 +1,523 @@ +package edu.stanford.nlp.dcoref; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.pipeline.DefaultPaths; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; + +public class Dictionaries { + + public enum MentionType { PRONOMINAL, NOMINAL, PROPER } + + public enum Gender { MALE, FEMALE, NEUTRAL, UNKNOWN } + + public enum Number { SINGULAR, PLURAL, UNKNOWN } + public enum Animacy { ANIMATE, INANIMATE, UNKNOWN } + public enum Person { I, YOU, HE, SHE, WE, THEY, IT, UNKNOWN} + + public final Set reportVerb = Generics.newHashSet(Arrays.asList( + "accuse", "acknowledge", "add", "admit", "advise", "agree", "alert", + "allege", "announce", "answer", "apologize", "argue", + "ask", "assert", "assure", "beg", "blame", "boast", + "caution", "charge", "cite", "claim", "clarify", "command", "comment", + "compare", "complain", "concede", "conclude", "confirm", "confront", "congratulate", + "contend", "contradict", "convey", "counter", "criticize", + "debate", "decide", "declare", "defend", "demand", "demonstrate", "deny", + "describe", "determine", "disagree", "disclose", "discount", "discover", "discuss", + "dismiss", "dispute", "disregard", "doubt", "emphasize", "encourage", "endorse", + "equate", "estimate", "expect", "explain", "express", "extoll", "fear", "feel", + "find", "forbid", "forecast", "foretell", "forget", "gather", "guarantee", "guess", + "hear", "hint", "hope", "illustrate", "imagine", "imply", "indicate", "inform", + "insert", "insist", "instruct", "interpret", "interview", "invite", "issue", + "justify", "learn", "maintain", "mean", "mention", "negotiate", "note", + "observe", "offer", "oppose", "order", "persuade", "pledge", "point", "point out", + "praise", "pray", "predict", "prefer", "present", "promise", "prompt", "propose", + "protest", "prove", "provoke", "question", "quote", "raise", "rally", "read", + "reaffirm", "realise", "realize", "rebut", "recall", "reckon", "recommend", "refer", + "reflect", "refuse", "refute", "reiterate", "reject", "relate", "remark", + "remember", "remind", "repeat", "reply", "report", "request", "respond", + "restate", "reveal", "rule", "say", "see", "show", "signal", "sing", + "slam", "speculate", "spoke", "spread", "state", "stipulate", "stress", + "suggest", "support", "suppose", "surmise", "suspect", "swear", "teach", + "tell", "testify", "think", "threaten", "told", "uncover", "underline", + "underscore", "urge", "voice", "vow", "warn", "welcome", + "wish", "wonder", "worry", "write")); + + public final Set reportNoun = Generics.newHashSet(Arrays.asList( + "acclamation", "account", "accusation", "acknowledgment", "address", "addressing", + "admission", "advertisement", "advice", "advisory", "affidavit", "affirmation", "alert", + "allegation", "analysis", "anecdote", "annotation", "announcement", "answer", "antiphon", + "apology", "applause", "appreciation", "argument", "arraignment", "article", "articulation", + "aside", "assertion", "asseveration", "assurance", "attestation", "attitude", + "averment", "avouchment", "avowal", "axiom", "backcap", "band-aid", "basic", "belief", "bestowal", + "bill", "blame", "blow-by-blow", "bomb", "book", "bow", "break", "breakdown", "brief", "briefing", + "broadcast", "broadcasting", "bulletin", "buzz", "cable", "calendar", "call", "canard", "canon", + "card", "cause", "censure", "certification", "characterization", "charge", "chat", "chatter", + "chitchat", "chronicle", "chronology", "citation", "claim", "clarification", "close", "cognizance", + "comeback", "comment", "commentary", "communication", "communique", "composition", "concept", + "concession", "conference", "confession", "confirmation", "conjecture", "connotation", "construal", + "construction", "consultation", "contention", "contract", "convention", "conversation", "converse", + "conviction", "counterclaim", "credenda", "creed", "critique", + "cry", "declaration", "defense", "definition", "delineation", "delivery", "demonstration", + "denial", "denotation", "depiction", "deposition", "description", "detail", "details", "detention", + "dialogue", "diction", "dictum", "digest", "directive", "disclosure", "discourse", "discovery", + "discussion", "dispatch", "display", "disquisition", "dissemination", "dissertation", "divulgence", + "dogma", "editorial", "ejaculation", "emphasis", "enlightenment", + "enunciation", "essay", "evidence", "examination", "example", "excerpt", "exclamation", + "excuse", "execution", "exegesis", "explanation", "explication", "exposing", "exposition", "expounding", + "expression", "eye-opener", "feedback", "fiction", "findings", "fingerprint", "flash", "formulation", + "fundamental", "gift", "gloss", "goods", "gospel", "gossip", "gratitude", "greeting", + "guarantee", "hail", "hailing", "handout", "hash", "headlines", "hearing", "hearsay", + "ideas", "idiom", "illustration", "impeachment", "implantation", "implication", "imputation", + "incrimination", "indication", "indoctrination", "inference", "info", "information", + "innuendo", "insinuation", "insistence", "instruction", "intelligence", "interpretation", "interview", + "intimation", "intonation", "issue", "item", "itemization", "justification", "key", "knowledge", + "leak", "letter", "locution", "manifesto", + "meaning", "meeting", "mention", "message", "missive", "mitigation", "monograph", "motive", "murmur", + "narration", "narrative", "news", "nod", "note", "notice", "notification", "oath", "observation", + "okay", "opinion", "oral", "outline", "paper", "parley", "particularization", "phrase", "phraseology", + "phrasing", "picture", "piece", "pipeline", "pitch", "plea", "plot", "portraiture", "portrayal", + "position", "potboiler", "prating", "precept", "prediction", "presentation", "presentment", "principle", + "proclamation", "profession", "program", "promulgation", "pronouncement", "pronunciation", "propaganda", + "prophecy", "proposal", "proposition", "prosecution", "protestation", "publication", "publicity", + "publishing", "quotation", "ratification", "reaction", "reason", "rebuttal", "receipt", "recital", + "recitation", "recognition", "record", "recount", "recountal", "refutation", "regulation", "rehearsal", + "rejoinder", "relation", "release", "remark", "rendition", "repartee", "reply", "report", "reporting", + "representation", "resolution", "response", "result", "retort", "return", "revelation", "review", + "rule", "rumble", "rumor", "rundown", "saying", "scandal", "scoop", + "scuttlebutt", "sense", "showing", "sign", "signature", "significance", "sketch", "skinny", "solution", + "speaking", "specification", "speech", "statement", "story", "study", "style", "suggestion", + "summarization", "summary", "summons", "tale", "talk", "talking", "tattle", "telecast", + "telegram", "telling", "tenet", "term", "testimonial", "testimony", "text", "theme", "thesis", + "tract", "tractate", "tradition", "translation", "treatise", "utterance", "vent", "ventilation", + "verbalization", "version", "vignette", "vindication", "warning", + "warrant", "whispering", "wire", "word", "work", "writ", "write-up", "writeup", "writing", + "acceptance", "complaint", "concern", "disappointment", "disclose", "estimate", "laugh", "pleasure", "regret", + "resentment", "view")); + + public final Set nonWords = Generics.newHashSet(Arrays.asList("mm", "hmm", "ahem", "um")); + public final Set copulas = Generics.newHashSet(Arrays.asList("is","are","were", "was","be", "been","become","became","becomes","seem","seemed","seems","remain","remains","remained")); + public final Set quantifiers = Generics.newHashSet(Arrays.asList("not","every","any","none","everything","anything","nothing","all","enough")); + public final Set parts = Generics.newHashSet(Arrays.asList("half","one","two","three","four","five","six","seven","eight","nine","ten","hundred","thousand","million","billion","tens","dozens","hundreds","thousands","millions","billions","group","groups","bunch","number","numbers","pinch","amount","amount","total","all","mile","miles","pounds")); + public final Set temporals = Generics.newHashSet(Arrays.asList( + "second", "minute", "hour", "day", "week", "month", "year", "decade", "century", "millennium", + "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "now", + "yesterday", "tomorrow", "age", "time", "era", "epoch", "morning", "evening", "day", "night", "noon", "afternoon", + "semester", "trimester", "quarter", "term", "winter", "spring", "summer", "fall", "autumn", "season", + "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december")); + + + public final Set femalePronouns = Generics.newHashSet(Arrays.asList(new String[]{ "her", "hers", "herself", "she" })); + public final Set malePronouns = Generics.newHashSet(Arrays.asList(new String[]{ "he", "him", "himself", "his" })); + public final Set neutralPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "its", "itself", "where", "here", "there", "which" })); + public final Set possessivePronouns = Generics.newHashSet(Arrays.asList(new String[]{ "my", "your", "his", "her", "its","our","their","whose" })); + public final Set otherPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "who", "whom", "whose", "where", "when","which" })); + public final Set thirdPersonPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's", "they", "them", "themself", "themselves", "theirs", "their", "they", "them", "'em", "themselves" })); + public final Set secondPersonPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "you", "yourself", "yours", "your", "yourselves" })); + public final Set firstPersonPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "i", "me", "myself", "mine", "my", "we", "us", "ourself", "ourselves", "ours", "our" })); + public final Set moneyPercentNumberPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "its" })); + public final Set dateTimePronouns = Generics.newHashSet(Arrays.asList(new String[]{ "when" })); + public final Set organizationPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "its", "they", "their", "them", "which"})); + public final Set locationPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "its", "where", "here", "there" })); + public final Set inanimatePronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "itself", "its", "where", "when" })); + public final Set animatePronouns = Generics.newHashSet(Arrays.asList(new String[]{ "i", "me", "myself", "mine", "my", "we", "us", "ourself", "ourselves", "ours", "our", "you", "yourself", "yours", "your", "yourselves", "he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "one", "oneself", "one's", "they", "them", "themself", "themselves", "theirs", "their", "they", "them", "'em", "themselves", "who", "whom", "whose" })); + public final Set indefinitePronouns = Generics.newHashSet(Arrays.asList(new String[]{"another", "anybody", "anyone", "anything", "each", "either", "enough", "everybody", "everyone", "everything", "less", "little", "much", "neither", "no one", "nobody", "nothing", "one", "other", "plenty", "somebody", "someone", "something", "both", "few", "fewer", "many", "others", "several", "all", "any", "more", "most", "none", "some", "such"})); + public final Set relativePronouns = Generics.newHashSet(Arrays.asList(new String[]{"that","who","which","whom","where","whose"})); + public final Set GPEPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "itself", "its", "they","where" })); + public final Set pluralPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "we", "us", "ourself", "ourselves", "ours", "our", "yourself", "yourselves", "they", "them", "themself", "themselves", "theirs", "their" })); + public final Set singularPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "i", "me", "myself", "mine", "my", "yourself", "he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's" })); + public final Set facilityVehicleWeaponPronouns = Generics.newHashSet(Arrays.asList(new String[]{ "it", "itself", "its", "they", "where" })); + public final Set miscPronouns = Generics.newHashSet(Arrays.asList(new String[]{"it", "itself", "its", "they", "where" })); + public final Set reflexivePronouns = Generics.newHashSet(Arrays.asList(new String[]{"myself", "yourself", "yourselves", "himself", "herself", "itself", "ourselves", "themselves", "oneself"})); + public final Set transparentNouns = Generics.newHashSet(Arrays.asList(new String[]{"bunch", "group", + "breed", "class", "ilk", "kind", "half", "segment", "top", "bottom", "glass", "bottle", + "box", "cup", "gem", "idiot", "unit", "part", "stage", "name", "division", "label", "group", "figure", + "series", "member", "members", "first", "version", "site", "side", "role", "largest", "title", "fourth", + "third", "second", "number", "place", "trio", "two", "one", "longest", "highest", "shortest", + "head", "resident", "collection", "result", "last" + })); + public final Set stopWords = Generics.newHashSet(Arrays.asList(new String[]{"a", "an", "the", "of", "at", + "on", "upon", "in", "to", "from", "out", "as", "so", "such", "or", "and", "those", "this", "these", "that", + "for", ",", "is", "was", "am", "are", "'s", "been", "were"})); + + public final Set notOrganizationPRP = Generics.newHashSet(Arrays.asList(new String[]{"i", "me", "myself", + "mine", "my", "yourself", "he", "him", "himself", "his", "she", "her", "herself", "hers", "here"})); + + public final Set quantifiers2 = Generics.newHashSet(Arrays.asList("all", "both", "neither", "either")); + public final Set determiners = Generics.newHashSet(Arrays.asList("the", "this", "that", "these", "those", "his", "her", "my", "your", "their", "our")); + public final Set negations = Generics.newHashSet(Arrays.asList("n't","not", "nor", "neither", "never", "no", "non", "any", "none", "nobody", "nothing", "nowhere", "nearly","almost", + "if", "false", "fallacy", "unsuccessfully", "unlikely", "impossible", "improbable", "uncertain", "unsure", "impossibility", "improbability", "cancellation", "breakup", "lack", + "long-stalled", "end", "rejection", "failure", "avoid", "bar", "block", "break", "cancel", "cease", "cut", "decline", "deny", "deprive", "destroy", "excuse", + "fail", "forbid", "forestall", "forget", "halt", "lose", "nullify", "prevent", "refrain", "reject", "rebut", "remain", "refuse", "stop", "suspend", "ward")); + public final Set neg_relations = Generics.newHashSet(Arrays.asList("prep_without", "prepc_without", "prep_except", "prepc_except", "prep_excluding", "prepx_excluding", + "prep_if", "prepc_if", "prep_whether", "prepc_whether", "prep_away_from", "prepc_away_from", "prep_instead_of", "prepc_instead_of")); + public final Set modals = Generics.newHashSet(Arrays.asList("can", "could", "may", "might", "must", "should", "would", "seem", + "able", "apparently", "necessarily", "presumably", "probably", "possibly", "reportedly", "supposedly", + "inconceivable", "chance", "impossibility", "improbability", "encouragement", "improbable", "impossible", + "likely", "necessary", "probable", "possible", "uncertain", "unlikely", "unsure", "likelihood", "probability", + "possibility", "eventual", "hypothetical" , "presumed", "supposed", "reported", "apparent")); + + public final Set personPronouns = Generics.newHashSet(); + public final Set allPronouns = Generics.newHashSet(); + + public final Map statesAbbreviation = Generics.newHashMap(); + public final Map> demonyms = Generics.newHashMap(); + public final Set demonymSet = Generics.newHashSet(); + public final Set adjectiveNation = Generics.newHashSet(); + + public final Set countries = Generics.newHashSet(); + public final Set statesAndProvinces = Generics.newHashSet(); + + public final Set neutralWords = Generics.newHashSet(); + public final Set femaleWords = Generics.newHashSet(); + public final Set maleWords = Generics.newHashSet(); + + public final Set pluralWords = Generics.newHashSet(); + public final Set singularWords = Generics.newHashSet(); + + public final Set inanimateWords = Generics.newHashSet(); + public final Set animateWords = Generics.newHashSet(); + + public final Map, int[]> genderNumber = Generics.newHashMap(); + + public final ArrayList>> corefDict = new ArrayList>>(4); + public final Counter> corefDictPMI = new ClassicCounter>(); + public final Map> NE_signatures = Generics.newHashMap(); + + private void setPronouns() { + for(String s: animatePronouns){ + personPronouns.add(s); + } + + allPronouns.addAll(firstPersonPronouns); + allPronouns.addAll(secondPersonPronouns); + allPronouns.addAll(thirdPersonPronouns); + allPronouns.addAll(otherPronouns); + + stopWords.addAll(allPronouns); + } + + public void loadStateAbbreviation(String statesFile) { + BufferedReader reader = null; + try { + reader = IOUtils.readerFromString(statesFile); + while(reader.ready()){ + String[] tokens = reader.readLine().split("\t"); + statesAbbreviation.put(tokens[1], tokens[0]); + statesAbbreviation.put(tokens[2], tokens[0]); + } + } catch (IOException e){ + throw new RuntimeIOException(e); + } finally { + IOUtils.closeIgnoringExceptions(reader); + } + } + + private void loadDemonymLists(String demonymFile) { + BufferedReader reader = null; + try { + reader = IOUtils.readerFromString(demonymFile); + while(reader.ready()){ + String[] line = reader.readLine().split("\t"); + if(line[0].startsWith("#")) continue; + Set set = Generics.newHashSet(); + for(String s : line){ + set.add(s.toLowerCase()); + demonymSet.add(s.toLowerCase()); + } + demonyms.put(line[0].toLowerCase(), set); + } + adjectiveNation.addAll(demonymSet); + adjectiveNation.removeAll(demonyms.keySet()); + } catch (IOException e){ + throw new RuntimeIOException(e); + } finally { + IOUtils.closeIgnoringExceptions(reader); + } + } + + private static void getWordsFromFile(String filename, Set resultSet, boolean lowercase) throws IOException { + BufferedReader reader = IOUtils.readerFromString(filename); + while(reader.ready()) { + if(lowercase) resultSet.add(reader.readLine().toLowerCase()); + else resultSet.add(reader.readLine()); + } + IOUtils.closeIgnoringExceptions(reader); + } + + private void loadAnimacyLists(String animateWordsFile, String inanimateWordsFile) { + try { + getWordsFromFile(animateWordsFile, animateWords, false); + getWordsFromFile(inanimateWordsFile, inanimateWords, false); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void loadGenderLists(String maleWordsFile, String neutralWordsFile, String femaleWordsFile) { + try { + getWordsFromFile(maleWordsFile, maleWords, false); + getWordsFromFile(neutralWordsFile, neutralWords, false); + getWordsFromFile(femaleWordsFile, femaleWords, false); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void loadNumberLists(String pluralWordsFile, String singularWordsFile) { + try { + getWordsFromFile(pluralWordsFile, pluralWords, false); + getWordsFromFile(singularWordsFile, singularWords, false); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + private void loadStatesLists(String file) { + try { + getWordsFromFile(file, statesAndProvinces, true); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void loadCountriesLists(String file) { + try{ + BufferedReader reader = IOUtils.readerFromString(file); + while(reader.ready()) { + String line = reader.readLine(); + countries.add(line.split("\t")[1].toLowerCase()); + } + reader.close(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void loadGenderNumber(String file){ + try { + BufferedReader reader = IOUtils.readerFromString(file); + String line; + while ((line = reader.readLine())!=null){ + String[] split = line.split("\t"); + List tokens = new ArrayList(Arrays.asList(split[0].split(" "))); + String[] countStr = split[1].split(" "); + int[] counts = new int[4]; + counts[0] = Integer.parseInt(countStr[0]); + counts[1] = Integer.parseInt(countStr[1]); + counts[2] = Integer.parseInt(countStr[2]); + counts[3] = Integer.parseInt(countStr[3]); + + genderNumber.put(tokens, counts); + } + reader.close(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + private void loadExtraGender(String file){ + BufferedReader reader = null; + try { + reader = IOUtils.readerFromString(file); + while(reader.ready()) { + String[] split = reader.readLine().split("\t"); + if(split[1].equals("MALE")) maleWords.add(split[0]); + else if(split[1].equals("FEMALE")) femaleWords.add(split[0]); + } + } catch (IOException e){ + throw new RuntimeIOException(e); + } finally { + IOUtils.closeIgnoringExceptions(reader); + } + } + + private static void loadCorefDict(String[] file, + ArrayList>> dict) { + + for(int i = 0; i < 4; i++){ + dict.add(new ClassicCounter>()); + + BufferedReader reader = null; + try { + reader = IOUtils.readerFromString(file[i]); + // Skip the first line (header) + reader.readLine(); + + while(reader.ready()) { + String[] split = reader.readLine().split("\t"); + dict.get(i).setCount(new Pair(split[0], split[1]), Double.parseDouble(split[2])); + } + + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeIgnoringExceptions(reader); + } + } + } + + private static void loadCorefDictPMI(String file, Counter> dict) { + + BufferedReader reader = null; + try { + reader = IOUtils.readerFromString(file); + // Skip the first line (header) + reader.readLine(); + + while(reader.ready()) { + String[] split = reader.readLine().split("\t"); + dict.setCount(new Pair(split[0], split[1]), Double.parseDouble(split[3])); + } + + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeIgnoringExceptions(reader); + } + } + + private static void loadSignatures(String file, Map> sigs) { + BufferedReader reader = null; + try { + reader = IOUtils.readerFromString(file); + + while(reader.ready()) { + String[] split = reader.readLine().split("\t"); + Counter cntr = new ClassicCounter(); + sigs.put(split[0], cntr); + for (int i = 1; i < split.length; i=i+2) { + cntr.setCount(split[i], Double.parseDouble(split[i+1])); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeIgnoringExceptions(reader); + } + } + + public Dictionaries(Properties props) { + this(props.getProperty(Constants.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM), + props.getProperty(Constants.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE), + props.getProperty(Constants.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE), + props.getProperty(Constants.MALE_PROP, DefaultPaths.DEFAULT_DCOREF_MALE), + props.getProperty(Constants.NEUTRAL_PROP, DefaultPaths.DEFAULT_DCOREF_NEUTRAL), + props.getProperty(Constants.FEMALE_PROP, DefaultPaths.DEFAULT_DCOREF_FEMALE), + props.getProperty(Constants.PLURAL_PROP, DefaultPaths.DEFAULT_DCOREF_PLURAL), + props.getProperty(Constants.SINGULAR_PROP, DefaultPaths.DEFAULT_DCOREF_SINGULAR), + props.getProperty(Constants.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES), + props.getProperty(Constants.GENDER_NUMBER_PROP, DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER), + props.getProperty(Constants.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES), + props.getProperty(Constants.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), + props.getProperty(Constants.EXTRA_GENDER_PROP, DefaultPaths.DEFAULT_DCOREF_EXTRA_GENDER), + Boolean.parseBoolean(props.getProperty(Constants.BIG_GENDER_NUMBER_PROP, "false")) || + Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false")), + props.getProperty(Constants.SIEVES_PROP, Constants.SIEVEPASSES).contains("CorefDictionaryMatch"), + new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2, + DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}, + DefaultPaths.DEFAULT_DCOREF_DICT1, + DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES); + } + + public static String signature(Properties props) { + StringBuilder os = new StringBuilder(); + os.append(Constants.DEMONYM_PROP + ":" + + props.getProperty(Constants.DEMONYM_PROP, + DefaultPaths.DEFAULT_DCOREF_DEMONYM)); + os.append(Constants.ANIMATE_PROP + ":" + + props.getProperty(Constants.ANIMATE_PROP, + DefaultPaths.DEFAULT_DCOREF_ANIMATE)); + os.append(Constants.INANIMATE_PROP + ":" + + props.getProperty(Constants.INANIMATE_PROP, + DefaultPaths.DEFAULT_DCOREF_INANIMATE)); + os.append(Constants.MALE_PROP + ":" + + props.getProperty(Constants.MALE_PROP, + DefaultPaths.DEFAULT_DCOREF_MALE)); + os.append(Constants.NEUTRAL_PROP + ":" + + props.getProperty(Constants.NEUTRAL_PROP, + DefaultPaths.DEFAULT_DCOREF_NEUTRAL)); + os.append(Constants.FEMALE_PROP + ":" + + props.getProperty(Constants.FEMALE_PROP, + DefaultPaths.DEFAULT_DCOREF_FEMALE)); + os.append(Constants.PLURAL_PROP + ":" + + props.getProperty(Constants.PLURAL_PROP, + DefaultPaths.DEFAULT_DCOREF_PLURAL)); + os.append(Constants.SINGULAR_PROP + ":" + + props.getProperty(Constants.SINGULAR_PROP, + DefaultPaths.DEFAULT_DCOREF_SINGULAR)); + os.append(Constants.STATES_PROP + ":" + + props.getProperty(Constants.STATES_PROP, + DefaultPaths.DEFAULT_DCOREF_STATES)); + os.append(Constants.GENDER_NUMBER_PROP + ":" + + props.getProperty(Constants.GENDER_NUMBER_PROP, + DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER)); + os.append(Constants.COUNTRIES_PROP + ":" + + props.getProperty(Constants.COUNTRIES_PROP, + DefaultPaths.DEFAULT_DCOREF_COUNTRIES)); + os.append(Constants.STATES_PROVINCES_PROP + ":" + + props.getProperty(Constants.STATES_PROVINCES_PROP, + DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES)); + os.append(Constants.EXTRA_GENDER_PROP + ":" + + props.getProperty(Constants.EXTRA_GENDER_PROP, + DefaultPaths.DEFAULT_DCOREF_EXTRA_GENDER)); + os.append(Constants.BIG_GENDER_NUMBER_PROP + ":" + + props.getProperty(Constants.BIG_GENDER_NUMBER_PROP, + "false")); + os.append(Constants.REPLICATECONLL_PROP + ":" + + props.getProperty(Constants.REPLICATECONLL_PROP, + "false")); + return os.toString(); + } + + public Dictionaries( + String demonymWords, + String animateWords, + String inanimateWords, + String maleWords, + String neutralWords, + String femaleWords, + String pluralWords, + String singularWords, + String statesWords, + String genderNumber, + String countries, + String states, + String extraGender, + boolean loadBigGenderNumber, + boolean loadCorefDict, + String[] corefDictFiles, + String corefDictPMIFile, + String signaturesFile) { + loadDemonymLists(demonymWords); + loadStateAbbreviation(statesWords); + if(Constants.USE_ANIMACY_LIST) loadAnimacyLists(animateWords, inanimateWords); + if(Constants.USE_GENDER_LIST) loadGenderLists(maleWords, neutralWords, femaleWords); + if(Constants.USE_NUMBER_LIST) loadNumberLists(pluralWords, singularWords); + if(loadBigGenderNumber) loadGenderNumber(genderNumber); + loadCountriesLists(countries); + loadStatesLists(states); + loadExtraGender(extraGender); + setPronouns(); + if(loadCorefDict){ + loadCorefDict(corefDictFiles, corefDict); + loadCorefDictPMI(corefDictPMIFile, corefDictPMI); + loadSignatures(signaturesFile, NE_signatures); + } + } + + public Dictionaries() { + this(new Properties()); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Document.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Document.java new file mode 100644 index 0000000..20989b5 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Document.java @@ -0,0 +1,748 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.dcoref.Dictionaries.Number; +import edu.stanford.nlp.dcoref.Dictionaries.Person; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.trees.GrammaticalRelation; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.util.CollectionValuedMap; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.IntPair; +import edu.stanford.nlp.util.IntTuple; +import edu.stanford.nlp.util.Pair; + +public class Document implements Serializable { + + private static final long serialVersionUID = -4139866807494603953L; + + public enum DocType { CONVERSATION, ARTICLE } + + /** The type of document: conversational or article */ + public DocType docType; + + /** Document annotation */ + public Annotation annotation; + + /** for conll shared task 2011 */ + public CoNLL2011DocumentReader.Document conllDoc; + + /** The list of gold mentions */ + public List> goldOrderedMentionsBySentence; + /** The list of predicted mentions */ + public List> predictedOrderedMentionsBySentence; + + /** return the list of predicted mentions */ + public List> getOrderedMentions() { + return predictedOrderedMentionsBySentence; + } + + /** Clusters for coreferent mentions */ + public Map corefClusters; + + /** Gold Clusters for coreferent mentions */ + public Map goldCorefClusters; + + /** All mentions in a document mentionID -> mention*/ + public Map allPredictedMentions; + public Map allGoldMentions; + + /** Set of roles (in role apposition) in a document */ + public Set roleSet; + + /** + * Position of each mention in the input matrix + * Each mention occurrence with sentence # and position within sentence + * (Nth mention, not Nth token) + */ + public Map positions; + + public final Map mentionheadPositions; + + /** List of gold links in a document by positions */ + private List> goldLinks; + + /** UtteranceAnnotation -> String (speaker): mention ID or speaker string */ + public Map speakers; + + /** mention ID pair */ + public Set> speakerPairs; + + public int maxUtter; + public int numParagraph; + public int numSentences; + + /** Set of incompatible mention pairs */ + public Set> incompatibles; + + public Document() { + positions = Generics.newHashMap(); + mentionheadPositions = Generics.newHashMap(); + roleSet = Generics.newHashSet(); + corefClusters = Generics.newHashMap(); + goldCorefClusters = null; + allPredictedMentions = Generics.newHashMap(); + allGoldMentions = Generics.newHashMap(); + speakers = Generics.newHashMap(); + speakerPairs = Generics.newHashSet(); + incompatibles = Generics.newHashSet(); + } + + public Document(Annotation anno, List> predictedMentions, + List> goldMentions, Dictionaries dict) { + this(); + annotation = anno; + numSentences = anno.get(CoreAnnotations.SentencesAnnotation.class).size(); + predictedOrderedMentionsBySentence = predictedMentions; + goldOrderedMentionsBySentence = goldMentions; + if(goldMentions!=null) { + findTwinMentions(true); + // fill allGoldMentions + for(List l : goldOrderedMentionsBySentence) { + for(Mention g : l) { + allGoldMentions.put(g.mentionID, g); + } + } + } + // set original ID, initial coref clusters, paragraph annotation, mention positions + initialize(); + processDiscourse(dict); + printMentionDetection(); + } + + /** Process discourse information */ + protected void processDiscourse(Dictionaries dict) { + docType = findDocType(dict); + markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false); + findSpeakers(dict); + + // find 'speaker mention' for each mention + for(Mention m : allPredictedMentions.values()) { + int utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class); + try{ + int speakerMentionID = Integer.parseInt(m.headWord.get(CoreAnnotations.SpeakerAnnotation.class)); + if (utter != 0) { + speakerPairs.add(new Pair(m.mentionID, speakerMentionID)); + speakerPairs.add(new Pair(speakerMentionID, m.mentionID)); + } + } catch (Exception e){ + // no mention found for the speaker + // nothing to do + } + // set generic 'you' : e.g., you know in conversation + if(docType!=DocType.ARTICLE && m.person==Person.YOU && m.endIndex < m.sentenceWords.size()-1 + && m.sentenceWords.get(m.endIndex).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("know")) { + m.generic = true; + } + } + } + + /** Document initialize */ + protected void initialize() { + if(goldOrderedMentionsBySentence==null) assignOriginalID(); + setParagraphAnnotation(); + initializeCorefCluster(); + } + + /** initialize positions and corefClusters (put each mention in each CorefCluster) */ + private void initializeCorefCluster() { + for(int i = 0; i < predictedOrderedMentionsBySentence.size(); i ++){ + for(int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j ++){ + Mention m = predictedOrderedMentionsBySentence.get(i).get(j); + if (allPredictedMentions.containsKey(m.mentionID)) { + SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID); + Mention m1 = allPredictedMentions.get(m.mentionID); + SieveCoreferenceSystem.logger.warning("OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]"); + SieveCoreferenceSystem.logger.warning("NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]"); + // SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED", predictedOrderedMentionsBySentence); +// SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED", goldOrderedMentionsBySentence); + } + assert(!allPredictedMentions.containsKey(m.mentionID)); + allPredictedMentions.put(m.mentionID, m); + + IntTuple pos = new IntTuple(2); + pos.set(0, i); + pos.set(1, j); + positions.put(m, pos); + m.sentNum = i; + + assert(!corefClusters.containsKey(m.mentionID)); + corefClusters.put(m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Arrays.asList(m)))); + m.corefClusterID = m.mentionID; + + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, i); + headPosition.set(1, m.headIndex); + mentionheadPositions.put(headPosition, m); + } + } + } + + /** Mark twin mentions in gold and predicted mentions */ + protected void findTwinMentions(boolean strict){ + if(strict) findTwinMentionsStrict(); + else findTwinMentionsRelaxed(); + } + + /** Mark twin mentions: All mention boundaries should be matched */ + private void findTwinMentionsStrict(){ + for(int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { + List golds = goldOrderedMentionsBySentence.get(sentNum); + List predicts = predictedOrderedMentionsBySentence.get(sentNum); + + // For CoNLL training there are some documents with gold mentions with the same position offsets + // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll + // (Packwood - Roth) + CollectionValuedMap goldMentionPositions = new CollectionValuedMap(); + for(Mention g : golds) { + IntPair ip = new IntPair(g.startIndex, g.endIndex); + if (goldMentionPositions.containsKey(ip)) { + StringBuilder existingMentions = new StringBuilder(); + for (Mention eg: goldMentionPositions.get(ip)) { + if (existingMentions.length() > 0) { + existingMentions.append(","); + } + existingMentions.append(eg.mentionID); + } + SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip + + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString()); + } + //assert(!goldMentionPositions.containsKey(ip)); + goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g); + } + for(Mention p : predicts) { + IntPair pos = new IntPair(p.startIndex, p.endIndex); + if(goldMentionPositions.containsKey(pos)) { + Collection cm = goldMentionPositions.get(pos); + Mention g = cm.iterator().next(); + cm.remove(g); + p.mentionID = g.mentionID; + p.twinless = false; + g.twinless = false; + } + } + // temp: for making easy to recognize twinless mention + for(Mention p : predicts){ + if(p.twinless) p.mentionID += 10000; + } + } + } + + /** Mark twin mentions: heads of the mentions are matched */ + private void findTwinMentionsRelaxed() { + for(int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { + List golds = goldOrderedMentionsBySentence.get(sentNum); + List predicts = predictedOrderedMentionsBySentence.get(sentNum); + + Map goldMentionPositions = Generics.newHashMap(); + Map> goldMentionHeadPositions = Generics.newHashMap(); + for(Mention g : golds) { + goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g); + if(!goldMentionHeadPositions.containsKey(g.headIndex)) { + goldMentionHeadPositions.put(g.headIndex, new LinkedList()); + } + goldMentionHeadPositions.get(g.headIndex).add(g); + } + + List remains = new ArrayList(); + for (Mention p : predicts) { + IntPair pos = new IntPair(p.startIndex, p.endIndex); + if(goldMentionPositions.containsKey(pos)) { + Mention g = goldMentionPositions.get(pos); + p.mentionID = g.mentionID; + p.twinless = false; + g.twinless = false; + goldMentionHeadPositions.get(g.headIndex).remove(g); + if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) { + goldMentionHeadPositions.remove(g.headIndex); + } + } + else remains.add(p); + } + for (Mention r : remains){ + if(goldMentionHeadPositions.containsKey(r.headIndex)) { + Mention g = goldMentionHeadPositions.get(r.headIndex).poll(); + r.mentionID = g.mentionID; + r.twinless = false; + g.twinless = false; + if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) { + goldMentionHeadPositions.remove(g.headIndex); + } + } + } + } + } + + /** Set paragraph index */ + private void setParagraphAnnotation() { + int paragraphIndex = 0; + int previousOffset = -10; + for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + if(w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { + if(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset+2) paragraphIndex++; + w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex); + previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + } else { + w.set(CoreAnnotations.ParagraphAnnotation.class, -1); + } + } + } + for(List l : predictedOrderedMentionsBySentence) { + for(Mention m : l){ + m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class); + } + } + numParagraph = paragraphIndex; + } + + /** Find document type: Conversation or article */ + private DocType findDocType(Dictionaries dict) { + boolean speakerChange = false; + Set discourseWithIorYou = Generics.newHashSet(); + + for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); + if(utterIndex!=0) speakerChange = true; + if(speakerChange && utterIndex==0) return DocType.ARTICLE; + if(dict.firstPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase()) + || dict.secondPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) { + discourseWithIorYou.add(utterIndex); + } + if(maxUtter < utterIndex) maxUtter = utterIndex; + } + } + if(!speakerChange) return DocType.ARTICLE; + return DocType.CONVERSATION; // in conversation, utter index keep increasing. + } + + /** When there is no mentionID information (without gold annotation), assign mention IDs */ + protected void assignOriginalID(){ + List> orderedMentionsBySentence = this.getOrderedMentions(); + boolean hasOriginalID = true; + for(List l : orderedMentionsBySentence){ + if (l.size()==0) continue; + for(Mention m : l){ + if(m.mentionID == -1){ + hasOriginalID = false; + } + } + } + if(!hasOriginalID){ + int id = 0; + for(List l : orderedMentionsBySentence){ + for(Mention m : l){ + m.mentionID = id++; + } + } + } + } + + /** Extract gold coref cluster information. */ + public void extractGoldCorefClusters(){ + goldCorefClusters = Generics.newHashMap(); + for (List mentions : goldOrderedMentionsBySentence) { + for (Mention m : mentions) { + int id = m.goldCorefClusterID; + if (id == -1) { + throw new RuntimeException("No gold info"); + } + CorefCluster c = goldCorefClusters.get(id); + if (c == null) { + c = new CorefCluster(id); + goldCorefClusters.put(id, c); + } + c.corefMentions.add(m); + } + } + } + + protected List> getGoldLinks() { + if(goldLinks==null) this.extractGoldLinks(); + return goldLinks; + } + + /** Extract gold coref link information */ + protected void extractGoldLinks() { + // List> orderedMentionsBySentence = this.getOrderedMentions(); + List> links = new ArrayList>(); + + // position of each mention in the input matrix, by id + Map positions = Generics.newHashMap(); + // positions of antecedents + Map> antecedents = Generics.newHashMap(); + for(int i = 0; i < goldOrderedMentionsBySentence.size(); i ++){ + for(int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j ++){ + Mention m = goldOrderedMentionsBySentence.get(i).get(j); + int id = m.mentionID; + IntTuple pos = new IntTuple(2); + pos.set(0, i); + pos.set(1, j); + positions.put(id, pos); + antecedents.put(id, new ArrayList()); + } + } + +// SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence); + for (List mentions : goldOrderedMentionsBySentence) { + for (Mention m : mentions) { + int id = m.mentionID; + IntTuple src = positions.get(id); + + assert (src != null); + if (m.originalRef >= 0) { + IntTuple dst = positions.get(m.originalRef); + if (dst == null) { + throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef); + } + + // to deal with cataphoric annotation + while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) { + Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); + m.originalRef = dstMention.originalRef; + dstMention.originalRef = id; + + if (m.originalRef < 0) break; + dst = positions.get(m.originalRef); + } + if (m.originalRef < 0) continue; + + // A B C: if A<-B, A<-C => make a link B<-C + for (int k = dst.get(0); k <= src.get(0); k++) { + for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) { + if (k == dst.get(0) && l < dst.get(1)) continue; + if (k == src.get(0) && l > src.get(1)) break; + IntTuple missed = new IntTuple(2); + missed.set(0, k); + missed.set(1, l); + if (links.contains(new Pair(missed, dst))) { + antecedents.get(id).add(missed); + links.add(new Pair(src, missed)); + } + } + } + + links.add(new Pair(src, dst)); + + assert (antecedents.get(id) != null); + antecedents.get(id).add(dst); + + List ants = antecedents.get(m.originalRef); + assert (ants != null); + for (IntTuple ant : ants) { + antecedents.get(id).add(ant); + links.add(new Pair(src, ant)); + } + } + } + } + goldLinks = links; + } + + /** set UtteranceAnnotation for quotations: default UtteranceAnnotation = 0 is given */ + private void markQuotations(List results, boolean normalQuotationType) { + boolean insideQuotation = false; + for(CoreMap m : results) { + for(CoreLabel l : m.get(CoreAnnotations.TokensAnnotation.class)) { + String w = l.get(CoreAnnotations.TextAnnotation.class); + + boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class) + || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("") + || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER"); + + if(w.equals("``") + || (!insideQuotation && normalQuotationType && w.equals("\""))) { + insideQuotation = true; + maxUtter++; + continue; + } else if(w.equals("''") + || (insideQuotation && normalQuotationType && w.equals("\""))) { + insideQuotation = false; + } + if(insideQuotation) { + l.set(CoreAnnotations.UtteranceAnnotation.class, maxUtter); + } + if(noSpeakerInfo){ + l.set(CoreAnnotations.SpeakerAnnotation.class, "PER"+l.get(CoreAnnotations.UtteranceAnnotation.class)); + } + } + } + if(maxUtter==0 && !normalQuotationType) markQuotations(results, true); + } + + /** Speaker extraction */ + private void findSpeakers(Dictionaries dict) { + if(Constants.USE_GOLD_SPEAKER_TAGS) { + for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); + speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class)); + } + } + } else { + if(docType==DocType.CONVERSATION) findSpeakersInConversation(dict); + else if (docType==DocType.ARTICLE) findSpeakersInArticle(dict); + + // set speaker info to annotation + for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); + if(speakers.containsKey(utterIndex)) { + w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex)); + } + } + } + } + } + private void findSpeakersInArticle(Dictionaries dict) { + List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); + Pair beginQuotation = new Pair(); + Pair endQuotation = new Pair(); + boolean insideQuotation = false; + int utterNum = -1; + + for (int i = 0 ; i < sentences.size(); i++) { + List sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); + for(int j = 0 ; j < sent.size() ; j++) { + int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); + + if(utterIndex != 0 && !insideQuotation) { + utterNum = utterIndex; + insideQuotation = true; + beginQuotation.setFirst(i); + beginQuotation.setSecond(j); + } else if (utterIndex == 0 && insideQuotation) { + insideQuotation = false; + endQuotation.setFirst(i); + endQuotation.setSecond(j); + findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict); + } + } + } + } + + private void findQuotationSpeaker(int utterNum, List sentences, + Pair beginQuotation, Pair endQuotation, Dictionaries dict) { + + if(findSpeaker(utterNum, beginQuotation.first(), sentences, 0, beginQuotation.second(), dict)) + return ; + + if(findSpeaker(utterNum, endQuotation.first(), sentences, endQuotation.second(), + sentences.get(endQuotation.first()).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) + return; + + if(beginQuotation.second() <= 1 && beginQuotation.first() > 0) { + if(findSpeaker(utterNum, beginQuotation.first()-1, sentences, 0, + sentences.get(beginQuotation.first()-1).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) + return; + } + + if(endQuotation.second() == sentences.get(endQuotation.first()).size()-1 + && sentences.size() > endQuotation.first()+1) { + if(findSpeaker(utterNum, endQuotation.first()+1, sentences, 0, + sentences.get(endQuotation.first()+1).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) + return; + } + } + + private boolean findSpeaker(int utterNum, int sentNum, List sentences, + int startIndex, int endIndex, Dictionaries dict) { + List sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); + for(int i = startIndex ; i < endIndex ; i++) { + if(sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue; + String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class); + String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class); + if(dict.reportVerb.contains(lemma)) { + // find subject + SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + IndexedWord w = dependency.getNodeByWordPattern(word); + + if (w != null) { + for(Pair child : dependency.childPairs(w)){ + if(child.first().getShortName().equals("nsubj")) { + String subjectString = child.second().word(); + int subjectIndex = child.second().index(); // start from 1 + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, sentNum); + headPosition.set(1, subjectIndex-1); + String speaker; + if(mentionheadPositions.containsKey(headPosition)) { + speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); + } else { + speaker = subjectString; + } + speakers.put(utterNum, speaker); + return true; + } + } + } else { + SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word); + } + } + } + return false; + } + + private void findSpeakersInConversation(Dictionaries dict) { + for(List l : predictedOrderedMentionsBySentence) { + for(Mention m : l){ + if(m.predicateNominatives == null) continue; + for (Mention a : m.predicateNominatives){ + if(a.spanToString().toLowerCase().equals("i")) { + speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID)); + } + } + } + } + List paragraph = new ArrayList(); + int paragraphUtterIndex = 0; + String nextParagraphSpeaker = ""; + int paragraphOffset = 0; + for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class); + if(paragraphUtterIndex!=currentUtter) { + nextParagraphSpeaker = findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); + paragraphUtterIndex = currentUtter; + paragraphOffset += paragraph.size(); + paragraph = new ArrayList(); + } + paragraph.add(sent); + } + findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); + } + + private String findParagraphSpeaker(List paragraph, + int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) { + if(!speakers.containsKey(paragraphUtterIndex)) { + if(!nextParagraphSpeaker.equals("")) { + speakers.put(paragraphUtterIndex, nextParagraphSpeaker); + } else { // find the speaker of this paragraph (John, nbc news) + CoreMap lastSent = paragraph.get(paragraph.size()-1); + String speaker = ""; + boolean hasVerb = false; + for(int i = 0 ; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size() ; i++){ + CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i); + String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class); + String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if(pos.startsWith("V")) { + hasVerb = true; + break; + } + if(ner.startsWith("PER")) { + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, paragraph.size()-1 + paragraphOffset); + headPosition.set(1, i); + if(mentionheadPositions.containsKey(headPosition)) { + speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); + } + } + } + if(!hasVerb && !speaker.equals("")) { + speakers.put(paragraphUtterIndex, speaker); + } + } + } + return findNextParagraphSpeaker(paragraph, paragraphOffset, dict); + } + + private String findNextParagraphSpeaker(List paragraph, int paragraphOffset, Dictionaries dict) { + CoreMap lastSent = paragraph.get(paragraph.size()-1); + String speaker = ""; + for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) { + if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) { + String word = w.get(CoreAnnotations.TextAnnotation.class); + SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + IndexedWord t = dependency.getNodeByWordPattern(word); + + for(Pair child : dependency.childPairs(t)){ + if(child.first().getShortName().equals("nsubj")) { + int subjectIndex = child.second().index(); // start from 1 + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, paragraph.size()-1 + paragraphOffset); + headPosition.set(1, subjectIndex-1); + if(mentionheadPositions.containsKey(headPosition) + && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) { + speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); + } + } + } + } + } + return speaker; + } + + /** Check one mention is the speaker of the other mention */ + public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) { + + if(!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase()) + || ant.number==Number.PLURAL || ant.sentNum!=m.sentNum) return false; + + int countQuotationMark = 0; + for(int i = Math.min(m.headIndex, ant.headIndex)+1 ; i < Math.max(m.headIndex, ant.headIndex) ; i++) { + String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class); + if(word.equals("``") || word.equals("''")) countQuotationMark++; + } + if(countQuotationMark!=1) return false; + + IndexedWord w = m.dependency.getNodeByWordPattern(m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class)); + if(w== null) return false; + + for(Pair parent : m.dependency.parentPairs(w)){ + if(parent.first().getShortName().equals("nsubj") + && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) { + return true; + } + } + return false; + } + + protected void printMentionDetection() { + int foundGoldCount = 0; + for(Mention g : allGoldMentions.values()) { + if(!g.twinless) foundGoldCount++; + } + SieveCoreferenceSystem.logger.fine("# of found gold mentions: "+foundGoldCount + " / # of gold mentions: "+allGoldMentions.size()); + SieveCoreferenceSystem.logger.fine("gold mentions == "); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/MUCMentionExtractor.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/MUCMentionExtractor.java new file mode 100644 index 0000000..1a251cc --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/MUCMentionExtractor.java @@ -0,0 +1,294 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Stack; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.process.TokenizerFactory; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; + +/** + * Extracts {@literal } mentions from a file annotated in MUC format. + * + * @author Jenny Finkel, Mihai Surdeanu, Karthik Raghunathan + */ +public class MUCMentionExtractor extends MentionExtractor { + + private TokenizerFactory tokenizerFactory; + + private String fileContents; + private int currentOffset; + + public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { + super(dict, semantics); + String fileName = props.getProperty(Constants.MUC_PROP); + fileContents = IOUtils.slurpFile(fileName); + currentOffset = 0; + tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); + stanfordProcessor = loadStanfordProcessor(props); + } + + public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics, + LogisticClassifier singletonModel) throws Exception { + this(dict, props, semantics); + singletonPredictor = singletonModel; + } + + @Override + public void resetDocs() { + super.resetDocs(); + currentOffset = 0; + } + + @Override + public Document nextDoc() throws Exception { + List> allWords = new ArrayList>(); + List allTrees = new ArrayList(); + List> allGoldMentions = new ArrayList>(); + List> allPredictedMentions; + List allSentences = new ArrayList(); + Annotation docAnno = new Annotation(""); + + Pattern docPattern = Pattern.compile("(.*?)", Pattern.DOTALL+Pattern.CASE_INSENSITIVE); + Pattern sentencePattern = Pattern.compile("(||

    |)(.*?)(||
    |)", Pattern.DOTALL+Pattern.CASE_INSENSITIVE); + Matcher docMatcher = docPattern.matcher(fileContents); + if (! docMatcher.find(currentOffset)) return null; + + currentOffset = docMatcher.end(); + String doc = docMatcher.group(1); + Matcher sentenceMatcher = sentencePattern.matcher(doc); + String ner = null; + + //Maintain current document ID. + Pattern docIDPattern = Pattern.compile("(.*?)", Pattern.DOTALL+Pattern.CASE_INSENSITIVE); + Matcher docIDMatcher = docIDPattern.matcher(doc); + if(docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); + else currentDocumentID = "documentAfter " + currentDocumentID; + + while (sentenceMatcher.find()) { + String sentenceString = sentenceMatcher.group(2); + List words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize(); + + // FIXING TOKENIZATION PROBLEMS + for (int i = 0; i < words.size(); i++) { + CoreLabel w = words.get(i); + if (i > 0 && w.word().equals("$")) { + if(!words.get(i-1).word().endsWith("PRP") && !words.get(i-1).word().endsWith("WP")) + continue; + words.get(i-1).set(CoreAnnotations.TextAnnotation.class, words.get(i-1).word()+"$"); + words.remove(i); + i--; + } else if (w.word().equals("\\/")) { + if(words.get(i-1).word().equals("")) + continue; + w.set(CoreAnnotations.TextAnnotation.class, words.get(i-1).word()+"\\/"+words.get(i+1).word()); + words.remove(i+1); + words.remove(i-1); + } + } + // END FIXING TOKENIZATION PROBLEMS + + List sentence = new ArrayList(); + // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open + Stack stack = new Stack(); + List mentions = new ArrayList(); + + allWords.add(sentence); + allGoldMentions.add(mentions); + + for (CoreLabel word : words) { + String w = word.get(CoreAnnotations.TextAnnotation.class); + // found regular token: WORD/POS + if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length()-2) { + int i = w.lastIndexOf("\\/"); + String w1 = w.substring(0, i); + // we do NOT set POS info here. We take the POS tags from the parser! + word.set(CoreAnnotations.TextAnnotation.class, w1); + word.remove(CoreAnnotations.OriginalTextAnnotation.class); + if(Constants.USE_GOLD_NE) { + if (ner != null) { + word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); + } else { + word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); + } + } + sentence.add(word); + } + // found the start SGML tag for a NE, e.g., "" + else if (w.startsWith("<") && !w.startsWith(""); + Matcher m = nerPattern.matcher(w); + m.find(); + ner = m.group(1); + } + // found the end SGML tag for a NE, e.g., "" + else if (w.startsWith(""); + Matcher m = nerPattern.matcher(w); + m.find(); + String ner1 = m.group(1); + if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); + ner = null; + } + // found the start SGML tag for a coref mention + else if (w.startsWith("")) { + Mention mention = stack.pop(); + mention.endIndex = sentence.size(); + + // this is a closed mention. add it to the final list of mentions + // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); + mentions.add(mention); + } else { + word.remove(CoreAnnotations.OriginalTextAnnotation.class); + if(Constants.USE_GOLD_NE){ + if (ner != null) { + word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); + } else { + word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); + } + } + sentence.add(word); + } + } + StringBuilder textContent = new StringBuilder(); + for (int i=0 ; i0) textContent.append(" "); + textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); + } + CoreMap sentCoreMap = new Annotation(textContent.toString()); + allSentences.add(sentCoreMap); + sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); + } + + // assign goldCorefClusterID + Map idMention = Generics.newHashMap(); // temporary use + for (List goldMentions : allGoldMentions) { + for (Mention m : goldMentions) { + idMention.put(m.mentionID, m); + } + } + for (List goldMentions : allGoldMentions) { + for (Mention m : goldMentions) { + if (m.goldCorefClusterID == -1) { + if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; + else { + int ref = m.originalRef; + while (true) { + Mention m2 = idMention.get(ref); + if (m2.goldCorefClusterID != -1) { + m.goldCorefClusterID = m2.goldCorefClusterID; + break; + } else if (m2.originalRef == -1) { + m2.goldCorefClusterID = m2.mentionID; + m.goldCorefClusterID = m2.goldCorefClusterID; + break; + } else { + ref = m2.originalRef; + } + } + } + } + } + } + + docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); + stanfordProcessor.annotate(docAnno); + + if(allSentences.size()!=allWords.size()) throw new RuntimeException(); + for(int i = 0 ; i< allSentences.size(); i++){ + List annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); + List unannotatedSent = allWords.get(i); + List mentionInSent = allGoldMentions.get(i); + for (Mention m : mentionInSent){ + m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + } + if(annotatedSent.size() != unannotatedSent.size()){ + throw new RuntimeException(); + } + int k = 0; + for(int j = 0 ; j < annotatedSent.size(); j++, k++){ + CoreLabel annotatedWord = annotatedSent.get(j); + CoreLabel unannotatedWord = unannotatedSent.get(k); + if(!annotatedWord.get(CoreAnnotations.TextAnnotation.class).equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { + throw new RuntimeException(); + } + } + allWords.set(i, annotatedSent); + allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); + } + + // extract predicted mentions + if(Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; + else allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); + + // add the relevant fields to mentions and order them for coref + return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Mention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Mention.java new file mode 100644 index 0000000..8c46e06 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Mention.java @@ -0,0 +1,1376 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.Serializable; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.dcoref.Dictionaries.Animacy; +import edu.stanford.nlp.dcoref.Dictionaries.Gender; +import edu.stanford.nlp.dcoref.Dictionaries.MentionType; +import edu.stanford.nlp.dcoref.Dictionaries.Number; +import edu.stanford.nlp.dcoref.Dictionaries.Person; +import edu.stanford.nlp.ling.BasicDatum; +import edu.stanford.nlp.ling.CoreAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.trees.EnglishGrammaticalRelations; +import edu.stanford.nlp.trees.GrammaticalRelation; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; + +/** + * One mention for the SieveCoreferenceSystem. + * + * @author Jenny Finkel, Karthik Raghunathan, Heeyoung Lee, Marta Recasens + */ +public class Mention implements CoreAnnotation, Serializable { + + private static final long serialVersionUID = -7524485803945717057L; + + public Mention() { + } + + public Mention(int mentionID, int startIndex, int endIndex, SemanticGraph dependency){ + this.mentionID = mentionID; + this.startIndex = startIndex; + this.endIndex = endIndex; + this.dependency = dependency; + } + + public Mention(int mentionID, int startIndex, int endIndex, SemanticGraph dependency, List mentionSpan){ + this.mentionID = mentionID; + this.startIndex = startIndex; + this.endIndex = endIndex; + this.dependency = dependency; + this.originalSpan = mentionSpan; + } + + public Mention(int mentionID, int startIndex, int endIndex, SemanticGraph dependency, List mentionSpan, Tree mentionTree){ + this.mentionID = mentionID; + this.startIndex = startIndex; + this.endIndex = endIndex; + this.dependency = dependency; + this.originalSpan = mentionSpan; + this.mentionSubTree = mentionTree; + } + + public MentionType mentionType; + public Number number; + public edu.stanford.nlp.dcoref.Dictionaries.Gender gender; + public Animacy animacy; + public Person person; + public String headString; + public String nerString; + + public int startIndex; + public int endIndex; + public int headIndex; + public int mentionID = -1; + public int originalRef = -1; + public IndexedWord headIndexedWord; + + public int goldCorefClusterID = -1; + public int corefClusterID = -1; + public int sentNum = -1; + public int utter = -1; + public int paragraph = -1; + public boolean isSubject; + public boolean isDirectObject; + public boolean isIndirectObject; + public boolean isPrepositionObject; + public IndexedWord dependingVerb; + public boolean twinless = true; + public boolean generic = false; // generic pronoun or generic noun (bare plurals) + public boolean isSingleton; + + public List sentenceWords; + public List originalSpan; + + public Tree mentionSubTree; + public Tree contextParseTree; + public CoreLabel headWord; + public SemanticGraph dependency; + public Set dependents = Generics.newHashSet(); + public List preprocessedTerms; + public Object synsets; + + /** Set of other mentions in the same sentence that are syntactic appositions to this */ + public Set appositions = null; + public Set predicateNominatives = null; + public Set relativePronouns = null; + + + @Override + public Class getType() { + return Mention.class; + } + + public boolean isPronominal() { + return mentionType == MentionType.PRONOMINAL; + } + + @Override + public String toString() { + // return headWord.toString(); + return spanToString(); + } + + public String spanToString() { + StringBuilder os = new StringBuilder(); + for(int i = 0; i < originalSpan.size(); i ++){ + if(i > 0) os.append(" "); + os.append(originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class)); + } + return os.toString(); + } + + /** Set attributes of a mention: + * head string, mention type, NER label, Number, Gender, Animacy + * @throws Exception + */ + public void process(Dictionaries dict, Semantics semantics, MentionExtractor mentionExtractor) throws Exception { + setHeadString(); + setType(dict); + setNERString(); + List mStr = getMentionString(); + setNumber(dict, getNumberCount(dict, mStr)); + setGender(dict, getGenderCount(dict, mStr)); + setAnimacy(dict); + setPerson(dict); + setDiscourse(); + headIndexedWord = dependency.getNodeByIndexSafe(headWord.index()); + if(semantics!=null) setSemantics(dict, semantics, mentionExtractor); + } + + public void process(Dictionaries dict, Semantics semantics, MentionExtractor mentionExtractor, + LogisticClassifier singletonPredictor) throws Exception { + process(dict, semantics, mentionExtractor); + if(singletonPredictor != null) setSingleton(singletonPredictor, dict); + } + + private void setSingleton(LogisticClassifier predictor, Dictionaries dict){ + double coreference_score = predictor.probabilityOf( + new BasicDatum(getSingletonFeatures(dict), "1")); + if(coreference_score < 0.2) this.isSingleton = true; + } + + /** + * Returns the features used by the singleton predictor (logistic + * classifier) to decide whether the mention belongs to a singleton entity + */ + private ArrayList getSingletonFeatures(Dictionaries dict){ + ArrayList features = new ArrayList(); + features.add(mentionType.toString()); + features.add(nerString); + features.add(animacy.toString()); + + int personNum = 3; + if(person.equals(Person.I) || person.equals(Person.WE)) personNum = 1; + if(person.equals(Person.YOU)) personNum = 2; + if(person.equals(Person.UNKNOWN)) personNum = 0; + features.add(String.valueOf(personNum)); + features.add(number.toString()); + features.add(getPosition()); + features.add(getRelation()); + features.add(getQuantification(dict)); + features.add(String.valueOf(getModifiers(dict))); + features.add(String.valueOf(getNegation(dict))); + features.add(String.valueOf(getModal(dict))); + features.add(String.valueOf(getReportEmbedding(dict))); + features.add(String.valueOf(getCoordination())); + return features; + } + + private List getMentionString() { + List mStr = new ArrayList(); + for(CoreLabel l : this.originalSpan) { + mStr.add(l.get(CoreAnnotations.TextAnnotation.class).toLowerCase()); + if(l==this.headWord) break; // remove words after headword + } + return mStr; + } + + private static int[] getNumberCount(Dictionaries dict, List mStr) { + int len = mStr.size(); + if(len > 1) { + for(int i = 0 ; i < len-1 ; i++) { + if(dict.genderNumber.containsKey(mStr.subList(i, len))) return dict.genderNumber.get(mStr.subList(i, len)); + } + + // find converted string with ! (e.g., "dr. martin luther king jr. boulevard" -> "! boulevard") + List convertedStr = new ArrayList(); + convertedStr.add("!"); + convertedStr.add(mStr.get(len-1)); + if(dict.genderNumber.containsKey(convertedStr)) return dict.genderNumber.get(convertedStr); + } + if(dict.genderNumber.containsKey(mStr.subList(len-1, len))) return dict.genderNumber.get(mStr.subList(len-1, len)); + + return null; + } + + private int[] getGenderCount(Dictionaries dict, List mStr) { + int len = mStr.size(); + char firstLetter = headWord.get(CoreAnnotations.TextAnnotation.class).charAt(0); + if(len > 1 && Character.isUpperCase(firstLetter) && nerString.startsWith("PER")) { + int firstNameIdx = len-2; + String secondToLast = mStr.get(firstNameIdx); + if(firstNameIdx > 1 && (secondToLast.length()==1 || (secondToLast.length()==2 && secondToLast.endsWith(".")))) { + firstNameIdx--; + } + + for(int i = 0 ; i <= firstNameIdx ; i++){ + if(dict.genderNumber.containsKey(mStr.subList(i, len))) return dict.genderNumber.get(mStr.subList(i, len)); + } + + // find converted string with ! (e.g., "dr. martin luther king jr. boulevard" -> "dr. !") + List convertedStr = new ArrayList(); + convertedStr.add(mStr.get(firstNameIdx)); + convertedStr.add("!"); + if(dict.genderNumber.containsKey(convertedStr)) return dict.genderNumber.get(convertedStr); + + if(dict.genderNumber.containsKey(mStr.subList(firstNameIdx, firstNameIdx+1))) return dict.genderNumber.get(mStr.subList(firstNameIdx, firstNameIdx+1)); + } + + if(dict.genderNumber.containsKey(mStr.subList(len-1, len))) return dict.genderNumber.get(mStr.subList(len-1, len)); + return null; + } + private void setDiscourse() { + utter = headWord.get(CoreAnnotations.UtteranceAnnotation.class); + + Pair verbDependency = findDependentVerb(this); + String dep = verbDependency.second(); + dependingVerb = verbDependency.first(); + + isSubject = false; + isDirectObject = false; + isIndirectObject = false; + isPrepositionObject = false; + + if(dep==null) { + return; + } else if(dep.equals("nsubj") || dep.equals("csubj")) { + isSubject = true; + } else if(dep.equals("dobj")){ + isDirectObject = true; + } else if(dep.equals("iobj")){ + isIndirectObject = true; + } else if(dep.equals("pobj")){ + isPrepositionObject = true; + } + } + + private void setPerson(Dictionaries dict) { + // only do for pronoun + if(!this.isPronominal()) person = Person.UNKNOWN; + String spanToString = this.spanToString().toLowerCase(); + + if(dict.firstPersonPronouns.contains(spanToString)) { + if (number == Number.SINGULAR) { + person = Person.I; + } else if (number == Number.PLURAL) { + person = Person.WE; + } else { + person = Person.UNKNOWN; + } + } else if(dict.secondPersonPronouns.contains(spanToString)) { + person = Person.YOU; + } else if(dict.thirdPersonPronouns.contains(spanToString)) { + if (gender == Gender.MALE && number == Number.SINGULAR) { + person = Person.HE; + } else if (gender == Gender.FEMALE && number == Number.SINGULAR) { + person = Person.SHE; + } else if ((gender == Gender.NEUTRAL || animacy == Animacy.INANIMATE) && number == Number.SINGULAR) { + person = Person.IT; + } else if (number == Number.PLURAL) { + person = Person.THEY; + } else { + person = Person.UNKNOWN; + } + } else { + person = Person.UNKNOWN; + } + } + + private void setSemantics(Dictionaries dict, Semantics semantics, MentionExtractor mentionExtractor) throws Exception { + + preprocessedTerms = this.preprocessSearchTerm(); + + if(dict.statesAbbreviation.containsKey(this.spanToString())) { // states abbreviations + preprocessedTerms = new ArrayList(); + preprocessedTerms.add(dict.statesAbbreviation.get(this.spanToString())); + } + + Method meth = semantics.wordnet.getClass().getDeclaredMethod("findSynset", List.class); + synsets = meth.invoke(semantics.wordnet, new Object[]{preprocessedTerms}); + + if(this.isPronominal()) return; + } + + private void setType(Dictionaries dict) { + if (headWord.has(CoreAnnotations.EntityTypeAnnotation.class)){ // ACE gold mention type + if (headWord.get(CoreAnnotations.EntityTypeAnnotation.class).equals("PRO")) { + mentionType = MentionType.PRONOMINAL; + } else if (headWord.get(CoreAnnotations.EntityTypeAnnotation.class).equals("NAM")) { + mentionType = MentionType.PROPER; + } else { + mentionType = MentionType.NOMINAL; + } + } else { // MUC + if(!headWord.has(CoreAnnotations.NamedEntityTagAnnotation.class)) { // temporary fix + mentionType = MentionType.NOMINAL; + SieveCoreferenceSystem.logger.finest("no NamedEntityTagAnnotation: "+headWord); + } else if (headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("PRP") + || (originalSpan.size() == 1 && headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O") + && (dict.allPronouns.contains(headString) || dict.relativePronouns.contains(headString) ))) { + mentionType = MentionType.PRONOMINAL; + } else if (!headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O") || headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) { + mentionType = MentionType.PROPER; + } else { + mentionType = MentionType.NOMINAL; + } + } + } + + private void setGender(Dictionaries dict, int[] genderNumberCount) { + gender = Gender.UNKNOWN; + if (mentionType == MentionType.PRONOMINAL) { + if (dict.malePronouns.contains(headString)) { + gender = Gender.MALE; + } else if (dict.femalePronouns.contains(headString)) { + gender = Gender.FEMALE; + } + } else { + if(Constants.USE_GENDER_LIST){ + // Bergsma list + if(gender == Gender.UNKNOWN) { + if(dict.maleWords.contains(headString)) { + gender = Gender.MALE; + SieveCoreferenceSystem.logger.finest("[Bergsma List] New gender assigned:\tMale:\t" + headString); + } + else if(dict.femaleWords.contains(headString)) { + gender = Gender.FEMALE; + SieveCoreferenceSystem.logger.finest("[Bergsma List] New gender assigned:\tFemale:\t" + headString); + } + else if(dict.neutralWords.contains(headString)) { + gender = Gender.NEUTRAL; + SieveCoreferenceSystem.logger.finest("[Bergsma List] New gender assigned:\tNeutral:\t" + headString); + } + } + } + if(genderNumberCount!=null && this.number!=Number.PLURAL){ + double male = genderNumberCount[0]; + double female = genderNumberCount[1]; + double neutral = genderNumberCount[2]; + + if (male * 0.5 > female + neutral && male > 2) { + this.gender = Gender.MALE; + } else if (female * 0.5 > male + neutral && female > 2) { + this.gender = Gender.FEMALE; + } else if (neutral * 0.5 > male + female && neutral > 2) + this.gender = Gender.NEUTRAL; + } + } + } + + protected void setNumber(Dictionaries dict, int[] genderNumberCount) { + if (mentionType == MentionType.PRONOMINAL) { + if (dict.pluralPronouns.contains(headString)) { + number = Number.PLURAL; + } else if (dict.singularPronouns.contains(headString)) { + number = Number.SINGULAR; + } else { + number = Number.UNKNOWN; + } + } else if(! nerString.equals("O") && mentionType!=MentionType.NOMINAL){ + if(! (nerString.equals("ORGANIZATION") || nerString.startsWith("ORG"))){ + number = Number.SINGULAR; + } else { + // ORGs can be both plural and singular + number = Number.UNKNOWN; + } + } else { + String tag = headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); + if (tag.startsWith("N") && tag.endsWith("S")) { + number = Number.PLURAL; + } else if (tag.startsWith("N")) { + number = Number.SINGULAR; + } else { + number = Number.UNKNOWN; + } + } + + if(mentionType != MentionType.PRONOMINAL) { + if(Constants.USE_NUMBER_LIST){ + if(number == Number.UNKNOWN){ + if(dict.singularWords.contains(headString)) { + number = Number.SINGULAR; + SieveCoreferenceSystem.logger.finest("[Bergsma] Number set to:\tSINGULAR:\t" + headString); + } + else if(dict.pluralWords.contains(headString)) { + number = Number.PLURAL; + SieveCoreferenceSystem.logger.finest("[Bergsma] Number set to:\tPLURAL:\t" + headString); + } + } + } + + final String enumerationPattern = "NP < (NP=tmp $.. (/,|CC/ $.. NP))"; + + TregexPattern tgrepPattern = TregexPattern.compile(enumerationPattern); + TregexMatcher m = tgrepPattern.matcher(this.mentionSubTree); + while (m.find()) { + // Tree t = m.getMatch(); + if(this.mentionSubTree==m.getNode("tmp") + && this.spanToString().toLowerCase().contains(" and ")) { + number = Number.PLURAL; + } + } + } + } + + private void setAnimacy(Dictionaries dict) { + if (mentionType == MentionType.PRONOMINAL) { + if (dict.animatePronouns.contains(headString)) { + animacy = Animacy.ANIMATE; + } else if (dict.inanimatePronouns.contains(headString)) { + animacy = Animacy.INANIMATE; + } else { + animacy = Animacy.UNKNOWN; + } + } else if (nerString.equals("PERSON") || nerString.startsWith("PER")) { + animacy = Animacy.ANIMATE; + } else if (nerString.equals("LOCATION")|| nerString.startsWith("LOC")) { + animacy = Animacy.INANIMATE; + } else if (nerString.equals("MONEY")) { + animacy = Animacy.INANIMATE; + } else if (nerString.equals("NUMBER")) { + animacy = Animacy.INANIMATE; + } else if (nerString.equals("PERCENT")) { + animacy = Animacy.INANIMATE; + } else if (nerString.equals("DATE")) { + animacy = Animacy.INANIMATE; + } else if (nerString.equals("TIME")) { + animacy = Animacy.INANIMATE; + } else if (nerString.equals("MISC")) { + animacy = Animacy.UNKNOWN; + } else if (nerString.startsWith("VEH")) { + animacy = Animacy.UNKNOWN; + } else if (nerString.startsWith("FAC")) { + animacy = Animacy.INANIMATE; + } else if (nerString.startsWith("GPE")) { + animacy = Animacy.INANIMATE; + } else if (nerString.startsWith("WEA")) { + animacy = Animacy.INANIMATE; + } else if (nerString.startsWith("ORG")) { + animacy = Animacy.INANIMATE; + } else { + animacy = Animacy.UNKNOWN; + } + if(mentionType != MentionType.PRONOMINAL) { + if(Constants.USE_ANIMACY_LIST){ + // Better heuristics using DekangLin: + if(animacy == Animacy.UNKNOWN) { + if(dict.animateWords.contains(headString)) { + animacy = Animacy.ANIMATE; + SieveCoreferenceSystem.logger.finest("Assigned Dekang Lin animacy:\tANIMATE:\t" + headString); + } + else if(dict.inanimateWords.contains(headString)) { + animacy = Animacy.INANIMATE; + SieveCoreferenceSystem.logger.finest("Assigned Dekang Lin animacy:\tINANIMATE:\t" + headString); + } + } + } + } + } + + private static final String [] commonNESuffixes = { + "Corp", "Co", "Inc", "Ltd" + }; + private static boolean knownSuffix(String s) { + if(s.endsWith(".")) s = s.substring(0, s.length() - 1); + for(String suff: commonNESuffixes){ + if(suff.equalsIgnoreCase(s)){ + return true; + } + } + return false; + } + + private void setHeadString() { + this.headString = headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(); + if(headWord.has(CoreAnnotations.NamedEntityTagAnnotation.class)) { + // make sure that the head of a NE is not a known suffix, e.g., Corp. + int start = headIndex - startIndex; + if (start >= originalSpan.size()) { + throw new RuntimeException("Invalid start index " + start + "=" + headIndex + "-" + startIndex + + ": originalSpan=[" + StringUtils.joinWords(originalSpan, " ") + "], head=" + headWord); + } + while(start >= 0){ + String head = originalSpan.get(start).get(CoreAnnotations.TextAnnotation.class).toLowerCase(); + if(knownSuffix(head) == false){ + this.headString = head; + break; + } else { + start --; + } + } + } + } + + private void setNERString() { + if(headWord.has(CoreAnnotations.EntityTypeAnnotation.class)){ // ACE + if(headWord.has(CoreAnnotations.NamedEntityTagAnnotation.class) && headWord.get(CoreAnnotations.EntityTypeAnnotation.class).equals("NAM")){ + this.nerString = headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); + } else { + this.nerString = "O"; + } + } + else{ // MUC + if (headWord.has(CoreAnnotations.NamedEntityTagAnnotation.class)) { + this.nerString = headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); + } else { + this.nerString = "O"; + } + } + } + + public boolean sameSentence(Mention m) { + return m.sentenceWords == sentenceWords; + } + + private static boolean included(CoreLabel small, List big) { + if(small.tag().equals("NNP")){ + for(CoreLabel w: big){ + if(small.word().equals(w.word()) || + small.word().length() > 2 && w.word().startsWith(small.word())){ + return true; + } + } + } + return false; + } + + protected boolean headsAgree(Mention m) { + // we allow same-type NEs to not match perfectly, but rather one could be included in the other, e.g., "George" -> "George Bush" + if (!nerString.equals("O") && !m.nerString.equals("O") && nerString.equals(m.nerString) && + (included(headWord, m.originalSpan) || included(m.headWord, originalSpan))) { + return true; + } + return headString.equals(m.headString); + } + + public boolean numbersAgree(Mention m){ + return numbersAgree(m, false); + } + private boolean numbersAgree(Mention m, boolean strict) { + if (strict) { + return number == m.number; + } else { + return number == Number.UNKNOWN || + m.number == Number.UNKNOWN || + number == m.number; + } + } + + public boolean gendersAgree(Mention m){ + return gendersAgree(m, false); + } + public boolean gendersAgree(Mention m, boolean strict) { + if (strict) { + return gender == m.gender; + } else { + return gender == Gender.UNKNOWN || + m.gender == Gender.UNKNOWN || + gender == m.gender; + } + } + + public boolean animaciesAgree(Mention m){ + return animaciesAgree(m, false); + } + public boolean animaciesAgree(Mention m, boolean strict) { + if (strict) { + return animacy == m.animacy; + } else { + return animacy == Animacy.UNKNOWN || + m.animacy == Animacy.UNKNOWN || + animacy == m.animacy; + } + } + + public boolean entityTypesAgree(Mention m, Dictionaries dict){ + return entityTypesAgree(m, dict, false); + } + + public boolean entityTypesAgree(Mention m, Dictionaries dict, boolean strict) { + if (strict) { + return nerString.equals(m.nerString); + } else { + if (isPronominal()) { + if (nerString.contains("-") || m.nerString.contains("-")) { //for ACE with gold NE + if (m.nerString.equals("O")) { + return true; + } else if (m.nerString.startsWith("ORG")) { + return dict.organizationPronouns.contains(headString); + } else if (m.nerString.startsWith("PER")) { + return dict.personPronouns.contains(headString); + } else if (m.nerString.startsWith("LOC")) { + return dict.locationPronouns.contains(headString); + } else if (m.nerString.startsWith("GPE")) { + return dict.GPEPronouns.contains(headString); + } else if (m.nerString.startsWith("VEH") || m.nerString.startsWith("FAC") || m.nerString.startsWith("WEA")) { + return dict.facilityVehicleWeaponPronouns.contains(headString); + } else { + return false; + } + } else { // ACE w/o gold NE or MUC + if (m.nerString.equals("O")) { + return true; + } else if (m.nerString.equals("MISC")) { + return true; + } else if (m.nerString.equals("ORGANIZATION")) { + return dict.organizationPronouns.contains(headString); + } else if (m.nerString.equals("PERSON")) { + return dict.personPronouns.contains(headString); + } else if (m.nerString.equals("LOCATION")) { + return dict.locationPronouns.contains(headString); + } else if (m.nerString.equals("DATE") || m.nerString.equals("TIME")) { + return dict.dateTimePronouns.contains(headString); + } else if (m.nerString.equals("MONEY") || m.nerString.equals("PERCENT") || m.nerString.equals("NUMBER")) { + return dict.moneyPercentNumberPronouns.contains(headString); + } else { + return false; + } + } + } + return nerString.equals("O") || + m.nerString.equals("O") || + nerString.equals(m.nerString); + } + } + + + + /** + * Verifies if this mention's tree is dominated by the tree of the given mention + */ + public boolean includedIn(Mention m) { + if (!m.sameSentence(this)) { + return false; + } + if(this.startIndex < m.startIndex || this.endIndex > m.endIndex) return false; + for (Tree t : m.mentionSubTree.subTrees()) { + if (t == mentionSubTree) { + return true; + } + } + return false; + } + + /** + * Detects if the mention and candidate antecedent agree on all attributes respectively. + * @param potentialAntecedent + * @return true if all attributes agree between both mention and candidate, else false. + */ + public boolean attributesAgree(Mention potentialAntecedent, Dictionaries dict){ + return (this.animaciesAgree(potentialAntecedent) && + this.entityTypesAgree(potentialAntecedent, dict) && + this.gendersAgree(potentialAntecedent) && + this.numbersAgree(potentialAntecedent)); + } + + /** Find apposition */ + public void addApposition(Mention m) { + if(appositions == null) appositions = Generics.newHashSet(); + appositions.add(m); + } + + /** Check apposition */ + public boolean isApposition(Mention m) { + if(appositions != null && appositions.contains(m)) return true; + return false; + } + /** Find predicate nominatives */ + public void addPredicateNominatives(Mention m) { + if(predicateNominatives == null) predicateNominatives = Generics.newHashSet(); + predicateNominatives.add(m); + } + + /** Check predicate nominatives */ + public boolean isPredicateNominatives(Mention m) { + if(predicateNominatives != null && predicateNominatives.contains(m)) return true; + return false; + } + + /** Find relative pronouns */ + public void addRelativePronoun(Mention m) { + if(relativePronouns == null) relativePronouns = Generics.newHashSet(); + relativePronouns.add(m); + } + + /** Find which mention appears first in a document */ + public boolean appearEarlierThan(Mention m){ + if (this.sentNum < m.sentNum) { + return true; + } else if (this.sentNum > m.sentNum) { + return false; + } else { + if (this.startIndex < m.startIndex) { + return true; + } else if (this.startIndex > m.startIndex) { + return false; + } else { + if (this.endIndex > m.endIndex) { + return true; + } else { + return false; + } + } + } + } + + public String longestNNPEndsWithHead (){ + String ret = ""; + for (int i = headIndex; i >=startIndex ; i--){ + String pos = sentenceWords.get(i).get(CoreAnnotations.PartOfSpeechAnnotation.class); + if(!pos.startsWith("NNP")) break; + if(!ret.equals("")) ret = " "+ret; + ret = sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class)+ret; + } + return ret; + } + + public String lowestNPIncludesHead (){ + String ret = ""; + Tree head = this.contextParseTree.getLeaves().get(this.headIndex); + Tree lowestNP = head; + String s; + while(true) { + if(lowestNP==null) return ret; + s = ((CoreLabel) lowestNP.label()).get(CoreAnnotations.ValueAnnotation.class); + if(s.equals("NP") || s.equals("ROOT")) break; + lowestNP = lowestNP.ancestor(1, this.contextParseTree); + } + if (s.equals("ROOT")) lowestNP = head; + for (Tree t : lowestNP.getLeaves()){ + if (!ret.equals("")) ret = ret + " "; + ret = ret + ((CoreLabel) t.label()).get(CoreAnnotations.TextAnnotation.class); + } + if(!this.spanToString().contains(ret)) return this.sentenceWords.get(this.headIndex).get(CoreAnnotations.TextAnnotation.class); + return ret; + } + + public String stringWithoutArticle(String str) { + String ret = (str==null)? this.spanToString() : str; + if (ret.startsWith("a ") || ret.startsWith("A ")) { + return ret.substring(2); + } else if (ret.startsWith("an ") || ret.startsWith("An ")) { + return ret.substring(3); + } else if (ret.startsWith("the ") || ret.startsWith("The ")) + return ret.substring(4); + return ret; + } + + public List preprocessSearchTerm (){ + List searchTerms = new ArrayList(); + String[] terms = new String[4]; + + terms[0] = this.stringWithoutArticle(this.removePhraseAfterHead()); + terms[1] = this.stringWithoutArticle(this.lowestNPIncludesHead()); + terms[2] = this.stringWithoutArticle(this.longestNNPEndsWithHead()); + terms[3] = this.headString; + + for (String term : terms){ + + if(term.contains("\"")) term = term.replace("\"", "\\\""); + if(term.contains("(")) term = term.replace("(","\\("); + if(term.contains(")")) term = term.replace(")", "\\)"); + if(term.contains("!")) term = term.replace("!", "\\!"); + if(term.contains(":")) term = term.replace(":", "\\:"); + if(term.contains("+")) term = term.replace("+", "\\+"); + if(term.contains("-")) term = term.replace("-", "\\-"); + if(term.contains("~")) term = term.replace("~", "\\~"); + if(term.contains("*")) term = term.replace("*", "\\*"); + if(term.contains("[")) term = term.replace("[", "\\["); + if(term.contains("]")) term = term.replace("]", "\\]"); + if(term.contains("^")) term = term.replace("^", "\\^"); + if(term.equals("")) continue; + + if(term.equals("") || searchTerms.contains(term)) continue; + if(term.equals(terms[3]) && !terms[2].equals("")) continue; + searchTerms.add(term); + } + return searchTerms; + } + public static String buildQueryText(List terms) { + String query = ""; + for (String t : terms){ + query += t + " "; + } + return query.trim(); + } + + /** Remove any clause after headword */ + public String removePhraseAfterHead(){ + String removed =""; + int posComma = -1; + int posWH = -1; + for(int i = 0 ; i < this.originalSpan.size() ; i++){ + CoreLabel w = this.originalSpan.get(i); + if(posComma == -1 && w.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",")) posComma = this.startIndex + i; + if(posWH == -1 && w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("W")) posWH = this.startIndex + i; + } + if(posComma!=-1 && this.headIndex < posComma){ + StringBuilder os = new StringBuilder(); + for(int i = 0; i < posComma-this.startIndex; i++){ + if(i > 0) os.append(" "); + os.append(this.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class)); + } + removed = os.toString(); + } + if(posComma==-1 && posWH != -1 && this.headIndex < posWH){ + StringBuilder os = new StringBuilder(); + for(int i = 0; i < posWH-this.startIndex; i++){ + if(i > 0) os.append(" "); + os.append(this.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class)); + } + removed = os.toString(); + } + if(posComma==-1 && posWH == -1){ + removed = this.spanToString(); + } + return removed; + } + + public static String removeParenthesis(String text) { + if (text.split("\\(").length > 0) { + return text.split("\\(")[0].trim(); + } else { + return ""; + } + } + + // the mention is 'the + commonNoun' form + protected boolean isTheCommonNoun() { + if (this.mentionType == MentionType.NOMINAL + && this.spanToString().toLowerCase().startsWith("the ") + && this.spanToString().split(" ").length == 2) { + return true; + } else { + return false; + } + } + + private static Pair findDependentVerb(Mention m) { + Pair ret = new Pair(); + int headIndex = m.headIndex+1; + try { + IndexedWord w = m.dependency.getNodeByIndex(headIndex); + if(w==null) return ret; + while (true) { + IndexedWord p = null; + for(Pair parent : m.dependency.parentPairs(w)){ + if(ret.second()==null) { + String relation = parent.first().getShortName(); + ret.setSecond(relation); + } + p = parent.second(); + } + if(p==null || p.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) { + ret.setFirst(p); + break; + } + if(w==p) return ret; + w = p; + } + } catch (Exception e) { + return ret; + } + return ret; + } + + public boolean insideIn(Mention m){ + return this.sentNum == m.sentNum + && m.startIndex <= this.startIndex + && this.endIndex <= m.endIndex; + } + + public boolean moreRepresentativeThan(Mention m){ + if(m==null) return true; + if(mentionType!=m.mentionType) { + if ((mentionType == MentionType.PROPER && m.mentionType != MentionType.PROPER) + || (mentionType == MentionType.NOMINAL && m.mentionType == MentionType.PRONOMINAL)) { + return true; + } else { + return false; + } + } else { + if (headIndex - startIndex > m.headIndex - m.startIndex) { + return true; + } else if (sentNum < m.sentNum || (sentNum == m.sentNum && headIndex < m.headIndex)) { + return true; + } else { + return false; + } + } + } + + //Returns filtered premodifiers (no determiners or numerals) + public ArrayList> getPremodifiers(){ + + ArrayList> premod = new ArrayList>(); + + if(headIndexedWord == null) return premod; + for(Pair child : dependency.childPairs(headIndexedWord)){ + String function = child.first().getShortName(); + if(child.second().index() < headWord.index() + && !child.second.tag().equals("DT") && !child.second.tag().equals("WRB") + && !function.endsWith("det") && !function.equals("num") + && !function.equals("rcmod") && !function.equals("infmod") + && !function.equals("partmod") && !function.equals("punct")){ + ArrayList phrase = new ArrayList(dependency.descendants(child.second())); + Collections.sort(phrase); + premod.add(phrase); + } + } + return premod; + } + + // Returns filtered postmodifiers (no relative, -ed or -ing clauses) + public ArrayList> getPostmodifiers(){ + + ArrayList> postmod = new ArrayList>(); + + if(headIndexedWord == null) return postmod; + for(Pair child : dependency.childPairs(headIndexedWord)){ + String function = child.first().getShortName(); + if(child.second().index() > headWord.index() && + !function.endsWith("det") && !function.equals("num") + && !function.equals("rcmod") && !function.equals("infmod") + && !function.equals("partmod") && !function.equals("punct") + && !(function.equals("possessive") && dependency.descendants(child.second()).size() == 1)){ + ArrayList phrase = new ArrayList(dependency.descendants(child.second())); + Collections.sort(phrase); + postmod.add(phrase); + } + } + return postmod; + } + + + public String[] getSplitPattern(){ + + ArrayList> premodifiers = getPremodifiers(); + + String[] components = new String[4]; + + components[0] = headWord.lemma(); + + if(premodifiers.size() == 0){ + components[1] = headWord.lemma(); + components[2] = headWord.lemma(); + } else if(premodifiers.size() == 1){ + ArrayList premod = new ArrayList(); + premod.addAll(premodifiers.get(premodifiers.size()-1)); + premod.add(headWord); + components[1] = getPattern(premod); + components[2] = getPattern(premod); + } else { + ArrayList premod1 = new ArrayList(); + premod1.addAll(premodifiers.get(premodifiers.size()-1)); + premod1.add(headWord); + components[1] = getPattern(premod1); + + ArrayList premod2 = new ArrayList(); + for(ArrayList premodifier : premodifiers){ + premod2.addAll(premodifier); + } + premod2.add(headWord); + components[2] = getPattern(premod2); + } + + components[3] = getPattern(); + return components; + } + + public String getPattern(){ + + ArrayList pattern = new ArrayList(); + for(ArrayList premodifier : getPremodifiers()){ + pattern.addAll(premodifier); + } + pattern.add(headWord); + for(ArrayList postmodifier : getPostmodifiers()){ + pattern.addAll(postmodifier); + } + return getPattern(pattern); + } + + public String getPattern(List pTokens){ + + ArrayList phrase_string = new ArrayList(); + String ne = ""; + for(CoreLabel token : pTokens){ + if(token.index() == headWord.index()){ + phrase_string.add(token.lemma()); + ne = ""; + + } else if( (token.lemma().equals("and") || StringUtils.isPunct(token.lemma())) + && pTokens.size() > pTokens.indexOf(token)+1 + && pTokens.indexOf(token) > 0 + && pTokens.get(pTokens.indexOf(token)+1).ner().equals(pTokens.get(pTokens.indexOf(token)-1).ner())){ + + } else if(token.index() == headWord.index()-1 + && token.ner().equals(nerString)){ + phrase_string.add(token.lemma()); + ne = ""; + + } else if(!token.ner().equals("O")){ + if(!token.ner().equals(ne)){ + ne = token.ner(); + phrase_string.add("<"+ne+">"); + } + + } else { + phrase_string.add(token.lemma()); + ne = ""; + } + } + return StringUtils.join(phrase_string); + } + + public boolean isCoordinated(){ + if(headIndexedWord == null) return false; + for(Pair child : dependency.childPairs(headIndexedWord)){ + if(child.first().getShortName().equals("cc")) return true; + } + return false; + } + + private static List getContextHelper(List words) { + List> namedEntities = new ArrayList>(); + List ne = new ArrayList(); + String previousNEType = ""; + int previousNEIndex = -1; + for (int i = 0; i < words.size(); i++) { + CoreLabel word = words.get(i); + if(!word.ner().equals("O")) { + if (!word.ner().equals(previousNEType) || previousNEIndex != i-1) { + ne = new ArrayList(); + namedEntities.add(ne); + } + ne.add(word); + previousNEType = word.ner(); + previousNEIndex = i; + } + } + + List neStrings = new ArrayList(); + Set hs = Generics.newHashSet(); + for (List namedEntity : namedEntities) { + String ne_str = StringUtils.joinWords(namedEntity, " "); + hs.add(ne_str); + } + neStrings.addAll(hs); + return neStrings; + } + + public List getContext() { + return getContextHelper(sentenceWords); + } + + public List getPremodifierContext() { + List neStrings = new ArrayList(); + for (List words : getPremodifiers()) { + neStrings.addAll(getContextHelper(words)); + } + return neStrings; + } + + /** Check relative pronouns */ + public boolean isRelativePronoun(Mention m) { + return relativePronouns != null && relativePronouns.contains(m); + } + + public boolean isRoleAppositive(Mention m, Dictionaries dict) { + String thisString = this.spanToString(); + if(this.isPronominal() || dict.allPronouns.contains(thisString.toLowerCase())) return false; + if(!m.nerString.startsWith("PER") && !m.nerString.equals("O")) return false; + if(!this.nerString.startsWith("PER") && !this.nerString.equals("O")) return false; + if(!sameSentence(m) || !m.spanToString().startsWith(thisString)) return false; + if(m.spanToString().contains("'") || m.spanToString().contains(" and ")) return false; + if (!animaciesAgree(m) || this.animacy == Animacy.INANIMATE + || this.gender == Gender.NEUTRAL || m.gender == Gender.NEUTRAL + || !this.numbersAgree(m)) { + return false; + } + if (dict.demonymSet.contains(thisString.toLowerCase()) + || dict.demonymSet.contains(m.spanToString().toLowerCase())) { + return false; + } + return true; + } + + public boolean isDemonym(Mention m, Dictionaries dict){ + String thisString = this.spanToString().toLowerCase(); + String antString = m.spanToString().toLowerCase(); + if(thisString.startsWith("the ") || thisString.startsWith("The ")) { + thisString = thisString.substring(4); + } + if(antString.startsWith("the ") || antString.startsWith("The ")) antString = antString.substring(4); + + if (dict.statesAbbreviation.containsKey(m.spanToString()) && dict.statesAbbreviation.get(m.spanToString()).equals(this.spanToString()) + || dict.statesAbbreviation.containsKey(this.spanToString()) && dict.statesAbbreviation.get(this.spanToString()).equals(m.spanToString())) { + return true; + } + + if(dict.demonyms.get(thisString)!=null){ + if(dict.demonyms.get(thisString).contains(antString)) return true; + } else if(dict.demonyms.get(antString)!=null){ + if(dict.demonyms.get(antString).contains(thisString)) return true; + } + return false; + } + + public String getPosition() { + int size = sentenceWords.size(); + if(headIndex == 0) { + return "first"; + } else if (headIndex == size -1) { + return "last"; + } else { + if(headIndex > 0 && headIndex < size/3) { + return "begin"; + } else if (headIndex >= size/3 && headIndex < 2 * size/3) { + return "middle"; + } else if (headIndex >= 2 * size/3 && headIndex < size -1) { + return "end"; + } + } + return null; + } + + public String getRelation(){ + + if(headIndexedWord == null) return null; + + if(dependency.getRoots().isEmpty()) return null; + // root relation + if(dependency.getFirstRoot().equals(headIndexedWord)) return "root"; + if(!dependency.vertexSet().contains(dependency.getParent(headIndexedWord))) return null; + GrammaticalRelation relation = dependency.reln(dependency.getParent(headIndexedWord), headIndexedWord); + + // adjunct relations + if(relation.toString().startsWith("prep") || relation == EnglishGrammaticalRelations.PREPOSITIONAL_OBJECT || relation == EnglishGrammaticalRelations.TEMPORAL_MODIFIER || relation == EnglishGrammaticalRelations.ADV_CLAUSE_MODIFIER || relation == EnglishGrammaticalRelations.ADVERBIAL_MODIFIER || relation == EnglishGrammaticalRelations.PREPOSITIONAL_COMPLEMENT) return "adjunct"; + + // subject relations + if(relation == EnglishGrammaticalRelations.NOMINAL_SUBJECT || relation == EnglishGrammaticalRelations.CLAUSAL_SUBJECT || relation == EnglishGrammaticalRelations.CONTROLLING_SUBJECT) return "subject"; + if(relation == EnglishGrammaticalRelations.NOMINAL_PASSIVE_SUBJECT || relation == EnglishGrammaticalRelations.CLAUSAL_PASSIVE_SUBJECT) return "subject"; + + // verbal argument relations + if(relation == EnglishGrammaticalRelations.ADJECTIVAL_COMPLEMENT || relation == EnglishGrammaticalRelations.ATTRIBUTIVE || relation == EnglishGrammaticalRelations.CLAUSAL_COMPLEMENT || relation == EnglishGrammaticalRelations.XCLAUSAL_COMPLEMENT || relation == EnglishGrammaticalRelations.AGENT || relation == EnglishGrammaticalRelations.DIRECT_OBJECT || relation == EnglishGrammaticalRelations.INDIRECT_OBJECT) return "verbArg"; + + // noun argument relations + if(relation == EnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER || relation == EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER || relation == EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER || relation == EnglishGrammaticalRelations.APPOSITIONAL_MODIFIER || relation == EnglishGrammaticalRelations.POSSESSION_MODIFIER) return "nounArg"; + + return null; + } + + public int getModifiers(Dictionaries dict){ + + if(headIndexedWord == null) return 0; + + int count = 0; + List> childPairs = dependency.childPairs(headIndexedWord); + for(Pair childPair : childPairs) { + GrammaticalRelation gr = childPair.first; + IndexedWord word = childPair.second; + if(gr == EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER || gr == EnglishGrammaticalRelations.PARTICIPIAL_MODIFIER + || gr == EnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER || gr == EnglishGrammaticalRelations.INFINITIVAL_MODIFIER + || gr.toString().startsWith("prep_")) { + count++; + } + // add noun modifier when the mention isn't a NER + if(nerString.equals("O") && gr == EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER) { + count++; + } + + // add possessive if not a personal determiner + if(gr == EnglishGrammaticalRelations.POSSESSION_MODIFIER && !dict.determiners.contains(word.lemma())) { + count++; + } + } + return count; + } + + public String getQuantification(Dictionaries dict){ + + if(headIndexedWord == null) return null; + + if(!nerString.equals("O")) return "definite"; + + List quant = dependency.getChildrenWithReln(headIndexedWord, EnglishGrammaticalRelations.DETERMINER); + List poss = dependency.getChildrenWithReln(headIndexedWord, EnglishGrammaticalRelations.POSSESSION_MODIFIER); + String det = ""; + if(!quant.isEmpty()) { + det = quant.get(0).lemma(); + if(dict.determiners.contains(det)) { + return "definite"; + } + } + else if(!poss.isEmpty()) { + return "definite"; + } + else { + quant = dependency.getChildrenWithReln(headIndexedWord, EnglishGrammaticalRelations.NUMERIC_MODIFIER); + if(dict.quantifiers2.contains(det) || !quant.isEmpty()) { + return "quantified"; + } + } + return "indefinite"; + } + + public int getNegation(Dictionaries dict) { + + if(headIndexedWord == null) return 0; + + // direct negation in a child + Collection children = dependency.getChildren(headIndexedWord); + for(IndexedWord child : children) { + if(dict.negations.contains(child.lemma())) return 1; + } + + // or has a sibling + Collection siblings = dependency.getSiblings(headIndexedWord); + for(IndexedWord sibling : siblings) { + if(dict.negations.contains(sibling.lemma()) && !dependency.hasParentWithReln(headIndexedWord, EnglishGrammaticalRelations.NOMINAL_SUBJECT)) return 1; + } + // check the parent + List> parentPairs = dependency.parentPairs(headIndexedWord); + if (!parentPairs.isEmpty()) { + Pair parentPair = parentPairs.get(0); + GrammaticalRelation gr = parentPair.first; + // check negative prepositions + if(dict.neg_relations.contains(gr.toString())) return 1; + } + return 0; + } + + public int getModal(Dictionaries dict) { + + if(headIndexedWord == null) return 0; + + // direct modal in a child + Collection children = dependency.getChildren(headIndexedWord); + for(IndexedWord child : children) { + if(dict.modals.contains(child.lemma())) return 1; + } + + // check the parent + IndexedWord parent = dependency.getParent(headIndexedWord); + if (parent != null) { + if(dict.modals.contains(parent.lemma())) return 1; + // check the children of the parent (that is needed for modal auxiliaries) + IndexedWord child = dependency.getChildWithReln(parent,EnglishGrammaticalRelations.AUX_MODIFIER); + if(!dependency.hasParentWithReln(headIndexedWord, EnglishGrammaticalRelations.NOMINAL_SUBJECT) && child != null && dict.modals.contains(child.lemma())) return 1; + } + + // look at the path to root + List path = dependency.getPathToRoot(headIndexedWord); + if(path == null) return 0; + for(IndexedWord word : path) { + if(dict.modals.contains(word.lemma())) return 1; + } + return 0; + } + + public int getReportEmbedding(Dictionaries dict) { + + if(headIndexedWord == null) return 0; + + // check adverbial clause with marker "as" + Collection siblings = dependency.getSiblings(headIndexedWord); + for(IndexedWord sibling : siblings) { + if(dict.reportVerb.contains(sibling.lemma()) && dependency.hasParentWithReln(sibling,EnglishGrammaticalRelations.ADV_CLAUSE_MODIFIER)) { + IndexedWord marker = dependency.getChildWithReln(sibling,EnglishGrammaticalRelations.MARKER); + if (marker != null && marker.lemma().equals("as")) { + return 1; + } + } + } + + // look at the path to root + List path = dependency.getPathToRoot(headIndexedWord); + if(path == null) return 0; + boolean isSubject = false; + + // if the node itself is a subject, we will not take into account its parent in the path + if(dependency.hasParentWithReln(headIndexedWord, EnglishGrammaticalRelations.NOMINAL_SUBJECT)) isSubject = true; + + for (IndexedWord word : path) { + if(!isSubject && (dict.reportVerb.contains(word.lemma()) || dict.reportNoun.contains(word.lemma()))) { + return 1; + } + // check how to put isSubject + isSubject = dependency.hasParentWithReln(word, EnglishGrammaticalRelations.NOMINAL_SUBJECT); + } + return 0; + } + + public int getCoordination() { + + if(headIndexedWord == null) return 0; + + Set relations = dependency.childRelns(headIndexedWord); + for (GrammaticalRelation rel : relations) { + if(rel.toString().startsWith("conj_")) { + return 1; + } + } + + Set parent_relations = dependency.relns(headIndexedWord); + for (GrammaticalRelation rel : parent_relations) { + if(rel.toString().startsWith("conj_")) { + return 1; + } + } + return 0; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/MentionExtractor.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/MentionExtractor.java new file mode 100644 index 0000000..14831b0 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/MentionExtractor.java @@ -0,0 +1,395 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2011 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.HeadFinder; +import edu.stanford.nlp.trees.SemanticHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; + +/** + * Generic mention extractor from a corpus. + * + * @author Jenny Finkel + * @author Mihai Surdeanu + * @author Karthik Raghunathan + * @author Heeyoung Lee + * @author Sudarshan Rangarajan + */ +public class MentionExtractor { + + protected final HeadFinder headFinder; + + protected String currentDocumentID; + + protected final Dictionaries dictionaries; + protected final Semantics semantics; + + public CorefMentionFinder mentionFinder; + protected StanfordCoreNLP stanfordProcessor; + protected LogisticClassifier singletonPredictor; + + /** The maximum mention ID: for preventing duplicated mention ID assignment */ + protected int maxID = -1; + + public static final boolean VERBOSE = false; + + public MentionExtractor(Dictionaries dict, Semantics semantics) { + this.headFinder = new SemanticHeadFinder(); + this.dictionaries = dict; + this.semantics = semantics; + this.mentionFinder = new RuleBasedCorefMentionFinder(); // Default + } + + public void setMentionFinder(CorefMentionFinder mentionFinder) + { + this.mentionFinder = mentionFinder; + } + + /** + * Extracts the info relevant for coref from the next document in the corpus + * @return List of mentions found in each sentence ordered according to the tree traversal. + * @throws Exception + */ + public Document nextDoc() throws Exception { return null; } + + /** + * Reset so that we start at the beginning of the document collection + */ + public void resetDocs() { + maxID = -1; + currentDocumentID = null; + } + + public Document arrange( + Annotation anno, + List> words, + List trees, + List> unorderedMentions) throws Exception { + return arrange(anno, words, trees, unorderedMentions, null, false); + } + + protected int getHeadIndex(Tree t) { + Tree ht = t.headTerminal(headFinder); + if(ht==null) return -1; // temporary: a key which is matched to nothing + CoreLabel l = (CoreLabel) ht.label(); + return (int) l.get(CoreAnnotations.IndexAnnotation.class); + } + private String treeToKey(Tree t){ + int idx = getHeadIndex(t); + String key = Integer.toString(idx) + ":" + t.toString(); + return key; + } + + public Document arrange( + Annotation anno, + List> words, + List trees, + List> unorderedMentions, + List> unorderedGoldMentions, + boolean doMergeLabels) throws Exception { + List> predictedOrderedMentionsBySentence = arrange(anno, words, trees, unorderedMentions, doMergeLabels); + List> goldOrderedMentionsBySentence = null; +// SieveCoreferenceSystem.debugPrintMentions(System.err, "UNORDERED GOLD MENTIONS:", unorderedGoldMentions); + + if(unorderedGoldMentions != null) { + goldOrderedMentionsBySentence = arrange(anno, words, trees, unorderedGoldMentions, doMergeLabels); + } +// SieveCoreferenceSystem.debugPrintMentions(System.err, "ORDERED GOLD MENTIONS:", goldOrderedMentionsBySentence); + return new Document(anno, predictedOrderedMentionsBySentence, goldOrderedMentionsBySentence, dictionaries); + } + + /** + * Post-processes the extracted mentions. Here we set the Mention fields required for coref and order mentions by tree-traversal order. + * @param words List of words in each sentence, in textual order + * @param trees List of trees, one per sentence + * @param unorderedMentions List of unordered, unprocessed mentions + * Each mention MUST have startIndex and endIndex set! + * Optionally, if scoring is desired, mentions must have mentionID and originalRef set. + * All the other Mention fields are set here. + * @return List of mentions ordered according to the tree traversal + * @throws Exception + */ + public List> arrange( + Annotation anno, + List> words, + List trees, + List> unorderedMentions, + boolean doMergeLabels) throws Exception { + + List> orderedMentionsBySentence = new ArrayList>(); + + // + // traverse all sentences and process each individual one + // + for(int sent = 0; sent < words.size(); sent ++){ + List sentence = words.get(sent); + Tree tree = trees.get(sent); + List mentions = unorderedMentions.get(sent); + Map> mentionsToTrees = Generics.newHashMap(); + + // merge the parse tree of the entire sentence with the sentence words + if(doMergeLabels) mergeLabels(tree, sentence); + + // + // set the surface information and the syntactic info in each mention + // startIndex and endIndex MUST be set before! + // + for(Mention mention: mentions){ + mention.contextParseTree = tree; + mention.sentenceWords = sentence; + mention.originalSpan = new ArrayList(mention.sentenceWords.subList(mention.startIndex, mention.endIndex)); + if(!((CoreLabel)tree.label()).has(CoreAnnotations.BeginIndexAnnotation.class)) tree.indexSpans(0); + if(mention.headWord==null) { + Tree headTree = ((RuleBasedCorefMentionFinder) mentionFinder).findSyntacticHead(mention, tree, sentence); + mention.headWord = (CoreLabel)headTree.label(); + mention.headIndex = mention.headWord.get(CoreAnnotations.IndexAnnotation.class) - 1; + } + if(mention.mentionSubTree==null) { + // mentionSubTree = highest NP that has the same head + Tree headTree = tree.getLeaves().get(mention.headIndex); + if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); } + Tree t = headTree; + while ((t = t.parent(tree)) != null) { + if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) { + mention.mentionSubTree = t; + } else if(mention.mentionSubTree != null){ + break; + } + } + if (mention.mentionSubTree == null) { + mention.mentionSubTree = headTree; + } + } + + List mentionsForTree = mentionsToTrees.get(treeToKey(mention.mentionSubTree)); + if(mentionsForTree == null){ + mentionsForTree = new ArrayList(); + mentionsToTrees.put(treeToKey(mention.mentionSubTree), mentionsForTree); + } + mentionsForTree.add(mention); + + // generates all fields required for coref, such as gender, number, etc. + mention.process(dictionaries, semantics, this, singletonPredictor); + } + + // + // Order all mentions in tree-traversal order + // + List orderedMentions = new ArrayList(); + orderedMentionsBySentence.add(orderedMentions); + + // extract all mentions in tree traversal order (alternative: tree.postOrderNodeList()) + for (Tree t : tree.preOrderNodeList()) { + List lm = mentionsToTrees.get(treeToKey(t)); + if(lm != null){ + for(Mention m: lm){ + orderedMentions.add(m); + } + } + } + + // + // find appositions, predicate nominatives, relative pronouns in this sentence + // + findSyntacticRelations(tree, orderedMentions); + assert(mentions.size() == orderedMentions.size()); + } + return orderedMentionsBySentence; + } + + /** + * Sets the label of the leaf nodes to be the CoreLabels in the given sentence + * The original value() of the Tree nodes is preserved + */ + public static void mergeLabels(Tree tree, List sentence) { + int idx = 0; + for (Tree t : tree.getLeaves()) { + CoreLabel cl = sentence.get(idx ++); + String value = t.value(); + cl.set(CoreAnnotations.ValueAnnotation.class, value); + t.setLabel(cl); + } + tree.indexLeaves(); + } + + static boolean inside(int i, Mention m) { + return i >= m.startIndex && i < m.endIndex; + } + + /** Find syntactic relations (e.g., appositives) in a sentence */ + private void findSyntacticRelations(Tree tree, List orderedMentions) { + Set> appos = Generics.newHashSet(); + findAppositions(tree, appos); + markMentionRelation(orderedMentions, appos, "APPOSITION"); + + Set> preNomi = Generics.newHashSet(); + findPredicateNominatives(tree, preNomi); + markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE"); + + Set> relativePronounPairs = Generics.newHashSet(); + findRelativePronouns(tree, relativePronounPairs); + markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN"); + } + + /** Find syntactic pattern in a sentence by tregex */ + private void findTreePattern(Tree tree, String pattern, Set> foundPairs) { + try { + TregexPattern tgrepPattern = TregexPattern.compile(pattern); + TregexMatcher m = tgrepPattern.matcher(tree); + while (m.find()) { + Tree t = m.getMatch(); + Tree np1 = m.getNode("m1"); + Tree np2 = m.getNode("m2"); + Tree np3 = null; + if(pattern.contains("m3")) np3 = m.getNode("m3"); + addFoundPair(np1, np2, t, foundPairs); + if(np3!=null) addFoundPair(np2, np3, t, foundPairs); + } + } catch (Exception e) { + // shouldn't happen.... + throw new RuntimeException(e); + } + } + + private void addFoundPair(Tree np1, Tree np2, Tree t, + Set> foundPairs) { + Tree head1 = np1.headTerminal(headFinder); + Tree head2 = np2.headTerminal(headFinder); + int h1 = ((CoreMap) head1.label()).get(CoreAnnotations.IndexAnnotation.class) - 1; + int h2 = ((CoreMap) head2.label()).get(CoreAnnotations.IndexAnnotation.class) - 1; + Pair p = new Pair(h1, h2); + foundPairs.add(p); + } + + private void findAppositions(Tree tree, Set> appos) { + String appositionPattern = "NP=m1 < (NP=m2 $.. (/,/ $.. NP=m3))"; + String appositionPattern2 = "NP=m1 < (NP=m2 $.. (/,/ $.. (SBAR < (WHNP < WP|WDT=m3))))"; + String appositionPattern3 = "/^NP(?:-TMP|-ADV)?$/=m1 < (NP=m2 $- /^,$/ $-- NP=m3 !$ CC|CONJP)"; + String appositionPattern4 = "/^NP(?:-TMP|-ADV)?$/=m1 < (PRN=m2 < (NP < /^NNS?|CD$/ $-- /^-LRB-$/ $+ /^-RRB-$/))"; + findTreePattern(tree, appositionPattern, appos); + findTreePattern(tree, appositionPattern2, appos); + findTreePattern(tree, appositionPattern3, appos); + findTreePattern(tree, appositionPattern4, appos); + } + + private void findPredicateNominatives(Tree tree, Set> preNomi) { + String predicateNominativePattern = "S < (NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2)))"; + String predicateNominativePattern2 = "S < (NP=m1 $.. (VP < (VP < ((/VB/ < /^(be|been|being)$/) $.. NP=m2))))"; + // String predicateNominativePattern2 = "NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2))"; + findTreePattern(tree, predicateNominativePattern, preNomi); + findTreePattern(tree, predicateNominativePattern2, preNomi); + } + + private void findRelativePronouns(Tree tree, Set> relativePronounPairs) { + String relativePronounPattern = "NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))"; + findTreePattern(tree, relativePronounPattern, relativePronounPairs); + } + + private static void markMentionRelation(List orderedMentions, Set> foundPairs, String flag) { + for(Mention m1 : orderedMentions){ + for(Mention m2 : orderedMentions){ + for(Pair foundPair: foundPairs){ + if((foundPair.first == m1.headIndex && foundPair.second == m2.headIndex)){ + if(flag.equals("APPOSITION")) m2.addApposition(m1); + else if(flag.equals("PREDICATE_NOMINATIVE")) m2.addPredicateNominatives(m1); + else if(flag.equals("RELATIVE_PRONOUN")) m2.addRelativePronoun(m1); + else throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)"); + } + } + } + } + } + /** + * Finds the tree the matches this span exactly + * @param tree Leaves must be indexed! + * @param first First element in the span (first position has offset 1) + * @param last Last element included in the span (first position has offset 1) + */ + public static Tree findExactMatch(Tree tree, int first, int last) { + List leaves = tree.getLeaves(); + int thisFirst = ((CoreMap) leaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class); + int thisLast = ((CoreMap) leaves.get(leaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class); + if(thisFirst == first && thisLast == last) { + return tree; + } else { + Tree [] kids = tree.children(); + for(Tree k: kids){ + Tree t = findExactMatch(k, first, last); + if(t != null) return t; + } + } + return null; + } + + /** Load Stanford Processor: skip unnecessary annotator */ + protected static StanfordCoreNLP loadStanfordProcessor(Properties props) { + boolean replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false")); + + Properties pipelineProps = new Properties(props); + StringBuilder annoSb = new StringBuilder(""); + if (!Constants.USE_GOLD_POS && !replicateCoNLL) { + annoSb.append("pos, lemma"); + } else { + annoSb.append("lemma"); + } + if(Constants.USE_TRUECASE) { + annoSb.append(", truecase"); + } + if (!Constants.USE_GOLD_NE && !replicateCoNLL) { + annoSb.append(", ner"); + } + if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) { + annoSb.append(", parse"); + } + String annoStr = annoSb.toString(); + SieveCoreferenceSystem.logger.info("Ignoring specified annotators, using annotators=" + annoStr); + pipelineProps.put("annotators", annoStr); + return new StanfordCoreNLP(pipelineProps, false); + } + + public static void initializeUtterance(List tokens) { + for(CoreLabel l : tokens){ + l.set(CoreAnnotations.UtteranceAnnotation.class, 0); + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/RuleBasedCorefMentionFinder.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/RuleBasedCorefMentionFinder.java new file mode 100644 index 0000000..5c9b2ac --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/RuleBasedCorefMentionFinder.java @@ -0,0 +1,533 @@ +package edu.stanford.nlp.dcoref; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.Label; +import edu.stanford.nlp.parser.lexparser.ParserConstraint; +import edu.stanford.nlp.parser.lexparser.ParserAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.Annotator; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.HeadFinder; +import edu.stanford.nlp.trees.SemanticHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.IntPair; +import edu.stanford.nlp.util.StringUtils; + +public class RuleBasedCorefMentionFinder implements CorefMentionFinder { + + protected boolean assignIds = true; + protected int maxID = -1; + private final HeadFinder headFinder; + protected Annotator parserProcessor; + + public RuleBasedCorefMentionFinder() { + SieveCoreferenceSystem.logger.fine("Using SEMANTIC HEAD FINDER!!!!!!!!!!!!!!!!!!!"); + headFinder = new SemanticHeadFinder(); + } + + /** When mention boundaries are given */ + public List> filterPredictedMentions(List> allGoldMentions, Annotation doc, Dictionaries dict){ + List> predictedMentions = new ArrayList>(); + + for(int i = 0 ; i < allGoldMentions.size(); i++){ + CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i); + List goldMentions = allGoldMentions.get(i); + List mentions = new ArrayList(); + predictedMentions.add(mentions); + mentions.addAll(goldMentions); + findHead(s, mentions); + + // todo [cdm 2013]: This block seems to do nothing - the two sets are never used + Set mentionSpanSet = Generics.newHashSet(); + Set namedEntitySpanSet = Generics.newHashSet(); + for(Mention m : mentions) { + mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex)); + if(!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) { + namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex)); + } + } + + setBarePlural(mentions); + removeSpuriousMentions(s, mentions, dict); + } + return predictedMentions; + } + + /** Main method of mention detection. + * Extract all NP, PRP or NE, and filter out by manually written patterns. + */ + @Override + public List> extractPredictedMentions(Annotation doc, int _maxID, Dictionaries dict){ + this.maxID = _maxID; + List> predictedMentions = new ArrayList>(); + for(CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) { + + List mentions = new ArrayList(); + predictedMentions.add(mentions); + Set mentionSpanSet = Generics.newHashSet(); + Set namedEntitySpanSet = Generics.newHashSet(); + + extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet); + extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet); + extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet); + findHead(s, mentions); + setBarePlural(mentions); + removeSpuriousMentions(s, mentions, dict); + } + return predictedMentions; + } + + protected static void setBarePlural(List mentions) { + for (Mention m : mentions) { + String pos = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); + if(m.originalSpan.size()==1 && pos.equals("NNS")) m.generic = true; + } + } + + protected void extractNamedEntityMentions(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) { + List sent = s.get(CoreAnnotations.TokensAnnotation.class); + SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + String preNE = "O"; + int beginIndex = -1; + for(CoreLabel w : sent) { + String nerString = w.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if(!nerString.equals(preNE)) { + int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1; + if(!preNE.equals("O") && !preNE.equals("QUANTITY") && !preNE.equals("CARDINAL") && !preNE.equals("PERCENT")) { + if(w.get(CoreAnnotations.TextAnnotation.class).equals("'s")) endIndex++; + IntPair mSpan = new IntPair(beginIndex, endIndex); + // Need to check if beginIndex < endIndex because, for + // example, there could be a 's mislabeled by the NER and + // attached to the previous NER by the earlier heuristic + if(beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) { + int mentionId = assignIds? ++maxID:-1; + Mention m = new Mention(mentionId, beginIndex, endIndex, dependency, new ArrayList(sent.subList(beginIndex, endIndex))); + mentions.add(m); + mentionSpanSet.add(mSpan); + namedEntitySpanSet.add(mSpan); + } + } + beginIndex = endIndex; + preNE = nerString; + } + } + // NE at the end of sentence + if(!preNE.equals("O") && !preNE.equals("QUANTITY") && !preNE.equals("CARDINAL") && !preNE.equals("PERCENT")) { + IntPair mSpan = new IntPair(beginIndex, sent.size()); + if(!mentionSpanSet.contains(mSpan)) { + int mentionId = assignIds? ++maxID:-1; + Mention m = new Mention(mentionId, beginIndex, sent.size(), dependency, new ArrayList(sent.subList(beginIndex, sent.size()))); + mentions.add(m); + mentionSpanSet.add(mSpan); + namedEntitySpanSet.add(mSpan); + } + } + } + + protected void extractNPorPRP(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) { + List sent = s.get(CoreAnnotations.TokensAnnotation.class); + Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); + tree.indexLeaves(); + SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + + final String mentionPattern = "/^(?:NP|PRP)/"; + TregexPattern tgrepPattern = TregexPattern.compile(mentionPattern); + TregexMatcher matcher = tgrepPattern.matcher(tree); + while (matcher.find()) { + Tree t = matcher.getMatch(); + List mLeaves = t.getLeaves(); + int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; + int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); + IntPair mSpan = new IntPair(beginIdx, endIdx); + if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { + int mentionID = assignIds? ++maxID:-1; + Mention m = new Mention(mentionID, beginIdx, endIdx, dependency, new ArrayList(sent.subList(beginIdx, endIdx)), t); + mentions.add(m); + mentionSpanSet.add(mSpan); + } + } + } + /** Extract enumerations (A, B, and C) */ + protected void extractEnumerations(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet){ + List sent = s.get(CoreAnnotations.TokensAnnotation.class); + Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); + SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); + + final String mentionPattern = "NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))"; + TregexPattern tgrepPattern = TregexPattern.compile(mentionPattern); + TregexMatcher matcher = tgrepPattern.matcher(tree); + Map spanToMentionSubTree = Generics.newHashMap(); + while (matcher.find()) { + matcher.getMatch(); + Tree m1 = matcher.getNode("m1"); + Tree m2 = matcher.getNode("m2"); + + List mLeaves = m1.getLeaves(); + int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; + int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); + spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1); + + mLeaves = m2.getLeaves(); + beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; + endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); + spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2); + } + + for(IntPair mSpan : spanToMentionSubTree.keySet()){ + if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { + int mentionID = assignIds? ++maxID:-1; + Mention m = new Mention(mentionID, mSpan.get(0), mSpan.get(1), dependency, + new ArrayList(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan)); + mentions.add(m); + mentionSpanSet.add(mSpan); + } + } + } + + /** Check whether a mention is inside of a named entity */ + private static boolean insideNE(IntPair mSpan, Set namedEntitySpanSet) { + for (IntPair span : namedEntitySpanSet){ + if(span.get(0) <= mSpan.get(0) && mSpan.get(1) <= span.get(1)) return true; + } + return false; + } + + protected void findHead(CoreMap s, List mentions) { + Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); + List sent = s.get(CoreAnnotations.TokensAnnotation.class); + tree.indexSpans(0); + for (Mention m : mentions){ + Tree head = findSyntacticHead(m, tree, sent); + m.headIndex = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1; + m.headWord = sent.get(m.headIndex); + m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(); + int start = m.headIndex - m.startIndex; + if (start < 0 || start >= m.originalSpan.size()) { + SieveCoreferenceSystem.logger.warning("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + + ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord); + SieveCoreferenceSystem.logger.warning("Setting head string to entire mention"); + m.headIndex = m.startIndex; + m.headWord = m.originalSpan.get(0); + m.headString = m.originalSpan.toString(); + } + } + } + + protected Tree findSyntacticHead(Mention m, Tree root, List tokens) { + // mention ends with 's + int endIdx = m.endIndex; + String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class); + if((lastWord.equals("'s") || lastWord.equals("'")) + && m.originalSpan.size() != 1 ) endIdx--; + + Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx); + // + // found an exact match + // + if (exactMatch != null) { + return safeHead(exactMatch); + } + + // no exact match found + // in this case, we parse the actual extent of the mention, embedded in a sentence + // context, so as to make the parser work better :-) + + int approximateness = 0; + List extentTokens = new ArrayList(); + extentTokens.add(initCoreLabel("It")); + extentTokens.add(initCoreLabel("was")); + final int ADDED_WORDS = 2; + for (int i = m.startIndex; i < endIdx; i++) { + // Add everything except separated dashes! The separated dashes mess with the parser too badly. + CoreLabel label = tokens.get(i); + if ( ! "-".equals(label.word())) { + extentTokens.add(tokens.get(i)); + } else { + approximateness++; + } + } + extentTokens.add(initCoreLabel(".")); + + // constrain the parse to the part we're interested in. + // Starting from ADDED_WORDS comes from skipping "It was". + // -1 to exclude the period. + // We now let it be any kind of nominal constituent, since there + // are VP and S ones + ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*")); + List constraints = Collections.singletonList(constraint); + Tree tree = parse(extentTokens, constraints); + convertToCoreLabels(tree); + tree.indexSpans(m.startIndex - ADDED_WORDS); // remember it has ADDED_WORDS extra words at the beginning + Tree subtree = findPartialSpan(tree, m.startIndex); + Tree extentHead = safeHead(subtree); + assert(extentHead != null); + // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree + // Because we deleted dashes, it's index will be >= the index in the extent parse tree + CoreLabel l = (CoreLabel) extentHead.label(); + Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness); + assert(realHead != null); + return realHead; + } + private static Tree findPartialSpan(final Tree root, final int start) { + CoreLabel label = (CoreLabel) root.label(); + int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class); + if (startIndex == start) { + return root; + } + for (Tree kid : root.children()) { + CoreLabel kidLabel = (CoreLabel) kid.label(); + int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class); + int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class); + if (kidStart <= start && kidEnd > start) { + return findPartialSpan(kid, start); + } + } + throw new RuntimeException("Shouldn't happen: " + start + " " + root); + } + + private static Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) { + List leaves = root.getLeaves(); + for (Tree leaf : leaves) { + CoreLabel label = CoreLabel.class.cast(leaf.label()); + int ind = label.get(CoreAnnotations.IndexAnnotation.class) - 1; + if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) { + return leaf; + } + } + // this shouldn't happen + // throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token"); + System.err.println("RuleBasedCorefMentionFinder: ERROR: Failed to find head token"); + return leaves.get(leaves.size() - 1); + } + + private static CoreLabel initCoreLabel(String token) { + CoreLabel label = new CoreLabel(); + label.set(CoreAnnotations.TextAnnotation.class, token); + label.set(CoreAnnotations.ValueAnnotation.class, token); + return label; + } + + private Tree parse(List tokens) { + return parse(tokens, null); + } + + private Tree parse(List tokens, + List constraints) { + CoreMap sent = new Annotation(""); + sent.set(CoreAnnotations.TokensAnnotation.class, tokens); + sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints); + Annotation doc = new Annotation(""); + List sents = new ArrayList(); + sents.add(sent); + doc.set(CoreAnnotations.SentencesAnnotation.class, sents); + getParser().annotate(doc); + sents = doc.get(CoreAnnotations.SentencesAnnotation.class); + return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class); + } + private Annotator getParser() { + if(parserProcessor == null){ + parserProcessor = StanfordCoreNLP.getExistingAnnotator("parse"); + assert(parserProcessor != null); + } + return parserProcessor; + } + private static void convertToCoreLabels(Tree tree) { + Label l = tree.label(); + if(! (l instanceof CoreLabel)){ + CoreLabel cl = new CoreLabel(); + cl.setValue(l.value()); + tree.setLabel(cl); + } + + for (Tree kid : tree.children()) { + convertToCoreLabels(kid); + } + } + private Tree safeHead(Tree top) { + Tree head = top.headTerminal(headFinder); + if (head != null) return head; + // if no head found return the right-most leaf + List leaves = top.getLeaves(); + if(leaves.size() > 0) return leaves.get(leaves.size() - 1); + // fallback: return top + return top; + } + private static Tree findTreeWithSpan(Tree tree, int start, int end) { + CoreLabel l = (CoreLabel) tree.label(); + if (l != null && l.has(CoreAnnotations.BeginIndexAnnotation.class) && l.has(CoreAnnotations.EndIndexAnnotation.class)) { + int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class); + int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class); + if (start == myStart && end == myEnd){ + // found perfect match + return tree; + } else if (end < myStart) { + return null; + } else if (start >= myEnd) { + return null; + } + } + + // otherwise, check inside children - a match is possible + for (Tree kid : tree.children()) { + if (kid == null) continue; + Tree ret = findTreeWithSpan(kid, start, end); + // found matching child + if (ret != null) return ret; + } + + // no match + return null; + } + + /** Filter out all spurious mentions */ + protected static void removeSpuriousMentions(CoreMap s, List mentions, Dictionaries dict) { + Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); + List sent = s.get(CoreAnnotations.TokensAnnotation.class); + Set remove = Generics.newHashSet(); + + + for(Mention m : mentions){ + String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); + String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); + // pleonastic it + if(isPleonastic(m, tree)) remove.add(m); + + // non word such as 'hmm' + if(dict.nonWords.contains(m.headString)) remove.add(m); + + // quantRule : not starts with 'any', 'all' etc + if(dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase())) remove.add(m); + + // partitiveRule + if(partitiveRule(m, sent, dict)) remove.add(m); + + // bareNPRule + if(headPOS.equals("NN") && !dict.temporals.contains(m.headString) + && (m.originalSpan.size()==1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) { + remove.add(m); + } + + // remove generic rule + // if(m.generic==true) remove.add(m); + + if(m.headString.equals("%")) remove.add(m); + if(headNE.equals("PERCENT") || headNE.equals("MONEY")) remove.add(m); + + // adjective form of nations + if(dict.adjectiveNation.contains(m.spanToString().toLowerCase())) remove.add(m); + + // stop list (e.g., U.S., there) + if(inStopList(m)) remove.add(m); + } + + // nested mention with shared headword (except apposition, enumeration): pick larger one + for(Mention m1 : mentions){ + for(Mention m2 : mentions){ + if(m1==m2 || remove.contains(m1) || remove.contains(m2)) continue; + if(m1.sentNum==m2.sentNum && m1.headWord==m2.headWord && m2.insideIn(m1)) { + if(m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") + || sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) { + continue; + } + remove.add(m2); + } + } + } + mentions.removeAll(remove); + } + + private static boolean inStopList(Mention m) { + String mentionSpan = m.spanToString().toLowerCase(); + if(mentionSpan.equals("u.s.") || mentionSpan.equals("u.k.") + || mentionSpan.equals("u.s.s.r")) return true; + if(mentionSpan.equals("there") || mentionSpan.startsWith("etc.") + || mentionSpan.equals("ltd.")) return true; + if(mentionSpan.startsWith("'s ")) return true; + if(mentionSpan.endsWith("etc.")) return true; + + return false; + } + + private static boolean partitiveRule(Mention m, List sent, Dictionaries dict) { + return m.startIndex >= 2 + && sent.get(m.startIndex - 1).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("of") + && dict.parts.contains(sent.get(m.startIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase()); + } + + /** Check whether pleonastic 'it'. E.g., It is possible that ... */ + private static boolean isPleonastic(Mention m, Tree tree) { + if ( ! m.spanToString().equalsIgnoreCase("it")) return false; + final String[] patterns = { + // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev + // I tried some more precise paterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns. + + //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches + // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))", // this one seems more accurate, but ... + "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))", // in practice, go with this one (best results) + + "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))", + "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))", + // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev + + "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))", + "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))", + // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers.... + + // these next 5 had buggy space in "$ ..", which I fixed + "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))", + + "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case + "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ + // certain can be either but relatively likely pleonastic with it ... be + // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones + + "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))", + "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))", + + "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))", + + "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))" + }; + + for (String p : patterns) { + if (checkPleonastic(m, tree, p)) { + System.err.printf("XXXX %s%n", tree); + return true; + } + } + return false; + } + + private static boolean checkPleonastic(Mention m, Tree tree, String pattern) { + try { + TregexPattern tgrepPattern = TregexPattern.compile(pattern); + TregexMatcher matcher = tgrepPattern.matcher(tree); + while (matcher.find()) { + Tree np1 = matcher.getNode("m1"); + if (((CoreLabel)np1.label()).get(CoreAnnotations.BeginIndexAnnotation.class)+1 == m.headWord.get(CoreAnnotations.IndexAnnotation.class)) { + return true; + } + } + } catch (Exception e) { + e.printStackTrace(); + } + return false; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Rules.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Rules.java new file mode 100644 index 0000000..c959c11 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Rules.java @@ -0,0 +1,782 @@ +package edu.stanford.nlp.dcoref; + +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import edu.stanford.nlp.dcoref.Dictionaries.Animacy; +import edu.stanford.nlp.dcoref.Dictionaries.Gender; +import edu.stanford.nlp.dcoref.Dictionaries.MentionType; +import edu.stanford.nlp.dcoref.Dictionaries.Number; +import edu.stanford.nlp.dcoref.Dictionaries.Person; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.math.NumberMatchingRegex; +import edu.stanford.nlp.stats.Counters; +import edu.stanford.nlp.stats.IntCounter; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Sets; + + +/** + * Rules for coref system (mention detection, entity coref, event coref) + * The name of the method for mention detection starts with detection, + * for entity coref starts with entity, and for event coref starts with event. + * + * @author heeyoung, recasens + */ +public class Rules { + + private static final boolean DEBUG = true; + + public static boolean entityBothHaveProper(CorefCluster mentionCluster, + CorefCluster potentialAntecedent) { + boolean mentionClusterHaveProper = false; + boolean potentialAntecedentHaveProper = false; + + for (Mention m : mentionCluster.corefMentions) { + if (m.mentionType==MentionType.PROPER) { + mentionClusterHaveProper = true; + } + } + for (Mention a : potentialAntecedent.corefMentions) { + if (a.mentionType==MentionType.PROPER) { + potentialAntecedentHaveProper = true; + } + } + return (mentionClusterHaveProper && potentialAntecedentHaveProper); + } + public static boolean entitySameProperHeadLastWord(CorefCluster mentionCluster, + CorefCluster potentialAntecedent, Mention mention, Mention ant) { + for (Mention m : mentionCluster.getCorefMentions()){ + for (Mention a : potentialAntecedent.getCorefMentions()) { + if (entitySameProperHeadLastWord(m, a)) return true; + } + } + return false; + } + + public static boolean entityAlias(CorefCluster mentionCluster, CorefCluster potentialAntecedent, + Semantics semantics, Dictionaries dict) throws Exception { + + Mention mention = mentionCluster.getRepresentativeMention(); + Mention antecedent = potentialAntecedent.getRepresentativeMention(); + if(mention.mentionType!=MentionType.PROPER + || antecedent.mentionType!=MentionType.PROPER) return false; + + Method meth = semantics.wordnet.getClass().getMethod("alias", new Class[]{Mention.class, Mention.class}); + if((Boolean) meth.invoke(semantics.wordnet, new Object[]{mention, antecedent})) { + return true; + } + return false; + } + public static boolean entityIWithinI(CorefCluster mentionCluster, + CorefCluster potentialAntecedent, Dictionaries dict) { + for(Mention m : mentionCluster.getCorefMentions()) { + for(Mention a : potentialAntecedent.getCorefMentions()) { + if(entityIWithinI(m, a, dict)) return true; + } + } + return false; + } + public static boolean entityPersonDisagree(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict){ + boolean disagree = false; + for(Mention m : mentionCluster.getCorefMentions()) { + for(Mention ant : potentialAntecedent.getCorefMentions()) { + if(entityPersonDisagree(document, m, ant, dict)) { + disagree = true; + } + } + } + if(disagree) return true; + else return false; + } + /** Word inclusion except stop words */ + public static boolean entityWordsIncluded(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention mention, Mention ant) { + Set wordsExceptStopWords = Generics.newHashSet(mentionCluster.words); + wordsExceptStopWords.removeAll(Arrays.asList(new String[]{ "the","this", "mr.", "miss", "mrs.", "dr.", "ms.", "inc.", "ltd.", "corp.", "'s"})); + wordsExceptStopWords.remove(mention.headString.toLowerCase()); + if(potentialAntecedent.words.containsAll(wordsExceptStopWords)) return true; + else return false; + } + + /** Compatible modifier only */ + public static boolean entityHaveIncompatibleModifier(CorefCluster mentionCluster, CorefCluster potentialAntecedent) { + for(Mention m : mentionCluster.corefMentions){ + for(Mention ant : potentialAntecedent.corefMentions){ + if(entityHaveIncompatibleModifier(m, ant)) return true; + } + } + return false; + } + public static boolean entityIsRoleAppositive(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m1, Mention m2, Dictionaries dict) { + if(!entityAttributesAgree(mentionCluster, potentialAntecedent)) return false; + return m1.isRoleAppositive(m2, dict) || m2.isRoleAppositive(m1, dict); + } + public static boolean entityIsRelativePronoun(Mention m1, Mention m2) { + return m1.isRelativePronoun(m2) || m2.isRelativePronoun(m1); + } + + public static boolean entityIsAcronym(CorefCluster mentionCluster, CorefCluster potentialAntecedent) { + for(Mention m : mentionCluster.corefMentions){ + if(m.isPronominal()) continue; + for(Mention ant : potentialAntecedent.corefMentions){ + if (isAcronym(m.originalSpan, ant.originalSpan)) { + return true; + } + } + } + return false; + } + + public static boolean isAcronym(List first, List second) { + if (first.size() > 1 && second.size() > 1) { + return false; + } + List longer; + List shorter; + + if (first.size() == second.size()) { + String firstWord = first.get(0).get(CoreAnnotations.TextAnnotation.class); + String secondWord = second.get(0).get(CoreAnnotations.TextAnnotation.class); + longer = (firstWord.length() > secondWord.length()) ? first : second; + shorter = (firstWord.length() > secondWord.length()) ? second : first;; + } else { + longer = (first.size() > second.size()) ? first : second; + shorter = (first.size() > second.size()) ? second : first; + } + + String acronym = shorter.get(0).get(CoreAnnotations.TextAnnotation.class); + // This check is not strictly necessary, but it saves a chunk of + // time iterating through the text of the longer mention + for (int acronymPos = 0; acronymPos < acronym.length(); ++acronymPos) { + if (acronym.charAt(acronymPos) < 'A' || acronym.charAt(acronymPos) > 'Z') { + return false; + } + } + int acronymPos = 0; + for (int wordNum = 0; wordNum < longer.size(); ++wordNum) { + String word = longer.get(wordNum).get(CoreAnnotations.TextAnnotation.class); + for (int charNum = 0; charNum < word.length(); ++charNum) { + if (word.charAt(charNum) >= 'A' && word.charAt(charNum) <= 'Z') { + // This triggers if there were more "acronym" characters in + // the longer mention than in the shorter mention + if (acronymPos >= acronym.length()) { + return false; + } + if (acronym.charAt(acronymPos) != word.charAt(charNum)) { + return false; + } + ++acronymPos; + } + } + } + if (acronymPos != acronym.length()) { + return false; + } + for (int i = 0; i < longer.size(); ++i) { + if (longer.get(i).get(CoreAnnotations.TextAnnotation.class).contains(acronym)) { + return false; + } + } + + return true; + } + + public static boolean entityIsPredicateNominatives(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m1, Mention m2) { + if(!entityAttributesAgree(mentionCluster, potentialAntecedent)) return false; + if ((m1.startIndex <= m2.startIndex && m1.endIndex >= m2.endIndex) + || (m1.startIndex >= m2.startIndex && m1.endIndex <= m2.endIndex)) { + return false; + } + return m1.isPredicateNominatives(m2) || m2.isPredicateNominatives(m1); + } + + public static boolean entityIsApposition(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m1, Mention m2) { + if(!entityAttributesAgree(mentionCluster, potentialAntecedent)) return false; + if(m1.mentionType==MentionType.PROPER && m2.mentionType==MentionType.PROPER) return false; + if(m1.nerString.equals("LOCATION")) return false; + return m1.isApposition(m2) || m2.isApposition(m1); + } + + public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent){ + + boolean hasExtraAnt = false; + boolean hasExtraThis = false; + + // number + if(!mentionCluster.numbers.contains(Number.UNKNOWN)){ + for(Number n : potentialAntecedent.numbers){ + if(n!=Number.UNKNOWN && !mentionCluster.numbers.contains(n)) hasExtraAnt = true; + } + } + if(!potentialAntecedent.numbers.contains(Number.UNKNOWN)){ + for(Number n : mentionCluster.numbers){ + if(n!=Number.UNKNOWN && !potentialAntecedent.numbers.contains(n)) hasExtraThis = true; + } + } + + if(hasExtraAnt && hasExtraThis) return false; + + // gender + hasExtraAnt = false; + hasExtraThis = false; + + if(!mentionCluster.genders.contains(Gender.UNKNOWN)){ + for(Gender g : potentialAntecedent.genders){ + if(g!=Gender.UNKNOWN && !mentionCluster.genders.contains(g)) hasExtraAnt = true; + } + } + if(!potentialAntecedent.genders.contains(Gender.UNKNOWN)){ + for(Gender g : mentionCluster.genders){ + if(g!=Gender.UNKNOWN && !potentialAntecedent.genders.contains(g)) hasExtraThis = true; + } + } + if(hasExtraAnt && hasExtraThis) return false; + + // animacy + hasExtraAnt = false; + hasExtraThis = false; + + if(!mentionCluster.animacies.contains(Animacy.UNKNOWN)){ + for(Animacy a : potentialAntecedent.animacies){ + if(a!=Animacy.UNKNOWN && !mentionCluster.animacies.contains(a)) hasExtraAnt = true; + } + } + if(!potentialAntecedent.animacies.contains(Animacy.UNKNOWN)){ + for(Animacy a : mentionCluster.animacies){ + if(a!=Animacy.UNKNOWN && !potentialAntecedent.animacies.contains(a)) hasExtraThis = true; + } + } + if(hasExtraAnt && hasExtraThis) return false; + + // NE type + hasExtraAnt = false; + hasExtraThis = false; + + if(!mentionCluster.nerStrings.contains("O") && !mentionCluster.nerStrings.contains("MISC")){ + for(String ne : potentialAntecedent.nerStrings){ + if(!ne.equals("O") && !ne.equals("MISC") && !mentionCluster.nerStrings.contains(ne)) hasExtraAnt = true; + } + } + if(!potentialAntecedent.nerStrings.contains("O") && !potentialAntecedent.nerStrings.contains("MISC")){ + for(String ne : mentionCluster.nerStrings){ + if(!ne.equals("O") && !ne.equals("MISC") && !potentialAntecedent.nerStrings.contains(ne)) hasExtraThis = true; + } + } + return ! (hasExtraAnt && hasExtraThis); + } + + public static boolean entityRelaxedHeadsAgreeBetweenMentions(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m, Mention ant) { + if(m.isPronominal() || ant.isPronominal()) return false; + if(m.headsAgree(ant)) return true; + return false; + } + + public static boolean entityHeadsAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m, Mention ant, Dictionaries dict) { + boolean headAgree = false; + if(m.isPronominal() || ant.isPronominal() + || dict.allPronouns.contains(m.spanToString().toLowerCase()) + || dict.allPronouns.contains(ant.spanToString().toLowerCase())) return false; + for(Mention a : potentialAntecedent.corefMentions){ + if(a.headString.equals(m.headString)) headAgree= true; + } + return headAgree; + } + + public static boolean entityExactStringMatch(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict, Set roleSet){ + boolean matched = false; + for(Mention m : mentionCluster.corefMentions){ + if(roleSet.contains(m)) return false; + if(m.isPronominal()) { + continue; + } + String mSpan = m.spanToString().toLowerCase(); + if(dict.allPronouns.contains(mSpan)) { + continue; + } + for(Mention ant : potentialAntecedent.corefMentions){ + if(ant.isPronominal()) { + continue; + } + String antSpan = ant.spanToString().toLowerCase(); + if(dict.allPronouns.contains(antSpan)) continue; + if(mSpan.equals(antSpan)) matched = true; + if(mSpan.equals(antSpan+" 's") || antSpan.equals(mSpan+" 's")) matched = true; + } + } + return matched; + } + + /** + * Exact string match except phrase after head (only for proper noun): + * For dealing with a error like "[Mr. Bickford] <- [Mr. Bickford , an 18-year mediation veteran]" + */ + public static boolean entityRelaxedExactStringMatch( + CorefCluster mentionCluster, + CorefCluster potentialAntecedent, + Mention mention, + Mention ant, + Dictionaries dict, + Set roleSet){ + if(roleSet.contains(mention)) return false; + if(mention.isPronominal() || ant.isPronominal() + || dict.allPronouns.contains(mention.spanToString().toLowerCase()) + || dict.allPronouns.contains(ant.spanToString().toLowerCase())) return false; + String mentionSpan = mention.removePhraseAfterHead(); + String antSpan = ant.removePhraseAfterHead(); + if(mentionSpan.equals("") || antSpan.equals("")) return false; + + if(mentionSpan.equals(antSpan) || mentionSpan.equals(antSpan+" 's") || antSpan.equals(mentionSpan+" 's")){ + return true; + } + return false; + } + + /** Check whether two mentions are in i-within-i relation (Chomsky, 1981) */ + public static boolean entityIWithinI(Mention m1, Mention m2, Dictionaries dict){ + // check for nesting: i-within-i + if(!m1.isApposition(m2) && !m2.isApposition(m1) + && !m1.isRelativePronoun(m2) && !m2.isRelativePronoun(m1) + && !m1.isRoleAppositive(m2, dict) && !m2.isRoleAppositive(m1, dict) + ){ + if(m1.includedIn(m2) || m2.includedIn(m1)){ + return true; + } + } + return false; + } + + + /** Check whether later mention has incompatible modifier */ + public static boolean entityHaveIncompatibleModifier(Mention m, Mention ant) { + if(!ant.headString.equalsIgnoreCase(m.headString)) return false; // only apply to same head mentions + boolean thisHasExtra = false; + int lengthThis = m.originalSpan.size(); + int lengthM = ant.originalSpan.size(); + Set thisWordSet = Generics.newHashSet(); + Set antWordSet = Generics.newHashSet(); + Set locationModifier = Generics.newHashSet(Arrays.asList("east", "west", "north", "south", + "eastern", "western", "northern", "southern", "upper", "lower")); + + for (int i=0; i< lengthThis ; i++){ + String w1 = m.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class).toLowerCase(); + String pos1 = m.originalSpan.get(i).get(CoreAnnotations.PartOfSpeechAnnotation.class); + if (!(pos1.startsWith("N") || pos1.startsWith("JJ") || pos1.equals("CD") + || pos1.startsWith("V")) || w1.equalsIgnoreCase(m.headString)) { + continue; + } + thisWordSet.add(w1); + } + for (int j=0 ; j < lengthM ; j++){ + String w2 = ant.originalSpan.get(j).get(CoreAnnotations.TextAnnotation.class).toLowerCase(); + antWordSet.add(w2); + } + for (String w : thisWordSet){ + if(!antWordSet.contains(w)) thisHasExtra = true; + } + boolean hasLocationModifier = false; + for(String l : locationModifier){ + if(antWordSet.contains(l) && !thisWordSet.contains(l)) { + hasLocationModifier = true; + } + } + return (thisHasExtra || hasLocationModifier); + } + /** Check whether two mentions have different locations */ + public static boolean entityHaveDifferentLocation(Mention m, Mention a, Dictionaries dict) { + + // state and country cannot be coref + if ((dict.statesAbbreviation.containsKey(a.spanToString()) || dict.statesAbbreviation.containsValue(a.spanToString())) + && (m.headString.equalsIgnoreCase("country") || m.headString.equalsIgnoreCase("nation"))) { + return true; + } + + Set locationM = Generics.newHashSet(); + Set locationA = Generics.newHashSet(); + String mString = m.spanToString().toLowerCase(); + String aString = a.spanToString().toLowerCase(); + Set locationModifier = Generics.newHashSet(Arrays.asList("east", "west", "north", "south", + "eastern", "western", "northern", "southern", "northwestern", "southwestern", "northeastern", + "southeastern", "upper", "lower")); + + for (CoreLabel w : m.originalSpan){ + if (locationModifier.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) return true; + if (w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("LOCATION")) { + String loc = w.get(CoreAnnotations.TextAnnotation.class); + if(dict.statesAbbreviation.containsKey(loc)) loc = dict.statesAbbreviation.get(loc); + locationM.add(loc); + } + } + for (CoreLabel w : a.originalSpan){ + if (locationModifier.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) return true; + if (w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("LOCATION")) { + String loc = w.get(CoreAnnotations.TextAnnotation.class); + if(dict.statesAbbreviation.containsKey(loc)) loc = dict.statesAbbreviation.get(loc); + locationA.add(loc); + } + } + boolean mHasExtra = false; + boolean aHasExtra = false; + for (String s : locationM) { + if (!aString.contains(s.toLowerCase())) mHasExtra = true; + } + for (String s : locationA) { + if (!mString.contains(s.toLowerCase())) aHasExtra = true; + } + if(mHasExtra && aHasExtra) { + return true; + } + return false; + } + + /** Check whether two mentions have the same proper head words */ + public static boolean entitySameProperHeadLastWord(Mention m, Mention a) { + if(!m.headString.equalsIgnoreCase(a.headString) + || !m.sentenceWords.get(m.headIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP") + || !a.sentenceWords.get(a.headIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) { + return false; + } + if(!m.removePhraseAfterHead().toLowerCase().endsWith(m.headString) + || !a.removePhraseAfterHead().toLowerCase().endsWith(a.headString)) { + return false; + } + Set mProperNouns = Generics.newHashSet(); + Set aProperNouns = Generics.newHashSet(); + for (CoreLabel w : m.sentenceWords.subList(m.startIndex, m.headIndex)){ + if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) { + mProperNouns.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + } + for (CoreLabel w : a.sentenceWords.subList(a.startIndex, a.headIndex)){ + if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) { + aProperNouns.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + } + boolean mHasExtra = false; + boolean aHasExtra = false; + for (String s : mProperNouns) { + if (!aProperNouns.contains(s)) mHasExtra = true; + } + for (String s : aProperNouns) { + if (!mProperNouns.contains(s)) aHasExtra = true; + } + if(mHasExtra && aHasExtra) return false; + return true; + } + + static final Set NUMBERS = Generics.newHashSet(Arrays.asList(new String[]{"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred", "thousand", "million", "billion"})); + + /** Check whether there is a new number in later mention */ + public static boolean entityNumberInLaterMention(Mention mention, Mention ant) { + Set antecedentWords = Generics.newHashSet(); + for (CoreLabel w : ant.originalSpan){ + antecedentWords.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + for (CoreLabel w : mention.originalSpan) { + String word = w.get(CoreAnnotations.TextAnnotation.class); + // Note: this is locale specific for English and ascii numerals + if (NumberMatchingRegex.isDouble(word)) { + if (!antecedentWords.contains(word)) return true; + } else { + if (NUMBERS.contains(word.toLowerCase()) && !antecedentWords.contains(word)) return true; + } + } + return false; + } + + /** Have extra proper noun except strings involved in semantic match */ + public static boolean entityHaveExtraProperNoun(Mention m, Mention a, Set exceptWords) { + Set mProper = Generics.newHashSet(); + Set aProper = Generics.newHashSet(); + String mString = m.spanToString(); + String aString = a.spanToString(); + + for (CoreLabel w : m.originalSpan){ + if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) { + mProper.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + } + for (CoreLabel w : a.originalSpan){ + if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) { + aProper.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + } + boolean mHasExtra = false; + boolean aHasExtra = false; + + + for (String s : mProper) { + if (!aString.contains(s) && !exceptWords.contains(s.toLowerCase())) mHasExtra = true; + } + for (String s : aProper) { + if (!mString.contains(s) && !exceptWords.contains(s.toLowerCase())) aHasExtra = true; + } + + if(mHasExtra && aHasExtra) { + return true; + } + return false; + } + + public static final Pattern WHITESPACE_PATTERN = Pattern.compile(" +"); + + public static boolean entityIsSpeaker(Document document, + Mention mention, Mention ant, Dictionaries dict) { + if(document.speakerPairs.contains(new Pair(mention.mentionID, ant.mentionID))) { + return true; + } + + if(mentionMatchesSpeakerAnnotation(mention, ant)) { + return true; + } + if(mentionMatchesSpeakerAnnotation(ant, mention)) { + return true; + } + return false; + } + + public static boolean mentionMatchesSpeakerAnnotation(Mention mention, Mention ant) { + if (mention.headWord == null) { + return false; + } + String speaker = mention.headWord.get(CoreAnnotations.SpeakerAnnotation.class); + + if (speaker == null) { + return false; + } + // We optimize a little here: if the name has no spaces, which is + // the common case, then it is unnecessarily expensive to call + // regex split + if (speaker.indexOf(" ") >= 0) { + // Perhaps we could optimize this, too, but that would be trickier + for (String s : WHITESPACE_PATTERN.split(speaker)) { + if (ant.headString.equalsIgnoreCase(s)) return true; + } + } else { + if (ant.headString.equalsIgnoreCase(speaker)) return true; + } + return false; + } + + public static boolean entityPersonDisagree(Document document, Mention m, Mention ant, Dictionaries dict) { + boolean sameSpeaker = entitySameSpeaker(document, m, ant); + + if(sameSpeaker && m.person!=ant.person) { + if ((m.person == Person.IT && ant.person == Person.THEY) + || (m.person == Person.THEY && ant.person == Person.IT) || (m.person == Person.THEY && ant.person == Person.THEY)) { + return false; + } else if (m.person != Person.UNKNOWN && ant.person != Person.UNKNOWN) + return true; + } + if(sameSpeaker) { + if(!ant.isPronominal()) { + if(m.person==Person.I || m.person==Person.WE || m.person==Person.YOU) return true; + } else if(!m.isPronominal()) { + if(ant.person==Person.I || ant.person==Person.WE || ant.person==Person.YOU) return true; + } + } + if(m.person==Person.YOU && ant.appearEarlierThan(m)) { + int mUtter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class); + if (document.speakers.containsKey(mUtter - 1)) { + String previousSpeaker = document.speakers.get(mUtter - 1); + int previousSpeakerID; + try { + previousSpeakerID = Integer.parseInt(previousSpeaker); + } catch (Exception e) { + return true; + } + if (ant.corefClusterID != document.allPredictedMentions.get(previousSpeakerID).corefClusterID && ant.person != Person.I) { + return true; + } + } else { + return true; + } + } else if (ant.person==Person.YOU && m.appearEarlierThan(ant)) { + int aUtter = ant.headWord.get(CoreAnnotations.UtteranceAnnotation.class); + if (document.speakers.containsKey(aUtter - 1)) { + String previousSpeaker = document.speakers.get(aUtter - 1); + int previousSpeakerID; + try { + previousSpeakerID = Integer.parseInt(previousSpeaker); + } catch (Exception e) { + return true; + } + if (m.corefClusterID != document.allPredictedMentions.get(previousSpeakerID).corefClusterID && m.person != Person.I) { + return true; + } + } else { + return true; + } + } + return false; + } + + public static boolean entitySameSpeaker(Document document, Mention m, Mention ant) { + String mSpeakerStr = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class); + if (mSpeakerStr == null) { + return false; + } + String antSpeakerStr = ant.headWord.get(CoreAnnotations.SpeakerAnnotation.class); + if (antSpeakerStr == null) { + return false; + } + + int mSpeakerID; + int antSpeakerID; + if (NumberMatchingRegex.isDecimalInteger(mSpeakerStr) && NumberMatchingRegex.isDecimalInteger(antSpeakerStr)) { + try { + mSpeakerID = Integer.parseInt(mSpeakerStr); + antSpeakerID = Integer.parseInt(ant.headWord.get(CoreAnnotations.SpeakerAnnotation.class)); + } catch (Exception e) { + return (m.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals(ant.headWord.get(CoreAnnotations.SpeakerAnnotation.class))); + } + } else { + return (m.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals(ant.headWord.get(CoreAnnotations.SpeakerAnnotation.class))); + } + int mSpeakerClusterID = document.allPredictedMentions.get(mSpeakerID).corefClusterID; + int antSpeakerClusterID = document.allPredictedMentions.get(antSpeakerID).corefClusterID; + return (mSpeakerClusterID == antSpeakerClusterID); + } + + public static boolean entitySubjectObject(Mention m1, Mention m2) { + if(m1.sentNum != m2.sentNum) return false; + if(m1.dependingVerb==null || m2.dependingVerb ==null) return false; + if (m1.dependingVerb == m2.dependingVerb + && ((m1.isSubject && (m2.isDirectObject || m2.isIndirectObject || m2.isPrepositionObject)) + || (m2.isSubject && (m1.isDirectObject || m1.isIndirectObject || m1.isPrepositionObject)))) { + return true; + } + return false; + } + + // Return true if the two mentions are less than n mentions apart in the same sent + public static boolean entityTokenDistance(Mention m1, Mention m2) { + if( (m2.sentNum == m1.sentNum) && (m1.startIndex - m2.startIndex < 6) ) return true; + return false; + } + + // COREF_DICT strict: all the mention pairs between the two clusters must match in the dict + public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, CorefCluster antCluster, + Dictionaries dict, int dictColumn, int freq){ + boolean ret = false; + for(Mention men : menCluster.getCorefMentions()){ + if(men.isPronominal()) continue; + for(Mention ant : antCluster.getCorefMentions()){ + if(ant.isPronominal() || men.headWord.lemma().equals(ant.headWord.lemma())) continue; + if(entityCorefDictionary(men, ant, dict, dictColumn, freq)){ + ret = true; + } else { + return false; + } + } + } + return ret; + } + + // COREF_DICT pairwise: the two mentions match in the dict + public static boolean entityCorefDictionary(Mention men, Mention ant, Dictionaries dict, int dictVersion, int freq){ + + Pair mention_pair = new Pair( + men.getSplitPattern()[dictVersion-1].toLowerCase(), + ant.getSplitPattern()[dictVersion-1].toLowerCase()); + + int high_freq = -1; + if(dictVersion == 1){ + high_freq = 75; + } else if(dictVersion == 2){ + high_freq = 16; + } else if(dictVersion == 3){ + high_freq = 16; + } else if(dictVersion == 4){ + high_freq = 16; + } + + if(dict.corefDict.get(dictVersion-1).getCount(mention_pair) > high_freq) return true; + + if(dict.corefDict.get(dictVersion-1).getCount(mention_pair) > freq){ + if(dict.corefDictPMI.getCount(mention_pair) > 0.18) return true; + if(!dict.corefDictPMI.containsKey(mention_pair)) return true; + } + return false; + } + + public static boolean contextIncompatible(Mention men, Mention ant, Dictionaries dict) { + String antHead = ant.headWord.word(); + if ( (ant.mentionType == MentionType.PROPER) + && ant.sentNum != men.sentNum + && !isContextOverlapping(ant,men) + && dict.NE_signatures.containsKey(antHead)) { + IntCounter ranks = Counters.toRankCounter(dict.NE_signatures.get(antHead)); + List context; + if (!men.getPremodifierContext().isEmpty()) { + context = men.getPremodifierContext(); + } else { + context = men.getContext(); + } + if (!context.isEmpty()) { + int highestRank = 100000; + for (String w: context) { + if (ranks.containsKey(w) && ranks.getIntCount(w) < highestRank) { + highestRank = ranks.getIntCount(w); + } + // check in the other direction + if (dict.NE_signatures.containsKey(w)) { + IntCounter reverseRanks = Counters.toRankCounter(dict.NE_signatures.get(w)); + if (reverseRanks.containsKey(antHead) && reverseRanks.getIntCount(antHead) < highestRank) { + highestRank = reverseRanks.getIntCount(antHead); + } + } + } + if (highestRank > 10) return true; + } + } + return false; + } + + public static boolean sentenceContextIncompatible(Mention men, Mention ant, Dictionaries dict) { + if ( (ant.mentionType != MentionType.PROPER) + && (ant.sentNum != men.sentNum) + && (men.mentionType != MentionType.PROPER) + && !isContextOverlapping(ant,men)) { + List context1 = !ant.getPremodifierContext().isEmpty() ? ant.getPremodifierContext() : ant.getContext(); + List context2 = !men.getPremodifierContext().isEmpty() ? men.getPremodifierContext() : men.getContext(); + if (!context1.isEmpty() && !context2.isEmpty()) { + int highestRank = 100000; + for (String w1: context1) { + for (String w2: context2) { + // check the forward direction + if (dict.NE_signatures.containsKey(w1)) { + IntCounter ranks = Counters.toRankCounter(dict.NE_signatures.get(w1)); + if (ranks.containsKey(w2) && ranks.getIntCount(w2) < highestRank) { + highestRank = ranks.getIntCount(w2); + } + } + // check in the other direction + if (dict.NE_signatures.containsKey(w2)) { + IntCounter reverseRanks = Counters.toRankCounter(dict.NE_signatures.get(w2)); + if (reverseRanks.containsKey(w1) && reverseRanks.getIntCount(w1) < highestRank) { + highestRank = reverseRanks.getIntCount(w1); + } + } + } + } + if (highestRank > 10) return true; + } + } + return false; + } + + private static boolean isContextOverlapping(Mention m1, Mention m2) { + Set context1 = Generics.newHashSet(); + Set context2 = Generics.newHashSet(); + context1.addAll(m1.getContext()); + context2.addAll(m2.getContext()); + return Sets.intersects(context1, context2); + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerBCubed.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerBCubed.java new file mode 100644 index 0000000..f5704e2 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerBCubed.java @@ -0,0 +1,188 @@ +package edu.stanford.nlp.dcoref; + +import java.util.*; + +/** + * B^3 scorer + * @author heeyoung + * + */ +public class ScorerBCubed extends CorefScorer { + + protected enum BCubedType {B0, Ball, Brahman, Bcai, Bconll} + + private final BCubedType type; + + public ScorerBCubed(BCubedType _type) { + super(ScoreType.BCubed); + type = _type; + } + + @Override + protected void calculatePrecision(Document doc){ + switch(type){ + case Bcai: calculatePrecisionBcai(doc); break; + case Ball: calculatePrecisionBall(doc); break; + case Bconll: calculatePrecisionBconll(doc); break; // same as Bcai + } + } + + + @Override + protected void calculateRecall(Document doc){ + switch(type){ + case Bcai: calculateRecallBcai(doc); break; + case Ball: calculateRecallBall(doc); break; + case Bconll: calculateRecallBconll(doc); break; + } + } + + private void calculatePrecisionBall(Document doc){ + int pDen = 0; + double pNum = 0.0; + + Map goldMentions = doc.allGoldMentions; + Map predictedMentions = doc.allPredictedMentions; + + for(Mention m : predictedMentions.values()){ + double correct = 0.0; + double total = 0.0; + + for(Mention m2 : doc.corefClusters.get(m.corefClusterID).getCorefMentions()){ + if(m==m2 || + (goldMentions.containsKey(m.mentionID) + && goldMentions.containsKey(m2.mentionID) + && goldMentions.get(m.mentionID).goldCorefClusterID == goldMentions.get(m2.mentionID).goldCorefClusterID)) { + correct++; + } + total++; + } + pNum += correct/total; + pDen++; + } + + precisionDenSum += pDen; + precisionNumSum += pNum; + } + private void calculateRecallBall(Document doc){ + int rDen = 0; + double rNum = 0.0; + Map goldMentions = doc.allGoldMentions; + Map predictedMentions = doc.allPredictedMentions; + + for(Mention m : goldMentions.values()){ + double correct = 0.0; + double total = 0.0; + for(Mention m2 : doc.goldCorefClusters.get(m.goldCorefClusterID).getCorefMentions()){ + if(m==m2 || + (predictedMentions.containsKey(m.mentionID) + && predictedMentions.containsKey(m2.mentionID) + && predictedMentions.get(m.mentionID).corefClusterID == predictedMentions.get(m2.mentionID).corefClusterID)) { + correct++; + } + total++; + } + rNum += correct/total; + rDen++; + } + + recallDenSum += rDen; + recallNumSum += rNum; + + } + private void calculatePrecisionBcai(Document doc) { + int pDen = 0; + double pNum = 0.0; + Map goldMentions = doc.allGoldMentions; + Map predictedMentions = doc.allPredictedMentions; + + for(Mention m : predictedMentions.values()){ + if(!goldMentions.containsKey(m.mentionID) && doc.corefClusters.get(m.corefClusterID).getCorefMentions().size()==1){ + continue; + } + double correct = 0.0; + double total = 0.0; + for(Mention m2 : doc.corefClusters.get(m.corefClusterID).getCorefMentions()){ + if(m==m2 || + (goldMentions.containsKey(m.mentionID) + && goldMentions.containsKey(m2.mentionID) + && goldMentions.get(m.mentionID).goldCorefClusterID == goldMentions.get(m2.mentionID).goldCorefClusterID)) { + correct++; + } + total++; + } + pNum += correct/total; + pDen++; + } + for(int id : goldMentions.keySet()) { + if(!predictedMentions.containsKey(id)) { + pNum++; + pDen++; + } + } + precisionDenSum += pDen; + precisionNumSum += pNum; + } + + private void calculateRecallBcai(Document doc) { + int rDen = 0; + double rNum = 0.0; + Map goldMentions = doc.allGoldMentions; + Map predictedMentions = doc.allPredictedMentions; + + for(Mention m : goldMentions.values()){ + double correct = 0.0; + double total = 0.0; + for(Mention m2 : doc.goldCorefClusters.get(m.goldCorefClusterID).getCorefMentions()){ + if(m==m2 || + (predictedMentions.containsKey(m.mentionID) + && predictedMentions.containsKey(m2.mentionID) + && predictedMentions.get(m.mentionID).corefClusterID == predictedMentions.get(m2.mentionID).corefClusterID)) { + correct++; + } + total++; + } + rNum += correct/total; + rDen++; + } + + recallDenSum += rDen; + recallNumSum += rNum; + } + private void calculatePrecisionBconll(Document doc) { + // same as Bcai + calculatePrecisionBcai(doc); + } + private void calculateRecallBconll(Document doc) { + int rDen = 0; + double rNum = 0.0; + Map goldMentions = doc.allGoldMentions; + Map predictedMentions = doc.allPredictedMentions; + + for(Mention m : goldMentions.values()){ + double correct = 0.0; + double total = 0.0; + for(Mention m2 : doc.goldCorefClusters.get(m.goldCorefClusterID).getCorefMentions()){ + if(m==m2 || + (predictedMentions.containsKey(m.mentionID) + && predictedMentions.containsKey(m2.mentionID) + && predictedMentions.get(m.mentionID).corefClusterID == predictedMentions.get(m2.mentionID).corefClusterID)) { + correct++; + } + total++; + } + rNum += correct/total; + rDen++; + } + // this part is different from Bcai + for(Mention m : predictedMentions.values()) { + if(!goldMentions.containsKey(m.mentionID) && doc.corefClusters.get(m.corefClusterID).getCorefMentions().size()!=1) { + rNum++; + rDen++; + } + } + + recallDenSum += rDen; + recallNumSum += rNum; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerMUC.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerMUC.java new file mode 100644 index 0000000..82d6d6a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerMUC.java @@ -0,0 +1,73 @@ +package edu.stanford.nlp.dcoref; + +import java.util.*; + +import edu.stanford.nlp.util.Generics; + +public class ScorerMUC extends CorefScorer { + + public ScorerMUC() { + super(ScoreType.MUC); + } + + @Override + protected void calculateRecall(Document doc) { + int rDen = 0; + int rNum = 0; + + Map predictedMentions = doc.allPredictedMentions; + for(CorefCluster g : doc.goldCorefClusters.values()){ + if(g.corefMentions.size()==0) { + SieveCoreferenceSystem.logger.warning("NO MENTIONS for cluster " + g.getClusterID()); + continue; + } + rDen += g.corefMentions.size()-1; + rNum += g.corefMentions.size(); + + Set partitions = Generics.newHashSet(); + for (Mention goldMention : g.corefMentions){ + if(!predictedMentions.containsKey(goldMention.mentionID)) { // twinless goldmention + rNum--; + } else { + partitions.add(doc.corefClusters.get(predictedMentions.get(goldMention.mentionID).corefClusterID)); + } + } + rNum -= partitions.size(); + } + if (rDen != doc.allGoldMentions.size()-doc.goldCorefClusters.values().size()) { + System.err.println("rDen is " + rDen); + System.err.println("doc.allGoldMentions.size() is " + doc.allGoldMentions.size()); + System.err.println("doc.goldCorefClusters.values().size() is " + doc.goldCorefClusters.values().size()); + } + assert(rDen == (doc.allGoldMentions.size()-doc.goldCorefClusters.values().size())); + + recallNumSum += rNum; + recallDenSum += rDen; + } + + @Override + protected void calculatePrecision(Document doc) { + int pDen = 0; + int pNum = 0; + Map goldMentions = doc.allGoldMentions; + + for(CorefCluster c : doc.corefClusters.values()){ + if(c.corefMentions.size()==0) continue; + pDen += c.corefMentions.size()-1; + pNum += c.corefMentions.size(); + Set partitions = Generics.newHashSet(); + for (Mention predictedMention : c.corefMentions){ + if(!goldMentions.containsKey(predictedMention.mentionID)) { // twinless goldmention + pNum--; + } else { + partitions.add(doc.goldCorefClusters.get(goldMentions.get(predictedMention.mentionID).goldCorefClusterID)); + } + } + pNum -= partitions.size(); + } + assert(pDen == (doc.allPredictedMentions.size()-doc.corefClusters.values().size())); + + precisionDenSum += pDen; + precisionNumSum += pNum; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerPairwise.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerPairwise.java new file mode 100644 index 0000000..98c50c6 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/ScorerPairwise.java @@ -0,0 +1,71 @@ +package edu.stanford.nlp.dcoref; + +import java.util.*; + +public class ScorerPairwise extends CorefScorer { + + public ScorerPairwise(){ + super(ScoreType.Pairwise); + } + + @Override + protected void calculateRecall(Document doc) { + int rDen = 0; + int rNum = 0; + Map predictedMentions = doc.allPredictedMentions; + + for(CorefCluster g : doc.goldCorefClusters.values()) { + int clusterSize = g.getCorefMentions().size(); + rDen += clusterSize*(clusterSize-1)/2; + for(Mention m1 : g.getCorefMentions()){ + Mention predictedM1 = predictedMentions.get(m1.mentionID); + if(predictedM1 == null) { + continue; + } + for(Mention m2 : g.getCorefMentions()) { + if(m1.mentionID >= m2.mentionID) continue; + Mention predictedM2 = predictedMentions.get(m2.mentionID); + if(predictedM2 == null) { + continue; + } + if(predictedM1.corefClusterID == predictedM2.corefClusterID){ + rNum++; + } + } + } + } + recallDenSum += rDen; + recallNumSum += rNum; + } + + @Override + protected void calculatePrecision(Document doc) { + int pDen = 0; + int pNum = 0; + + Map goldMentions = doc.allGoldMentions; + + for(CorefCluster c : doc.corefClusters.values()){ + int clusterSize = c.getCorefMentions().size(); + pDen += clusterSize*(clusterSize-1)/2; + for(Mention m1 : c.getCorefMentions()){ + Mention goldM1 = goldMentions.get(m1.mentionID); + if(goldM1 == null) { + continue; + } + for(Mention m2 : c.getCorefMentions()) { + if(m1.mentionID >= m2.mentionID) continue; + Mention goldM2 = goldMentions.get(m2.mentionID); + if(goldM2 == null) { + continue; + } + if(goldM1.goldCorefClusterID == goldM2.goldCorefClusterID){ + pNum++; + } + } + } + } + precisionDenSum += pDen; + precisionNumSum += pNum; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Semantics.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Semantics.java new file mode 100644 index 0000000..607f19a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/Semantics.java @@ -0,0 +1,15 @@ +package edu.stanford.nlp.dcoref; + +import java.lang.reflect.Constructor; + +/** Semantic knowledge: currently WordNet is available */ +public class Semantics { + public Object wordnet; + + public Semantics() {} + + public Semantics(Dictionaries dict) throws Exception{ + Constructor wordnetConstructor = (Class.forName("edu.stanford.nlp.dcoref.WordNet")).getConstructor(); + wordnet = wordnetConstructor.newInstance(); + } +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/SieveCoreferenceSystem.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/SieveCoreferenceSystem.java new file mode 100644 index 0000000..393ac33 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/SieveCoreferenceSystem.java @@ -0,0 +1,1709 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2011 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Calendar; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.TreeSet; +import java.util.logging.FileHandler; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.pipeline.DefaultPaths; +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.dcoref.CorefChain.CorefMention; +import edu.stanford.nlp.dcoref.CorefChain.MentionComparator; +import edu.stanford.nlp.dcoref.ScorerBCubed.BCubedType; +import edu.stanford.nlp.dcoref.sievepasses.DeterministicCorefSieve; +import edu.stanford.nlp.dcoref.sievepasses.ExactStringMatch; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.io.StringOutputStream; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.IntTuple; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.util.SystemUtils; +import edu.stanford.nlp.util.logging.NewlineLogFormatter; + +/** + * Multi-pass Sieve coreference resolution system (see EMNLP 2010 paper). + *

    + * The main entry point for API is coref(Document document). + * The output is a map from CorefChain ID to corresponding CorefChain. + * + * @author Jenny Finkel + * @author Mihai Surdeanu + * @author Karthik Raghunathan + * @author Heeyoung Lee + * @author Sudarshan Rangarajan + */ +public class SieveCoreferenceSystem { + + public static final Logger logger = Logger.getLogger(SieveCoreferenceSystem.class.getName()); + + /** + * If true, we score the output of the given test document + * Assumes gold annotations are available + */ + private final boolean doScore; + + /** + * If true, we do post processing. + */ + private final boolean doPostProcessing; + + /** + * maximum sentence distance between two mentions for resolution (-1: no constraint on distance) + */ + private final int maxSentDist; + + /** + * automatically set by looking at sieves + */ + private final boolean useSemantics; + + /** + * Singleton predictor from Recasens, de Marneffe, and Potts (NAACL 2013) + */ + private final boolean useSingletonPredictor; + + /** flag for replicating CoNLL result */ + private final boolean replicateCoNLL; + + /** Path for the official CoNLL scorer */ + public final String conllMentionEvalScript; + + /** flag for optimizing ordering of sieves */ + private final boolean optimizeSieves; + /** Constraints on sieve order */ + private List> sievesKeepOrder; + + /** Final score to use for sieve optimization (default is pairwise.Precision) */ + private final String optimizeScoreType; + /** More useful break down of optimizeScoreType */ + private final boolean optimizeConllScore; + private final String optimizeMetricType; + private final CorefScorer.SubScoreType optimizeSubScoreType; + + /** + * Array of sieve passes to be used in the system + * Ordered from highest precision to lowest! + */ + private /*final */DeterministicCorefSieve [] sieves; + public /*final*/ String [] sieveClassNames; + + /** + * Dictionaries of all the useful goodies (gender, animacy, number etc. lists) + */ + private final Dictionaries dictionaries; + + /** + * Semantic knowledge: WordNet + */ + public final Semantics semantics; + + public LogisticClassifier singletonPredictor; + + /** Current sieve index */ + public int currentSieve; + + /** counter for links in passes (Pair) */ + public List> linksCountInPass; + + + /** Scores for each pass */ + public List scorePairwise; + public List scoreBcubed; + public List scoreMUC; + + private List scoreSingleDoc; + + /** Additional scoring stats */ + int additionalCorrectLinksCount; + int additionalLinksCount; + + public SieveCoreferenceSystem(Properties props) throws Exception { + // initialize required fields + currentSieve = -1; + + // + // construct the sieve passes + // + String sievePasses = props.getProperty(Constants.SIEVES_PROP, Constants.SIEVEPASSES); + sieveClassNames = sievePasses.trim().split(",\\s*"); + sieves = new DeterministicCorefSieve[sieveClassNames.length]; + for(int i = 0; i < sieveClassNames.length; i ++){ + sieves[i] = (DeterministicCorefSieve) Class.forName("edu.stanford.nlp.dcoref.sievepasses."+sieveClassNames[i]).getConstructor().newInstance(); + sieves[i].init(props); + } + + // + // create scoring framework + // + doScore = Boolean.parseBoolean(props.getProperty(Constants.SCORE_PROP, "false")); + + // + // setting post processing + // + doPostProcessing = Boolean.parseBoolean(props.getProperty(Constants.POSTPROCESSING_PROP, "false")); + + // + // setting singleton predictor + // + useSingletonPredictor = Boolean.parseBoolean(props.getProperty(Constants.SINGLETON_PROP, "true")); + + // + // setting maximum sentence distance between two mentions for resolution (-1: no constraint on distance) + // + maxSentDist = Integer.parseInt(props.getProperty(Constants.MAXDIST_PROP, "-1")); + + // + // set useWordNet + // + useSemantics = sievePasses.contains("AliasMatch") || sievePasses.contains("LexicalChainMatch"); + + // flag for replicating CoNLL result + replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false")); + conllMentionEvalScript = props.getProperty(Constants.CONLL_SCORER, Constants.conllMentionEvalScript); + + // flag for optimizing sieve ordering + optimizeSieves = Boolean.parseBoolean(props.getProperty(Constants.OPTIMIZE_SIEVES_PROP, "false")); + optimizeScoreType = props.getProperty(Constants.OPTIMIZE_SIEVES_SCORE_PROP, "pairwise.Precision"); + + // Break down of the optimize score type + String[] validMetricTypes = { "muc", "pairwise", "bcub", "ceafe", "ceafm", "combined" }; + String[] parts = optimizeScoreType.split("\\."); + optimizeConllScore = parts.length > 2 && "conll".equalsIgnoreCase(parts[2]); + optimizeMetricType = parts[0]; + boolean optimizeMetricTypeOk = false; + for (String validMetricType : validMetricTypes) { + if (validMetricType.equalsIgnoreCase(optimizeMetricType)) { + optimizeMetricTypeOk = true; + break; + } + } + if (!optimizeMetricTypeOk) { + throw new IllegalArgumentException("Invalid metric type for " + + Constants.OPTIMIZE_SIEVES_SCORE_PROP + " property: " + optimizeScoreType); + } + optimizeSubScoreType = CorefScorer.SubScoreType.valueOf(parts[1]); + + if (optimizeSieves) { + String keepSieveOrder = props.getProperty(Constants.OPTIMIZE_SIEVES_KEEP_ORDER_PROP); + if (keepSieveOrder != null) { + String[] orderings = keepSieveOrder.split("\\s*,\\s*"); + sievesKeepOrder = new ArrayList>(); + String firstSieveConstraint = null; + String lastSieveConstraint = null; + for (String ordering:orderings) { + // Convert ordering constraints from string + Pair p = fromSieveOrderConstraintString(ordering, sieveClassNames); + // Do initial check of sieves order, can only have one where the first is ANY (< 0), and one where second is ANY (< 0) + if (p.first() < 0 && p.second() < 0) { + throw new IllegalArgumentException("Invalid ordering constraint: " + ordering); + } else if (p.first() < 0) { + if (lastSieveConstraint != null) { + throw new IllegalArgumentException("Cannot have these two ordering constraints: " + lastSieveConstraint + "," + ordering); + } + lastSieveConstraint = ordering; + } else if (p.second() < 0) { + if (firstSieveConstraint != null) { + throw new IllegalArgumentException("Cannot have these two ordering constraints: " + firstSieveConstraint + "," + ordering); + } + firstSieveConstraint = ordering; + } + sievesKeepOrder.add(p); + } + } + } + + if(doScore){ + initScorers(); + } + + // + // load all dictionaries + // + dictionaries = new Dictionaries(props); + semantics = (useSemantics)? new Semantics(dictionaries) : null; + + if(useSingletonPredictor){ + singletonPredictor = getSingletonPredictorFromSerializedFile(DefaultPaths.DEFAULT_DCOREF_SINGLETON_MODEL); + } + } + + public static String signature(Properties props) { + StringBuilder os = new StringBuilder(); + os.append(Constants.SIEVES_PROP + ":" + + props.getProperty(Constants.SIEVES_PROP, + Constants.SIEVEPASSES)); + os.append(Constants.SINGLETON_PROP + ":" + + props.getProperty(Constants.SINGLETON_PROP, + "false")); + os.append(Constants.SCORE_PROP + ":" + + props.getProperty(Constants.SCORE_PROP, + "false")); + os.append(Constants.POSTPROCESSING_PROP + ":" + + props.getProperty(Constants.POSTPROCESSING_PROP, + "false")); + os.append(Constants.MAXDIST_PROP + ":" + + props.getProperty(Constants.MAXDIST_PROP, + "-1")); + os.append(Constants.REPLICATECONLL_PROP + ":" + + props.getProperty(Constants.REPLICATECONLL_PROP, + "false")); + os.append(Constants.CONLL_SCORER + ":" + + props.getProperty(Constants.CONLL_SCORER, + Constants.conllMentionEvalScript)); + os.append(Dictionaries.signature(props)); + return os.toString(); + } + + public void initScorers() { + linksCountInPass = new ArrayList>(); + scorePairwise = new ArrayList(); + scoreBcubed = new ArrayList(); + scoreMUC = new ArrayList(); + for(int i = 0 ; i < sieveClassNames.length ; i++){ + scorePairwise.add(new ScorerPairwise()); + scoreBcubed.add(new ScorerBCubed(BCubedType.Bconll)); + scoreMUC.add(new ScorerMUC()); + linksCountInPass.add(new Pair(0, 0)); + } + } + + public boolean doScore() { return doScore; } + public Dictionaries dictionaries() { return dictionaries; } + public Semantics semantics() { return semantics; } + + /** + * Needs the following properties: + * -props 'Location of coref.properties' + * @throws Exception + */ + public static void main(String[] args) throws Exception { + Properties props = StringUtils.argsToProperties(args); + String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-"); + + // + // initialize logger + // + try { + String logFileName = props.getProperty(Constants.LOG_PROP, "log.txt"); + if(logFileName.endsWith(".txt")) { + logFileName = logFileName.substring(0, logFileName.length()-4) +"_"+ timeStamp+".txt"; + } else { + logFileName = logFileName + "_"+ timeStamp+".txt"; + } + FileHandler fh = new FileHandler(logFileName, false); + logger.addHandler(fh); + logger.setLevel(Level.FINE); + fh.setFormatter(new NewlineLogFormatter()); + } catch (SecurityException e) { + System.err.println("ERROR: cannot initialize logger!"); + throw e; + } catch (IOException e) { + System.err.println("ERROR: cannot initialize logger!"); + throw e; + } + + logger.fine(timeStamp); + logger.fine(props.toString()); + Constants.printConstants(logger); + + // initialize coref system + SieveCoreferenceSystem corefSystem = new SieveCoreferenceSystem(props); + + // MentionExtractor extracts MUC, ACE, or CoNLL documents + MentionExtractor mentionExtractor = null; + if(props.containsKey(Constants.MUC_PROP)){ + mentionExtractor = new MUCMentionExtractor(corefSystem.dictionaries, props, + corefSystem.semantics, corefSystem.singletonPredictor); + } else if(props.containsKey(Constants.ACE2004_PROP) || props.containsKey(Constants.ACE2005_PROP)) { + mentionExtractor = new ACEMentionExtractor(corefSystem.dictionaries, props, + corefSystem.semantics, corefSystem.singletonPredictor); + } else if (props.containsKey(Constants.CONLL2011_PROP)) { + mentionExtractor = new CoNLLMentionExtractor(corefSystem.dictionaries, props, + corefSystem.semantics, corefSystem.singletonPredictor); + } + if(mentionExtractor == null){ + throw new RuntimeException("No input file specified!"); + } + if (!Constants.USE_GOLD_MENTIONS) { + // Set mention finder + String mentionFinderClass = props.getProperty(Constants.MENTION_FINDER_PROP); + if (mentionFinderClass != null) { + String mentionFinderPropFilename = props.getProperty(Constants.MENTION_FINDER_PROPFILE_PROP); + CorefMentionFinder mentionFinder; + if (mentionFinderPropFilename != null) { + Properties mentionFinderProps = new Properties(); + mentionFinderProps.load(new FileInputStream(mentionFinderPropFilename)); + mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).getConstructor(Properties.class).newInstance(mentionFinderProps); + } else { + mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).newInstance(); + } + mentionExtractor.setMentionFinder(mentionFinder); + } + if (mentionExtractor.mentionFinder == null) { + logger.warning("No mention finder specified, but not using gold mentions"); + } + } + + if (corefSystem.optimizeSieves && corefSystem.sieves.length > 1) { + corefSystem.optimizeSieveOrdering(mentionExtractor, props, timeStamp); + } + + try { + runAndScoreCoref(corefSystem, mentionExtractor, props, timeStamp); + } catch (Exception ex) { + logger.log(Level.SEVERE, "ERROR in running coreference", ex); + } + logger.info("done"); + String endTimeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-"); + logger.fine(endTimeStamp); + } + + public static double runAndScoreCoref(SieveCoreferenceSystem corefSystem, + MentionExtractor mentionExtractor, + Properties props, + String timeStamp) throws Exception + { + // prepare conll output + PrintWriter writerGold = null; + PrintWriter writerPredicted = null; + PrintWriter writerPredictedCoref = null; + + String conllOutputMentionGoldFile = null; + String conllOutputMentionPredictedFile = null; + String conllOutputMentionCorefPredictedFile = null; + String conllMentionEvalFile = null; + String conllMentionEvalErrFile = null; + String conllMentionCorefEvalFile = null; + String conllMentionCorefEvalErrFile = null; + + if(Constants.PRINT_CONLL_OUTPUT || corefSystem.replicateCoNLL) { + String conllOutput = props.getProperty(Constants.CONLL_OUTPUT_PROP, "conlloutput"); + conllOutputMentionGoldFile = conllOutput + "-"+timeStamp+".gold.txt"; + conllOutputMentionPredictedFile = conllOutput +"-"+timeStamp+ ".predicted.txt"; + conllOutputMentionCorefPredictedFile = conllOutput +"-"+timeStamp+ ".coref.predicted.txt"; + conllMentionEvalFile = conllOutput +"-"+timeStamp+ ".eval.txt"; + conllMentionEvalErrFile = conllOutput +"-"+timeStamp+ ".eval.err.txt"; + conllMentionCorefEvalFile = conllOutput +"-"+timeStamp+ ".coref.eval.txt"; + conllMentionCorefEvalErrFile = conllOutput +"-"+timeStamp+ ".coref.eval.err.txt"; + logger.info("CONLL MENTION GOLD FILE: " + conllOutputMentionGoldFile); + logger.info("CONLL MENTION PREDICTED FILE: " + conllOutputMentionPredictedFile); + logger.info("CONLL MENTION EVAL FILE: " + conllMentionEvalFile); + if (!Constants.SKIP_COREF) { + logger.info("CONLL MENTION PREDICTED WITH COREF FILE: " + conllOutputMentionCorefPredictedFile); + logger.info("CONLL MENTION WITH COREF EVAL FILE: " + conllMentionCorefEvalFile); + } + writerGold = new PrintWriter(new FileOutputStream(conllOutputMentionGoldFile)); + writerPredicted = new PrintWriter(new FileOutputStream(conllOutputMentionPredictedFile)); + writerPredictedCoref = new PrintWriter(new FileOutputStream(conllOutputMentionCorefPredictedFile)); + } + + mentionExtractor.resetDocs(); + if (corefSystem.doScore()) { + corefSystem.initScorers(); + } + + // + // Parse one document at a time, and do single-doc coreference resolution in each. + // + // In one iteration, orderedMentionsBySentence contains a list of all + // mentions in one document. Each mention has properties (annotations): + // its surface form (Word), NER Tag, POS Tag, Index, etc. + // + + while(true) { + + Document document = mentionExtractor.nextDoc(); + if(document==null) break; + + if(!props.containsKey(Constants.MUC_PROP)) { + printRawDoc(document, true); + printRawDoc(document, false); + } + printDiscourseStructure(document); + + if(corefSystem.doScore()){ + document.extractGoldCorefClusters(); + } + + if(Constants.PRINT_CONLL_OUTPUT || corefSystem.replicateCoNLL) { + // Not doing coref - print conll output here + printConllOutput(document, writerGold, true); + printConllOutput(document, writerPredicted, false); + } + + // run mention detection only + if(Constants.SKIP_COREF) { + continue; + } + + corefSystem.coref(document); // Do Coreference Resolution + + if(corefSystem.doScore()){ + //Identifying possible coreferring mentions in the corpus along with any recall/precision errors with gold corpus + corefSystem.printTopK(logger, document, corefSystem.semantics); + + logger.fine("pairwise score for this doc: "); + corefSystem.scoreSingleDoc.get(corefSystem.sieves.length-1).printF1(logger); + logger.fine("accumulated score: "); + corefSystem.printF1(true); + logger.fine("\n"); + } + if(Constants.PRINT_CONLL_OUTPUT || corefSystem.replicateCoNLL){ + printConllOutput(document, writerPredictedCoref, false, true); + } + } + + double finalScore = 0; + if(Constants.PRINT_CONLL_OUTPUT || corefSystem.replicateCoNLL) { + writerGold.close(); + writerPredicted.close(); + writerPredictedCoref.close(); + + //if(props.containsKey(Constants.CONLL_SCORER)) { + if (corefSystem.conllMentionEvalScript != null) { + // runConllEval(corefSystem.conllMentionEvalScript, conllOutputMentionGoldFile, conllOutputMentionPredictedFile, conllMentionEvalFile, conllMentionEvalErrFile); + + String summary = getConllEvalSummary(corefSystem.conllMentionEvalScript, conllOutputMentionGoldFile, conllOutputMentionPredictedFile); + logger.info("\nCONLL EVAL SUMMARY (Before COREF)"); + printScoreSummary(summary, logger, false); + + if (!Constants.SKIP_COREF) { + // runConllEval(corefSystem.conllMentionEvalScript, conllOutputMentionGoldFile, conllOutputMentionCorefPredictedFile, conllMentionCorefEvalFile, conllMentionCorefEvalErrFile); + summary = getConllEvalSummary(corefSystem.conllMentionEvalScript, conllOutputMentionGoldFile, conllOutputMentionCorefPredictedFile); + logger.info("\nCONLL EVAL SUMMARY (After COREF)"); + printScoreSummary(summary, logger, true); + printFinalConllScore(summary); + if (corefSystem.optimizeConllScore) { + finalScore = getFinalConllScore(summary, corefSystem.optimizeMetricType, corefSystem.optimizeSubScoreType.toString()); + } + } + } + } + + if (!corefSystem.optimizeConllScore && corefSystem.doScore()) { + finalScore = corefSystem.getFinalScore(corefSystem.optimizeMetricType, corefSystem.optimizeSubScoreType); + } + String scoresFile = props.getProperty(Constants.SCORE_FILE_PROP); + if (scoresFile != null) { + PrintWriter pw = IOUtils.getPrintWriter(scoresFile); + pw.println(finalScore); + pw.close(); + } + + if (corefSystem.optimizeSieves) { + logger.info("Final reported score for sieve optimization " + corefSystem.optimizeScoreType + " : " + finalScore); + } + return finalScore; + } + + /** Run and score coref distributed */ + public static void runAndScoreCorefDist(String runDistCmd, Properties props, String propsFile) throws Exception { + PrintWriter pw = IOUtils.getPrintWriter(propsFile); + props.store(pw, null); + pw.close(); + /* Run coref job in a distributed manner, score is written to file */ + List cmd = new ArrayList(); + cmd.addAll(Arrays.asList(runDistCmd.split("\\s+"))); + cmd.add("-props"); + cmd.add(propsFile); + ProcessBuilder pb = new ProcessBuilder(cmd); + // Copy environment variables over + Map curEnv = System.getenv(); + Map pbEnv = pb.environment(); + pbEnv.putAll(curEnv); + + logger.info("Running distributed coref:" + StringUtils.join(pb.command(), " ")); + StringWriter outSos = new StringWriter(); + StringWriter errSos = new StringWriter(); + PrintWriter out = new PrintWriter(new BufferedWriter(outSos)); + PrintWriter err = new PrintWriter(new BufferedWriter(errSos)); + SystemUtils.run(pb, out, err); + out.close(); + err.close(); + String outStr = outSos.toString(); + String errStr = errSos.toString(); + logger.info("Finished distributed coref: " + runDistCmd + ", props=" + propsFile); + logger.info("Output: " + outStr); + if (errStr.length() > 0) { + logger.info("Error: " + errStr); + } + } + + static boolean waitForFiles(File workDir, FileFilter fileFilter, int howMany) throws InterruptedException { + logger.info("Waiting until we see " + howMany + " " + fileFilter + " files in directory " + workDir + "..."); + int seconds = 0; + while (true) { + File[] checkFiles = workDir.listFiles(fileFilter); + + // we found the required number of .check files + if (checkFiles != null && checkFiles.length >= howMany) { + logger.info("Found " + checkFiles.length + " " + fileFilter + " files. Continuing execution."); + break; + } + + // sleep for while before the next check + Thread.sleep(Constants.MONITOR_DIST_CMD_FINISHED_WAIT_MILLIS); + seconds += Constants.MONITOR_DIST_CMD_FINISHED_WAIT_MILLIS / 1000; + if (seconds % 600 == 0) { + double minutes = seconds / 60; + logger.info("Still waiting... " + minutes + " minutes have passed."); + } + } + return true; + } + + private static int fromSieveNameToIndex(String sieveName, String[] sieveNames) + { + if ("*".equals(sieveName)) return -1; + for (int i = 0; i < sieveNames.length; i++) { + if (sieveNames[i].equals(sieveName)) { + return i; + } + } + throw new IllegalArgumentException("Invalid sieve name: " + sieveName); + } + + private static Pair fromSieveOrderConstraintString(String s, String[] sieveNames) + { + String[] parts = s.split("<"); + if (parts.length == 2) { + String first = parts[0].trim(); + String second = parts[1].trim(); + int a = fromSieveNameToIndex(first, sieveNames); + int b = fromSieveNameToIndex(second, sieveNames); + return new Pair(a,b); + } else { + throw new IllegalArgumentException("Invalid sieve ordering constraint: " + s); + } + } + + private static String toSieveOrderConstraintString(Pair orderedSieveIndices, String[] sieveNames) + { + String first = (orderedSieveIndices.first() < 0)? "*":sieveNames[orderedSieveIndices.first()]; + String second = (orderedSieveIndices.second() < 0)? "*":sieveNames[orderedSieveIndices.second()]; + return first + " < " + second; + } + + /** + * Given a set of sieves, select an optimal ordering for the sieves + * by iterating over sieves, and selecting the one that gives the best score and + * adding sieves one at a time until no more sieves left + */ + public void optimizeSieveOrdering(MentionExtractor mentionExtractor, Properties props, String timestamp) throws Exception + { + logger.info("=============SIEVE OPTIMIZATION START ===================="); + logger.info("Optimize sieves using score: " + optimizeScoreType); + FileFilter scoreFilesFilter = new FileFilter() { + @Override + public boolean accept(File file) { + return file.getAbsolutePath().endsWith(".score"); + } + public String toString() { + return ".score"; + } + }; + Pattern scoreFilePattern = Pattern.compile(".*sieves\\.(\\d+)\\.(\\d+).score"); + String runDistributedCmd = props.getProperty(Constants.RUN_DIST_CMD_PROP); + String mainWorkDirPath = props.getProperty(Constants.RUN_DIST_CMD_WORK_DIR, "workdir") + "-" + timestamp + File.separator; + DeterministicCorefSieve[] origSieves = sieves; + String[] origSieveNames = sieveClassNames; + Set remainingSieveIndices = Generics.newHashSet(); + for (int i = 0; i < origSieves.length; i++) { + remainingSieveIndices.add(i); + } + List optimizedOrdering = new ArrayList(); + while (!remainingSieveIndices.isEmpty()) { + // initialize array of current sieves + int curSievesNumber = optimizedOrdering.size(); + sieves = new DeterministicCorefSieve[curSievesNumber+1]; + sieveClassNames = new String[curSievesNumber+1]; + for (int i = 0; i < curSievesNumber; i++) { + sieves[i] = origSieves[optimizedOrdering.get(i)]; + sieveClassNames[i] = origSieveNames[optimizedOrdering.get(i)]; + } + logger.info("*** Optimizing Sieve ordering for pass " + curSievesNumber + " ***"); + // Get list of sieves that we can pick from for the next sieve + Set selectableSieveIndices = new TreeSet(remainingSieveIndices); + // Based on ordering constraints remove sieves from options + if (sievesKeepOrder != null) { + for (Pair ko:sievesKeepOrder) { + if (ko.second() < 0) { + if (remainingSieveIndices.contains(ko.first())) { + logger.info("Restrict selection to " + origSieveNames[ko.first()] + " because of constraint " + + toSieveOrderConstraintString(ko, origSieveNames)); + selectableSieveIndices = Generics.newHashSet(1); + selectableSieveIndices.add(ko.first()); + break; + } + } else if (ko.first() < 0 && remainingSieveIndices.size() > 1) { + if (remainingSieveIndices.contains(ko.second())) { + logger.info("Remove selection " + origSieveNames[ko.second()] + " because of constraint " + + toSieveOrderConstraintString(ko, origSieveNames)); + selectableSieveIndices.remove(ko.second()); + } + } else if (remainingSieveIndices.contains(ko.first())) { + if (remainingSieveIndices.contains(ko.second())) { + logger.info("Remove selection " + origSieveNames[ko.second()] + " because of constraint " + + toSieveOrderConstraintString(ko, origSieveNames)); + selectableSieveIndices.remove(ko.second()); + } + } + } + } + if (selectableSieveIndices.isEmpty()) { + throw new RuntimeException("Unable to find sieve ordering to satisfy all ordering constraints!!!!"); + } + + int selected = -1; + if (selectableSieveIndices.size() > 1) { + // Go through remaining sieves and see how well they do + List> scores = new ArrayList>(); + if (runDistributedCmd != null) { + String workDirPath = mainWorkDirPath + curSievesNumber + File.separator; + File workDir = new File(workDirPath); + workDir.mkdirs(); + workDirPath = workDir.getAbsolutePath() + File.separator; + // Start jobs + for (int potentialSieveIndex:selectableSieveIndices) { + String sieveSelectionId = curSievesNumber + "." + potentialSieveIndex; + String jobDirPath = workDirPath + sieveSelectionId + File.separator; + File jobDir = new File(jobDirPath); + jobDir.mkdirs(); + Properties newProps = new Properties(); + for (String key:props.stringPropertyNames()) { + String value = props.getProperty(key); + value = value.replaceAll("\\$\\{JOBDIR\\}",jobDirPath); + newProps.setProperty(key, value); + } + // try this sieve and see how well it works + sieves[curSievesNumber] = origSieves[potentialSieveIndex]; + sieveClassNames[curSievesNumber] = origSieveNames[potentialSieveIndex]; + newProps.setProperty(Constants.OPTIMIZE_SIEVES_PROP, "false"); + newProps.setProperty(Constants.SCORE_PROP, "true"); + newProps.setProperty(Constants.SIEVES_PROP, StringUtils.join(sieveClassNames,",")); + newProps.setProperty(Constants.LOG_PROP, jobDirPath + "sieves." + sieveSelectionId + ".log"); + newProps.setProperty(Constants.SCORE_FILE_PROP, workDirPath + "sieves." + sieveSelectionId + ".score"); + if (Constants.PRINT_CONLL_OUTPUT || replicateCoNLL) { + newProps.setProperty(Constants.CONLL_OUTPUT_PROP, jobDirPath + "sieves." + sieveSelectionId + ".conlloutput"); + } + String distCmd = newProps.getProperty(Constants.RUN_DIST_CMD_PROP, runDistributedCmd); + runAndScoreCorefDist(distCmd, newProps, workDirPath + "sieves." + sieveSelectionId + ".props"); + } + // Wait for jobs to finish and collect scores + waitForFiles(workDir, scoreFilesFilter,selectableSieveIndices.size()); + // Get scores + File[] scoreFiles = workDir.listFiles(scoreFilesFilter); + for (File file:scoreFiles) { + Matcher m = scoreFilePattern.matcher(file.getName()); + if (m.matches()) { + int potentialSieveIndex = Integer.parseInt(m.group(2)); + String text = IOUtils.slurpFile(file); + double score = Double.parseDouble(text); + // keeps scores so we can select best score and log them + scores.add(new Pair(score,potentialSieveIndex)); + } else { + throw new RuntimeException("Bad score file name: " + file); + } + } + } else { + for (int potentialSieveIndex:selectableSieveIndices) { + // try this sieve and see how well it works + sieves[curSievesNumber] = origSieves[potentialSieveIndex]; + sieveClassNames[curSievesNumber] = origSieveNames[potentialSieveIndex]; + logger.info("Trying sieve " + curSievesNumber + "="+ sieveClassNames[curSievesNumber] + ": "); + logger.info(" Trying sieves: " + StringUtils.join(sieveClassNames,",")); + + double score = runAndScoreCoref(this, mentionExtractor, props, timestamp); + // keeps scores so we can select best score and log them + scores.add(new Pair(score,potentialSieveIndex)); + logger.info(" Trying sieves: " + StringUtils.join(sieveClassNames,",")); + logger.info(" Trying sieves score: " + score); + } + } + // Select bestScore + double bestScore = -1; + for (Pair p:scores) { + if (selected < 0 || p.first() > bestScore) { + bestScore = p.first(); + selected = p.second(); + } + } + // log ordered scores + Collections.sort(scores); + Collections.reverse(scores); + logger.info("Ordered sieves"); + for (Pair p:scores) { + logger.info("Sieve optimization pass " + curSievesNumber + + " scores: Sieve=" + origSieveNames[p.second()] + ", score=" + p.first()); + } + } else { + // Only one sieve + logger.info("Only one choice for next sieve"); + selected = selectableSieveIndices.iterator().next(); + } + // log sieve we are adding + sieves[curSievesNumber] = origSieves[selected]; + sieveClassNames[curSievesNumber] = origSieveNames[selected]; + logger.info("Adding sieve " + curSievesNumber + "="+ sieveClassNames[curSievesNumber] + " to existing sieves: "); + logger.info(" Current Sieves: " + StringUtils.join(sieveClassNames,",")); + // select optimal sieve and add it to our optimized ordering + optimizedOrdering.add(selected); + remainingSieveIndices.remove(selected); + } + logger.info("Final Sieve Ordering: " + StringUtils.join(sieveClassNames, ",")); + logger.info("=============SIEVE OPTIMIZATION DONE ===================="); + } + + /** + * Extracts coreference clusters. + * This is the main API entry point for coreference resolution. + * Return a map from CorefChain ID to corresponding CorefChain. + * @throws Exception + */ + public Map coref(Document document) throws Exception { + + // Multi-pass sieve coreference resolution + for (int i = 0; i < sieves.length ; i++){ + currentSieve = i; + DeterministicCorefSieve sieve = sieves[i]; + // Do coreference resolution using this pass + coreference(document, sieve); + } + + // post processing (e.g., removing singletons, appositions for conll) + if((!Constants.USE_GOLD_MENTIONS && doPostProcessing) || replicateCoNLL) postProcessing(document); + + // coref system output: CorefChain + Map result = Generics.newHashMap(); + for(CorefCluster c : document.corefClusters.values()) { + result.put(c.clusterID, new CorefChain(c, document.positions)); + } + + return result; + } + + /** + * Do coreference resolution using one sieve pass + * @param document - an extracted document + * @throws Exception + */ + private void coreference( + Document document, + DeterministicCorefSieve sieve) throws Exception { + + List> orderedMentionsBySentence = document.getOrderedMentions(); + Map corefClusters = document.corefClusters; + Set roleSet = document.roleSet; + + logger.finest("ROLE SET (Skip exact string match): ------------------"); + for(Mention m : roleSet){ + logger.finest("\t"+m.spanToString()); + } + logger.finest("-------------------------------------------------------"); + + additionalCorrectLinksCount = 0; + additionalLinksCount = 0; + + for (int sentI = 0; sentI < orderedMentionsBySentence.size(); sentI++) { + List orderedMentions = orderedMentionsBySentence.get(sentI); + + for (int mentionI = 0; mentionI < orderedMentions.size(); mentionI++) { + + Mention m1 = orderedMentions.get(mentionI); + + // check for skip: first mention only, discourse salience + if(sieve.skipThisMention(document, m1, corefClusters.get(m1.corefClusterID), dictionaries)) { + continue; + } + + LOOP: + for (int sentJ = sentI; sentJ >= 0; sentJ--) { + List l = sieve.getOrderedAntecedents(sentJ, sentI, orderedMentions, orderedMentionsBySentence, m1, mentionI, corefClusters, dictionaries); + if(maxSentDist != -1 && sentI - sentJ > maxSentDist) continue; + + // Sort mentions by length whenever we have two mentions beginning at the same position and having the same head + for(int i = 0; i < l.size(); i++) { + for(int j = 0; j < l.size(); j++) { + if(l.get(i).headString.equals(l.get(j).headString) && + l.get(i).startIndex == l.get(j).startIndex && + l.get(i).sameSentence(l.get(j)) && j > i && + l.get(i).spanToString().length() > l.get(j).spanToString().length()) { + logger.finest("FLIPPED: "+l.get(i).spanToString()+"("+i+"), "+l.get(j).spanToString()+"("+j+")"); + l.set(j, l.set(i, l.get(j))); + } + } + } + + for (Mention m2 : l) { + // m2 - antecedent of m1 l + + // Skip singletons according to the singleton predictor + // (only for non-NE mentions) + // Recasens, de Marneffe, and Potts (NAACL 2013) + if (m1.isSingleton && m2.isSingleton) continue; + + if (m1.corefClusterID == m2.corefClusterID) continue; + CorefCluster c1 = corefClusters.get(m1.corefClusterID); + CorefCluster c2 = corefClusters.get(m2.corefClusterID); + if (c2 == null) { + logger.warning("NO corefcluster id " + m2.corefClusterID); + } + assert(c1 != null); + assert(c2 != null); + + if (sieve.useRoleSkip()) { + if (m1.isRoleAppositive(m2, dictionaries)) { + roleSet.add(m1); + } else if (m2.isRoleAppositive(m1, dictionaries)) { + roleSet.add(m2); + } + continue; + } + + if (sieve.coreferent(document, c1, c2, m1, m2, dictionaries, roleSet, semantics)) { + + // print logs for analysis + if (doScore()) { + printLogs(c1, c2, m1, m2, document, currentSieve); + } + + int removeID = c1.clusterID; + CorefCluster.mergeClusters(c2, c1); +// logger.warning("Removing cluster " + removeID + ", merged with " + c2.getClusterID()); + corefClusters.remove(removeID); + break LOOP; + } + } + } // End of "LOOP" + } + } + + // scoring + if(doScore()){ + scoreMUC.get(currentSieve).calculateScore(document); + scoreBcubed.get(currentSieve).calculateScore(document); + scorePairwise.get(currentSieve).calculateScore(document); + if(currentSieve==0) { + scoreSingleDoc = new ArrayList(); + scoreSingleDoc.add(new ScorerPairwise()); + scoreSingleDoc.get(currentSieve).calculateScore(document); + additionalCorrectLinksCount = (int) scoreSingleDoc.get(currentSieve).precisionNumSum; + additionalLinksCount = (int) scoreSingleDoc.get(currentSieve).precisionDenSum; + } else { + scoreSingleDoc.add(new ScorerPairwise()); + scoreSingleDoc.get(currentSieve).calculateScore(document); + additionalCorrectLinksCount = (int) (scoreSingleDoc.get(currentSieve).precisionNumSum - scoreSingleDoc.get(currentSieve-1).precisionNumSum); + additionalLinksCount = (int) (scoreSingleDoc.get(currentSieve).precisionDenSum - scoreSingleDoc.get(currentSieve-1).precisionDenSum); + } + linksCountInPass.get(currentSieve).setFirst(linksCountInPass.get(currentSieve).first() + additionalCorrectLinksCount); + linksCountInPass.get(currentSieve).setSecond(linksCountInPass.get(currentSieve).second() + additionalLinksCount); + + printSieveScore(document, sieve); + } + } + + /** Remove singletons, appositive, predicate nominatives, relative pronouns */ + private static void postProcessing(Document document) { + Set removeSet = Generics.newHashSet(); + Set removeClusterSet = Generics.newHashSet(); + + for(CorefCluster c : document.corefClusters.values()){ + Set removeMentions = Generics.newHashSet(); + for(Mention m : c.getCorefMentions()) { + if(Constants.REMOVE_APPOSITION_PREDICATENOMINATIVES + && ((m.appositions!=null && m.appositions.size() > 0) + || (m.predicateNominatives!=null && m.predicateNominatives.size() > 0) + || (m.relativePronouns!=null && m.relativePronouns.size() > 0))){ + removeMentions.add(m); + removeSet.add(document.positions.get(m)); + m.corefClusterID = m.mentionID; + } + } + c.corefMentions.removeAll(removeMentions); + if(Constants.REMOVE_SINGLETONS && c.getCorefMentions().size()==1) { + removeClusterSet.add(c.clusterID); + } + } + for (int removeId : removeClusterSet){ + document.corefClusters.remove(removeId); + } + // todo [cdm 2013]: This is buggy: positions is Map, so can't remove IntTuple + for(IntTuple pos : removeSet){ + document.positions.remove(pos); + } + } + + public static LogisticClassifier getSingletonPredictorFromSerializedFile(String serializedFile) { + try { + ObjectInputStream ois = IOUtils.readStreamFromString(serializedFile); + Object o = ois.readObject(); + if (o instanceof LogisticClassifier) { + return (LogisticClassifier) o; + } + throw new ClassCastException("Wanted SingletonPredictor, got " + o.getClass()); + } catch (IOException e) { + throw new RuntimeIOException(e); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** Remove singleton clusters */ + public static List> filterMentionsWithSingletonClusters(Document document, List> mentions) + { + + List> res = new ArrayList>(mentions.size()); + for (List ml:mentions) { + List filtered = new ArrayList(); + for (Mention m:ml) { + CorefCluster cluster = document.corefClusters.get(m.corefClusterID); + if (cluster != null && cluster.getCorefMentions().size() > 1) { + filtered.add(m); + } + } + res.add(filtered); + } + return res; + } + public static void runConllEval(String conllMentionEvalScript, + String goldFile, String predictFile, String evalFile, String errFile) throws IOException + { + ProcessBuilder process = new ProcessBuilder(conllMentionEvalScript, "all", goldFile, predictFile); + PrintWriter out = new PrintWriter(new FileOutputStream(evalFile)); + PrintWriter err = new PrintWriter(new FileOutputStream(errFile)); + SystemUtils.run(process, out, err); + out.close(); + err.close(); + } + + public static String getConllEvalSummary(String conllMentionEvalScript, + String goldFile, String predictFile) throws IOException + { + ProcessBuilder process = new ProcessBuilder(conllMentionEvalScript, "all", goldFile, predictFile, "none"); + StringOutputStream errSos = new StringOutputStream(); + StringOutputStream outSos = new StringOutputStream(); + PrintWriter out = new PrintWriter(outSos); + PrintWriter err = new PrintWriter(errSos); + SystemUtils.run(process, out, err); + out.close(); + err.close(); + String summary = outSos.toString(); + String errStr = errSos.toString(); + if (errStr.length() > 0) { + summary += "\nERROR: " + errStr; + } + return summary; + } + + /** Print logs for error analysis */ + public void printTopK(Logger logger, Document document, Semantics semantics) { + + List> orderedMentionsBySentence = document.getOrderedMentions(); + Map corefClusters = document.corefClusters; + Map positions = document.positions; + Map golds = document.allGoldMentions; + + logger.fine("=======ERROR ANALYSIS========================================================="); + + for (int i = 0 ; i < orderedMentionsBySentence.size(); i++) { + List orderedMentions = orderedMentionsBySentence.get(i); + for (int j = 0 ; j < orderedMentions.size(); j++) { + Mention m = orderedMentions.get(j); + logger.fine("=========Line: "+i+"\tmention: "+j+"======================================================="); + logger.fine(m.spanToString()+"\tmentionID: "+m.mentionID+"\tcorefClusterID: "+m.corefClusterID+"\tgoldCorefClusterID: "+m.goldCorefClusterID); + CorefCluster corefCluster = corefClusters.get(m.corefClusterID); + if (corefCluster != null) { + corefCluster.printCorefCluster(logger); + } else { + logger.finer("CANNOT find coref cluster for cluster " + m.corefClusterID); + } + logger.fine("-------------------------------------------------------"); + + boolean oneRecallErrorPrinted = false; + boolean onePrecisionErrorPrinted = false; + boolean alreadyChoose = false; + + for (int sentJ = i; sentJ >= 0; sentJ--) { + List l = (new ExactStringMatch()).getOrderedAntecedents(sentJ, i, orderedMentions, orderedMentionsBySentence, m, j, corefClusters, dictionaries); + + // Sort mentions by length whenever we have two mentions beginning at the same position and having the same head + for(int ii = 0; ii < l.size(); ii++) { + for(int jj = 0; jj < l.size(); jj++) { + if(l.get(ii).headString.equals(l.get(jj).headString) && + l.get(ii).startIndex == l.get(jj).startIndex && + l.get(ii).sameSentence(l.get(jj)) && jj > ii && + l.get(ii).spanToString().length() > l.get(jj).spanToString().length()) { + logger.finest("FLIPPED: "+l.get(ii).spanToString()+"("+ii+"), "+l.get(jj).spanToString()+"("+jj+")"); + l.set(jj, l.set(ii, l.get(jj))); + } + } + } + + logger.finest("Candidates in sentence #"+sentJ+" for mention: "+m.spanToString()); + for(int ii = 0; ii < l.size(); ii ++){ + logger.finest("\tCandidate #"+ii+": "+l.get(ii).spanToString()); + } + + for (Mention antecedent : l) { + boolean chosen = (m.corefClusterID == antecedent.corefClusterID); + IntTuple src = new IntTuple(2); + src.set(0,i); + src.set(1,j); + + IntTuple ant = positions.get(antecedent); + //correct=(chosen==goldLinks.contains(new Pair(src,ant))); + boolean coreferent = golds.containsKey(m.mentionID) + && golds.containsKey(antecedent.mentionID) + && (golds.get(m.mentionID).goldCorefClusterID == golds.get(antecedent.mentionID).goldCorefClusterID); + boolean correct = (chosen == coreferent); + + String chosenness = chosen ? "Chosen" : "Not Chosen"; + String correctness = correct ? "Correct" : "Incorrect"; + logger.fine("\t" + correctness +"\t\t" + chosenness + "\t"+antecedent.spanToString()); + CorefCluster mC = corefClusters.get(m.corefClusterID); + CorefCluster aC = corefClusters.get(antecedent.corefClusterID); + + if(chosen && !correct && !onePrecisionErrorPrinted && !alreadyChoose) { + onePrecisionErrorPrinted = true; + printLinkWithContext(logger, "\nPRECISION ERROR ", src, ant, document, semantics); + logger.fine("END of PRECISION ERROR LOG"); + } + + if(!chosen && !correct && !oneRecallErrorPrinted && (!alreadyChoose || (alreadyChoose && onePrecisionErrorPrinted))) { + oneRecallErrorPrinted = true; + printLinkWithContext(logger, "\nRECALL ERROR ", src, ant, document, semantics); + + logger.finer("cluster info: "); + if (mC != null) { + mC.printCorefCluster(logger); + } else { + logger.finer("CANNOT find coref cluster for cluster " + m.corefClusterID); + } + logger.finer("----------------------------------------------------------"); + if (aC != null) { + aC.printCorefCluster(logger); + } else { + logger.finer("CANNOT find coref cluster for cluster " + m.corefClusterID); + } + logger.finer(""); + logger.fine("END of RECALL ERROR LOG"); + } + if(chosen) alreadyChoose = true; + } + } + logger.fine("\n"); + } + } + logger.fine("==============================================================================="); + } + + public void printF1(boolean printF1First) { + scoreMUC.get(sieveClassNames.length - 1).printF1(logger, printF1First); + scoreBcubed.get(sieveClassNames.length - 1).printF1(logger, printF1First); + scorePairwise.get(sieveClassNames.length - 1).printF1(logger, printF1First); + } + + private void printSieveScore(Document document, DeterministicCorefSieve sieve) { + logger.fine("==========================================="); + logger.fine("pass"+currentSieve+": "+ sieve.flagsToString()); + scoreMUC.get(currentSieve).printF1(logger); + scoreBcubed.get(currentSieve).printF1(logger); + scorePairwise.get(currentSieve).printF1(logger); + logger.fine("# of Clusters: "+document.corefClusters.size() + ",\t# of additional links: "+additionalLinksCount + +",\t# of additional correct links: "+additionalCorrectLinksCount + +",\tprecision of new links: "+1.0*additionalCorrectLinksCount/additionalLinksCount); + logger.fine("# of total additional links: "+linksCountInPass.get(currentSieve).second() + +",\t# of total additional correct links: "+linksCountInPass.get(currentSieve).first() + +",\taccumulated precision of this pass: "+1.0*linksCountInPass.get(currentSieve).first()/linksCountInPass.get(currentSieve).second()); + logger.fine("--------------------------------------"); + } + /** Print coref link info */ + private static void printLink(Logger logger, String header, IntTuple src, IntTuple dst, List> orderedMentionsBySentence) { + Mention srcMention = orderedMentionsBySentence.get(src.get(0)).get(src.get(1)); + Mention dstMention = orderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); + if(src.get(0)==dst.get(0)) { + logger.fine(header + ": ["+srcMention.spanToString()+"](id="+srcMention.mentionID + +") in sent #"+src.get(0)+" => ["+dstMention.spanToString()+"](id="+dstMention.mentionID+") in sent #"+dst.get(0) + " Same Sentence"); + } else { + logger.fine(header + ": ["+srcMention.spanToString()+"](id="+srcMention.mentionID + +") in sent #"+src.get(0)+" => ["+dstMention.spanToString()+"](id="+dstMention.mentionID+") in sent #"+dst.get(0)); + } + } + + protected static void printList(Logger logger, String... args) { + StringBuilder sb = new StringBuilder(); + for (String arg : args) { + sb.append(arg); + sb.append('\t'); + } + logger.fine(sb.toString()); + } + + /** print a coref link information including context and parse tree */ + private static void printLinkWithContext(Logger logger, + String header, + IntTuple src, + IntTuple dst, + Document document, Semantics semantics + ) { + List> orderedMentionsBySentence = document.getOrderedMentions(); + List> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence; + + Mention srcMention = orderedMentionsBySentence.get(src.get(0)).get(src.get(1)); + Mention dstMention = orderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); + List srcSentence = srcMention.sentenceWords; + List dstSentence = dstMention.sentenceWords; + + printLink(logger, header, src, dst, orderedMentionsBySentence); + + printList(logger, "Mention:" + srcMention.spanToString(), + "Gender:" + srcMention.gender.toString(), + "Number:" + srcMention.number.toString(), + "Animacy:" + srcMention.animacy.toString(), + "Person:" + srcMention.person.toString(), + "NER:" + srcMention.nerString, + "Head:" + srcMention.headString, + "Type:" + srcMention.mentionType.toString(), + "utter: "+srcMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), + "speakerID: "+srcMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), + "twinless:" + srcMention.twinless); + logger.fine("Context:"); + + String p = ""; + for(int i = 0; i < srcSentence.size(); i++) { + if (i == srcMention.startIndex) { + p += "["; + } + if (i == srcMention.endIndex) { + p += "]"; + } + p += srcSentence.get(i).word() + " "; + } + logger.fine(p); + + StringBuilder golds = new StringBuilder(); + golds.append("Gold mentions in the sentence:\n"); + Counter mBegin = new ClassicCounter(); + Counter mEnd = new ClassicCounter(); + + for(Mention m : goldOrderedMentionsBySentence.get(src.get(0))){ + mBegin.incrementCount(m.startIndex); + mEnd.incrementCount(m.endIndex); + } + List l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(src.get(0)).get(CoreAnnotations.TokensAnnotation.class); + for(int i = 0 ; i < l.size() ; i++){ + for(int j = 0; j < mEnd.getCount(i); j++){ + golds.append("]"); + } + for(int j = 0; j < mBegin.getCount(i); j++){ + golds.append("["); + } + golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class)); + golds.append(" "); + } + logger.fine(golds.toString()); + + printList(logger, "\nAntecedent:" + dstMention.spanToString(), + "Gender:" + dstMention.gender.toString(), + "Number:" + dstMention.number.toString(), + "Animacy:" + dstMention.animacy.toString(), + "Person:" + dstMention.person.toString(), + "NER:" + dstMention.nerString, + "Head:" + dstMention.headString, + "Type:" + dstMention.mentionType.toString(), + "utter: "+dstMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), + "speakerID: "+dstMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), + "twinless:" + dstMention.twinless); + logger.fine("Context:"); + + p = ""; + for(int i = 0; i < dstSentence.size(); i++) { + if (i == dstMention.startIndex) { + p += "["; + } + if (i == dstMention.endIndex) { + p += "]"; + } + p += dstSentence.get(i).word() + " "; + } + logger.fine(p); + + golds = new StringBuilder(); + golds.append("Gold mentions in the sentence:\n"); + mBegin = new ClassicCounter(); + mEnd = new ClassicCounter(); + + for(Mention m : goldOrderedMentionsBySentence.get(dst.get(0))){ + mBegin.incrementCount(m.startIndex); + mEnd.incrementCount(m.endIndex); + } + l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(dst.get(0)).get(CoreAnnotations.TokensAnnotation.class); + for(int i = 0 ; i < l.size() ; i++){ + for(int j = 0; j < mEnd.getCount(i); j++){ + golds.append("]"); + } + for(int j = 0; j < mBegin.getCount(i); j++){ + golds.append("["); + } + golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class)); + golds.append(" "); + } + logger.fine(golds.toString()); + + logger.finer("\nMention:: --------------------------------------------------------"); + try { + logger.finer(srcMention.dependency.toString()); + } catch (Exception e){} //throw new RuntimeException(e);} + logger.finer("Parse:"); + logger.finer(formatPennTree(srcMention.contextParseTree)); + logger.finer("\nAntecedent:: -----------------------------------------------------"); + try { + logger.finer(dstMention.dependency.toString()); + } catch (Exception e){} //throw new RuntimeException(e);} + logger.finer("Parse:"); + logger.finer(formatPennTree(dstMention.contextParseTree)); + } + /** For printing tree in a better format */ + public static String formatPennTree(Tree parseTree) { + String treeString = parseTree.pennString(); + treeString = treeString.replaceAll("\\[TextAnnotation=", ""); + treeString = treeString.replaceAll("(NamedEntityTag|Value|Index|PartOfSpeech)Annotation.+?\\)", ")"); + treeString = treeString.replaceAll("\\[.+?\\]", ""); + return treeString; + } + + /** Print pass results */ + private static void printLogs(CorefCluster c1, CorefCluster c2, Mention m1, + Mention m2, Document document, int sieveIndex) { + Map positions = document.positions; + List> orderedMentionsBySentence = document.getOrderedMentions(); + List> goldLinks = document.getGoldLinks(); + + IntTuple p1 = positions.get(m1); + assert(p1 != null); + IntTuple p2 = positions.get(m2); + assert(p2 != null); + + int menDist = 0; + for (int i = p2.get(0) ; i<= p1.get(0) ; i++){ + if(p1.get(0)==p2.get(0)) { + menDist = p1.get(1)-p2.get(1); + break; + } + if(i==p2.get(0)) { + menDist += orderedMentionsBySentence.get(p2.get(0)).size()-p2.get(1); + continue; + } + if(i==p1.get(0)) { + menDist += p1.get(1); + continue; + } + if(p2.get(0)(p1,p2)))? "\tCorrect" : "\tIncorrect"; + logger.finest("\nsentence distance: "+(p1.get(0)-p2.get(0))+"\tmention distance: "+menDist + correct); + + if(!goldLinks.contains(new Pair(p1,p2))){ + logger.finer("-------Incorrect merge in pass"+sieveIndex+"::--------------------"); + c1.printCorefCluster(logger); + logger.finer("--------------------------------------------"); + c2.printCorefCluster(logger); + logger.finer("--------------------------------------------"); + } + logger.finer("antecedent: "+m2.spanToString()+"("+m2.mentionID+")\tmention: "+m1.spanToString()+"("+m1.mentionID+")\tsentDistance: "+Math.abs(m1.sentNum-m2.sentNum)+"\t"+correct+" Pass"+sieveIndex+":"); + } + + private static void printDiscourseStructure(Document document) { + logger.finer("DISCOURSE STRUCTURE=============================="); + logger.finer("doc type: "+document.docType); + int previousUtterIndex = -1; + String previousSpeaker = ""; + StringBuilder sb = new StringBuilder(); + for(CoreMap s : document.annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel l : s.get(CoreAnnotations.TokensAnnotation.class)) { + int utterIndex = l.get(CoreAnnotations.UtteranceAnnotation.class); + String speaker = l.get(CoreAnnotations.SpeakerAnnotation.class); + String word = l.get(CoreAnnotations.TextAnnotation.class); + if(previousUtterIndex!=utterIndex) { + try { + int previousSpeakerID = Integer.parseInt(previousSpeaker); + logger.finer("\n: "+previousUtterIndex + " : "+document.allPredictedMentions.get(previousSpeakerID).spanToString()); + } catch (Exception e) { + logger.finer("\n: "+previousUtterIndex + " : "+previousSpeaker); + } + + logger.finer(sb.toString()); + sb.setLength(0); + previousUtterIndex = utterIndex; + previousSpeaker = speaker; + } + sb.append(" ").append(word); + } + sb.append("\n"); + } + try { + int previousSpeakerID = Integer.parseInt(previousSpeaker); + logger.finer("\n: "+previousUtterIndex + " : "+document.allPredictedMentions.get(previousSpeakerID).spanToString()); + } catch (Exception e) { + logger.finer("\n: "+previousUtterIndex + " : "+previousSpeaker); + } + logger.finer(sb.toString()); + logger.finer("END OF DISCOURSE STRUCTURE=============================="); + } + + private static void printScoreSummary(String summary, Logger logger, boolean afterPostProcessing) { + String[] lines = summary.split("\n"); + if(!afterPostProcessing) { + for(String line : lines) { + if(line.startsWith("Identification of Mentions")) { + logger.info(line); + return; + } + } + } else { + StringBuilder sb = new StringBuilder(); + for(String line : lines) { + if(line.startsWith("METRIC")) sb.append(line); + if(!line.startsWith("Identification of Mentions") && line.contains("Recall")) { + sb.append(line).append("\n"); + } + } + logger.info(sb.toString()); + } + } + /** Print average F1 of MUC, B^3, CEAF_E */ + private static void printFinalConllScore(String summary) { + Pattern f1 = Pattern.compile("Coreference:.*F1: (.*)%"); + Matcher f1Matcher = f1.matcher(summary); + double[] F1s = new double[5]; + int i = 0; + while (f1Matcher.find()) { + F1s[i++] = Double.parseDouble(f1Matcher.group(1)); + } + double finalScore = (F1s[0]+F1s[1]+F1s[3])/3; + logger.info("Final conll score ((muc+bcub+ceafe)/3) = " + finalScore); + } + + private static double getFinalConllScore(String summary, String metricType, String scoreType) { + // metricType can be muc, bcub, ceafm, ceafe or combined + // Expects to match metricType muc, bcub, ceafm, ceafe + // Will not match the BLANC metrics (coref links, noncoref links, overall) + Pattern pattern = Pattern.compile("METRIC\\s+(.*):Coreference:.*" + scoreType + ":\\s*(\\([ 0-9./]*\\))?\\s*(\\d+(\\.\\d+)?)%"); + Matcher matcher = pattern.matcher(summary); + double[] scores = new double[5]; + String[] names = new String[5]; + int i = 0; + while (matcher.find()) { + names[i] = matcher.group(1); + scores[i] = Double.parseDouble(matcher.group(3)); + i++; + } + metricType = metricType.toLowerCase(); + if ("combined".equals(metricType)) { + double finalScore = (scores[0]+scores[1]+scores[3])/3; + logger.info("Final conll score ((muc+bcub+ceafe)/3) " + scoreType + " = " + finalScore); + return finalScore; + } else { + if ("bcubed".equals(metricType)) { + metricType = "bcub"; + } + for (i = 0; i < names.length; i++) { + if (names[i] != null && names[i].equals(metricType)) { + double finalScore = scores[i]; + logger.info("Final conll score (" + metricType + ") " + scoreType + " = " + finalScore); + return finalScore; + } + } + throw new IllegalArgumentException("Invalid metricType:" + metricType); + } + } + + /** Returns final selected score */ + private double getFinalScore(String metricType, CorefScorer.SubScoreType subScoreType) { + metricType = metricType.toLowerCase(); + int passIndex = sieveClassNames.length - 1; + String scoreDesc = metricType; + double finalScore; + if ("combined".equals(metricType)) { + finalScore = (scoreMUC.get(passIndex).getScore(subScoreType) + + scoreBcubed.get(passIndex).getScore(subScoreType) + + scorePairwise.get(passIndex).getScore(subScoreType))/3; + scoreDesc = "(muc + bcub + pairwise)/3"; + } else if ("muc".equals(metricType)) { + finalScore = scoreMUC.get(passIndex).getScore(subScoreType); + } else if ("bcub".equals(metricType) || "bcubed".equals(metricType)) { + finalScore = scoreBcubed.get(passIndex).getScore(subScoreType); + } else if ("pairwise".equals(metricType)) { + finalScore = scorePairwise.get(passIndex).getScore(subScoreType); + } else { + throw new IllegalArgumentException("Invalid sub score type:" + subScoreType); + } + logger.info("Final score (" + scoreDesc + ") " + subScoreType + " = " + finalScore); + return finalScore; + } + + public static void printConllOutput(Document document, PrintWriter writer, boolean gold) { + printConllOutput(document, writer, gold, false); + } + + public static void printConllOutput(Document document, PrintWriter writer, boolean gold, boolean filterSingletons) { + List> orderedMentions; + if (gold) { + orderedMentions = document.goldOrderedMentionsBySentence; + } else { + orderedMentions = document.predictedOrderedMentionsBySentence; + } + if (filterSingletons) { + orderedMentions = filterMentionsWithSingletonClusters(document, orderedMentions); + } + printConllOutput(document, writer, orderedMentions, gold); + } + + public static void printConllOutput(Document document, PrintWriter writer, List> orderedMentions, boolean gold) + { + Annotation anno = document.annotation; + List> conllDocSentences = document.conllDoc.sentenceWordLists; + String docID = anno.get(CoreAnnotations.DocIDAnnotation.class); + StringBuilder sb = new StringBuilder(); + sb.append("#begin document ").append(docID).append("\n"); + List sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); + for(int sentNum = 0 ; sentNum < sentences.size() ; sentNum++){ + List sentence = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); + List conllSentence = conllDocSentences.get(sentNum); + Map> mentionBeginOnly = Generics.newHashMap(); + Map> mentionEndOnly = Generics.newHashMap(); + Map> mentionBeginEnd = Generics.newHashMap(); + + for(int i=0 ; i()); + mentionEndOnly.put(i, new LinkedHashSet()); + mentionBeginEnd.put(i, new LinkedHashSet()); + } + + for(Mention m : orderedMentions.get(sentNum)) { + if(m.startIndex==m.endIndex-1) { + mentionBeginEnd.get(m.startIndex).add(m); + } else { + mentionBeginOnly.get(m.startIndex).add(m); + mentionEndOnly.get(m.endIndex-1).add(m); + } + } + + for(int i=0 ; i 0) { + sb2.append("|"); + } + int corefClusterId = (gold)? m.goldCorefClusterID:m.corefClusterID; + sb2.append("(").append(corefClusterId); + } + for(Mention m : mentionBeginEnd.get(i)){ + if (sb2.length() > 0) { + sb2.append("|"); + } + int corefClusterId = (gold)? m.goldCorefClusterID:m.corefClusterID; + sb2.append("(").append(corefClusterId).append(")"); + } + for(Mention m : mentionEndOnly.get(i)){ + if (sb2.length() > 0) { + sb2.append("|"); + } + int corefClusterId = (gold)? m.goldCorefClusterID:m.corefClusterID; + sb2.append(corefClusterId).append(")"); + } + if(sb2.length() == 0) sb2.append("-"); + + String[] columns = conllSentence.get(i); + for(int j = 0 ; j < columns.length-1 ; j++){ + String column = columns[j]; + sb.append(column).append("\t"); + } + sb.append(sb2).append("\n"); + } + sb.append("\n"); + } + + sb.append("#end document").append("\n"); + // sb.append("#end document ").append(docID).append("\n"); + + writer.print(sb.toString()); + writer.flush(); + } + + /** Print raw document for analysis */ + public static void printRawDoc(Document document, boolean gold) throws FileNotFoundException { + List sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class); + List> allMentions; + if (gold) { + allMentions = document.goldOrderedMentionsBySentence; + } else { + allMentions = document.predictedOrderedMentionsBySentence; + } + // String filename = document.annotation.get() + + StringBuilder doc = new StringBuilder(); + int previousOffset = 0; + + for(int i = 0 ; i mentions = allMentions.get(i); + + List t = sentence.get(CoreAnnotations.TokensAnnotation.class); + String[] tokens = new String[t.size()]; + for(CoreLabel c : t) { + tokens[c.index()-1] = c.word(); + } + if(previousOffset+2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { + doc.append("\n"); + } + previousOffset = t.get(t.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + Counter startCounts = new ClassicCounter(); + Counter endCounts = new ClassicCounter(); + Map> endMentions = Generics.newHashMap(); + for (Mention m : mentions) { + startCounts.incrementCount(m.startIndex); + endCounts.incrementCount(m.endIndex); + if(!endMentions.containsKey(m.endIndex)) endMentions.put(m.endIndex, Generics.newHashSet()); + endMentions.get(m.endIndex).add(m); + } + for (int j = 0 ; j < tokens.length; j++){ + if(endMentions.containsKey(j)) { + for(Mention m : endMentions.get(j)){ + int corefChainId = (gold)? m.goldCorefClusterID: m.corefClusterID; + doc.append("]_").append(corefChainId); + } + } + for (int k = 0 ; k < startCounts.getCount(j) ; k++) { + if (doc.length() > 0 && doc.charAt(doc.length()-1) != '[') doc.append(" "); + doc.append("["); + } + if (doc.length() > 0 && doc.charAt(doc.length()-1)!='[') doc.append(" "); + doc.append(tokens[j]); + } + if(endMentions.containsKey(tokens.length)) { + for(Mention m : endMentions.get(tokens.length)){ + int corefChainId = (gold)? m.goldCorefClusterID: m.corefClusterID; + doc.append("]_").append(corefChainId); //append("_").append(m.mentionID); + } + } + + doc.append("\n"); + } + logger.fine(document.annotation.get(CoreAnnotations.DocIDAnnotation.class)); + if (gold) { + logger.fine("New DOC: (GOLD MENTIONS) =================================================="); + } else { + logger.fine("New DOC: (Predicted Mentions) =================================================="); + } + logger.fine(doc.toString()); + } + public static List> getLinks( + Map result) { + List> links = new ArrayList>(); + MentionComparator comparator = new MentionComparator(); + + for(CorefChain c : result.values()) { + List s = c.getMentionsInTextualOrder(); + for(CorefMention m1 : s){ + for(CorefMention m2 : s){ + if(comparator.compare(m1, m2)==1) links.add(new Pair(m1.position, m2.position)); + } + } + } + return links; + } + + public static void debugPrintMentions(PrintStream out, String tag, List> mentions) + { + for(int i = 0; i < mentions.size(); i ++){ + out.println(tag + " SENTENCE " + i); + for(int j = 0; j < mentions.get(i).size(); j ++){ + Mention m = mentions.get(i).get(j); + String ms = "(" + m.mentionID + "," + m.originalRef + "," + m.corefClusterID + + ",[" + m.startIndex + "," + m.endIndex +"]" + ") "; + out.print(ms); + } + out.println(); + } + } + + public static boolean checkClusters(Logger logger, String tag, Document document) + { + List> mentions = document.getOrderedMentions(); + boolean clustersOk = true; + for (List mentionCluster : mentions) { + for (Mention m : mentionCluster) { + String ms = "(" + m.mentionID + "," + m.originalRef + "," + m.corefClusterID + + ",[" + m.startIndex + "," + m.endIndex + "]" + ") "; + CorefCluster cluster = document.corefClusters.get(m.corefClusterID); + if (cluster == null) { + logger.warning(tag + ": Cluster not found for mention: " + ms); + clustersOk = false; + } else if (!cluster.getCorefMentions().contains(m)) { + logger.warning(tag + ": Cluster does not contain mention: " + ms); + clustersOk = false; + } + } + } + return clustersOk; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/SieveOptions.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/SieveOptions.java new file mode 100644 index 0000000..7dc69f6 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/SieveOptions.java @@ -0,0 +1,95 @@ +package edu.stanford.nlp.dcoref; + +public class SieveOptions { + public boolean DO_PRONOUN; + public boolean USE_iwithini; + public boolean USE_APPOSITION; + public boolean USE_PREDICATENOMINATIVES; + public boolean USE_ACRONYM; + public boolean USE_RELATIVEPRONOUN; + public boolean USE_ROLEAPPOSITION; + public boolean USE_EXACTSTRINGMATCH; + public boolean USE_INCLUSION_HEADMATCH; + public boolean USE_RELAXED_HEADMATCH; + public boolean USE_INCOMPATIBLE_MODIFIER; + public boolean USE_DEMONYM; + public boolean USE_WORDS_INCLUSION; + public boolean USE_ROLE_SKIP; + public boolean USE_RELAXED_EXACTSTRINGMATCH; + public boolean USE_ATTRIBUTES_AGREE; + public boolean USE_WN_HYPERNYM; + public boolean USE_WN_SYNONYM; + public boolean USE_DIFFERENT_LOCATION; + public boolean USE_NUMBER_IN_MENTION; + public boolean USE_PROPERHEAD_AT_LAST; + public boolean USE_ALIAS; + public boolean USE_SLOT_MATCH; + public boolean USE_DISCOURSEMATCH; + public boolean USE_DISTANCE; + public boolean USE_NUMBER_ANIMACY_NE_AGREE; + public boolean USE_COREF_DICT; + + public String toString() { + StringBuilder os = new StringBuilder(); + os.append("{"); + if(DO_PRONOUN) os.append("DO_PRONOUN"); + if(USE_iwithini) os.append(", USE_iwithini"); + if(USE_APPOSITION) os.append(", USE_APPOSITION"); + if(USE_PREDICATENOMINATIVES) os.append(", USE_PREDICATENOMINATIVES"); + if(USE_ACRONYM) os.append(", USE_ACRONYM"); + if(USE_RELATIVEPRONOUN) os.append(", USE_RELATIVEPRONOUN"); + if(USE_ROLEAPPOSITION) os.append(", USE_ROLEAPPOSITION"); + if(USE_EXACTSTRINGMATCH) os.append(", USE_EXACTSTRINGMATCH"); + if(USE_INCLUSION_HEADMATCH) os.append(", USE_INCLUSION_HEADMATCH"); + if(USE_RELAXED_HEADMATCH) os.append(", USE_RELAXED_HEADMATCH"); + if(USE_INCOMPATIBLE_MODIFIER) os.append(", USE_INCOMPATIBLE_MODIFIER"); + if(USE_DEMONYM) os.append(", USE_DEMONYM"); + if(USE_WORDS_INCLUSION) os.append(", USE_WORDS_INCLUSION"); + if(USE_ROLE_SKIP) os.append(", USE_ROLE_SKIP"); + if(USE_RELAXED_EXACTSTRINGMATCH) os.append(", USE_RELAXED_EXACTSTRINGMATCH"); + if(USE_ATTRIBUTES_AGREE) os.append(", USE_ATTRIBUTES_AGREE"); + if(USE_WN_HYPERNYM) os.append(", USE_WN_HYPERNYM"); + if(USE_WN_SYNONYM) os.append(", USE_WN_SYNONYM"); + if(USE_DIFFERENT_LOCATION) os.append(", USE_DIFFERENT_LOCATION"); + if(USE_NUMBER_IN_MENTION) os.append(", USE_NUMBER_IN_MENTION"); + if(USE_PROPERHEAD_AT_LAST) os.append(", USE_PROPERHEAD_AT_LAST"); + if(USE_ALIAS) os.append(", USE_ALIAS"); + if(USE_SLOT_MATCH) os.append(", USE_SLOT_MATCH"); + if(USE_DISCOURSEMATCH) os.append(", USE_DISCOURSEMATCH"); + if(USE_DISTANCE) os.append(", USE_DISTANCE"); + if(USE_NUMBER_ANIMACY_NE_AGREE) os.append(", USE_NUMBER_ANIMACY_NE_AGREE"); + if(USE_COREF_DICT) os.append(", USE_COREF_DICT"); + os.append("}"); + return os.toString(); + } + + public SieveOptions() { + DO_PRONOUN= false; + USE_iwithini = false; + USE_APPOSITION = false; + USE_PREDICATENOMINATIVES = false; + USE_ACRONYM = false; + USE_RELATIVEPRONOUN = false; + USE_ROLEAPPOSITION = false; + USE_EXACTSTRINGMATCH = false; + USE_INCLUSION_HEADMATCH = false; + USE_RELAXED_HEADMATCH = false; + USE_INCOMPATIBLE_MODIFIER = false; + USE_DEMONYM = false; + USE_WORDS_INCLUSION = false; + USE_ROLE_SKIP = false; + USE_RELAXED_EXACTSTRINGMATCH = false; + USE_ATTRIBUTES_AGREE = false; + USE_WN_HYPERNYM = false; + USE_WN_SYNONYM = false; + USE_DIFFERENT_LOCATION = false; + USE_NUMBER_IN_MENTION = false; + USE_PROPERHEAD_AT_LAST = false; + USE_ALIAS = false; + USE_SLOT_MATCH = false; + USE_DISCOURSEMATCH = false; + USE_DISTANCE = false; + USE_NUMBER_ANIMACY_NE_AGREE = false; + USE_COREF_DICT = false; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/AliasMatch.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/AliasMatch.java new file mode 100644 index 0000000..664418a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/AliasMatch.java @@ -0,0 +1,12 @@ +package edu.stanford.nlp.dcoref.sievepasses; + +public class AliasMatch extends DeterministicCorefSieve { + public AliasMatch() { + super(); + flags.USE_iwithini = true; + flags.USE_ATTRIBUTES_AGREE = true; + flags.USE_ALIAS = true; + flags.USE_DIFFERENT_LOCATION = true; + flags.USE_NUMBER_IN_MENTION = true; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/CorefDictionaryMatch.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/CorefDictionaryMatch.java new file mode 100644 index 0000000..1ed127b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/CorefDictionaryMatch.java @@ -0,0 +1,21 @@ +package edu.stanford.nlp.dcoref.sievepasses; + +/** + * Sieve that uses the coreference dictionary for the technical domain + * developed by Recasens, Can and Jurafsky (NAACL 2013). + * + * @author recasens + */ +public class CorefDictionaryMatch extends DeterministicCorefSieve { + + public CorefDictionaryMatch(){ + super(); + flags.USE_iwithini = true; + flags.USE_DIFFERENT_LOCATION = true; + flags.USE_NUMBER_IN_MENTION = true; + flags.USE_DISTANCE = true; + flags.USE_ATTRIBUTES_AGREE = true; + flags.USE_COREF_DICT = true; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/DeterministicCorefSieve.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/DeterministicCorefSieve.java new file mode 100644 index 0000000..e8c675a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/dcoref/sievepasses/DeterministicCorefSieve.java @@ -0,0 +1,459 @@ +// +// StanfordCoreNLP -- a suite of NLP tools +// Copyright (c) 2009-2010 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// + +package edu.stanford.nlp.dcoref.sievepasses; + +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import edu.stanford.nlp.dcoref.Constants; +import edu.stanford.nlp.dcoref.CorefCluster; +import edu.stanford.nlp.dcoref.Dictionaries; +import edu.stanford.nlp.dcoref.Dictionaries.MentionType; +import edu.stanford.nlp.dcoref.Dictionaries.Number; +import edu.stanford.nlp.dcoref.Dictionaries.Person; +import edu.stanford.nlp.dcoref.Document; +import edu.stanford.nlp.dcoref.Document.DocType; +import edu.stanford.nlp.dcoref.Mention; +import edu.stanford.nlp.dcoref.Rules; +import edu.stanford.nlp.dcoref.Semantics; +import edu.stanford.nlp.dcoref.SieveCoreferenceSystem; +import edu.stanford.nlp.dcoref.SieveOptions; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.Pair; + +/** + * Base class for a Coref Sieve. + * Each sieve extends this class, and set flags for its own options in the constructor. + * + * @author heeyoung + * @author mihais + */ +public abstract class DeterministicCorefSieve { + + public final SieveOptions flags; + + /** Initialize flagSet */ + public DeterministicCorefSieve() { + flags = new SieveOptions(); + } + + public void init(Properties props) { + } + + public String flagsToString() { return flags.toString(); } + + public boolean useRoleSkip() { return flags.USE_ROLE_SKIP; } + + /** Skip this mention? (search pruning) */ + public boolean skipThisMention(Document document, Mention m1, CorefCluster c, Dictionaries dict) { + boolean skip = false; + + // only do for the first mention in its cluster + if(!flags.USE_EXACTSTRINGMATCH && !flags.USE_ROLEAPPOSITION && !flags.USE_PREDICATENOMINATIVES + && !flags.USE_ACRONYM && !flags.USE_APPOSITION && !flags.USE_RELATIVEPRONOUN + && !c.getFirstMention().equals(m1)) { + return true; + } + + if(Constants.USE_DISCOURSE_SALIENCE) { + SieveCoreferenceSystem.logger.finest("DOING COREF FOR:\t" + m1.spanToString()); + if(m1.appositions == null && m1.predicateNominatives == null + && (m1.spanToString().toLowerCase().startsWith("a ") || m1.spanToString().toLowerCase().startsWith("an ")) + && !flags.USE_EXACTSTRINGMATCH) { + skip = true; // A noun phrase starting with an indefinite article - unlikely to have an antecedent (e.g. "A commission" was set up to .... ) + } + if(dict.indefinitePronouns.contains(m1.spanToString().toLowerCase())) { + skip = true; // An indefinite pronoun - unlikely to have an antecedent (e.g. "Some" say that... ) + } + for(String indef : dict.indefinitePronouns){ + if(m1.spanToString().toLowerCase().startsWith(indef + " ")) { + skip = true; // A noun phrase starting with an indefinite adjective - unlikely to have an antecedent (e.g. "Another opinion" on the topic is...) + break; + } + } + + if(skip) { + SieveCoreferenceSystem.logger.finest("MENTION SKIPPED:\t" + m1.spanToString() + "(" + m1.sentNum + ")"+"\toriginalRef: "+m1.originalRef + " in discourse "+m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class)); + } + } + + return skip; + } + + public boolean checkEntityMatch( + CorefCluster mentionCluster, + CorefCluster potentialAntecedent, + Mention mention, + Mention ant, + Dictionaries dict, + Set roleSet) + { + return false; + } + + + /** + * Checks if two clusters are coreferent according to our sieve pass constraints + * @param document + * @throws Exception + */ + public boolean coreferent(Document document, CorefCluster mentionCluster, + CorefCluster potentialAntecedent, + Mention mention2, + Mention ant, + Dictionaries dict, + Set roleSet, + Semantics semantics) throws Exception { + + boolean ret = false; + Mention mention = mentionCluster.getRepresentativeMention(); + if(flags.DO_PRONOUN && Math.abs(mention2.sentNum-ant.sentNum) > 3 + && mention2.person!=Person.I && mention2.person!=Person.YOU) return false; + if(mention2.spanToString().toLowerCase().equals("this") && Math.abs(mention2.sentNum-ant.sentNum) > 3) return false; + if(mention2.person==Person.YOU && document.docType==DocType.ARTICLE + && mention2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) return false; + if(document.conllDoc != null) { + if(ant.generic && ant.person==Person.YOU) return false; + if(mention2.generic) return false; + if(mention2.insideIn(ant) || ant.insideIn(mention2)) return false; + } + + if(flags.USE_DISCOURSEMATCH) { + String mString = mention.spanToString().toLowerCase(); + String antString = ant.spanToString().toLowerCase(); + // (I - I) in the same speaker's quotation. + if(dict.firstPersonPronouns.contains(mString) && mention.number==Number.SINGULAR + && dict.firstPersonPronouns.contains(antString) && ant.number==Number.SINGULAR + && Rules.entitySameSpeaker(document, mention, ant)){ + return true; + } + // (speaker - I) + if(Rules.entityIsSpeaker(document, mention, ant, dict) && + ((dict.firstPersonPronouns.contains(mString) && mention.number==Number.SINGULAR) + || (dict.firstPersonPronouns.contains(antString) && ant.number==Number.SINGULAR))) { + return true; + } + if(Rules.entitySameSpeaker(document, mention, ant) + && dict.secondPersonPronouns.contains(mString) + && dict.secondPersonPronouns.contains(antString)) { + return true; + } + // previous I - you or previous you - I in two person conversation + if(((mention.person==Person.I && ant.person==Person.YOU + || (mention.person==Person.YOU && ant.person==Person.I)) + && (mention.headWord.get(CoreAnnotations.UtteranceAnnotation.class)-ant.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) + && document.docType==DocType.CONVERSATION)) { + SieveCoreferenceSystem.logger.finest("discourse match: between two person"); + return true; + } + if(dict.reflexivePronouns.contains(mention.headString) && Rules.entitySubjectObject(mention, ant)){ + SieveCoreferenceSystem.logger.finest("reflexive pronoun: "+ant.spanToString()+"("+ant.mentionID + ") :: "+ mention.spanToString()+"("+mention.mentionID + ") -> "+(mention.goldCorefClusterID==ant.goldCorefClusterID)); + return true; + } + } + if(Constants.USE_DISCOURSE_CONSTRAINTS && !flags.USE_EXACTSTRINGMATCH && !flags.USE_RELAXED_EXACTSTRINGMATCH + && !flags.USE_APPOSITION && !flags.USE_WORDS_INCLUSION) { + for(Mention m : mentionCluster.getCorefMentions()) { + for(Mention a : potentialAntecedent.getCorefMentions()){ + if(Rules.entityIsSpeaker(document, m, a, dict) && m.person!=Person.I && a.person!=Person.I) { + SieveCoreferenceSystem.logger.finest("Incompatibles: not match(speaker): " +ant.spanToString()+"("+ant.mentionID + ") :: "+ mention.spanToString()+"("+mention.mentionID + ") -> "+(mention.goldCorefClusterID!=ant.goldCorefClusterID)); + document.incompatibles.add(new Pair(Math.min(m.mentionID, a.mentionID), Math.max(m.mentionID, a.mentionID))); + return false; + } + int dist = Math.abs(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - a.headWord.get(CoreAnnotations.UtteranceAnnotation.class)); + if(document.docType!=DocType.ARTICLE && dist==1 && !Rules.entitySameSpeaker(document, m, a)) { + if(m.person==Person.I && a.person==Person.I) { + SieveCoreferenceSystem.logger.finest("Incompatibles: neighbor I: " +ant.spanToString()+"("+ant.mentionID + ") :: "+ mention.spanToString()+"("+mention.mentionID + ") -> "+(mention.goldCorefClusterID!=ant.goldCorefClusterID)); + document.incompatibles.add(new Pair(Math.min(m.mentionID, a.mentionID), Math.max(m.mentionID, a.mentionID))); + return false; + } + if(m.person==Person.YOU && a.person==Person.YOU) { + SieveCoreferenceSystem.logger.finest("Incompatibles: neighbor YOU: " +ant.spanToString()+"("+ant.mentionID + ") :: "+ mention.spanToString()+"("+mention.mentionID + ") -> "+(mention.goldCorefClusterID!=ant.goldCorefClusterID)); + document.incompatibles.add(new Pair(Math.min(m.mentionID, a.mentionID), Math.max(m.mentionID, a.mentionID))); + return false; + } + if(m.person==Person.WE && a.person==Person.WE) { + SieveCoreferenceSystem.logger.finest("Incompatibles: neighbor WE: " +ant.spanToString()+"("+ant.mentionID + ") :: "+ mention.spanToString()+"("+mention.mentionID + ") -> "+(mention.goldCorefClusterID!=ant.goldCorefClusterID)); + document.incompatibles.add(new Pair(Math.min(m.mentionID, a.mentionID), Math.max(m.mentionID, a.mentionID))); + return false; + } + } + } + } + if(document.docType==DocType.ARTICLE) { + for(Mention m : mentionCluster.getCorefMentions()) { + for(Mention a : potentialAntecedent.getCorefMentions()){ + if(Rules.entitySubjectObject(m, a)) { + SieveCoreferenceSystem.logger.finest("Incompatibles: subject-object: "+ant.spanToString()+"("+ant.mentionID + ") :: "+ mention.spanToString()+"("+mention.mentionID + ") -> "+(mention.goldCorefClusterID!=ant.goldCorefClusterID)); + document.incompatibles.add(new Pair(Math.min(m.mentionID, a.mentionID), Math.max(m.mentionID, a.mentionID))); + return false; + } + } + } + } + } + + if(flags.USE_iwithini && Rules.entityIWithinI(mention, ant, dict)) { + document.incompatibles.add(new Pair(Math.min(mention.mentionID, ant.mentionID), Math.max(mention.mentionID, ant.mentionID))); + return false; + } + if(flags.USE_EXACTSTRINGMATCH && Rules.entityExactStringMatch(mentionCluster, potentialAntecedent, dict, roleSet)){ + return true; + } + if(flags.USE_RELAXED_EXACTSTRINGMATCH && Rules.entityRelaxedExactStringMatch(mentionCluster, potentialAntecedent, mention, ant, dict, roleSet)){ + return true; + } + if(flags.USE_APPOSITION && Rules.entityIsApposition(mentionCluster, potentialAntecedent, mention, ant)) { + SieveCoreferenceSystem.logger.finest("Apposition: "+mention.spanToString()+"\tvs\t"+ant.spanToString()); + return true; + } + if(flags.USE_PREDICATENOMINATIVES && Rules.entityIsPredicateNominatives(mentionCluster, potentialAntecedent, mention, ant)) { + SieveCoreferenceSystem.logger.finest("Predicate nominatives: "+mention.spanToString()+"\tvs\t"+ant.spanToString()); + return true; + } + + if(flags.USE_ACRONYM && Rules.entityIsAcronym(mentionCluster, potentialAntecedent)) { + SieveCoreferenceSystem.logger.finest("Acronym: "+mention.spanToString()+"\tvs\t"+ant.spanToString()); + return true; + } + if(flags.USE_RELATIVEPRONOUN && Rules.entityIsRelativePronoun(mention, ant)){ + SieveCoreferenceSystem.logger.finest("Relative pronoun: "+mention.spanToString()+"\tvs\t"+ant.spanToString()); + return true; + } + if(flags.USE_DEMONYM && mention.isDemonym(ant, dict)){ + SieveCoreferenceSystem.logger.finest("Demonym: "+mention.spanToString()+"\tvs\t"+ant.spanToString()); + return true; + } + + if(flags.USE_ROLEAPPOSITION && Rules.entityIsRoleAppositive(mentionCluster, potentialAntecedent, mention, ant, dict)){ + return true; + } + if(flags.USE_INCLUSION_HEADMATCH && Rules.entityHeadsAgree(mentionCluster, potentialAntecedent, mention, ant, dict)){ + ret = true; + } + if(flags.USE_RELAXED_HEADMATCH && Rules.entityRelaxedHeadsAgreeBetweenMentions(mentionCluster, potentialAntecedent, mention, ant) ){ + ret = true; + } + if(flags.USE_WORDS_INCLUSION && ret && ! Rules.entityWordsIncluded(mentionCluster, potentialAntecedent, mention, ant)) { + return false; + } + + if(flags.USE_INCOMPATIBLE_MODIFIER && ret && Rules.entityHaveIncompatibleModifier(mentionCluster, potentialAntecedent)) { + return false; + } + if(flags.USE_PROPERHEAD_AT_LAST && ret && !Rules.entitySameProperHeadLastWord(mentionCluster, potentialAntecedent, mention, ant)) { + return false; + } + if(flags.USE_ATTRIBUTES_AGREE && !Rules.entityAttributesAgree(mentionCluster, potentialAntecedent)) { + return false; + } + if(flags.USE_DIFFERENT_LOCATION + && Rules.entityHaveDifferentLocation(mention, ant, dict)) { + if(flags.USE_PROPERHEAD_AT_LAST && ret && mention.goldCorefClusterID!=ant.goldCorefClusterID) { + SieveCoreferenceSystem.logger.finest("DIFFERENT LOCATION: "+ant.spanToString()+" :: "+mention.spanToString()); + } + return false; + } + if(flags.USE_NUMBER_IN_MENTION + && Rules.entityNumberInLaterMention(mention, ant)) { + if(flags.USE_PROPERHEAD_AT_LAST && ret && mention.goldCorefClusterID!=ant.goldCorefClusterID) { + SieveCoreferenceSystem.logger.finest("NEW NUMBER : "+ant.spanToString()+" :: "+mention.spanToString()); + } + return false; + } + if(flags.USE_WN_HYPERNYM) { + Method meth = semantics.wordnet.getClass().getMethod("checkHypernym", CorefCluster.class, CorefCluster.class, Mention.class, Mention.class); + if((Boolean) meth.invoke(semantics.wordnet, mentionCluster, potentialAntecedent, mention, ant)) { + ret = true; + } else if (mention.goldCorefClusterID == ant.goldCorefClusterID + && !mention.isPronominal() && !ant.isPronominal()){ + SieveCoreferenceSystem.logger.finest("not hypernym in WN"); + SieveCoreferenceSystem.logger.finest("False Negatives:: " + ant.spanToString() +" <= "+mention.spanToString()); + } + } + if(flags.USE_WN_SYNONYM) { + Method meth = semantics.wordnet.getClass().getMethod("checkSynonym", new Class[]{Mention.class, Mention.class}); + if((Boolean) meth.invoke(semantics.wordnet, mention, ant)) { + ret = true; + } else if (mention.goldCorefClusterID == ant.goldCorefClusterID + && !mention.isPronominal() && !ant.isPronominal()){ + SieveCoreferenceSystem.logger.finest("not synonym in WN"); + SieveCoreferenceSystem.logger.finest("False Negatives:: " + ant.spanToString() +" <= "+mention.spanToString()); + + } + } + + try { + if(flags.USE_ALIAS && Rules.entityAlias(mentionCluster, potentialAntecedent, semantics, dict)){ + return true; + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + if(flags.USE_DISTANCE && Rules.entityTokenDistance(mention2, ant)){ + return false; + } + + if(flags.USE_COREF_DICT){ + + // Head match + if(ant.headWord.lemma().equals(mention2.headWord.lemma())) return false; + + // Constraint: ignore pairs commonNoun - properNoun + if(ant.mentionType != MentionType.PROPER && + ( mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP") + || !mention2.headWord.word().substring(1).equals(mention2.headWord.word().substring(1).toLowerCase()) ) ) return false; + + // Constraint: ignore plurals + if(ant.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS") + && mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS")) return false; + + // Constraint: ignore mentions with indefinite determiners + if(dict.indefinitePronouns.contains(ant.originalSpan.get(0).lemma()) + || dict.indefinitePronouns.contains(mention2.originalSpan.get(0).lemma())) return false; + + // Constraint: ignore coordinated mentions + if(ant.isCoordinated() || mention2.isCoordinated()) return false; + + // Constraint: context incompatibility + if(Rules.contextIncompatible(mention2, ant, dict)) return false; + + // Constraint: sentence context incompatibility when the mentions are common nouns + if(Rules.sentenceContextIncompatible(mention2, ant, dict)) return false; + + if(Rules.entityClusterAllCorefDictionary(mentionCluster, potentialAntecedent, dict, 1, 8)) return true; + if(Rules.entityCorefDictionary(mention, ant, dict, 2, 2)) return true; + if(Rules.entityCorefDictionary(mention, ant, dict, 3, 2)) return true; + if(Rules.entityCorefDictionary(mention, ant, dict, 4, 2)) return true; + } + + if(flags.DO_PRONOUN){ + Mention m; + if (mention.predicateNominatives!=null && mention.predicateNominatives.contains(mention2)) { + m = mention2; + } else { + m = mention; + } + + if((m.isPronominal() || dict.allPronouns.contains(m.toString())) && Rules.entityAttributesAgree(mentionCluster, potentialAntecedent)){ + + if(dict.demonymSet.contains(ant.spanToString().toLowerCase()) && dict.notOrganizationPRP.contains(m.headString)){ + document.incompatibles.add(new Pair(Math.min(m.mentionID, ant.mentionID), Math.max(m.mentionID, ant.mentionID))); + return false; + } + if(Constants.USE_DISCOURSE_CONSTRAINTS && Rules.entityPersonDisagree(document, mentionCluster, potentialAntecedent, dict)){ + SieveCoreferenceSystem.logger.finest("Incompatibles: Person Disagree: "+ant.spanToString()+"("+ant.mentionID+") :: "+mention.spanToString()+"("+mention.mentionID+") -> "+(mention.goldCorefClusterID!=ant.goldCorefClusterID)); + document.incompatibles.add(new Pair(Math.min(m.mentionID, ant.mentionID), Math.max(m.mentionID, ant.mentionID))); + return false; + } + return true; + } + } + + return ret; + } + + /** + * Orders the antecedents for the given mention (m1) + * @param antecedentSentence + * @param mySentence + * @param orderedMentions + * @param orderedMentionsBySentence + * @param m1 + * @param m1Position + * @param corefClusters + * @param dict + * @return An ordering of potential antecedents depending on same/different sentence, etc. + */ + public List getOrderedAntecedents( + int antecedentSentence, + int mySentence, + List orderedMentions, + List> orderedMentionsBySentence, + Mention m1, + int m1Position, + Map corefClusters, + Dictionaries dict) { + List orderedAntecedents = new ArrayList(); + + // ordering antecedents + if (antecedentSentence == mySentence) { // same sentence + orderedAntecedents.addAll(orderedMentions.subList(0, m1Position)); + if(flags.DO_PRONOUN && corefClusters.get(m1.corefClusterID).isSinglePronounCluster(dict)) { + orderedAntecedents = sortMentionsForPronoun(orderedAntecedents, m1, true); + } + if(dict.relativePronouns.contains(m1.spanToString())) Collections.reverse(orderedAntecedents); + } else { // previous sentence + orderedAntecedents.addAll(orderedMentionsBySentence.get(antecedentSentence)); + } + + return orderedAntecedents; + } + + /** Divides a sentence into clauses and sort the antecedents for pronoun matching */ + private static List sortMentionsForPronoun(List l, Mention m1, boolean sameSentence) { + List sorted = new ArrayList(); + Tree tree = m1.contextParseTree; + Tree current = m1.mentionSubTree; + if(sameSentence){ + while(true){ + current = current.ancestor(1, tree); + if(current.label().value().startsWith("S")){ + for(Mention m : l){ + if(!sorted.contains(m) && current.dominates(m.mentionSubTree)) sorted.add(m); + } + } + if(current.label().value().equals("ROOT") || current.ancestor(1, tree)==null) break; + } + if(l.size()!=sorted.size()) { + SieveCoreferenceSystem.logger.finest("sorting failed!!! -> parser error?? \tmentionID: "+m1.mentionID+" " + m1.spanToString()); + sorted=l; + } else if(!l.equals(sorted)){ + SieveCoreferenceSystem.logger.finest("sorting succeeded & changed !! \tmentionID: "+m1.mentionID+" " + m1.spanToString()); + for(int i=0; i { + + public Set getMembers(); + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSA.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSA.java new file mode 100644 index 0000000..f8b1028 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSA.java @@ -0,0 +1,152 @@ +package edu.stanford.nlp.fsm; + +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Scored; + +import java.io.IOException; +import java.io.Writer; +import java.util.*; + +/** + * DFSA: A class for representing a deterministic finite state automaton + * without epsilon transitions. + * + * @author Dan Klein + * @author Michel Galley (AT&T FSM library format printing) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) - cleanup and filling in types + */ +public final class DFSA implements Scored { + + Object dfsaID; + DFSAState initialState; + + public DFSA(DFSAState initialState, double score) { + this.initialState = initialState; + this.score = score; + } + + public DFSA(DFSAState initialState) { + this.initialState = initialState; + this.score = Double.NaN; + } + + public double score; + + public double score() { + return score; + } + + public DFSAState initialState() { + return initialState; + } + + public void setInitialState(DFSAState initialState) { + this.initialState = initialState; + } + + public Set> states() { + Set> visited = Generics.newHashSet(); + List> toVisit = new ArrayList>(); + toVisit.add(initialState()); + exploreStates(toVisit, visited); + return visited; + } + + private static void exploreStates(List> toVisit, Set> visited) { + while (!toVisit.isEmpty()) { + DFSAState state = toVisit.get(toVisit.size() - 1); + toVisit.remove(toVisit.size() - 1); + if (!visited.contains(state)) { + toVisit.addAll(state.successorStates()); + visited.add(state); + } + } + } + + public DFSA(Object dfsaID) { + this.dfsaID = dfsaID; + this.score = 0; + } + + + private static void printTrieDFSAHelper(DFSAState state, int level) { + if (state.isAccepting()) { + return; + } + Set inputs = state.continuingInputs(); + for (T input : inputs) { + DFSATransition transition = state.transition(input); + System.out.print(level); + System.out.print(input); + for (int i = 0; i < level; i++) { + System.out.print(" "); + } + System.out.print(transition.score()); + System.out.print(" "); + System.out.println(input); + printTrieDFSAHelper(transition.target(), level + 1); + } + } + + public static void printTrieDFSA(DFSA dfsa) { + System.err.println("DFSA: " + dfsa.dfsaID); + printTrieDFSAHelper(dfsa.initialState(), 2); + } + + public void printAttFsmFormat(Writer w) throws IOException { + Queue> q = new LinkedList>(); + Set> visited = Generics.newHashSet(); + q.offer(initialState); + while(q.peek() != null) { + DFSAState state = q.poll(); + if(state == null || visited.contains(state)) + continue; + visited.add(state); + if (state.isAccepting()) { + w.write(state.toString()+"\t"+state.score()+"\n"); + continue; + } + TreeSet inputs = new TreeSet(state.continuingInputs()); + for (T input : inputs) { + DFSATransition transition = state.transition(input); + DFSAState target = transition.target(); + if(!visited.contains(target)) + q.add(target); + w.write(state.toString()+"\t"+target.toString()+"\t"+transition.getInput()+"\t"+transition.score()+"\n"); + } + } + } + + private static void printTrieAsRulesHelper(DFSAState state, String prefix, Writer w) throws IOException { + if (state.isAccepting()) { + return; + } + Set inputs = state.continuingInputs(); + for (T input : inputs) { + DFSATransition transition = state.transition(input); + DFSAState target = transition.target(); + Set inputs2 = target.continuingInputs(); + boolean allTerminate = true; + for (T input2 : inputs2) { + DFSATransition transition2 = target.transition(input2); + DFSAState target2 = transition2.target(); + if (target2.isAccepting()) { + // it's a binary end rule. Print it. + w.write(prefix + " --> " + input + " " + input2 + "\n"); + } else { + allTerminate = false; + } + } + if (!allTerminate) { + // there are some longer continuations. Print continuation rule + String newPrefix = prefix + "_" + input; + w.write(prefix + " --> " + input + " " + newPrefix + "\n"); + printTrieAsRulesHelper(transition.target(), newPrefix, w); + } + } + } + + public static void printTrieAsRules(DFSA dfsa, Writer w) throws IOException { + printTrieAsRulesHelper(dfsa.initialState(), dfsa.dfsaID.toString(), w); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSAState.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSAState.java new file mode 100644 index 0000000..46823fd --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSAState.java @@ -0,0 +1,152 @@ +package edu.stanford.nlp.fsm; + +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Scored; + +import java.util.*; + +/** + * DFSAState + *

    + * Class for representing the state of a deterministic finite state + * automaton without epsilon transitions. + * + * @author Dan Klein + * @version 12/14/2000 + * @author Sarah Spikes (sdspikes@cs.stanford.edu) - cleanup and filling in types + * @param stateID type + * @param transition type + */ +public final class DFSAState implements Scored { + + private S stateID; + private Map> inputToTransition; + public boolean accepting; + private DFSA dfsa; + + public double score; + + public double score() { + return score; + } + + public void setScore(double score) { + this.score = score; + } + + + public DFSA dfsa() { + return dfsa; + } + + public void setStateID(S stateID) { + this.stateID = stateID; + } + + public S stateID() { + return stateID; + } + + public void addTransition(DFSATransition transition) { + inputToTransition.put(transition.input(), transition); + } + + public DFSATransition transition(T input) { + return inputToTransition.get(input); + } + + public Collection> transitions() { + return inputToTransition.values(); + } + + public Set continuingInputs() { + return inputToTransition.keySet(); + } + + public Set> successorStates() { + Set> successors = Generics.newHashSet(); + Collection> transitions = inputToTransition.values(); + for (DFSATransition transition : transitions) { + successors.add(transition.getTarget()); + } + return successors; + } + + public void setAccepting(boolean accepting) { + this.accepting = accepting; + } + + public boolean isAccepting() { + return accepting; + } + + public boolean isContinuable() { + return !inputToTransition.isEmpty(); + } + + @Override + public String toString() { + return stateID.toString(); + } + + private int hashCodeCache; // = 0; + + @Override + public int hashCode() { + if (hashCodeCache == 0) { + hashCodeCache = stateID.hashCode() ^ dfsa.hashCode(); + } + return hashCodeCache; + } + + // equals + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof DFSAState)) { + return false; + } + DFSAState s = (DFSAState) o; + // historically also checked: accepting == s.accepting && + //inputToTransition.equals(s.inputToTransition)) + return dfsa.equals(s.dfsa) && stateID.equals(s.stateID); + } + + public Set> statesReachable() { + Set> visited = Generics.newHashSet(); + List> toVisit = new ArrayList>(); + toVisit.add(this); + exploreStates(toVisit, visited); + return visited; + } + + private void exploreStates(List> toVisit, Set> visited) { + while (!toVisit.isEmpty()) { + DFSAState state = toVisit.get(toVisit.size() - 1); + toVisit.remove(toVisit.size() - 1); + if (!visited.contains(state)) { + toVisit.addAll(state.successorStates()); + visited.add(state); + } + } + } + + public DFSAState(S id, DFSA dfsa) { + this.dfsa = dfsa; + this.stateID = id; + this.accepting = false; + this.inputToTransition = Generics.newHashMap(); + this.score = Double.NEGATIVE_INFINITY; + } + + public DFSAState(S id, DFSA dfsa, double score) { + this(id,dfsa); + setScore(score); + } + + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSATransition.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSATransition.java new file mode 100644 index 0000000..9f72092 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/DFSATransition.java @@ -0,0 +1,80 @@ +package edu.stanford.nlp.fsm; + +import edu.stanford.nlp.util.Scored; + +/** + * (D)FSA Transition + *

    + * Class for representing a transition in a weighted finite state + * transducer. For now, just null out fields that may not apply. + * This should really be FSATransition as there's nothing + * deterministic-specific. If FSA is ever made, this should be + * abstracted. The ID is a descriptor, not a unique ID. + * + * @author Dan Klein + * @version 12/14/00 + */ +public final class DFSATransition implements Scored { + + private Object transitionID; + private DFSAState source; + protected DFSAState target; // used directly in DFSAMinimizer (only) + private double score; + private T input; + private Object output; + + public DFSATransition(Object transitionID, DFSAState source, DFSAState target, T input, Object output, double score) { + this.transitionID = transitionID; + this.source = source; + this.target = target; + this.input = input; + this.output = output; + this.score = score; + } + + public DFSAState getSource() { + return source; + } + + public DFSAState source() { + return source; + } + + public DFSAState getTarget() { + return target; + } + + public DFSAState target() { + return target; + } + + public Object getID() { + return transitionID; + } + + public double score() { + return score; + } + + public T getInput() { + return input; + } + + public T input() { + return input; + } + + public Object getOutput() { + return output; + } + + public Object output() { + return output; + } + + @Override + public String toString() { + return "[" + transitionID + "]" + source + " -" + input + ":" + output + "-> " + target; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/FastExactAutomatonMinimizer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/FastExactAutomatonMinimizer.java new file mode 100644 index 0000000..08bdf3a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/FastExactAutomatonMinimizer.java @@ -0,0 +1,319 @@ +package edu.stanford.nlp.fsm; + +import edu.stanford.nlp.trees.PennTreebankLanguagePack; +import edu.stanford.nlp.util.Maps; + +import java.util.*; + +/** + * Minimization in n log n a la Hopcroft. + * + * @author Dan Klein (klein@cs.stanford.edu) + */ +public class FastExactAutomatonMinimizer implements AutomatonMinimizer { + TransducerGraph unminimizedFA = null; + Map memberToBlock = null; + LinkedList splits = null; + + boolean sparseMode = true; + + static final Object SINK_NODE = "SINK_NODE"; + + static class Split { + Collection members; + Object symbol; + Block block; + + public Collection getMembers() { + return members; + } + + public Object getSymbol() { + return symbol; + } + + public Block getBlock() { + return block; + } + + public Split(Collection members, Object symbol, Block block) { + this.members = members; + this.symbol = symbol; + this.block = block; + } + } + + static class Block { + Set members; + + public Set getMembers() { + return members; + } + + public Block(Set members) { + this.members = members; + } + } + + protected TransducerGraph getUnminimizedFA() { + return unminimizedFA; + } + + protected Collection getSymbols() { + return getUnminimizedFA().getInputs(); + } + + public TransducerGraph minimizeFA(TransducerGraph unminimizedFA) { + // System.out.println(unminimizedFA); + this.unminimizedFA = unminimizedFA; + this.splits = new LinkedList(); + this.memberToBlock = new HashMap(); //new IdentityHashMap(); // TEG: I had to change this b/c some weren't matching + minimize(); + return buildMinimizedFA(); + } + + protected TransducerGraph buildMinimizedFA() { + TransducerGraph minimizedFA = new TransducerGraph(); + TransducerGraph unminimizedFA = getUnminimizedFA(); + for (Iterator arcI = unminimizedFA.getArcs().iterator(); arcI.hasNext();) { + TransducerGraph.Arc arc = (TransducerGraph.Arc) arcI.next(); + Object source = projectNode(arc.getSourceNode()); + Object target = projectNode(arc.getTargetNode()); + try { + if (minimizedFA.canAddArc(source, target, arc.getInput(), arc.getOutput())) { + minimizedFA.addArc(source, target, arc.getInput(), arc.getOutput()); + } + } catch (Exception e) { + //throw new IllegalArgumentException(); + } + } + minimizedFA.setStartNode(projectNode(unminimizedFA.getStartNode())); + for (Iterator endIter = unminimizedFA.getEndNodes().iterator(); endIter.hasNext();) { + Object o = endIter.next(); + minimizedFA.setEndNode(projectNode(o)); + } + + return minimizedFA; + } + + protected Object projectNode(Object node) { + Set members = getBlock(node).getMembers(); + return members; + } + + + protected boolean hasSplit() { + return splits.size() > 0; + } + + protected Split getSplit() { + return (Split) splits.removeFirst(); + } + + protected void addSplit(Split split) { + splits.addLast(split); + } + + // protected Collection inverseImages(Collection block, Object symbol) { + // List inverseImages = new ArrayList(); + // for (Iterator nodeI = block.iterator(); nodeI.hasNext();) { + // Object node = nodeI.next(); + // inverseImages.addAll(getUnminimizedFA().getInboundArcs(node, symbol)); + // } + // return inverseImages; + // } + + protected Map sortIntoBlocks(Collection nodes) { + Map blockToMembers = new IdentityHashMap(); + for (Iterator nodeI = nodes.iterator(); nodeI.hasNext();) { + Object o = nodeI.next(); + Block block = getBlock(o); + Maps.putIntoValueHashSet(blockToMembers, block, o); + } + return blockToMembers; + } + + protected void makeBlock(Collection members) { + Block block = new Block(new HashSet(members)); + for (Iterator memberI = block.getMembers().iterator(); memberI.hasNext();) { + Object member = memberI.next(); + if (member != SINK_NODE) { + // System.out.println("putting in memberToBlock: " + member + " " + block); + memberToBlock.put(member, block); + } + } + addSplits(block); + } + + protected void addSplits(Block block) { + Map symbolToTarget = new HashMap(); + for (Iterator memberI = block.getMembers().iterator(); memberI.hasNext();) { + Object member = memberI.next(); + for (Iterator symbolI = getInverseArcs(member).iterator(); symbolI.hasNext();) { + TransducerGraph.Arc arc = (TransducerGraph.Arc) symbolI.next(); + Object symbol = arc.getInput(); + Object target = arc.getTargetNode(); + Maps.putIntoValueArrayList(symbolToTarget, symbol, target); + } + } + for (Iterator symbolI = symbolToTarget.keySet().iterator(); symbolI.hasNext();) { + Object symbol = symbolI.next(); + addSplit(new Split((List) symbolToTarget.get(symbol), symbol, block)); + } + } + + protected void removeAll(Collection block, Collection members) { + // this is because AbstractCollection/Set.removeAll() isn't always linear in members.size() + for (Iterator memberI = members.iterator(); memberI.hasNext();) { + Object member = memberI.next(); + block.remove(member); + } + } + + protected Collection difference(Collection block, Collection members) { + Set difference = new HashSet(); + for (Iterator memberI = block.iterator(); memberI.hasNext();) { + Object member = memberI.next(); + if (!members.contains(member)) { + difference.add(member); + } + } + return difference; + } + + protected Block getBlock(Object o) { + Block result = (Block) memberToBlock.get(o); + if (result == null) { + System.out.println("No block found for: " + o); // debug + System.out.println("But I do have blocks for: "); + for (Iterator i = memberToBlock.keySet().iterator(); i.hasNext();) { + System.out.println(i.next()); + } + throw new RuntimeException("FastExactAutomatonMinimizer: no block found"); + } + return result; + } + + protected Collection getInverseImages(Split split) { + List inverseImages = new ArrayList(); + Object symbol = split.getSymbol(); + Block block = split.getBlock(); + for (Iterator memberI = split.getMembers().iterator(); memberI.hasNext();) { + Object member = memberI.next(); + if (!block.getMembers().contains(member)) { + continue; + } + Collection arcs = getInverseArcs(member, symbol); + for (Iterator arcI = arcs.iterator(); arcI.hasNext();) { + TransducerGraph.Arc arc = (TransducerGraph.Arc) arcI.next(); + Object source = arc.getSourceNode(); + inverseImages.add(source); + } + } + return inverseImages; + } + + protected Collection getInverseArcs(Object member, Object symbol) { + if (member != SINK_NODE) { + return getUnminimizedFA().getArcsByTargetAndInput(member, symbol); + } + return getUnminimizedFA().getArcsByInput(symbol); + } + + protected Collection getInverseArcs(Object member) { + if (member != SINK_NODE) { + return getUnminimizedFA().getArcsByTarget(member); + } + return getUnminimizedFA().getArcs(); + } + + protected void makeInitialBlocks() { + // sink block (for if the automaton isn't complete + makeBlock(Collections.singleton(SINK_NODE)); + // accepting block + Set endNodes = getUnminimizedFA().getEndNodes(); + makeBlock(endNodes); + // main block + Collection nonFinalNodes = new HashSet(getUnminimizedFA().getNodes()); + nonFinalNodes.removeAll(endNodes); + makeBlock(nonFinalNodes); + } + + protected void minimize() { + makeInitialBlocks(); + while (hasSplit()) { + Split split = getSplit(); + Collection inverseImages = getInverseImages(split); + Map inverseImagesByBlock = sortIntoBlocks(inverseImages); + for (Iterator blockI = inverseImagesByBlock.keySet().iterator(); blockI.hasNext();) { + Block block = (Block) blockI.next(); + Collection members = (Collection) inverseImagesByBlock.get(block); + if (members.size() == 0 || members.size() == block.getMembers().size()) { + continue; + } + if (members.size() > block.getMembers().size() - members.size()) { + members = difference(block.getMembers(), members); + } + removeAll(block.getMembers(), members); + makeBlock(members); + } + } + } + + public static void main(String[] args) { + /* + TransducerGraph fa = new TransducerGraph(); + fa.addArc(fa.getStartNode(),"1","a",""); + fa.addArc(fa.getStartNode(),"2","b",""); + fa.addArc(fa.getStartNode(),"3","c",""); + fa.addArc("1","4","a",""); + fa.addArc("2","4","a",""); + fa.addArc("3","5","c",""); + fa.addArc("4",fa.getEndNode(),"c",""); + fa.addArc("5",fa.getEndNode(),"c",""); + System.out.println(fa); + ExactAutomatonMinimizer minimizer = new ExactAutomatonMinimizer(); + System.out.println(minimizer.minimizeFA(fa)); + */ + System.out.println("Starting minimizer test..."); + List pathList = new ArrayList(); + TransducerGraph randomFA = TransducerGraph.createRandomGraph(5000, 5, 1.0, 5, pathList); + List outputs = randomFA.getPathOutputs(pathList); + + TransducerGraph.GraphProcessor quasiDeterminizer = new QuasiDeterminizer(); + AutomatonMinimizer minimizer = new FastExactAutomatonMinimizer(); + TransducerGraph.NodeProcessor ntsp = new TransducerGraph.SetToStringNodeProcessor(new PennTreebankLanguagePack()); + TransducerGraph.ArcProcessor isp = new TransducerGraph.InputSplittingProcessor(); + TransducerGraph.ArcProcessor ocp = new TransducerGraph.OutputCombiningProcessor(); + + TransducerGraph detGraph = quasiDeterminizer.processGraph(randomFA); + TransducerGraph combGraph = new TransducerGraph(detGraph, ocp); // combine outputs into inputs + TransducerGraph result = minimizer.minimizeFA(combGraph); // minimize the thing + System.out.println("Minimized from " + randomFA.getNodes().size() + " to " + result.getNodes().size()); + result = new TransducerGraph(result, ntsp); // pull out strings from sets returned by minimizer + result = new TransducerGraph(result, isp); // split outputs from inputs + List minOutputs = result.getPathOutputs(pathList); + System.out.println("Equal? " + outputs.equals(minOutputs)); + + /* + randomFA = new TransducerGraph(randomFA, new TransducerGraph.OutputCombiningProcessor()); + System.out.print("Starting fast minimization..."); + FastExactAutomatonMinimizer minimizer2 = new FastExactAutomatonMinimizer(); + Timing.startTime(); + TransducerGraph minimizedRandomFA = minimizer2.minimizeFA(randomFA); + Timing.tick("done. ( "+randomFA.getArcs().size()+" arcs to "+minimizedRandomFA.getArcs().size()+" arcs)"); + minimizedRandomFA = new TransducerGraph(minimizedRandomFA, new TransducerGraph.InputSplittingProcessor()); + List minOutputs = minimizedRandomFA.getPathOutputs(pathList); + System.out.println("Equal? "+outputs.equals(minOutputs)); + + System.out.print("Starting slow minimization..."); + ExactAutomatonMinimizer minimizer = new ExactAutomatonMinimizer(); + Timing.startTime(); + minimizedRandomFA = minimizer.minimizeFA(randomFA); + Timing.tick("done. ( "+randomFA.getArcs().size()+" arcs to "+minimizedRandomFA.getArcs().size()+" arcs)"); + minimizedRandomFA = new TransducerGraph(minimizedRandomFA, new TransducerGraph.InputSplittingProcessor()); + minOutputs = minimizedRandomFA.getPathOutputs(pathList); + System.out.println("Equal? "+outputs.equals(minOutputs)); + */ + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/QuasiDeterminizer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/QuasiDeterminizer.java new file mode 100644 index 0000000..55c262b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/QuasiDeterminizer.java @@ -0,0 +1,144 @@ +package edu.stanford.nlp.fsm; + +import edu.stanford.nlp.stats.ClassicCounter; +import java.util.*; + +/** + * QuasiDeterminizer. + *

    + * Class for performing quasi-determinization on TransducerGraphs. + * + * @author Teg Grenager + * @version 11/02/03 + */ +public class QuasiDeterminizer implements TransducerGraph.GraphProcessor { + + + public TransducerGraph processGraph(TransducerGraph graph) { + // compute lambda function + ClassicCounter lambda = computeLambda(graph); // not destructive + // do the pushing + TransducerGraph result = pushLambdas(graph, lambda); // creates a new one + return result; + } + + /** + * Takes time linear in number of arcs. + */ + public static ClassicCounter computeLambda(TransducerGraph graph) { + LinkedList queue = new LinkedList(); + ClassicCounter lambda = new ClassicCounter(); + ClassicCounter length = new ClassicCounter(); + Map first = new HashMap(); + Set nodes = graph.getNodes(); + for (Object node : nodes) { + lambda.setCount(node, 0); + length.setCount(node, Double.POSITIVE_INFINITY); + } + Set endNodes = graph.getEndNodes(); + for (Object o : endNodes) { + lambda.setCount(o, 0); + length.setCount(o, 0); + queue.addLast(o); + } + // Breadth first search + // get the first node from the queue + Object node = null; + try { + node = queue.removeFirst(); + } catch (NoSuchElementException e) { + } + while (node != null) { + double oldLen = length.getCount(node); + Set arcs = graph.getArcsByTarget(node); + if (arcs != null) { + for (Object arc1 : arcs) { + TransducerGraph.Arc arc = (TransducerGraph.Arc) arc1; + Object newNode = arc.getSourceNode(); + Comparable a = (Comparable) arc.getInput(); + double k = ((Double) arc.getOutput()).doubleValue(); + double newLen = length.getCount(newNode); + if (newLen == Double.POSITIVE_INFINITY) { + // we are discovering this + queue.addLast(newNode); + } + Comparable f = (Comparable) first.get(newNode); + if (newLen == Double.POSITIVE_INFINITY || (newLen == oldLen + 1 && a.compareTo(f) < 0)) { // f can't be null, since we have a newLen + // we do this to this to newNode when we have new info, possibly many times + first.put(newNode, a); // ejecting old one if necessary + length.setCount(newNode, oldLen + 1); // this may already be the case + lambda.setCount(newNode, k + lambda.getCount(node)); + } + } + } + // get a new node from the queue + node = null; + try { + node = queue.removeFirst(); + } catch (NoSuchElementException e) { + } + } + return lambda; + } + + /** + * Takes time linear in number of arcs. + */ + public TransducerGraph pushLambdas(TransducerGraph graph, ClassicCounter lambda) { + TransducerGraph result = graph.clone(); // arcs have been copied too so we don't mess up graph + Set arcs = result.getArcs(); + for (TransducerGraph.Arc arc : arcs) { + double sourceLambda = lambda.getCount(arc.getSourceNode()); + double targetLambda = lambda.getCount(arc.getTargetNode()); + double oldOutput = ((Double) arc.getOutput()).doubleValue(); + double newOutput = oldOutput + targetLambda - sourceLambda; + arc.setOutput(new Double(newOutput)); + } + // do initialOutput + double startLambda = lambda.getCount(result.getStartNode()); + if (startLambda != 0.0) { + // add it back to the outbound arcs from start (instead of adding it to the initialOutput) + Set startArcs = result.getArcsBySource(result.getStartNode()); + for (TransducerGraph.Arc arc : startArcs) { + double oldOutput = ((Double) arc.getOutput()).doubleValue(); + double newOutput = oldOutput + startLambda; + arc.setOutput(new Double(newOutput)); + } + } + // do finalOutput + for (Object o : result.getEndNodes()) { + double endLambda = lambda.getCount(o); + if (endLambda != 0.0) { + // subtract it from the inbound arcs to end (instead of subtracting it from the finalOutput) + Set endArcs = result.getArcsByTarget(o); + for (TransducerGraph.Arc arc : endArcs) { + double oldOutput = ((Double) arc.getOutput()).doubleValue(); + double newOutput = oldOutput - endLambda; + arc.setOutput(new Double(newOutput)); + } + } + + } + return result; + } + + public static void main(String[] args) { + TransducerGraph.GraphProcessor qd = new QuasiDeterminizer(); + List pathList = new ArrayList(); + TransducerGraph graph = TransducerGraph.createRandomGraph(1000, 10, 1.0, 10, pathList); + StringBuffer b = new StringBuffer(); + graph.depthFirstSearch(true, b); + System.out.println(b.toString()); + System.out.println("Done creating random graph"); + // TransducerGraph.printPathOutputs(pathList, graph, false); + //System.out.println("Depth first search from start node"); + //TransducerGraph.depthFirstSearch(graph, TransducerGraph.END_NODE, new HashSet(), 0, false); + TransducerGraph newGraph = qd.processGraph(graph); + System.out.println("Done quasi-determinizing"); + //TransducerGraph.printPathOutputs(pathList, newGraph, false); + //System.out.println("Depth first search from start node"); + //TransducerGraph.depthFirstSearch(newGraph, TransducerGraph.END_NODE, new HashSet(), 0, false); + TransducerGraph.testGraphPaths(graph, newGraph, 1000); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/TransducerGraph.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/TransducerGraph.java new file mode 100644 index 0000000..b80091b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/fsm/TransducerGraph.java @@ -0,0 +1,945 @@ +package edu.stanford.nlp.fsm; + +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.trees.TreebankLanguagePack; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Maps; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; + +/** + * TransducerGraph + *

    + * Class for representing a deterministic finite state automaton + * without epsilon transitions. + * + * @author Teg Grenager + * @version 11/02/03 + */ +//TODO: needs some work to make type-safe. (In several places, +//takes an Object and does instanceof to see what it is...) +public class TransducerGraph implements Cloneable { + + public static final String EPSILON_INPUT = "EPSILON"; + + private static final String DEFAULT_START_NODE = "START"; + + private static Random r = new Random(); + + // internal data structures + private Set arcs; + private Map> arcsBySource; + private Map> arcsByTarget; + private Map> arcsByInput; + private Map, Arc> arcsBySourceAndInput; + private Map> arcsByTargetAndInput; + private Object startNode; + private Set endNodes; + private boolean checkDeterminism = false; + + public void setDeterminism(boolean checkDeterminism) { + this.checkDeterminism = checkDeterminism; + } + + public TransducerGraph() { + arcs = Generics.newHashSet(); + arcsBySource = Generics.newHashMap(); + arcsByTarget = Generics.newHashMap(); + arcsByInput = Generics.newHashMap(); + arcsBySourceAndInput = Generics.newHashMap(); + arcsByTargetAndInput = Generics.newHashMap(); + endNodes = Generics.newHashSet(); + setStartNode(DEFAULT_START_NODE); + } + + public TransducerGraph(TransducerGraph other) { + this(other, (ArcProcessor) null); + } + + public TransducerGraph(TransducerGraph other, ArcProcessor arcProcessor) { + this(other.getArcs(), other.getStartNode(), other.getEndNodes(), arcProcessor, null); + } + + public TransducerGraph(TransducerGraph other, NodeProcessor nodeProcessor) { + this(other.getArcs(), other.getStartNode(), other.getEndNodes(), null, nodeProcessor); + } + + public TransducerGraph(Set newArcs, Object startNode, Set endNodes, ArcProcessor arcProcessor, NodeProcessor nodeProcessor) { + this(); + ArcProcessor arcProcessor2 = null; + if (nodeProcessor != null) { + arcProcessor2 = new NodeProcessorWrappingArcProcessor(nodeProcessor); + } + for (Arc a : newArcs) { + a = new Arc(a); // make a copy + if (arcProcessor != null) { + a = arcProcessor.processArc(a); + } + if (arcProcessor2 != null) { + a = arcProcessor2.processArc(a); + } + addArc(a); + } + if (nodeProcessor != null) { + this.startNode = nodeProcessor.processNode(startNode); + } else { + this.startNode = startNode; + } + if (nodeProcessor != null) { + if (endNodes != null) { + for (Iterator endIter = endNodes.iterator(); endIter.hasNext();) { + Object o = endIter.next(); + this.endNodes.add(nodeProcessor.processNode(o)); + } + } + } else { + if (endNodes != null) { + this.endNodes.addAll(endNodes); + } + } + } + + /** + * Uses the Arcs newArcs. + */ + public TransducerGraph(Set newArcs) { + this(newArcs, null, null, null, null); + } + + @Override + public TransducerGraph clone() { + TransducerGraph result = new TransducerGraph(this, (ArcProcessor) null); + return result; + } + + public Set getArcs() { + return arcs; + } + + /** + * Just does union of keysets of maps. + */ + public Set getNodes() { + Set result = Generics.newHashSet(); + result.addAll(arcsBySource.keySet()); + result.addAll(arcsByTarget.keySet()); + return result; + } + + public Set getInputs() { + return arcsByInput.keySet(); + } + + public void setStartNode(Object o) { + startNode = o; + } + + public void setEndNode(Object o) { + //System.out.println(this + " setting endNode to " + o); + endNodes.add(o); + } + + public Object getStartNode() { + return startNode; + } + + public Set getEndNodes() { + //System.out.println(this + " getting endNode " + endNode); + return endNodes; + } + + /** + * Returns a Set of type TransducerGraph.Arc. + */ + public Set getArcsByInput(Object node) { + return ensure(arcsByInput.get(node)); + } + + /** + * Returns a Set of type TransducerGraph.Arc. + */ + public Set getArcsBySource(Object node) { + return ensure(arcsBySource.get(node)); + } + + protected Set ensure(Set s) { + if (s == null) { + return Collections.emptySet(); + } + return s; + } + + /** + * Returns a Set of type TransducerGraph.Arc. + */ + public Set getArcsByTarget(Object node) { + return ensure(arcsByTarget.get(node)); + } + + /** + * Can only be one because automaton is deterministic. + */ + public Arc getArcBySourceAndInput(Object node, Object input) { + return arcsBySourceAndInput.get(Generics.newPair(node, input)); + } + + /** + * Returns a Set of type TransducerGraph.Arc. + */ + public Set getArcsByTargetAndInput(Object node, Object input) { + return ensure(arcsByTargetAndInput.get(Generics.newPair(node, input))); + } + + /** + * Slow implementation. + */ + public Arc getArc(Object source, Object target) { + Set arcsFromSource = arcsBySource.get(source); + Set arcsToTarget = arcsByTarget.get(target); + Set result = Generics.newHashSet(); + result.addAll(arcsFromSource); + result.retainAll(arcsToTarget); // intersection + if (result.size() < 1) { + return null; + } + if (result.size() > 1) { + throw new RuntimeException("Problem in TransducerGraph data structures."); + } + // get the only member + Iterator iterator = result.iterator(); + return (Arc) iterator.next(); + } + + /** + * @return true if and only if it created a new Arc and added it to the graph. + */ + public boolean addArc(Object source, Object target, Object input, Object output) { + Arc a = new Arc(source, target, input, output); + return addArc(a); + } + + /** + * @return true if and only if it added Arc a to the graph. + * determinism. + */ + protected boolean addArc(Arc a) { + Object source = a.getSourceNode(); + Object target = a.getTargetNode(); + Object input = a.getInput(); + if (source == null || target == null || input == null) { + return false; + } + // add to data structures + if (arcs.contains(a)) { + return false; + } + // it's new, so add to the rest of the data structures + // add to source and input map + Pair p = Generics.newPair(source, input); + if (arcsBySourceAndInput.containsKey(p) && checkDeterminism) { + throw new RuntimeException("Creating nondeterminism while inserting arc " + a + " because it already has arc " + arcsBySourceAndInput.get(p) + checkDeterminism); + } + arcsBySourceAndInput.put(p, a); + Maps.putIntoValueHashSet(arcsBySource, source, a); + p = Generics.newPair(target, input); + Maps.putIntoValueHashSet(arcsByTargetAndInput, p, a); + Maps.putIntoValueHashSet(arcsByTarget, target, a); + Maps.putIntoValueHashSet(arcsByInput, input, a); + // add to arcs + arcs.add(a); + return true; + } + + public boolean removeArc(Arc a) { + Object source = a.getSourceNode(); + Object target = a.getTargetNode(); + Object input = a.getInput(); + // remove from arcs + if (!arcs.remove(a)) { + return false; + } + // remove from arcsBySourceAndInput + Pair p = Generics.newPair(source, input); + if (!arcsBySourceAndInput.containsKey(p)) { + return false; + } + arcsBySourceAndInput.remove(p); + // remove from arcsBySource + Set s = arcsBySource.get(source); + if (s == null) { + return false; + } + if (!s.remove(a)) { + return false; + } + // remove from arcsByTargetAndInput + p = Generics.newPair(target, input); + s = arcsByTargetAndInput.get(p); + if (s == null) { + return false; + } + if (!s.remove(a)) { + return false; + } + // remove from arcsByTarget + s = arcsByTarget.get(target); + if (s == null) { + return false; + } + s = arcsByInput.get(input); + if (s == null) { + return false; + } + if (!s.remove(a)) { + return false; + } + return true; + } + + public boolean canAddArc(Object source, Object target, Object input, Object output) { + Arc a = new Arc(source, target, input, output); + if (arcs.contains(a)) // inexpensive check + { + return false; + } + Pair p = Generics.newPair(source, input); + return !arcsBySourceAndInput.containsKey(p); // expensive check + } + + + public static class Arc { + + protected NODE sourceNode; + protected NODE targetNode; + protected IN input; + protected OUT output; + + public NODE getSourceNode() { + return sourceNode; + } + + public NODE getTargetNode() { + return targetNode; + } + + public IN getInput() { + return input; + } + + public OUT getOutput() { + return output; + } + + public void setSourceNode(NODE o) { + sourceNode = o; + } + + public void setTargetNode(NODE o) { + targetNode = o; + } + + public void setInput(IN o) { + input = o; + } + + public void setOutput(OUT o) { + output = o; + } + + @Override + public int hashCode() { + return sourceNode.hashCode() ^ (targetNode.hashCode() << 16) ^ (input.hashCode() << 16); + } + + @Override + public boolean equals(Object o) { + if (o == this) { + return true; + } + if (!(o instanceof Arc)) { + return false; + } + Arc a = (Arc) o; + return ((sourceNode == null ? a.sourceNode == null : sourceNode.equals(a.sourceNode)) && (targetNode == null ? a.targetNode == null : targetNode.equals(a.targetNode)) && (input == null ? a.input == null : input.equals(a.input))); + } + + // makes a copy of Arc a + protected Arc(Arc a) { + this(a.getSourceNode(), a.getTargetNode(), a.getInput(), a.getOutput()); + } + + protected Arc(NODE sourceNode, NODE targetNode) { + this(sourceNode, targetNode, null, null); + } + + protected Arc(NODE sourceNode, NODE targetNode, IN input) { + this(sourceNode, targetNode, input, null); + } + + protected Arc(NODE sourceNode, NODE targetNode, IN input, OUT output) { + this.sourceNode = sourceNode; + this.targetNode = targetNode; + this.input = input; + this.output = output; + } + + @Override + public String toString() { + return sourceNode + " --> " + targetNode + " (" + input + " : " + output + ")"; + } + + } // end static class Arc + + + public static interface ArcProcessor { + /** + * Modifies Arc a. + */ + public Arc processArc(Arc a); + } + + public static class OutputCombiningProcessor implements ArcProcessor { + public Arc processArc(Arc a) { + a = new Arc(a); + a.setInput(Generics.newPair(a.getInput(), a.getOutput())); + a.setOutput(null); + return a; + } + } + + public static class InputSplittingProcessor implements ArcProcessor { + public Arc processArc(Arc a) { + a = new Arc(a); + Pair p = (Pair) a.getInput(); + a.setInput(p.first); + a.setOutput(p.second); + return a; + } + } + + public static class NodeProcessorWrappingArcProcessor implements ArcProcessor { + NodeProcessor nodeProcessor; + + public NodeProcessorWrappingArcProcessor(NodeProcessor nodeProcessor) { + this.nodeProcessor = nodeProcessor; + } + + public Arc processArc(Arc a) { + a = new Arc(a); + a.setSourceNode(nodeProcessor.processNode(a.getSourceNode())); + a.setTargetNode(nodeProcessor.processNode(a.getTargetNode())); + return a; + } + } + + public static interface NodeProcessor { + public Object processNode(Object node); + } + + public static class SetToStringNodeProcessor implements NodeProcessor { + private TreebankLanguagePack tlp; + + public SetToStringNodeProcessor(TreebankLanguagePack tlp) { + this.tlp = tlp; + } + + public Object processNode(Object node) { + Set s = null; + if (node instanceof Set) { + s = (Set) node; + } else { + if (node instanceof Block) { + Block b = (Block) node; + s = b.getMembers(); + } else { + throw new RuntimeException("Unexpected node class"); + } + } + Object sampleNode = s.iterator().next(); + if (s.size() == 1) { + if (sampleNode instanceof Block) { + return processNode(sampleNode); + } else { + return sampleNode; + } + } + // nope there's a set of things + if (sampleNode instanceof String) { + String str = (String) sampleNode; + if (str.charAt(0) != '@') { + // passive category... + return tlp.basicCategory(str) + "-" + s.hashCode(); // TODO remove b/c there could be collisions + // return tlp.basicCategory(str) + "-" + System.identityHashCode(s); + } + } + return "@NodeSet-" + s.hashCode(); // TODO remove b/c there could be collisions + // return sampleNode.toString(); + } + } + + public static class ObjectToSetNodeProcessor implements NodeProcessor { + public Object processNode(Object node) { + return Collections.singleton(node); + } + } + + public static interface GraphProcessor { + public TransducerGraph processGraph(TransducerGraph g); + } + + + public static class NormalizingGraphProcessor implements GraphProcessor { + boolean forward = true; + + public NormalizingGraphProcessor(boolean forwardNormalization) { + this.forward = forwardNormalization; + } + + public TransducerGraph processGraph(TransducerGraph g) { + g = new TransducerGraph(g); + Set nodes = g.getNodes(); + for (Object node : nodes) { + Set myArcs = null; + if (forward) { + myArcs = g.getArcsBySource(node); + } else { + myArcs = g.getArcsByTarget(node); + } + // compute a total + double total = 0.0; + for (Arc a : myArcs) { + total += ((Double) a.getOutput()).doubleValue(); + } + // divide each by total + for (Arc a : myArcs) { + a.setOutput(new Double(Math.log(((Double) a.getOutput()).doubleValue() / total))); + } + } + return g; + } + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + depthFirstSearch(true, sb); + return sb.toString(); + } + + + private boolean dotWeightInverted = false; + + public void setDotWeightingInverted(boolean inverted) { + dotWeightInverted = true; + } + + public String asDOTString() { + NumberFormat nf = NumberFormat.getNumberInstance(); + nf.setMaximumFractionDigits(3); + nf.setMinimumFractionDigits(1); + StringBuffer result = new StringBuffer(); + Set nodes = getNodes(); + result.append("digraph G {\n"); + // result.append("page = \"8.5,11\";\n"); + // result.append("margin = \"0.25\";\n"); + // Heuristic number of pages + int sz = arcs.size(); + int ht = 105; + int mag = 250; + while (sz > mag) { + ht += 105; + mag *= 2; + } + int wd = 8; + mag = 500; + while (sz > mag) { + wd += 8; + mag *= 4; + } + double htd = ht / 10.0; + result.append("size = \"" + wd + "," + htd + "\";\n"); + result.append("graph [rankdir = \"LR\"];\n"); + result.append("graph [ranksep = \"0.2\"];\n"); + for (Object node : nodes) { + String cleanString = StringUtils.fileNameClean(node.toString()); + result.append(cleanString); + result.append(" [ "); + // if (getEndNodes().contains(node)) { + // result.append("label=\"" + node.toString() + "\", style=filled, "); + // } else + result.append("label=\"" + node.toString() + "\""); + result.append("height=\"0.3\", width=\"0.3\""); + result.append(" ];\n"); + for (Arc arc : getArcsBySource(node)) { + result.append(StringUtils.fileNameClean(arc.getSourceNode().toString())); + result.append(" -> "); + result.append(StringUtils.fileNameClean(arc.getTargetNode().toString())); + result.append(" [ "); + result.append("label=\""); + result.append(arc.getInput()); + result.append(" : "); + // result.append(arc.getOutput()); + Object output = arc.getOutput(); + String wt = ""; + if (output instanceof Number) { + double dd = ((Number) output).doubleValue(); + if (dd == -0.0d) { + result.append(nf.format(0.0d)); + } else { + result.append(nf.format(output)); + } + int weight; + if (dotWeightInverted) { + weight = (int) (20.0 - dd); + } else { + weight = (int) dd; + } + if (weight > 0) { + wt = ", weight = \"" + weight + "\""; + } + if (dotWeightInverted && dd <= 2.0 || (!dotWeightInverted) && dd >= 20.0) { + wt += ", style=bold"; + } + } else { + result.append(output); + } + result.append("\""); + result.append(wt); + // result.append("fontsize = 14 "); + if (arc.getInput().toString().equals("EPSILON")) { + result.append(", style = \"dashed\" "); + } else { + result.append(", style = \"solid\" "); + } + // result.append(", weight = \"" + arc.getOutput() + "\" "); + result.append("];\n"); + } + } + result.append("}\n"); + return result.toString(); + } + + public double inFlow(Object node) { + Set arcs = getArcsByTarget(node); + return sumOutputs(arcs); + } + + public double outFlow(Object node) { + Set arcs = getArcsBySource(node); + return sumOutputs(arcs); + } + + private double sumOutputs(Set arcs) { + double sum = 0.0; + for (Arc arc : arcs) { + sum += ((Double) arc.getOutput()).doubleValue(); + } + return sum; + } + + public double getSourceTotal(Object node) { + double result = 0.0; + Set arcs = getArcsBySource(node); + if (arcs.size() == 0) { + System.err.println("No outbound arcs from node."); + return result; + } + for (Arc arc : arcs) { + result += ((Double) arc.getOutput()).doubleValue(); + } + return result; + } + + /** + * For testing only. Doubles combined by addition. + */ + public double getOutputOfPathInGraph(List path) { + double score = 0.0; + Object node = getStartNode(); + for (Object input : path) { + Arc arc = getArcBySourceAndInput(node, input); // next input in path + if (arc == null) { + System.out.println(" NOT ACCEPTED :" + path); + return Double.NEGATIVE_INFINITY; + } + score += ((Double) arc.getOutput()).doubleValue(); + node = arc.getTargetNode(); + } + return score; + } + + /** + * for testing only. doubles combined by addition. + */ + public List sampleUniformPathFromGraph() { + List list = new ArrayList(); + Object node = this.getStartNode(); + Set endNodes = this.getEndNodes(); + while (!endNodes.contains(node)) { + List arcs = new ArrayList(this.getArcsBySource(node)); + TransducerGraph.Arc arc = arcs.get(r.nextInt(arcs.size())); + list.add(arc.getInput()); + node = arc.getTargetNode(); + } + return list; + } + + public Map samplePathsFromGraph(int numPaths) { + Map result = Generics.newHashMap(); + for (int i = 0; i < numPaths; i++) { + List l = sampleUniformPathFromGraph(); + result.put(l, new Double(getOutputOfPathInGraph(l))); + } + return result; + } + + /** + * For testing only. + */ + public static void printPathOutputs(List pathList, TransducerGraph graph, boolean printPaths) { + int i = 0; + for (Iterator iter = pathList.iterator(); iter.hasNext();) { + List path = iter.next(); + if (printPaths) { + for (Iterator j = path.iterator(); j.hasNext();) { + System.out.print(j.next() + " "); + } + } else { + System.out.print(i++ + " "); + } + System.out.print("output: " + graph.getOutputOfPathInGraph(path)); + System.out.println(); + } + } + + /** + * For testing only. + */ + public List getPathOutputs(List pathList) { + List outputList = new ArrayList(); + for (Iterator iter = pathList.iterator(); iter.hasNext();) { + List path = iter.next(); + outputList.add(new Double(getOutputOfPathInGraph(path))); + } + return outputList; + } + + public static boolean testGraphPaths(TransducerGraph sourceGraph, TransducerGraph testGraph, int numPaths) { + for (int i = 0; i < numPaths; i++) { + List path = sourceGraph.sampleUniformPathFromGraph(); + double score = sourceGraph.getOutputOfPathInGraph(path); + double newScore = testGraph.getOutputOfPathInGraph(path); + if ((score - newScore) / (score + newScore) > 1e-10) { + System.out.println("Problem: " + score + " vs. " + newScore + " on " + path); + return false; + } + } + return true; + } + + + /** + * For testing only. Doubles combined by multiplication. + */ + public boolean canAddPath(List path) { + Object node = this.getStartNode(); + for (int j = 0; j < path.size() - 1; j++) { + Object input = path.get(j); + Arc arc = this.getArcBySourceAndInput(node, input); // next input in path + if (arc == null) { + return true; + } + node = arc.getTargetNode(); + } + Object input = path.get(path.size() - 1); // last element + Arc arc = this.getArcBySourceAndInput(node, input); // next input in path + if (arc == null) { + return true; + } else { + if (getEndNodes().contains(arc.getTargetNode())) { + return true; + } else { + return false; + } + } + } + + /** + * If markovOrder is zero, we always transition back to the start state + * If markovOrder is negative, we assume that it is infinite + */ + public static TransducerGraph createGraphFromPaths(List paths, int markovOrder) { + ClassicCounter pathCounter = new ClassicCounter(); + for (Object o : paths) { + pathCounter.incrementCount(o); + } + return createGraphFromPaths(pathCounter, markovOrder); + } + + public static TransducerGraph createGraphFromPaths(ClassicCounter> pathCounter, int markovOrder) { + TransducerGraph graph = new TransducerGraph(); // empty + for (Iterator> pathIter = pathCounter.keySet().iterator(); pathIter.hasNext();) { + List path = pathIter.next(); + double count = pathCounter.getCount(path); + addOnePathToGraph(path, count, markovOrder, graph); + } + return graph; + } + + // assumes that the path already has EPSILON as the last element. + public static void addOnePathToGraph(List path, double count, int markovOrder, TransducerGraph graph) { + Object source = graph.getStartNode(); + Object input, target; + for (int j = 0; j < path.size(); j++) { + input = path.get(j); + Arc a = graph.getArcBySourceAndInput(source, input); + if (a != null) { + // increment the arc weight + a.output = new Double(((Double) a.output).doubleValue() + count); + } else { + if (input.equals(TransducerGraph.EPSILON_INPUT)) { + target = "END"; // to ensure they all share the same end node + } else if (markovOrder == 0) { + // we all transition back to the same state + target = source; + } else if (markovOrder > 0) { + // the state is described by the partial history + target = path.subList((j < markovOrder ? 0 : j - markovOrder + 1), j + 1); + } else { + // the state is described by the full history + target = path.subList(0, j + 1); + } + Double output = new Double(count); + a = new Arc(source, target, input, output); + graph.addArc(a); + } + source = a.getTargetNode(); + } + graph.setEndNode(source); + } + + /** + * For testing only. All paths will be added to pathList as Lists. + * // generate a bunch of paths through the graph with the input alphabet + * // and create new nodes for each one. + */ + public static TransducerGraph createRandomGraph(int numPaths, int pathLengthMean, double pathLengthVariance, int numInputs, List pathList) { + // compute the path length. Draw from a normal distribution + int pathLength = (int) (r.nextGaussian() * pathLengthVariance + pathLengthMean); + String input; + List path; + for (int i = 0; i < numPaths; i++) { + // make a path + path = new ArrayList(); + for (int j = 0; j < pathLength; j++) { + input = Integer.toString(r.nextInt(numInputs)); + path.add(input); + } + // TODO: createRandomPaths had the following difference: + // we're done, add one more arc to get to the endNode. + //input = TransducerGraph.EPSILON_INPUT; + //path.add(input); + pathList.add(path); + } + return createGraphFromPaths(pathList, -1); + } + + public static List createRandomPaths(int numPaths, int pathLengthMean, double pathLengthVariance, int numInputs) { + List pathList = new ArrayList(); + // make a bunch of paths, randomly + // compute the path length. Draw from a normal distribution + int pathLength = (int) (r.nextGaussian() * pathLengthVariance + pathLengthMean); + String input; + List path; + for (int i = 0; i < numPaths; i++) { + // make a path + path = new ArrayList(); + for (int j = 0; j < pathLength; j++) { + input = Integer.toString(r.nextInt(numInputs)); + path.add(input); + } + // we're done, add one more arc to get to the endNode. + input = TransducerGraph.EPSILON_INPUT; + path.add(input); + pathList.add(path); + } + return pathList; + } + + public void depthFirstSearch(boolean forward, StringBuffer b) { + if (forward) { + depthFirstSearchHelper(getStartNode(), new HashSet(), 0, true, b); + } else { + for (Iterator endIter = getEndNodes().iterator(); endIter.hasNext();) { + Object o = endIter.next(); + depthFirstSearchHelper(o, new HashSet(), 0, false, b); + } + } + } + + /** + * For testing only. + */ + private void depthFirstSearchHelper(Object node, Set marked, int level, boolean forward, StringBuffer b) { + if (marked.contains(node)) { + return; + } + marked.add(node); + Set arcs; + if (forward) { + arcs = this.getArcsBySource(node); + } else { + arcs = this.getArcsByTarget(node); + } + if (arcs == null) { + return; + } + for (Iterator iter = arcs.iterator(); iter.hasNext();) { + Arc newArc = (Arc) iter.next(); + // print it out + for (int i = 0; i < level; i++) { + b.append(" "); + } + if (getEndNodes().contains(newArc.getTargetNode())) { + b.append(newArc + " END\n"); + } else { + b.append(newArc + "\n"); + } + if (forward) { + depthFirstSearchHelper(newArc.getTargetNode(), marked, level + 1, forward, b); + } else { + depthFirstSearchHelper(newArc.getSourceNode(), marked, level + 1, forward, b); + } + } + } + + /** + * For testing only. + */ + public static void main(String[] args) { + List pathList = new ArrayList(); + TransducerGraph graph = createRandomGraph(1000, 10, 0.0, 10, pathList); + System.out.println("Done creating random graph"); + printPathOutputs(pathList, graph, true); + System.out.println("Depth first search from start node"); + StringBuffer b = new StringBuffer(); + graph.depthFirstSearch(true, b); + System.out.println(b.toString()); + b = new StringBuffer(); + System.out.println("Depth first search back from end node"); + graph.depthFirstSearch(false, b); + System.out.println(b.toString()); + } + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/ConnectedComponents.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/ConnectedComponents.java new file mode 100644 index 0000000..358b881 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/ConnectedComponents.java @@ -0,0 +1,50 @@ +package edu.stanford.nlp.graph; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.util.CollectionUtils; +import edu.stanford.nlp.util.Generics; + +/** + * Finds connected components in the graph, currently uses inefficient list for + * variable 'verticesLeft'. It might give a problem for big graphs + * + * @author sonalg 08/08/11 + */ +public class ConnectedComponents { + + public static List> getConnectedComponents(DirectedMultiGraph graph) { + List> ccs = new ArrayList>(); + LinkedList todo = new LinkedList(); + // TODO: why not a set? + List verticesLeft = CollectionUtils.toList(graph.getAllVertices()); + while (verticesLeft.size() > 0) { + todo.add(verticesLeft.get(0)); + verticesLeft.remove(0); + ccs.add(bfs(todo, graph, verticesLeft)); + } + return ccs; + } + + private static Set bfs(LinkedList todo, DirectedMultiGraph graph, List verticesLeft) { + Set cc = Generics.newHashSet(); + while (todo.size() > 0) { + V node = todo.removeFirst(); + cc.add(node); + for (V neighbor : graph.getNeighbors(node)) { + if (verticesLeft.contains(neighbor)) { + cc.add(neighbor); + todo.add(neighbor); + verticesLeft.remove(neighbor); + } + } + } + + return cc; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/DijkstraShortestPath.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/DijkstraShortestPath.java new file mode 100644 index 0000000..89227fb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/DijkstraShortestPath.java @@ -0,0 +1,71 @@ +package edu.stanford.nlp.graph; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.BinaryHeapPriorityQueue; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; + +public class DijkstraShortestPath { + private DijkstraShortestPath() {} // static method only + + public static List getShortestPath(DirectedMultiGraph graph, + V node1, V node2, + boolean directionSensitive) { + if (node1.equals(node2)) { + return Collections.singletonList(node2); + } + + Set visited = Generics.newHashSet(); + + Map previous = Generics.newHashMap(); + + BinaryHeapPriorityQueue unsettledNodes = + new BinaryHeapPriorityQueue(); + + unsettledNodes.add(node1, 0); + + while (unsettledNodes.size() > 0) { + double distance = unsettledNodes.getPriority(); + V u = unsettledNodes.removeFirst(); + visited.add(u); + + if (u.equals(node2)) + break; + + unsettledNodes.remove(u); + + Set candidates = ((directionSensitive) ? + graph.getChildren(u) : graph.getNeighbors(u)); + for (V candidate : candidates) { + double alt = distance - 1; + // nodes not already present will have a priority of -inf + if (alt > unsettledNodes.getPriority(candidate) && + !visited.contains(candidate)) { + unsettledNodes.relaxPriority(candidate, alt); + previous.put(candidate, u); + } + } + } + if (!previous.containsKey(node2)) + return null; + ArrayList path = new ArrayList(); + path.add(node2); + V n = node2; + while (previous.containsKey(n)) { + path.add(previous.get(n)); + n = previous.get(n); + } + Collections.reverse(path); + return path; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/DirectedMultiGraph.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/DirectedMultiGraph.java new file mode 100644 index 0000000..e51782c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/DirectedMultiGraph.java @@ -0,0 +1,509 @@ +package edu.stanford.nlp.graph; + +import java.util.*; + +import edu.stanford.nlp.util.CollectionUtils; +import edu.stanford.nlp.util.Generics; + +/** + * Simple graph library; this is directed for now. This class focuses on time + * efficiency rather than memory efficiency. + * + * @author sonalg + * @author John Bauer + * + * @param + * Type of vertices + * @param + * Type of edges. + */ + +public class DirectedMultiGraph implements Graph /* Serializable */{ + + Map>> outgoingEdges = Generics.newHashMap(); + + Map>> incomingEdges = Generics.newHashMap(); + + public DirectedMultiGraph() { + } + + /** + * Be careful hashing these. They are mutable objects, and changing the object + * will throw off the hash code, messing up your hash table + */ + public int hashCode() { + return outgoingEdges.hashCode(); + } + + @SuppressWarnings("unchecked") + public boolean equals(Object that) { + if (that == this) + return true; + if (!(that instanceof DirectedMultiGraph)) + return false; + return outgoingEdges.equals(((DirectedMultiGraph) that).outgoingEdges); + } + + /** + * For adding a zero degree vertex + * + * @param v + */ + public boolean addVertex(V v) { + if (outgoingEdges.containsKey(v)) + return false; + outgoingEdges.put(v, Generics.>newHashMap()); + incomingEdges.put(v, Generics.>newHashMap()); + return true; + } + + /** + * adds vertices (if not already in the graph) and the edge between them + * + * @param source + * @param dest + * @param data + */ + public void add(V source, V dest, E data) { + addVertex(source); + addVertex(dest); + + Map> outgoingMap = outgoingEdges.get(source); + List outgoingList = outgoingMap.get(dest); + if (outgoingList == null) { + outgoingList = new ArrayList(); + outgoingMap.put(dest, outgoingList); + } + + Map> incomingMap = incomingEdges.get(dest); + List incomingList = incomingMap.get(source); + if (incomingList == null) { + incomingList = new ArrayList(); + incomingMap.put(source, incomingList); + } + + outgoingList.add(data); + incomingList.add(data); + } + + public boolean removeEdges(V source, V dest) { + if (!outgoingEdges.containsKey(source)) { + return false; + } + if (!incomingEdges.containsKey(dest)) { + return false; + } + if (!outgoingEdges.get(source).containsKey(dest)) { + return false; + } + outgoingEdges.get(source).remove(dest); + incomingEdges.get(dest).remove(source); + return true; + } + + public boolean removeEdge(V source, V dest, E data) { + if (!outgoingEdges.containsKey(source)) { + return false; + } + if (!incomingEdges.containsKey(dest)) { + return false; + } + if (!outgoingEdges.get(source).containsKey(dest)) { + return false; + } + boolean foundOut = outgoingEdges.get(source).get(dest).remove(data); + boolean foundIn = incomingEdges.get(dest).get(source).remove(data); + if (foundOut && !foundIn) { + throw new AssertionError("Edge found in outgoing but not incoming"); + } + if (foundIn && !foundOut) { + throw new AssertionError("Edge found in incoming but not outgoing"); + } + // TODO: cut down the number of .get calls + if (outgoingEdges.get(source).get(dest).size() == 0) { + outgoingEdges.get(source).remove(dest); + } + if (incomingEdges.get(dest).get(source).size() == 0) { + incomingEdges.get(dest).remove(source); + } + return foundOut; + } + + /** + * remove a vertex (and its edges) from the graph. + * + * @param vertex + * @return true if successfully removes the node + */ + public boolean removeVertex(V vertex) { + if (!outgoingEdges.containsKey(vertex)) { + return false; + } + for (V other : outgoingEdges.get(vertex).keySet()) { + incomingEdges.get(other).remove(vertex); + } + for (V other : incomingEdges.get(vertex).keySet()) { + outgoingEdges.get(other).remove(vertex); + } + outgoingEdges.remove(vertex); + incomingEdges.remove(vertex); + return true; + } + + public boolean removeVertices(Collection vertices) { + boolean changed = false; + for (V v : vertices) { + if (removeVertex(v)) { + changed = true; + } + } + return changed; + } + + public int getNumVertices() { + return outgoingEdges.size(); + } + + public List getOutgoingEdges(V v) { + return CollectionUtils.flatten(outgoingEdges.get(v).values()); + } + + public List getIncomingEdges(V v) { + return CollectionUtils.flatten(incomingEdges.get(v).values()); + } + + public int getNumEdges() { + int count = 0; + for (Map.Entry>> sourceEntry : outgoingEdges.entrySet()) { + for (Map.Entry> destEntry : sourceEntry.getValue().entrySet()) { + count += destEntry.getValue().size(); + } + } + return count; + } + + public Set getParents(V vertex) { + Map> parentMap = incomingEdges.get(vertex); + if (parentMap == null) + return null; + return Collections.unmodifiableSet(parentMap.keySet()); + } + + public Set getChildren(V vertex) { + Map> childMap = outgoingEdges.get(vertex); + if (childMap == null) + return null; + return Collections.unmodifiableSet(childMap.keySet()); + } + + /** + * Gets both parents and children nodes + * + * @param v + */ + public Set getNeighbors(V v) { + // TODO: pity we have to copy the sets... is there a combination set? + Set children = getChildren(v); + Set parents = getParents(v); + + if (children == null && parents == null) + return null; + Set neighbors = Generics.newHashSet(); + neighbors.addAll(children); + neighbors.addAll(parents); + return neighbors; + } + + /** + * clears the graph, removes all edges and nodes + */ + public void clear() { + incomingEdges.clear(); + outgoingEdges.clear(); + } + + public boolean containsVertex(V v) { + return outgoingEdges.containsKey(v); + } + + /** + * only checks if there is an edge from source to dest. To check if it is + * connected in either direction, use isNeighbor + * + * @param source + * @param dest + */ + public boolean isEdge(V source, V dest) { + Map> childrenMap = outgoingEdges.get(source); + if (childrenMap == null || childrenMap.isEmpty()) + return false; + List edges = childrenMap.get(dest); + if (edges == null || edges.isEmpty()) + return false; + return edges.size() > 0; + } + + public boolean isNeighbor(V source, V dest) { + return isEdge(source, dest) || isEdge(dest, source); + } + + public Set getAllVertices() { + return Collections.unmodifiableSet(outgoingEdges.keySet()); + } + + public List getAllEdges() { + List edges = new ArrayList(); + for (Map> e : outgoingEdges.values()) { + for (List ee : e.values()) { + edges.addAll(ee); + } + } + return edges; + } + + /** + * False if there are any vertices in the graph, true otherwise. Does not care + * about the number of edges. + */ + public boolean isEmpty() { + return outgoingEdges.isEmpty(); + } + + /** + * Deletes nodes with zero incoming and zero outgoing edges + */ + public void removeZeroDegreeNodes() { + List toDelete = new ArrayList(); + for (V vertex : outgoingEdges.keySet()) { + if (outgoingEdges.get(vertex).size() == 0 && incomingEdges.get(vertex).size() == 0) { + toDelete.add(vertex); + } + } + for (V vertex : toDelete) { + outgoingEdges.remove(vertex); + incomingEdges.remove(vertex); + } + } + + public List getEdges(V source, V dest) { + Map> childrenMap = outgoingEdges.get(source); + if (childrenMap == null) { + return Collections.emptyList(); + } + List edges = childrenMap.get(dest); + if (edges == null) { + return Collections.emptyList(); + } + return Collections.unmodifiableList(edges); + } + + /** + * direction insensitive (the paths can go "up" or through the parents) + */ + public List getShortestPath(V node1, V node2) { + if (!outgoingEdges.containsKey(node1) || !outgoingEdges.containsKey(node2)) { + return null; + } + return getShortestPath(node1, node2, false); + } + + public List getShortestPathEdges(V node1, V node2) { + return convertPath(getShortestPath(node1, node2), false); + } + + /** + * can specify the direction sensitivity + * + * @param node1 + * @param node2 + * @param directionSensitive + * - whether the path can go through the parents + * @return the list of nodes you get through to get there + */ + public List getShortestPath(V node1, V node2, boolean directionSensitive) { + if (!outgoingEdges.containsKey(node1) || !outgoingEdges.containsKey(node2)) { + return null; + } + return DijkstraShortestPath.getShortestPath(this, node1, node2, directionSensitive); + } + + public List getShortestPathEdges(V node1, V node2, boolean directionSensitive) { + return convertPath(getShortestPath(node1, node2, directionSensitive), directionSensitive); + } + + public List convertPath(List nodes, boolean directionSensitive) { + if (nodes == null) + return null; + + if (nodes.size() <= 1) + return Collections.emptyList(); + + List path = new ArrayList(); + Iterator nodeIterator = nodes.iterator(); + V previous = nodeIterator.next(); + while (nodeIterator.hasNext()) { + V next = nodeIterator.next(); + E connection = null; + List edges = getEdges(previous, next); + if (edges.size() == 0 && !directionSensitive) { + edges = getEdges(next, previous); + } + if (edges.size() > 0) { + connection = edges.get(0); + } else { + throw new IllegalArgumentException("Path given with missing " + "edge connection"); + } + path.add(connection); + previous = next; + } + return path; + } + + public int getInDegree(V vertex) { + if (!containsVertex(vertex)) { + throw new IllegalArgumentException("Unknown vertex " + vertex); + } + int result = 0; + Map> incoming = incomingEdges.get(vertex); + for (List edges : incoming.values()) { + result += edges.size(); + } + return result; + } + + public int getOutDegree(V vertex) { + int result = 0; + Map> outgoing = outgoingEdges.get(vertex); + if (outgoing == null) { + throw new IllegalArgumentException("Unknown vertex " + vertex); + } + for (List edges : outgoing.values()) { + result += edges.size(); + } + return result; + } + + public List> getConnectedComponents() { + return ConnectedComponents.getConnectedComponents(this); + } + + public Iterator incomingEdgeIterator(final V vertex) { + return new EdgeIterator(incomingEdges, vertex); + } + + public Iterable incomingEdgeIterable(final V vertex) { + return new Iterable() { + public Iterator iterator() { + return new EdgeIterator(incomingEdges, vertex); + } + }; + } + + public Iterator outgoingEdgeIterator(final V vertex) { + return new EdgeIterator(outgoingEdges, vertex); + } + + public Iterable outgoingEdgeIterable(final V vertex) { + return new Iterable() { + public Iterator iterator() { + return new EdgeIterator(outgoingEdges, vertex); + } + }; + } + + public Iterator edgeIterator() { + return new EdgeIterator(this); + } + + public Iterable edgeIterable() { + return new Iterable() { + public Iterator iterator() { + return new EdgeIterator(DirectedMultiGraph.this); + } + }; + } + + static class EdgeIterator implements Iterator { + private Iterator>> vertexIterator; + private Iterator> connectionIterator; + private Iterator edgeIterator; + + E next; + + public EdgeIterator(DirectedMultiGraph graph) { + vertexIterator = graph.outgoingEdges.values().iterator(); + primeNext(); + } + + public EdgeIterator(Map>> source, V startVertex) { + Map> neighbors = source.get(startVertex); + if (neighbors == null) { + return; + } + connectionIterator = neighbors.values().iterator(); + primeNext(); + } + + public boolean hasNext() { + return next != null; + } + + public E next() { + if (next == null) { + throw new NoSuchElementException("Graph edge iterator exhausted."); + } + E value = next; + primeNext(); + return value; + } + + private void primeNext() { + while (true) { + if (edgeIterator != null && edgeIterator.hasNext()) { + next = edgeIterator.next(); + break; + } + + if (connectionIterator != null && connectionIterator.hasNext()) { + edgeIterator = connectionIterator.next().iterator(); + continue; + } + + if (vertexIterator != null && vertexIterator.hasNext()) { + connectionIterator = vertexIterator.next().values().iterator(); + continue; + } + + next = null; + break; + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + @Override + public String toString() { + StringBuilder s = new StringBuilder(); + s.append("{\n"); + s.append("Vertices:\n"); + for (V vertex : outgoingEdges.keySet()) { + s.append(" ").append(vertex).append('\n'); + } + s.append("Edges:\n"); + for (V source : outgoingEdges.keySet()) { + for (V dest : outgoingEdges.get(source).keySet()) { + for (E edge : outgoingEdges.get(source).get(dest)) { + s.append(" ").append(source).append(" -> ").append(dest).append(" : ").append(edge).append('\n'); + } + } + } + s.append('}'); + return s.toString(); + } + + private static final long serialVersionUID = 609823567298345145L; + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/Graph.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/Graph.java new file mode 100644 index 0000000..162572b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/graph/Graph.java @@ -0,0 +1,122 @@ +package edu.stanford.nlp.graph; + +import java.io.Serializable; + +import java.util.Collection; +import java.util.List; +import java.util.Set; + +public interface Graph extends Serializable { + + /** + * Adds vertices (if not already in the graph) and the edge between them. + * (If the graph is undirected, the choice of which vertex to call + * source and dest is arbitrary.) + * + * @param source + * @param dest + * @param data + */ + public abstract void add(V source, V dest, E data); + /** + * For adding a zero degree vertex + * + * @param v + */ + + public abstract boolean addVertex(V v); + + + + public abstract boolean removeEdges(V source, V dest); + + public abstract boolean removeEdge(V source, V dest, E data); + + /** + * remove a vertex (and its edges) from the graph. + * + * @param vertex + * @return true if successfully removes the node + */ + public abstract boolean removeVertex(V vertex); + + public abstract boolean removeVertices(Collection vertices); + + public abstract int getNumVertices(); + + /** + * for undirected graph, it is just the edges from the node + * @param v + */ + public abstract List getOutgoingEdges(V v); + + /** + * for undirected graph, it is just the edges from the node + * @param v + */ + public abstract List getIncomingEdges(V v); + + public abstract int getNumEdges(); + + /** + * for undirected graph, it is just the neighbors + * @param vertex + */ + public abstract Set getParents(V vertex); + + /** + * for undirected graph, it is just the neighbors + * @param vertex + */ + + public abstract Set getChildren(V vertex); + + public abstract Set getNeighbors(V v); + + /** + * clears the graph, removes all edges and nodes + */ + public abstract void clear(); + + public abstract boolean containsVertex(V v); + + /** + * only checks if there is an edge from source to dest. To check if it is + * connected in either direction, use isNeighbor + * + * @param source + * @param dest + */ + public abstract boolean isEdge(V source, V dest); + + public abstract boolean isNeighbor(V source, V dest); + + public abstract Set getAllVertices(); + + public abstract List getAllEdges(); + + /** + * False if there are any vertices in the graph, true otherwise. Does not care + * about the number of edges. + */ + public abstract boolean isEmpty(); + + /** + * Deletes nodes with zero incoming and zero outgoing edges + */ + public abstract void removeZeroDegreeNodes(); + + public abstract List getEdges(V source, V dest); + + + /** + * for undirected graph, it should just be the degree + * @param vertex + */ + public abstract int getInDegree(V vertex); + + public abstract int getOutDegree(V vertex); + + public abstract List> getConnectedComponents(); + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java new file mode 100644 index 0000000..f17df64 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java @@ -0,0 +1,1822 @@ +// AbstractSequenceClassifier -- a framework for probabilistic sequence models. +// Copyright (c) 2002-2008 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// Support/Questions: java-nlp-user@lists.stanford.edu +// Licensing: java-nlp-support@lists.stanford.edu +// http://nlp.stanford.edu/downloads/crf-classifier.shtml + +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.fsm.DFSA; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.io.RegExFileFilter; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.ling.CoreAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.objectbank.ResettableReaderIteratorFactory; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.CoreTokenFactory; +import edu.stanford.nlp.sequences.*; +import edu.stanford.nlp.sequences.FeatureFactory; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.Counters; +import edu.stanford.nlp.stats.Sampler; +import edu.stanford.nlp.util.*; +import edu.stanford.nlp.util.concurrent.*; + +import java.io.*; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; + +/** + * This class provides common functionality for (probabilistic) sequence models. + * It is a superclass of our CMM and CRF sequence classifiers, and is even used + * in the (deterministic) NumberSequenceClassifier. See implementing classes for + * more information. + *

    + * A full implementation should implement these 5 abstract methods:
    + * {@code List classify(List document); }
    + * {@code void train(Collection> docs); }
    + * {@code printProbsDocument(List document); }
    + * {@code void serializeClassifier(String serializePath); }
    + * {@code void loadClassifier(ObjectInputStream in, Properties props) throws IOException, + * ClassCastException, ClassNotFoundException; }
    + * but a runtime (or rule-based) implementation can usefully implement just the first. + * + * @author Jenny Finkel + * @author Dan Klein + * @author Christopher Manning + * @author Dan Cer + * @author sonalg (made the class generic) + */ +public abstract class AbstractSequenceClassifier implements Function { + + public SeqClassifierFlags flags; + public Index classIndex; // = null; + public FeatureFactory featureFactory; + protected IN pad; + private CoreTokenFactory tokenFactory; + protected int windowSize; + // different threads can add or query knownLCWords at the same time, + // so we need a concurrent data structure + protected Set knownLCWords = Collections.newSetFromMap(new ConcurrentHashMap()); + + private boolean VERBOSE = true; + private DocumentReaderAndWriter defaultReaderAndWriter; + public DocumentReaderAndWriter defaultReaderAndWriter() { + return defaultReaderAndWriter; + } + + private AtomicInteger threadCompletionCounter = new AtomicInteger(0); + + private DocumentReaderAndWriter plainTextReaderAndWriter; + public DocumentReaderAndWriter plainTextReaderAndWriter() { + return plainTextReaderAndWriter; + } + + + /** + * Construct a SeqClassifierFlags object based on the passed in properties, + * and then call the other constructor. + * + * @param props + * See SeqClassifierFlags for known properties. + */ + public AbstractSequenceClassifier(Properties props) { + this(new SeqClassifierFlags(props)); + } + + /** + * Initialize the featureFactory and other variables based on the passed in + * flags. + * + * @param flags A specification of the AbstractSequenceClassifier to construct. + */ + public AbstractSequenceClassifier(SeqClassifierFlags flags) { + this.flags = flags; + + // try { + this.featureFactory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); + // this.featureFactory = (FeatureFactory) Class.forName(flags.featureFactory).newInstance(); + if (flags.tokenFactory == null) { + tokenFactory = (CoreTokenFactory) new CoreLabelTokenFactory(); + } else { + this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs); + // this.tokenFactory = (CoreTokenFactory) Class.forName(flags.tokenFactory).newInstance(); + } + // } catch (Exception e) { + // throw new RuntimeException(e); + // } + pad = tokenFactory.makeToken(); + windowSize = flags.maxLeft + 1; + reinit(); + } + + /** + * This method should be called after there have been changes to the flags + * (SeqClassifierFlags) variable, such as after deserializing a classifier. It + * is called inside the loadClassifier methods. It assumes that the flags + * variable and the pad variable exist, but reinitializes things like the pad + * variable, featureFactory and readerAndWriter based on the flags. + *

    + * Implementation note: At the moment this variable doesn't set + * windowSize or featureFactory, since they are being serialized separately in + * the file, but we should probably stop serializing them and just + * reinitialize them from the flags? + */ + protected final void reinit() { + pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); + pad.set(CoreAnnotations.GoldAnswerAnnotation.class, flags.backgroundSymbol); + + featureFactory.init(flags); + + defaultReaderAndWriter = makeReaderAndWriter(); + if (flags.readerAndWriter != null && + flags.readerAndWriter.equals(flags.plainTextDocumentReaderAndWriter)) { + plainTextReaderAndWriter = defaultReaderAndWriter; + } else { + plainTextReaderAndWriter = makePlainTextReaderAndWriter(); + } + } + + /** + * Makes a DocumentReaderAndWriter based on the flags the CRFClassifier + * was constructed with. Will create an instance of the class specified in + * the property flags.readerAndWriter and + * initialize it with the CRFClassifier's flags. + * + * @return The appropriate ReaderAndWriter for training/testing this classifier + */ + public DocumentReaderAndWriter makeReaderAndWriter() { + DocumentReaderAndWriter readerAndWriter; + try { + readerAndWriter = ReflectionLoading.loadByReflection(flags.readerAndWriter); + } catch (Exception e) { + throw new RuntimeException(String.format("Error loading flags.readerAndWriter: '%s'", flags.readerAndWriter), e); + } + readerAndWriter.init(flags); + return readerAndWriter; + } + + /** + * Makes a DocumentReaderAndWriter based on + * flags.plainTextReaderAndWriter. Useful for reading in + * untokenized text documents or reading plain text from the command + * line. An example of a way to use this would be to return a + * edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter for + * the Chinese Segmenter. + */ + public DocumentReaderAndWriter makePlainTextReaderAndWriter() { + String readerClassName = flags.plainTextDocumentReaderAndWriter; + // We set this default here if needed because there may be models + // which don't have the reader flag set + if (readerClassName == null) { + readerClassName = SeqClassifierFlags.DEFAULT_PLAIN_TEXT_READER; + } + DocumentReaderAndWriter readerAndWriter; + try { + readerAndWriter = ReflectionLoading.loadByReflection(readerClassName); + } catch (Exception e) { + throw new RuntimeException(String.format("Error loading flags.plainTextDocumentReaderAndWriter: '%s'", flags.plainTextDocumentReaderAndWriter), e); + } + readerAndWriter.init(flags); + return readerAndWriter; + } + + /** + * Returns the background class for the classifier. + * + * @return The background class name + */ + public String backgroundSymbol() { + return flags.backgroundSymbol; + } + + public Set labels() { + return Generics.newHashSet(classIndex.objectsList()); + } + + /** + * Classify a List of IN. This method returns a new list of tokens, not + * the list of tokens passed in, and runs the new tokens through + * ObjectBankWrapper. (Both these behaviors are different from that of the + * classify(List) method. + * + * @param sentence The List of IN to be classified. + * @return The classified List of IN, where the classifier output for + * each token is stored in its + * {@link edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation} + * field. + */ + public List classifySentence(List sentence) { + List document = new ArrayList(); + int i = 0; + for (HasWord word : sentence) { + IN wi; // initialized below + if (word instanceof CoreMap) { + // copy all annotations! some are required later in + // AbstractSequenceClassifier.classifyWithInlineXML + // wi = (IN) new ArrayCoreMap((ArrayCoreMap) word); + wi = tokenFactory.makeToken((IN) word); + } else { + wi = tokenFactory.makeToken(); + wi.set(CoreAnnotations.TextAnnotation.class, word.word()); + // wi.setWord(word.word()); + } + wi.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(i)); + wi.set(CoreAnnotations.AnswerAnnotation.class, backgroundSymbol()); + document.add(wi); + i++; + } + + // TODO get rid of objectbankwrapper + ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords); + wrapper.processDocument(document); + + classify(document); + + return document; + } + + /** + * Classify a List of IN using whatever additional information is passed in globalInfo. + * Used by SUTime (NumberSequenceClassifier), which requires the doc date to resolve relative dates + * + * @param tokenSequence + * The List of IN to be classified. + * @return The classified List of IN, where the classifier output for + * each token is stored in its "answer" field. + */ + public List classifySentenceWithGlobalInformation(List tokenSequence, final CoreMap doc, final CoreMap sentence) { + List document = new ArrayList(); + int i = 0; + for (HasWord word : tokenSequence) { + IN wi; // initialized straight below + if (word instanceof CoreMap) { + // copy all annotations! some are required later in + // AbstractSequenceClassifier.classifyWithInlineXML + // wi = (IN) new ArrayCoreMap((ArrayCoreMap) word); + wi = tokenFactory.makeToken((IN) word); + } else { + wi = tokenFactory.makeToken(); + wi.set(CoreAnnotations.TextAnnotation.class, word.word()); + // wi.setWord(word.word()); + } + wi.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(i)); + wi.set(CoreAnnotations.AnswerAnnotation.class, backgroundSymbol()); + document.add(wi); + i++; + } + + // TODO get rid of objectbankwrapper + ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords); + wrapper.processDocument(document); + + classifyWithGlobalInformation(document, doc, sentence); + + return document; + } + + public SequenceModel getSequenceModel(List doc) { + throw new UnsupportedOperationException(); + } + + public Sampler> getSampler(final List input) { + return new Sampler>() { + SequenceModel model = getSequenceModel(input); + SequenceSampler sampler = new SequenceSampler(); + + @Override + public List drawSample() { + int[] sampleArray = sampler.bestSequence(model); + List sample = new ArrayList(); + int i = 0; + for (IN word : input) { + + IN newWord = tokenFactory.makeToken(word); + newWord.set(CoreAnnotations.AnswerAnnotation.class, classIndex.get(sampleArray[i++])); + sample.add(newWord); + } + return sample; + } + }; + } + + public Counter> classifyKBest(List doc, Class> answerField, int k) { + + if (doc.isEmpty()) { + return new ClassicCounter>(); + } + + // TODO get rid of ObjectBankWrapper + // i'm sorry that this is so hideous - JRF + ObjectBankWrapper obw = new ObjectBankWrapper(flags, null, knownLCWords); + doc = obw.processDocument(doc); + + SequenceModel model = getSequenceModel(doc); + + KBestSequenceFinder tagInference = new KBestSequenceFinder(); + Counter bestSequences = tagInference.kBestSequences(model, k); + + Counter> kBest = new ClassicCounter>(); + + for (int[] seq : bestSequences.keySet()) { + List kth = new ArrayList(); + int pos = model.leftWindow(); + for (IN fi : doc) { + IN newFL = tokenFactory.makeToken(fi); + String guess = classIndex.get(seq[pos]); + fi.remove(CoreAnnotations.AnswerAnnotation.class); // because fake answers will get + // added during testing + newFL.set(answerField, guess); + pos++; + kth.add(newFL); + } + kBest.setCount(kth, bestSequences.getCount(seq)); + } + + return kBest; + } + + public DFSA getViterbiSearchGraph(List doc, Class> answerField) { + if (doc.isEmpty()) { + return new DFSA(null); + } + // TODO get rid of objectbankwrapper + ObjectBankWrapper obw = new ObjectBankWrapper(flags, null, knownLCWords); + doc = obw.processDocument(doc); + SequenceModel model = getSequenceModel(doc); + return ViterbiSearchGraphBuilder.getGraph(model, classIndex); + } + + /** + * Classify the tokens in a String. Each sentence becomes a separate document. + * + * @param str + * A String with tokens in one or more sentences of text to be + * classified. + * @return {@link List} of classified sentences (each a List of something that + * extends {@link CoreMap}). + */ + public List> classify(String str) { + ObjectBank> documents = + makeObjectBankFromString(str, plainTextReaderAndWriter); + List> result = new ArrayList>(); + + for (List document : documents) { + classify(document); + + List sentence = new ArrayList(); + for (IN wi : document) { + // TaggedWord word = new TaggedWord(wi.word(), wi.answer()); + // sentence.add(word); + sentence.add(wi); + } + result.add(sentence); + } + return result; + } + + /** + * Classify the tokens in a String. Each sentence becomes a separate document. + * Doesn't override default readerAndWriter. + * + * @param str + * A String with tokens in one or more sentences of text to be + * classified. + * @return {@link List} of classified sentences (each a List of something that + * extends {@link CoreMap}). + */ + public List> classifyRaw(String str, + DocumentReaderAndWriter readerAndWriter) { + ObjectBank> documents = + makeObjectBankFromString(str, readerAndWriter); + List> result = new ArrayList>(); + + for (List document : documents) { + classify(document); + + List sentence = new ArrayList(); + for (IN wi : document) { + // TaggedWord word = new TaggedWord(wi.word(), wi.answer()); + // sentence.add(word); + sentence.add(wi); + } + result.add(sentence); + } + return result; + } + + /** + * Classify the contents of a file. + * + * @param filename + * Contains the sentence(s) to be classified. + * @return {@link List} of classified List of IN. + */ + public List> classifyFile(String filename) { + ObjectBank> documents = + makeObjectBankFromFile(filename, plainTextReaderAndWriter); + List> result = new ArrayList>(); + + for (List document : documents) { + // System.err.println(document); + classify(document); + + List sentence = new ArrayList(); + for (IN wi : document) { + sentence.add(wi); + // System.err.println(wi); + } + result.add(sentence); + } + return result; + } + + /** + * Maps a String input to an XML-formatted rendition of applying NER to the + * String. Implements the Function interface. Calls + * classifyWithInlineXML(String) [q.v.]. + */ + @Override + public String apply(String in) { + return classifyWithInlineXML(in); + } + + /** + * Classify the contents of a {@link String} to one of several String + * representations that shows the classes. Plain text or XML input is expected + * and the {@link PlainTextDocumentReaderAndWriter} is used. The classifier + * will tokenize the text and treat each sentence as a separate document. The + * output can be specified to be in a choice of three formats: slashTags + * (e.g., Bill/PERSON Smith/PERSON died/O ./O), inlineXML (e.g., + * <PERSON>Bill Smith</PERSON> went to + * <LOCATION>Paris</LOCATION> .), or xml, for stand-off XML (e.g., + * <wi num="0" entity="PERSON">Sue</wi> <wi num="1" + * entity="O">shouted</wi> ). There is also a binary choice as to + * whether the spacing between tokens of the original is preserved or whether + * the (tagged) tokens are printed with a single space (for inlineXML or + * slashTags) or a single newline (for xml) between each one. + *

    + * Fine points: The slashTags and xml formats show tokens as + * transformed by any normalization processes inside the tokenizer, while + * inlineXML shows the tokens exactly as they appeared in the source text. + * When a period counts as both part of an abbreviation and as an end of + * sentence marker, it is included twice in the output String for slashTags or + * xml, but only once for inlineXML, where it is not counted as part of the + * abbreviation (or any named entity it is part of). For slashTags with + * preserveSpacing=true, there will be two successive periods such as "Jr.." + * The tokenized (preserveSpacing=false) output will have a space or a newline + * after the last token. + * + * @param sentences + * The String to be classified. It will be tokenized and + * divided into documents according to (heuristically + * determined) sentence boundaries. + * @param outputFormat + * The format to put the output in: one of "slashTags", "xml", or + * "inlineXML" + * @param preserveSpacing + * Whether to preserve the input spacing between tokens, which may + * sometimes be none (true) or whether to tokenize the text and print + * it with one space between each token (false) + * @return A {@link String} with annotated with classification information. + */ + public String classifyToString(String sentences, String outputFormat, boolean preserveSpacing) { + PlainTextDocumentReaderAndWriter.OutputStyle outFormat = + PlainTextDocumentReaderAndWriter.OutputStyle.fromShortName(outputFormat); + + + ObjectBank> documents = + makeObjectBankFromString(sentences, plainTextReaderAndWriter); + + StringBuilder sb = new StringBuilder(); + for (List doc : documents) { + List docOutput = classify(doc); + if (plainTextReaderAndWriter instanceof PlainTextDocumentReaderAndWriter) { + // TODO: implement this particular method and its options in + // the other documentReaderAndWriters + sb.append(((PlainTextDocumentReaderAndWriter) plainTextReaderAndWriter).getAnswers(docOutput, outFormat, preserveSpacing)); + } else { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + plainTextReaderAndWriter.printAnswers(docOutput, pw); + pw.flush(); + sb.append(sw.toString()); + sb.append("\n"); + } + } + return sb.toString(); + } + + /** + * Classify the contents of a {@link String}. Plain text or XML is expected + * and the {@link PlainTextDocumentReaderAndWriter} is used by default. + * The classifier + * will treat each sentence as a separate document. The output can be + * specified to be in a choice of formats: Output is in inline XML format + * (e.g. <PERSON>Bill Smith</PERSON> went to + * <LOCATION>Paris</LOCATION> .) + * + * @param sentences + * The string to be classified + * @return A {@link String} with annotated with classification information. + */ + public String classifyWithInlineXML(String sentences) { + return classifyToString(sentences, "inlineXML", true); + } + + /** + * Classify the contents of a String to a tagged word/class String. Plain text + * or XML input is expected and the {@link PlainTextDocumentReaderAndWriter} + * is used by default. + * Output looks like: My/O name/O is/O Bill/PERSON Smith/PERSON ./O + * + * @param sentences + * The String to be classified + * @return A String annotated with classification information. + */ + public String classifyToString(String sentences) { + return classifyToString(sentences, "slashTags", true); + } + + /** + * Classify the contents of a {@link String} to classified character offset + * spans. Plain text or XML input text is expected and the + * {@link PlainTextDocumentReaderAndWriter} is used by default. + * Output is a (possibly + * empty, but not null) List of Triples. Each Triple is an entity + * name, followed by beginning and ending character offsets in the original + * String. Character offsets can be thought of as fenceposts between the + * characters, or, like certain methods in the Java String class, as character + * positions, numbered starting from 0, with the end index pointing to the + * position AFTER the entity ends. That is, end - start is the length of the + * entity in characters. + *

    + * Fine points: Token offsets are true wrt the source text, even though + * the tokenizer may internally normalize certain tokens to String + * representations of different lengths (e.g., " becoming `` or ''). When a + * period counts as both part of an abbreviation and as an end of sentence + * marker, and that abbreviation is part of a named entity, the reported + * entity string excludes the period. + * + * @param sentences + * The string to be classified + * @return A {@link List} of {@link Triple}s, each of which gives an entity + * type and the beginning and ending character offsets. + */ + public List> classifyToCharacterOffsets(String sentences) { + ObjectBank> documents = + makeObjectBankFromString(sentences, plainTextReaderAndWriter); + + List> entities = + new ArrayList>(); + for (List doc : documents) { + String prevEntityType = flags.backgroundSymbol; + Triple prevEntity = null; + + classify(doc); + + for (IN fl : doc) { + String guessedAnswer = fl.get(CoreAnnotations.AnswerAnnotation.class); + if (guessedAnswer.equals(flags.backgroundSymbol)) { + if (prevEntity != null) { + entities.add(prevEntity); + prevEntity = null; + } + } else { + if (!guessedAnswer.equals(prevEntityType)) { + if (prevEntity != null) { + entities.add(prevEntity); + } + prevEntity = new Triple(guessedAnswer, fl + .get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + } else { + assert prevEntity != null; // if you read the code carefully, this + // should always be true! + prevEntity.setThird(fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + } + } + prevEntityType = guessedAnswer; + } + + // include any entity at end of doc + if (prevEntity != null) { + entities.add(prevEntity); + } + + } + return entities; + } + + /** + * ONLY USE IF LOADED A CHINESE WORD SEGMENTER!!!!! + * + * @param sentence + * The string to be classified + * @return List of words + */ + public List segmentString(String sentence) { + return segmentString(sentence, defaultReaderAndWriter); + } + + public List segmentString(String sentence, + DocumentReaderAndWriter readerAndWriter) { + ObjectBank> docs = makeObjectBankFromString(sentence, + readerAndWriter); + + StringWriter stringWriter = new StringWriter(); + PrintWriter stringPrintWriter = new PrintWriter(stringWriter); + for (List doc : docs) { + classify(doc); + readerAndWriter.printAnswers(doc, stringPrintWriter); + stringPrintWriter.println(); + } + stringPrintWriter.close(); + String segmented = stringWriter.toString(); + + return Arrays.asList(segmented.split("\\s")); + } + + /** + * Classify the contents of {@link SeqClassifierFlags scf.testFile}. The file + * should be in the format expected based on {@link SeqClassifierFlags + * scf.documentReader}. + * + * @return A {@link List} of {@link List}s of classified something that + * extends {@link CoreMap} where each {@link List} refers to a + * document/sentence. + */ + // public ObjectBank> test() { + // return test(flags.testFile); + // } + + /** + * Classify a {@link List} of something that extends{@link CoreMap}. + * The classifications are added in place to the items of the document, + * which is also returned by this method + * + * @param document A {@link List} of something that extends {@link CoreMap}. + * @return The same {@link List}, but with the elements annotated with their + * answers (stored under the + * {@link edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation} + * key). + */ + public abstract List classify(List document); + + /** + * Classify a {@link List} of something that extends {@link CoreMap} using as + * additional information whatever is stored in the document and sentence. + * This is needed for SUTime (NumberSequenceClassifier), which requires + * the document date to resolve relative dates. + * + * @param tokenSequence + * @param document + * @param sentence + * @return Classified version of the input tokenSequence + */ + public abstract List classifyWithGlobalInformation(List tokenSequence, final CoreMap document, final CoreMap sentence); + + /** + * Train the classifier based on values in flags. It will use the first of + * these variables that is defined: trainFiles (and baseTrainDir), + * trainFileList, trainFile. + */ + public void train() { + if (flags.trainFiles != null) { + train(flags.baseTrainDir, flags.trainFiles, defaultReaderAndWriter); + } else if (flags.trainFileList != null) { + String[] files = flags.trainFileList.split(","); + train(files, defaultReaderAndWriter); + } else { + train(flags.trainFile, defaultReaderAndWriter); + } + } + + public void train(String filename) { + train(filename, defaultReaderAndWriter); + } + + public void train(String filename, + DocumentReaderAndWriter readerAndWriter) { + // only for the OCR data does this matter + flags.ocrTrain = true; + train(makeObjectBankFromFile(filename, readerAndWriter), readerAndWriter); + } + + public void train(String baseTrainDir, String trainFiles, + DocumentReaderAndWriter readerAndWriter) { + // only for the OCR data does this matter + flags.ocrTrain = true; + train(makeObjectBankFromFiles(baseTrainDir, trainFiles, readerAndWriter), + readerAndWriter); + } + + public void train(String[] trainFileList, + DocumentReaderAndWriter readerAndWriter) { + // only for the OCR data does this matter + flags.ocrTrain = true; + train(makeObjectBankFromFiles(trainFileList, readerAndWriter), + readerAndWriter); + } + + /** + * Trains a classifier from a Collection of sequences. + * Note that the Collection can be (and usually is) an ObjectBank. + * + * @param docs + * An Objectbank or a collection of sequences of IN + */ + public void train(Collection> docs) { + train(docs, defaultReaderAndWriter); + } + + /** + * Trains a classifier from a Collection of sequences. + * Note that the Collection can be (and usually is) an ObjectBank. + * + * @param docs + * An ObjectBank or a collection of sequences of IN + * @param readerAndWriter + * A DocumentReaderAndWriter to use when loading test files + */ + public abstract void train(Collection> docs, + DocumentReaderAndWriter readerAndWriter); + + /** + * Reads a String into an ObjectBank object. NOTE: that the current + * implementation of ReaderIteratorFactory will first try to interpret each + * string as a filename, so this method will yield unwanted results if it + * applies to a string that is at the same time a filename. It prints out a + * warning, at least. + * + * @param string The String which will be the content of the ObjectBank + * @return The ObjectBank + */ + public ObjectBank> + makeObjectBankFromString(String string, + DocumentReaderAndWriter readerAndWriter) + { + if (flags.announceObjectBankEntries) { + System.err.print("Reading data using " + readerAndWriter.getClass()); + + if (flags.inputEncoding == null) { + System.err.println("Getting data from " + string + " (default encoding)"); + } else { + System.err.println("Getting data from " + string + " (" + flags.inputEncoding + " encoding)"); + } + } + // return new ObjectBank>(new + // ResettableReaderIteratorFactory(string), readerAndWriter); + // TODO + return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(string), + readerAndWriter), knownLCWords); + } + + public ObjectBank> makeObjectBankFromFile(String filename, + DocumentReaderAndWriter readerAndWriter) { + String[] fileAsArray = { filename }; + return makeObjectBankFromFiles(fileAsArray, readerAndWriter); + } + + public ObjectBank> makeObjectBankFromFiles(String[] trainFileList, + DocumentReaderAndWriter readerAndWriter) { + // try{ + Collection files = new ArrayList(); + for (String trainFile : trainFileList) { + File f = new File(trainFile); + files.add(f); + } + // System.err.printf("trainFileList contains %d file%s.\n", files.size(), + // files.size() == 1 ? "": "s"); + // TODO get rid of objectbankwrapper + // return new ObjectBank>(new + // ResettableReaderIteratorFactory(files), readerAndWriter); + return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), + readerAndWriter), knownLCWords); + // } catch (IOException e) { + // throw new RuntimeException(e); + // } + } + + public ObjectBank> makeObjectBankFromFiles(String baseDir, String filePattern, + DocumentReaderAndWriter readerAndWriter) { + + File path = new File(baseDir); + FileFilter filter = new RegExFileFilter(Pattern.compile(filePattern)); + File[] origFiles = path.listFiles(filter); + Collection files = new ArrayList(); + for (File file : origFiles) { + if (file.isFile()) { + if (flags.announceObjectBankEntries) { + System.err.println("Getting data from " + file + " (" + flags.inputEncoding + " encoding)"); + } + files.add(file); + } + } + + if (files.isEmpty()) { + throw new RuntimeException("No matching files: " + baseDir + '\t' + filePattern); + } + // return new ObjectBank>(new + // ResettableReaderIteratorFactory(files, flags.inputEncoding), + // readerAndWriter); + // TODO get rid of objectbankwrapper + return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, + flags.inputEncoding), readerAndWriter), knownLCWords); + } + + public ObjectBank> makeObjectBankFromFiles(Collection files, + DocumentReaderAndWriter readerAndWriter) { + if (files.isEmpty()) { + throw new RuntimeException("Attempt to make ObjectBank with empty file list"); + } + // return new ObjectBank>(new + // ResettableReaderIteratorFactory(files, flags.inputEncoding), + // readerAndWriter); + // TODO get rid of objectbankwrapper + return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, + flags.inputEncoding), readerAndWriter), knownLCWords); + } + + /** + * Set up an ObjectBank that will allow one to iterate over a collection of + * documents obtained from the passed in Reader. Each document will be + * represented as a list of IN. If the ObjectBank iterator() is called until + * hasNext() returns false, then the Reader will be read till end of file, but + * no reading is done at the time of this call. Reading is done using the + * reading method specified in flags.documentReader, and for some + * reader choices, the column mapping given in flags.map. + * + * @param in + * Input data addNEWLCWords do we add new lowercase words from this + * data to the word shape classifier + * @return The list of documents + */ + public ObjectBank> makeObjectBankFromReader(BufferedReader in, + DocumentReaderAndWriter readerAndWriter) { + if (flags.announceObjectBankEntries) { + System.err.println("Reading data using " + readerAndWriter.getClass()); + } + // TODO get rid of objectbankwrapper + // return new ObjectBank>(new ResettableReaderIteratorFactory(in), + // readerAndWriter); + return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(in), + readerAndWriter), knownLCWords); + } + + /** + * Takes the file, reads it in, and prints out the likelihood of each possible + * label at each point. + * + * @param filename The path to the specified file + */ + public void printProbs(String filename, + DocumentReaderAndWriter readerAndWriter) { + // only for the OCR data does this matter + flags.ocrTrain = false; + + ObjectBank> docs = + makeObjectBankFromFile(filename, readerAndWriter); + printProbsDocuments(docs); + } + + /** + * Takes a {@link List} of documents and prints the likelihood of each + * possible label at each point. + * + * @param documents + * A {@link List} of {@link List} of something that extends + * {@link CoreMap}. + */ + public void printProbsDocuments(ObjectBank> documents) { + for (List doc : documents) { + printProbsDocument(doc); + System.out.println(); + } + } + + public void classifyStdin() + throws IOException + { + classifyStdin(plainTextReaderAndWriter); + } + + public void classifyStdin(DocumentReaderAndWriter readerWriter) + throws IOException + { + BufferedReader is = new BufferedReader(new InputStreamReader(System.in, flags.inputEncoding)); + for (String line; (line = is.readLine()) != null; ) { + ObjectBank> documents = makeObjectBankFromString(line, readerWriter); + classifyAndWriteAnswers(documents, readerWriter); + } + } + + public abstract void printProbsDocument(List document); + + /** + * Load a test file, run the classifier on it, and then print the answers to + * stdout (with timing to stderr). This uses the value of flags.documentReader + * to determine testFile format. + * + * @param testFile The file to test on. + */ + public void classifyAndWriteAnswers(String testFile) + throws IOException + { + classifyAndWriteAnswers(testFile, plainTextReaderAndWriter); + } + + /** + * Load a test file, run the classifier on it, and then print the answers to + * stdout (with timing to stderr). This uses the value of flags.documentReader + * to determine testFile format. + * + * @param testFile The file to test on. + * @param readerWriter A reader and writer to use for the output + */ + public void classifyAndWriteAnswers(String testFile, + DocumentReaderAndWriter readerWriter) + throws IOException + { + ObjectBank> documents = + makeObjectBankFromFile(testFile, readerWriter); + classifyAndWriteAnswers(documents, readerWriter); + } + + /** If the flag + * outputEncoding is defined, the output is written in that + * character encoding, otherwise in the system default character encoding. + */ + public void classifyAndWriteAnswers(String testFile, OutputStream outStream, + DocumentReaderAndWriter readerWriter) + throws IOException + { + ObjectBank> documents = + makeObjectBankFromFile(testFile, readerWriter); + PrintWriter pw = IOUtils.encodedOutputStreamPrintWriter(outStream, flags.outputEncoding, true); + classifyAndWriteAnswers(documents, pw, readerWriter); + } + + public void classifyAndWriteAnswers(String baseDir, String filePattern, + DocumentReaderAndWriter readerWriter) + throws IOException + { + ObjectBank> documents = + makeObjectBankFromFiles(baseDir, filePattern, readerWriter); + classifyAndWriteAnswers(documents, readerWriter); + } + + public void classifyAndWriteAnswers(Collection testFiles) + throws IOException + { + classifyAndWriteAnswers(testFiles, plainTextReaderAndWriter); + } + + public void classifyAndWriteAnswers(Collection testFiles, + DocumentReaderAndWriter readerWriter) + throws IOException + { + ObjectBank> documents = + makeObjectBankFromFiles(testFiles, readerWriter); + classifyAndWriteAnswers(documents, readerWriter); + } + + private void classifyAndWriteAnswers(ObjectBank> documents, + DocumentReaderAndWriter readerWriter) + throws IOException + { + classifyAndWriteAnswers(documents, + IOUtils.encodedOutputStreamPrintWriter(System.out, flags.outputEncoding, true), readerWriter); + } + + public void classifyAndWriteAnswers(Collection> documents, + PrintWriter printWriter, + DocumentReaderAndWriter readerWriter) + throws IOException + { + Timing timer = new Timing(); + + Counter entityTP = new ClassicCounter(); + Counter entityFP = new ClassicCounter(); + Counter entityFN = new ClassicCounter(); + boolean resultsCounted = true; + int numWords = 0; + int numDocs = 0; + + ThreadsafeProcessor, List> threadProcessor = + new ThreadsafeProcessor, List>() { + @Override + public List process(List doc) { + doc = classify(doc); + + int completedNo = threadCompletionCounter.incrementAndGet(); + if (VERBOSE) System.err.println(completedNo + " examples completed"); + return doc; + } + @Override + public ThreadsafeProcessor, List> newInstance() { + return this; + } + }; + + MulticoreWrapper, List> wrapper = null; + if (flags.multiThreadClassifier != 0) { + wrapper = new MulticoreWrapper, List>(flags.multiThreadClassifier, threadProcessor); + } + + for (List doc: documents) { + numWords += doc.size(); + numDocs++; + if (flags.multiThreadClassifier != 0) { + wrapper.put(doc); + while (wrapper.peek()) { + List results = wrapper.poll(); + writeAnswers(results, printWriter, readerWriter); + resultsCounted = resultsCounted && countResults(results, entityTP, entityFP, entityFN); + } + } else { + List results = threadProcessor.process(doc); + writeAnswers(results, printWriter, readerWriter); + resultsCounted = resultsCounted && countResults(results, entityTP, entityFP, entityFN); + } + } + if (flags.multiThreadClassifier != 0) { + wrapper.join(); + while (wrapper.peek()) { + List results = wrapper.poll(); + writeAnswers(results, printWriter, readerWriter); + resultsCounted = resultsCounted && countResults(results, entityTP, entityFP, entityFN); + } + } + + long millis = timer.stop(); + double wordspersec = numWords / (((double) millis) / 1000); + NumberFormat nf = new DecimalFormat("0.00"); // easier way! + System.err.println(StringUtils.getShortClassName(this) + + " tagged " + numWords + " words in " + numDocs + + " documents at " + nf.format(wordspersec) + + " words per second."); + if (resultsCounted) { + printResults(entityTP, entityFP, entityFN); + } + } + + /** + * Load a test file, run the classifier on it, and then print the answers to + * stdout (with timing to stderr). This uses the value of flags.documentReader + * to determine testFile format. + * + * @param testFile The filename to test on. + */ + public void classifyAndWriteAnswersKBest(String testFile, int k, + DocumentReaderAndWriter readerAndWriter) + throws IOException + { + ObjectBank> documents = + makeObjectBankFromFile(testFile, readerAndWriter); + PrintWriter pw = IOUtils.encodedOutputStreamPrintWriter(System.out, flags.outputEncoding, true); + classifyAndWriteAnswersKBest(documents, k, pw, readerAndWriter); + } + + /** + * Run the classifier on the documents in an ObjectBank, and print the + * answers to a given PrintWriter (with timing to stderr). The value of + * flags.documentReader is used to determine testFile format. + * + * @param documents The ObjectBank to test on. + */ + public void classifyAndWriteAnswersKBest(ObjectBank> documents, int k, PrintWriter printWriter, + DocumentReaderAndWriter readerAndWriter) throws IOException { + Timing timer = new Timing(); + int numWords = 0; + int numSentences = 0; + + for (List doc : documents) { + Counter> kBest = classifyKBest(doc, CoreAnnotations.AnswerAnnotation.class, k); + numWords += doc.size(); + List> sorted = Counters.toSortedList(kBest); + int n = 1; + for (List l : sorted) { + System.out.println(""); + n++; + } + numSentences++; + } + + long millis = timer.stop(); + double wordspersec = numWords / (((double) millis) / 1000); + NumberFormat nf = new DecimalFormat("0.00"); // easier way! + System.err.println(this.getClass().getName() + " tagged " + numWords + " words in " + numSentences + + " documents at " + nf.format(wordspersec) + " words per second."); + } + + /** + * Load a test file, run the classifier on it, and then write a Viterbi search + * graph for each sequence. + * + * @param testFile The file to test on. + */ + public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGraphPrefix, DocumentReaderAndWriter readerAndWriter) throws IOException { + Timing timer = new Timing(); + ObjectBank> documents = + makeObjectBankFromFile(testFile, readerAndWriter); + int numWords = 0; + int numSentences = 0; + + for (List doc : documents) { + DFSA tagLattice = getViterbiSearchGraph(doc, CoreAnnotations.AnswerAnnotation.class); + numWords += doc.size(); + PrintWriter latticeWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences + + ".wlattice")); + PrintWriter vsgWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences + ".lattice")); + if (readerAndWriter instanceof LatticeWriter) + ((LatticeWriter) readerAndWriter).printLattice(tagLattice, doc, latticeWriter); + tagLattice.printAttFsmFormat(vsgWriter); + latticeWriter.close(); + vsgWriter.close(); + numSentences++; + } + + long millis = timer.stop(); + double wordspersec = numWords / (((double) millis) / 1000); + NumberFormat nf = new DecimalFormat("0.00"); // easier way! + System.err.println(this.getClass().getName() + " tagged " + numWords + " words in " + numSentences + + " documents at " + nf.format(wordspersec) + " words per second."); + } + + /** + * Write the classifications of the Sequence classifier out to a writer in a + * format determined by the DocumentReaderAndWriter used. + * + * @param doc Documents to write out + * @param printWriter Writer to use for output + * @throws IOException If an IO problem + */ + public void writeAnswers(List doc, PrintWriter printWriter, + DocumentReaderAndWriter readerAndWriter) + throws IOException { + if (flags.lowerNewgeneThreshold) { + return; + } + if (flags.numRuns <= 1) { + readerAndWriter.printAnswers(doc, printWriter); + // out.println(); + printWriter.flush(); + } + } + + /** + * Count results using a method appropriate for the tag scheme being used. + */ + public boolean countResults(List doc, + Counter entityTP, + Counter entityFP, + Counter entityFN) { + String bg = (flags.evaluateBackground ? null : flags.backgroundSymbol); + if (flags.entitySubclassification.equalsIgnoreCase("iob2")) { + bg = flags.backgroundSymbol; + return countResultsIOB2(doc, entityTP, entityFP, entityFN, bg); + } else if (flags.iobTags) { + bg = flags.backgroundSymbol; + return countResultsIOB(doc, entityTP, entityFP, entityFN, bg); + } else { + return countResults(doc, entityTP, entityFP, entityFN, bg); + } + } + + public static boolean countResultsIOB2(List doc, + Counter entityTP, + Counter entityFP, + Counter entityFN, + String background) { + boolean entityCorrect = true; + // the annotations + String previousGold = background; + String previousGuess = background; + // the part after the I- or B- in the annotation + String previousGoldEntity = ""; + String previousGuessEntity = ""; + + for (CoreMap word : doc) { + String gold = word.get(CoreAnnotations.GoldAnswerAnnotation.class); + String guess = word.get(CoreAnnotations.AnswerAnnotation.class); + String goldEntity = (!gold.equals(background)) ? gold.substring(2) : ""; + String guessEntity = (!guess.equals(background)) ? guess.substring(2) : ""; + + //System.out.println(gold + " (" + goldEntity + ") ; " + guess + " (" + guessEntity + ")"); + + boolean newGold = (!gold.equals(background) && + (!goldEntity.equals(previousGoldEntity)) || gold.startsWith("B-")); + boolean newGuess = (!guess.equals(background) && + (!guessEntity.equals(previousGuessEntity)) || guess.startsWith("B-")); + boolean goldEnded = (!previousGold.equals(background) && + (gold.startsWith("B-") || !goldEntity.equals(previousGoldEntity))); + boolean guessEnded = (!previousGuess.equals(background) && + (guess.startsWith("B-") || !guessEntity.equals(previousGuessEntity))); + + //System.out.println(" " + newGold + " " + newGuess + " " + goldEnded + " " + guessEnded); + + if (goldEnded && !guessEnded) { + entityFN.incrementCount(previousGoldEntity, 1.0); + entityCorrect = gold.equals(background) && guess.equals(background); + } + if (goldEnded && guessEnded) { + if (entityCorrect) { + entityTP.incrementCount(previousGoldEntity, 1.0); + } else { + entityFN.incrementCount(previousGoldEntity, 1.0); + entityFP.incrementCount(previousGuessEntity, 1.0); + } + entityCorrect = gold.equals(guess); + } + if (!goldEnded && guessEnded) { + entityCorrect = false; + entityFP.incrementCount(previousGuessEntity, 1.0); + } + // nothing to do if neither gold nor guess have ended + + if (newGold && !newGuess) { + entityCorrect = false; + } + if (newGold && newGuess) { + entityCorrect = guessEntity.equals(goldEntity); + } + if (!newGold && newGuess) { + entityCorrect = false; + } + + previousGold = gold; + previousGuess = guess; + previousGoldEntity = goldEntity; + previousGuessEntity = guessEntity; + } + + // At the end, we need to check the last entity + if (!previousGold.equals(background)) { + if (entityCorrect) { + entityTP.incrementCount(previousGoldEntity, 1.0); + } else { + entityFN.incrementCount(previousGoldEntity, 1.0); + } + } + if (!previousGuess.equals(background)) { + if (!entityCorrect) { + entityFP.incrementCount(previousGuessEntity, 1.0); + } + } + + return true; + } + + public static boolean countResultsIOB(List doc, + Counter entityTP, + Counter entityFP, + Counter entityFN, + String background) { + // first, check that all answers exist and are either O, B-, or I- + for (CoreMap line : doc) { + String gold = line.get(CoreAnnotations.GoldAnswerAnnotation.class); + String guess = line.get(CoreAnnotations.AnswerAnnotation.class); + + if (gold == null) { + System.err.println("Blank gold answer"); + return false; + } + if (guess == null) { + System.err.println("Blank guess"); + return false; + } + if (!gold.equals(background) && !gold.startsWith("B-") && + !gold.startsWith("I-")) { + System.err.println("Unexpected gold answer " + gold); + return false; + } + if (!guess.equals(background) && !guess.startsWith("B-") && + !guess.startsWith("I-")) { + System.err.println("Unexpected guess " + guess); + return false; + } + // todo: verify that gold entities are in a good pattern + } + + // We count entities in three categories. When we have an entity + // in the gold, it is a true positive if the start of the entity + // is correctly guessed, all of the I- tags are correctly guessed, + // and the next guess tag after the last gold I- tag is not the + // same I- tag. Otherwise, it is a false negative. We then apply + // the same logic to the guesses, counting false positives for + // anything that did not correctly match the gold. + int index = 0; + while (index < doc.size()) { + index = tallyOneEntityIOB(doc, index, + CoreAnnotations.GoldAnswerAnnotation.class, + CoreAnnotations.AnswerAnnotation.class, + entityTP, entityFN, background); + } + index = 0; + while (index < doc.size()) { + index = tallyOneEntityIOB(doc, index, + CoreAnnotations.AnswerAnnotation.class, + CoreAnnotations.GoldAnswerAnnotation.class, + null, entityFP, background); + } + + return true; + } + + public static int tallyOneEntityIOB(List doc, + int index, + Class> source, + Class> target, + Counter positive, + Counter negative, + String background) { + CoreMap line = doc.get(index); + String gold = line.get(source); + String guess = line.get(target); + + // uninteresting + if (gold.equals(background)) { + return index + 1; + } + String entity = gold.substring(2); + boolean correct = gold.equals(guess); + ++index; + while (index < doc.size()) { + line = doc.get(index); + gold = line.get(source); + guess = line.get(target); + + if (!gold.equals("I-" + entity)) { + if (guess.equals("I-" + entity)) { + correct = false; + } + break; + } + if (!gold.equals(guess)) { + correct = false; + } + ++index; + } + if (correct) { + if (positive != null) { + positive.incrementCount(entity, 1.0); + } + } else { + negative.incrementCount(entity, 1.0); + } + return index; + } + + /** + * Count the successes and failures of the model on the given document. + * Fills numbers in to counters for true positives, false positives, + * and false negatives, and also keeps track of the entities seen. + *
    + * Returns false if we ever encounter null for gold or guess. + */ + public static boolean countResults(List doc, + Counter entityTP, + Counter entityFP, + Counter entityFN, + String background) { + int index = 0; + int goldIndex = 0, guessIndex = 0; + String lastGold = background, lastGuess = background; + + // As we go through the document, there are two events we might be + // interested in. One is when a gold entity ends, and the other + // is when a guessed entity ends. If the gold and guessed + // entities end at the same time, started at the same time, and + // match entity type, we have a true positive. Otherwise we + // either have a false positive or a false negative. + for (CoreMap line : doc) { + String gold = line.get(CoreAnnotations.GoldAnswerAnnotation.class); + String guess = line.get(CoreAnnotations.AnswerAnnotation.class); + + if (gold == null || guess == null) + return false; + + if (lastGold != null && !lastGold.equals(gold) && !lastGold.equals(background)) { + if (lastGuess.equals(lastGold) && !lastGuess.equals(guess) && goldIndex == guessIndex) { + entityTP.incrementCount(lastGold, 1.0); + } else { + entityFN.incrementCount(lastGold, 1.0); + } + } + + if (lastGuess != null && !lastGuess.equals(guess) && !lastGuess.equals(background)) { + if (lastGuess.equals(lastGold) && !lastGuess.equals(guess) && goldIndex == guessIndex && !lastGold.equals(gold)) { + // correct guesses already tallied + // only need to tally false positives + } else { + entityFP.incrementCount(lastGuess, 1.0); + } + } + + if (lastGold == null || !lastGold.equals(gold)) { + lastGold = gold; + goldIndex = index; + } + + if (lastGuess == null || !lastGuess.equals(guess)) { + lastGuess = guess; + guessIndex = index; + } + ++index; + } + + // We also have to account for entities at the very end of the + // document, since the above logic only occurs when we see + // something that tells us an entity has ended + if (lastGold != null && !lastGold.equals(background)) { + if (lastGold.equals(lastGuess) && goldIndex == guessIndex) { + entityTP.incrementCount(lastGold, 1.0); + } else { + entityFN.incrementCount(lastGold, 1.0); + } + } + if (lastGuess != null && !lastGuess.equals(background)) { + if (lastGold.equals(lastGuess) && goldIndex == guessIndex) { + // correct guesses already tallied + } else { + entityFP.incrementCount(lastGuess, 1.0); + } + } + return true; + } + + /** + * Given counters of true positives, false positives, and false + * negatives, prints out precision, recall, and f1 for each key. + */ + public static void printResults(Counter entityTP, Counter entityFP, + Counter entityFN) { + Set entities = new TreeSet(); + entities.addAll(entityTP.keySet()); + entities.addAll(entityFP.keySet()); + entities.addAll(entityFN.keySet()); + boolean printedHeader = false; + for (String entity : entities) { + double tp = entityTP.getCount(entity); + double fp = entityFP.getCount(entity); + double fn = entityFN.getCount(entity); + printedHeader = printPRLine(entity, tp, fp, fn, printedHeader); + } + double tp = entityTP.totalCount(); + double fp = entityFP.totalCount(); + double fn = entityFN.totalCount(); + printedHeader = printPRLine("Totals", tp, fp, fn, printedHeader); + } + + /** + * Print a line of precision, recall, and f1 scores, titled by entity, + * possibly printing a header if it hasn't already been printed. + * Returns whether or not the header has ever been printed. + */ + private static boolean printPRLine(String entity, double tp, double fp, double fn, + boolean printedHeader) { + if (tp == 0.0 && (fp == 0.0 || fn == 0.0)) + return printedHeader; + double precision = tp / (tp + fp); + double recall = tp / (tp + fn); + double f1 = ((precision == 0.0 || recall == 0.0) ? + 0.0 : 2.0 / (1.0 / precision + 1.0 / recall)); + if (!printedHeader) { + System.err.println(" Entity\tP\tR\tF1\tTP\tFP\tFN"); + printedHeader = true; + } + System.err.format("%15s\t%.4f\t%.4f\t%.4f\t%.0f\t%.0f\t%.0f\n", + entity, precision, recall, f1, + tp, fp, fn); + return printedHeader; + } + + /** + * Serialize a sequence classifier to a file on the given path. + * + * @param serializePath The path/filename to write the classifier to. + */ + public abstract void serializeClassifier(String serializePath); + + /** + * Loads a classifier from the given input stream. + * Any exceptions are rethrown as unchecked exceptions. + * This method does not close the InputStream. + * + * @param in The InputStream to read from + */ + public void loadClassifierNoExceptions(InputStream in, Properties props) { + // load the classifier + try { + loadClassifier(in, props); + } catch (IOException e) { + throw new RuntimeIOException(e); + } catch (ClassNotFoundException cnfe) { + throw new RuntimeException(cnfe); + } + } + + /** + * Load a classifier from the specified InputStream. No extra properties are + * supplied. This does not close the InputStream. + * + * @param in The InputStream to load the serialized classifier from + * @throws IOException If there are problems accessing the input stream + * @throws ClassCastException If there are problems interpreting the serialized data + * @throws ClassNotFoundException If there are problems interpreting the serialized data + */ + public void loadClassifier(InputStream in) throws IOException, ClassCastException, ClassNotFoundException { + loadClassifier(in, null); + } + + /** + * Load a classifier from the specified InputStream. The classifier is + * reinitialized from the flags serialized in the classifier. This does not + * close the InputStream. + * + * @param in + * The InputStream to load the serialized classifier from + * @param props + * This Properties object will be used to update the + * SeqClassifierFlags which are read from the serialized classifier + * @throws IOException + * If there are problems accessing the input stream + * @throws ClassCastException + * If there are problems interpreting the serialized data + * @throws ClassNotFoundException + * If there are problems interpreting the serialized data + */ + public void loadClassifier(InputStream in, Properties props) throws IOException, ClassCastException, + ClassNotFoundException { + loadClassifier(new ObjectInputStream(in), props); + } + + /** + * Load a classifier from the specified input stream. The classifier is + * reinitialized from the flags serialized in the classifier. + * + * @param in + * The InputStream to load the serialized classifier from + * @param props + * This Properties object will be used to update the + * SeqClassifierFlags which are read from the serialized classifier + * @throws IOException + * If there are problems accessing the input stream + * @throws ClassCastException + * If there are problems interpreting the serialized data + * @throws ClassNotFoundException + * If there are problems interpreting the serialized data + */ + public abstract void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, + ClassNotFoundException; + + private InputStream loadStreamFromClasspath(String path) { + InputStream is = getClass().getClassLoader().getResourceAsStream(path); + if (is == null) + return null; + try { + if (path.endsWith(".gz")) + is = new GZIPInputStream(new BufferedInputStream(is)); + else + is = new BufferedInputStream(is); + } catch (IOException e) { + System.err.println("CLASSPATH resource " + path + " is not a GZIP stream!"); + } + return is; + } + + /** + * Loads a classifier from the file specified by loadPath. If loadPath ends in + * .gz, uses a GZIPInputStream, else uses a regular FileInputStream. + */ + public void loadClassifier(String loadPath) throws ClassCastException, IOException, ClassNotFoundException { + loadClassifier(loadPath, null); + } + + /** + * Loads a classifier from the file specified by loadPath. If loadPath ends in + * .gz, uses a GZIPInputStream, else uses a regular FileInputStream. + */ + public void loadClassifier(String loadPath, Properties props) throws ClassCastException, IOException, ClassNotFoundException { + InputStream is; + // ms, 10-04-2010: check first is this path exists in our CLASSPATH. This + // takes priority over the file system. + if ((is = loadStreamFromClasspath(loadPath)) != null) { + Timing.startDoing("Loading classifier from " + loadPath); + loadClassifier(is, props); + is.close(); + Timing.endDoing(); + } else { + loadClassifier(new File(loadPath), props); + } + } + + public void loadClassifierNoExceptions(String loadPath) { + loadClassifierNoExceptions(loadPath, null); + } + + public void loadClassifierNoExceptions(String loadPath, Properties props) { + InputStream is; + // ms, 10-04-2010: check first is this path exists in our CLASSPATH. This + // takes priority over the file system. + if ((is = loadStreamFromClasspath(loadPath)) != null) { + Timing.startDoing("Loading classifier from " + loadPath); + loadClassifierNoExceptions(is, props); + try { + is.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + Timing.endDoing(); + } else { + loadClassifierNoExceptions(new File(loadPath), props); + } + } + + public void loadClassifier(File file) throws ClassCastException, IOException, ClassNotFoundException { + loadClassifier(file, null); + } + + /** + * Loads a classifier from the file specified. If the file's name ends in .gz, + * uses a GZIPInputStream, else uses a regular FileInputStream. This method + * closes the File when done. + * + * @param file + * Loads a classifier from this file. + * @param props + * Properties in this object will be used to overwrite those + * specified in the serialized classifier + * + * @throws IOException + * If there are problems accessing the input stream + * @throws ClassCastException + * If there are problems interpreting the serialized data + * @throws ClassNotFoundException + * If there are problems interpreting the serialized data + */ + public void loadClassifier(File file, Properties props) throws ClassCastException, IOException, + ClassNotFoundException { + Timing.startDoing("Loading classifier from " + file.getAbsolutePath()); + BufferedInputStream bis; + if (file.getName().endsWith(".gz")) { + bis = new BufferedInputStream(new GZIPInputStream(new FileInputStream(file))); + } else { + bis = new BufferedInputStream(new FileInputStream(file)); + } + loadClassifier(bis, props); + bis.close(); + Timing.endDoing(); + } + + public void loadClassifierNoExceptions(File file) { + loadClassifierNoExceptions(file, null); + } + + public void loadClassifierNoExceptions(File file, Properties props) { + try { + loadClassifier(file, props); + } catch (Exception e) { + System.err.println("Error deserializing " + file.getAbsolutePath()); + throw new RuntimeException(e); + } + } + + /** + * This function will load a classifier that is stored inside a jar file (if + * it is so stored). The classifier should be specified as its full path + * in a jar. If the classifier is not stored in the jar file or this is not run + * from inside a jar file, then this function will throw a RuntimeException. + * + * @param modelName + * The name of the model file. Iff it ends in .gz, then it is assumed + * to be gzip compressed. + * @param props + * A Properties object which can override certain properties in the + * serialized file, such as the DocumentReaderAndWriter. You can pass + * in null to override nothing. + */ + public void loadJarClassifier(String modelName, Properties props) { + Timing.startDoing("Loading JAR-internal classifier " + modelName); + try { + InputStream is = getClass().getResourceAsStream(modelName); + if (modelName.endsWith(".gz")) { + is = new GZIPInputStream(is); + } + is = new BufferedInputStream(is); + loadClassifier(is, props); + is.close(); + Timing.endDoing(); + } catch (Exception e) { + String msg = "Error loading classifier from jar file (most likely you are not running this code from a jar file or the named classifier is not stored in the jar file)"; + throw new RuntimeException(msg, e); + } + } + + private transient PrintWriter cliqueWriter; + private transient int writtenNum; // = 0; + + /** Print the String features generated from a IN */ + protected void printFeatures(IN wi, Collection features) { + if (flags.printFeatures == null || writtenNum >= flags.printFeaturesUpto) { + return; + } + if (cliqueWriter == null) { + cliqueWriter = IOUtils.getPrintWriterOrDie("feats-" + flags.printFeatures + ".txt"); + writtenNum = 0; + } + if (wi instanceof CoreLabel) { + cliqueWriter.print(wi.get(CoreAnnotations.TextAnnotation.class) + ' ' + wi.get(CoreAnnotations.PartOfSpeechAnnotation.class) + ' ' + + wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t'); + } else { + cliqueWriter.print(wi.get(CoreAnnotations.TextAnnotation.class) + + wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t'); + } + boolean first = true; + List featsList = new ArrayList(features); + Collections.sort(featsList); + for (String feat : featsList) { + if (first) { + first = false; + } else { + cliqueWriter.print(" "); + } + cliqueWriter.print(feat); + } + cliqueWriter.println(); + writtenNum++; + } + + /** Print the String features generated from a token */ + protected void printFeatureLists(IN wi, Collection> features) { + if (flags.printFeatures == null || writtenNum >= flags.printFeaturesUpto) { + return; + } + if (cliqueWriter == null) { + cliqueWriter = IOUtils.getPrintWriterOrDie("feats-" + flags.printFeatures + ".txt"); + writtenNum = 0; + } + if (wi instanceof CoreLabel) { + cliqueWriter.print(wi.get(CoreAnnotations.TextAnnotation.class) + ' ' + wi.get(CoreAnnotations.PartOfSpeechAnnotation.class) + ' ' + + wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t'); + } else { + cliqueWriter.print(wi.get(CoreAnnotations.TextAnnotation.class) + + wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t'); + } + boolean first = true; + for (List featList : features) { + List sortedFeatList = new ArrayList(featList); + Collections.sort(sortedFeatList); + for (String feat : sortedFeatList) { + if (first) { + first = false; + } else { + cliqueWriter.print(" "); + } + cliqueWriter.print(feat); + } + cliqueWriter.print(" "); + } + cliqueWriter.println(); + writtenNum++; + } + + public int windowSize() { + return windowSize; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/AcquisitionsPrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/AcquisitionsPrior.java new file mode 100644 index 0000000..0a9f449 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/AcquisitionsPrior.java @@ -0,0 +1,264 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.StringUtils; + +import java.util.Set; +import java.util.List; +import java.util.ArrayList; + +/** + * @author Jenny Finkel + */ + +public class AcquisitionsPrior extends EntityCachingAbstractSequencePrior { + + double penalty = 4.0; + double penalty1 = 3.0; + double penalty2 = 4.0; + + public AcquisitionsPrior(String backgroundSymbol, Index classIndex, List doc) { + super(backgroundSymbol, classIndex, doc); + } + + public double scoreOf(int[] sequence) { + + Set purchasers = Generics.newHashSet(); + Set purchabrs = Generics.newHashSet(); + Set sellers = Generics.newHashSet(); + Set sellerabrs = Generics.newHashSet(); + Set acquireds = Generics.newHashSet(); + Set acqabrs = Generics.newHashSet(); + + List purchasersL = new ArrayList(); + List purchabrsL = new ArrayList(); + List sellersL = new ArrayList(); + List sellerabrsL = new ArrayList(); + List acquiredsL = new ArrayList(); + List acqabrsL = new ArrayList(); + + double p = 0.0; + for (int i = 0; i < entities.length; i++) { + Entity entity = entities[i]; + if ((i == 0 || entities[i-1] != entity) && entity != null) { + + String type = classIndex.get(entity.type); + String phrase = StringUtils.join(entity.words, " ").toLowerCase(); + if (type.equals("purchaser")) { + purchasers.add(phrase); + purchasersL.add(entity); + } else if (type.equals("purchabr")) { + purchabrs.add(phrase); + purchabrsL.add(entity); + } else if (type.equals("seller")) { + sellers.add(phrase); + sellersL.add(entity); + } else if (type.equals("sellerabr")) { + sellerabrs.add(phrase); + sellerabrsL.add(entity); + } else if (type.equals("acquired")) { + acquireds.add(phrase); + acquiredsL.add(entity); + } else if (type.equals("acqabr")) { + acqabrs.add(phrase); + acqabrsL.add(entity); + } else { + System.err.println("unknown entity type: "+type); + System.exit(0); + } + } + } + + for (Entity purchaser : purchasersL) { + if (purchasers.size() > 1) { + p -= purchaser.words.size() * penalty; + } + String s = StringUtils.join(purchaser.words, "").toLowerCase(); + boolean match = false; + for (Entity purchabr : purchabrsL) { + String s1 = StringUtils.join(purchabr.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s1.length() - 2) { + if (s.indexOf(s1) >= 0) { + match = true; + break; + } + } + if (!match && purchabrs.size() > 0) { + p -= purchaser.words.size() * penalty; + } + } + + for (Entity seller : sellersL) { + if (sellers.size() > 1) { + p -= seller.words.size() * penalty; + } + String s = StringUtils.join(seller.words, "").toLowerCase(); + boolean match = false; + for (Entity sellerabr : sellerabrsL) { + String s1 = StringUtils.join(sellerabr.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s1.length() - 2) { + if (s.indexOf(s1) >= 0) { + match = true; + break; + } + } + if (!match && sellerabrs.size() > 0) { + p -= seller.words.size() * penalty; + } + } + + for (Entity acquired : acquiredsL) { + if (acquireds.size() > 1) { + p -= acquired.words.size() * penalty; + } + String s = StringUtils.join(acquired.words, "").toLowerCase(); + boolean match = false; + for (Entity acqabr : acqabrsL) { + String s1 = StringUtils.join(acqabr.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s1.length() - 2) { + if (s.indexOf(s1) >= 0) { + match = true; + break; + } + } + if (!match && acqabrs.size() > 0) { + p -= acquired.words.size() * penalty; + } + } + + + for (Entity purchabr : purchabrsL) { + //p -= purchabr.words.size() * penalty; + String s = StringUtils.join(purchabr.words, "").toLowerCase(); + boolean match = false; + for (Entity purchaser : purchasersL) { + String s1 = StringUtils.join(purchaser.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s1.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + if (!match) { + p -= purchabr.words.size() * penalty2; + } + + match = false; + for (Entity acquired : acquiredsL) { + String s1 = StringUtils.join(acquired.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + for (Entity seller : sellersL) { + String s1 = StringUtils.join(seller.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + if (match) { + p -= purchabr.words.size() * penalty1; + } + } + + for (Entity sellerabr : sellerabrsL) { + //p -= sellerabr.words.size() * penalty; + String s = StringUtils.join(sellerabr.words, "").toLowerCase(); + boolean match = false; + for (Entity seller : sellersL) { + String s1 = StringUtils.join(seller.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s1.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + if (!match) { + p -= sellerabr.words.size() * penalty2; + } + + + match = false; + for (Entity acquired : acquiredsL) { + String s1 = StringUtils.join(acquired.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + for (Entity purchaser : purchasersL) { + String s1 = StringUtils.join(purchaser.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + if (match) { + p -= sellerabr.words.size() * penalty1; + } + } + + + for (Entity acqabr : acqabrsL) { + //p -= acqabr.words.size() * penalty; + String s = StringUtils.join(acqabr.words, "").toLowerCase(); + boolean match = false; + for (Entity acquired : acquiredsL) { + String s1 = StringUtils.join(acquired.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s1.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + if (!match) { + p -= acqabr.words.size() * penalty2; + } + + match = false; + for (Entity seller : sellersL) { + String s1 = StringUtils.join(seller.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s.length() - 2) { + if (s1.indexOf(s) >= 0) { + //System.err.println(acqabr.toString(classIndex)+"\n"+seller.toString(classIndex)+"\n"); + match = true; + break; + } + } + for (Entity purchaser : purchasersL) { + String s1 = StringUtils.join(purchaser.words, "").toLowerCase(); + //int dist = StringUtils.longestCommonSubstring(s, s1); + //if (dist > s.length() - 2) { + if (s1.indexOf(s) >= 0) { + match = true; + break; + } + } + if (match) { + p -= acqabr.words.size() * penalty1; + } + } + + return p; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/BisequenceEmpiricalNERPrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/BisequenceEmpiricalNERPrior.java new file mode 100644 index 0000000..5abc6fd --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/BisequenceEmpiricalNERPrior.java @@ -0,0 +1,249 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.util.ArrayUtils; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.HashIndex; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.sequences.SequenceModel; +import edu.stanford.nlp.sequences.SequenceListener; +import edu.stanford.nlp.sequences.SeqClassifierFlags; + +import java.io.*; +import java.util.*; + + +/** + * @author Mengqiu Wang + */ + +public class BisequenceEmpiricalNERPrior { + + private Index tagIndex; + private int backgroundSymbolIndex; + private int numClasses; + private int numTags; + private int[] possibleValues; + private int[] currSequence; + private Index classIndex; + private List wordDoc; + private double[][] entityMatrix, subEntityMatrix; + private List entityList; + private SeqClassifierFlags flags; + + // protected double p1 = -Math.log(0.01); + + private boolean VERBOSE = false; + + public static List debugIndices = Arrays.asList(80, 265, 53+598, 162+598, 163+598); + public static boolean DEBUG = false; + + public BisequenceEmpiricalNERPrior(String backgroundSymbol, Index classIndex, Index tagIndex, List doc, Pair matrices, SeqClassifierFlags flags) { + this.flags = flags; + this.classIndex = classIndex; + this.tagIndex = tagIndex; + this.backgroundSymbolIndex = classIndex.indexOf(backgroundSymbol); + this.numClasses = classIndex.size(); + this.numTags = tagIndex.size(); + this.possibleValues = new int[numClasses]; + for (int i=0; i(doc.size()); + for (IN w: doc) { + wordDoc.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + entityMatrix = matrices.first(); + subEntityMatrix = matrices.second(); + } + + public static Pair readEntityMatrices(String fileName, Index tagIndex) { + int numTags = tagIndex.size(); + double[][] matrix = new double[numTags-1][numTags-1]; + for (int i = 0; i < numTags-1; i++) + matrix[i] = new double[numTags-1]; + double[][] subMatrix = new double[numTags-1][numTags-1]; + for (int i = 0; i < numTags-1; i++) + subMatrix[i] = new double[numTags-1]; + + try { + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fileName)))); + String line = null; + int lineCount = 0; + while ((line = br.readLine()) != null) { + line = line.trim(); + String[] parts = line.split("\t"); + for (String part: parts) { + String[] subparts = part.split(" "); + String[] subsubparts = subparts[0].split(":"); + double counts = Double.parseDouble(subparts[1]); + if (counts == 0.0) // smoothing + counts = 1.0; + int tagIndex1 = tagIndex.indexOf(subsubparts[0]); + int tagIndex2 = tagIndex.indexOf(subsubparts[1]); + if (lineCount < numTags-1) + matrix[tagIndex1][tagIndex2] = counts; + else + subMatrix[tagIndex1][tagIndex2] = counts; + } + lineCount++; + } + } catch (Exception ex) { + ex.printStackTrace(); + System.exit(-1); + } + for (int i = 0; i < matrix.length; i++) { + double sum = ArrayMath.sum(matrix[i]); + for (int j = 0; j < matrix[i].length; j++) + matrix[i][j] = Math.log(matrix[i][j] / sum) / 2; + } + for (int i = 0; i < subMatrix.length; i++) { + double sum = ArrayMath.sum(subMatrix[i]); + for (int j = 0; j < subMatrix[i].length; j++) + subMatrix[i][j] = Math.log(subMatrix[i][j] / sum); + } + + System.err.println("Matrix: "); + System.err.println(ArrayUtils.toString(matrix)); + System.err.println("SubMatrix: "); + System.err.println(ArrayUtils.toString(subMatrix)); + + return new Pair(matrix, subMatrix); + } + + static class Entity { + public int startPosition; + public int wordsSize; + public String surface; + public int type; + public List subMatch; + public List exactMatch; + + public Entity(int startP, List words, int type) { + this.type = type; + this.startPosition = startP; + this.wordsSize = words.size(); + this.surface = StringUtils.join(words, " "); + } + + /** + * the begining index of other locations where this sequence of + * words appears. + */ + public int[] otherOccurrences; + + public String toString(Index tagIndex) { + StringBuffer sb = new StringBuffer(); + sb.append("\""); + sb.append(surface); + sb.append("\" start: "); + sb.append(startPosition); + sb.append(" type: "); + sb.append(tagIndex.get(type)); + sb.append(" exact matches: ["); + for (Entity exact: exactMatch) { + sb.append(exact.startPosition); + sb.append(":"); + sb.append(exact.surface); + sb.append(" "); + } + sb.append("],"); + sb.append(" sub matches: ["); + for (Entity sub: subMatch) { + sb.append(sub.startPosition); + sb.append(":"); + sb.append(sub.surface); + sb.append(" "); + } + sb.append("]"); + return sb.toString(); + } + } + + public static List extractEntities(int[] sequence, List wordDoc, Index tagIndex, Index classIndex, int backgroundSymbolIndex) { + String rawTag = null; + String[] parts = null; + String currTag = ""; + List currWords = new ArrayList(); + List entityList = new ArrayList(); + + for (int i = 0; i < sequence.length; i++) { + if (sequence[i] != backgroundSymbolIndex) { + rawTag = classIndex.get(sequence[i]); + parts = rawTag.split("-"); + if (parts[0].equals("B")) { // B- + if (currWords.size() > 0) { + entityList.add(new Entity(i-currWords.size(), currWords, tagIndex.indexOf(currTag))); + currWords.clear(); + } + currWords.add(wordDoc.get(i)); + currTag = parts[1]; + } else { // I- + if (currWords.size() > 0 && parts[1].equals(currTag)) { // matches proceeding tag + currWords.add(wordDoc.get(i)); + } else { // orphan I- without proceeding B- or mismatch previous tag + if (currWords.size() > 0) { + entityList.add(new Entity(i-currWords.size(), currWords, tagIndex.indexOf(currTag))); + currWords.clear(); + } + currWords.add(wordDoc.get(i)); + currTag = parts[1]; + } + } + } else { + if (currWords.size() > 0) { + entityList.add(new Entity(i-currWords.size(), currWords, tagIndex.indexOf(currTag))); + currWords.clear(); + currTag = ""; + } + } + } + if (currWords.size() > 0) { + entityList.add(new Entity(sequence.length-currWords.size(), currWords, tagIndex.indexOf(currTag))); + } + // build entity matching and sub-entity matching map + for (int i = 0; i < entityList.size(); i++) { + Entity curr = entityList.get(i); + List exact = new ArrayList(); + List subMatch = new ArrayList(); + String currStr = curr.surface; + + for (int j = 0; j < entityList.size(); j++) { + if (i == j) + continue; + Entity other = entityList.get(j); + if (other.surface.indexOf(currStr) != -1) { + if (other.surface.length() == currStr.length()) { + if (i < j) // avoid double-counting + exact.add(other); + } else { // sub-match has no double-counting problem, cause it's one-directional + subMatch.add(other); + } + } + } + + curr.exactMatch = exact; + curr.subMatch = subMatch; + } + + return entityList; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + for (Entity entity: entityList) { + sb.append(entity.startPosition); + sb.append("\t"); + sb.append(entity.surface); + sb.append("\t"); + sb.append(tagIndex.get(entity.type)); + sb.append("\n"); + } + return sb.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/ClassifierCombiner.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/ClassifierCombiner.java new file mode 100644 index 0000000..20facea --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/ClassifierCombiner.java @@ -0,0 +1,358 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.ie.ner.CMMClassifier; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.sequences.DocumentReaderAndWriter; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.ErasureUtils; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.StringUtils; + +import java.io.FileNotFoundException; +import java.io.ObjectInputStream; +import java.io.IOException; +import java.util.*; + +/** + * Merges the outputs of two or more AbstractSequenceClassifiers according to + * a simple precedence scheme: any given base classifier contributes only + * classifications of labels that do not exist in the base classifiers specified + * before, and that do not have any token overlap with labels assigned by + * higher priority classifiers. + *

    + * This is a pure AbstractSequenceClassifier, i.e., it sets the AnswerAnnotation label. + * If you work with NER classifiers, you should use NERClassifierCombiner. This class + * inherits from ClassifierCombiner, and takes care that all AnswerAnnotations are also + * copied to NERAnnotation. + *

    + * You can specify up to 10 base classifiers using the -loadClassifier1 to -loadClassifier10 + * properties. We also maintain the older usage when only two base classifiers were accepted, + * specified using -loadClassifier and -loadAuxClassifier. + *

    + * ms 2009: removed all NER functionality (see NERClassifierCombiner), changed code so it accepts an arbitrary number of base classifiers, removed dead code. + * + * @author Chris Cox + * @author Mihai Surdeanu + */ +public class ClassifierCombiner extends AbstractSequenceClassifier { + + private static final boolean DEBUG = false; + private List> baseClassifiers; + + private static final String DEFAULT_AUX_CLASSIFIER_PATH="/u/nlp/data/ner/goodClassifiers/english.muc.7class.distsim.crf.ser.gz"; + private static final String DEFAULT_CLASSIFIER_PATH="/u/nlp/data/ner/goodClassifiers/english.all.3class.distsim.crf.ser.gz"; + + /** + * @param p Properties File that specifies loadClassifier + * and loadAuxClassifier properties or, alternatively, loadClassifier[1-10] properties. + * @throws FileNotFoundException If classifier files not found + */ + public ClassifierCombiner(Properties p) throws FileNotFoundException { + super(p); + String loadPath1, loadPath2; + List paths = new ArrayList(); + + // + // preferred configuration: specify up to 10 base classifiers using loadClassifier1 to loadClassifier10 properties + // + if((loadPath1 = p.getProperty("loadClassifier1")) != null && (loadPath2 = p.getProperty("loadClassifier2")) != null) { + paths.add(loadPath1); + paths.add(loadPath2); + for(int i = 3; i <= 10; i ++){ + String path; + if ((path = p.getProperty("loadClassifier" + i)) != null) { + paths.add(path); + } + } + loadClassifiers(paths); + } + + // + // second accepted setup (backward compatible): two classifier given in loadClassifier and loadAuxClassifier + // + else if((loadPath1 = p.getProperty("loadClassifier")) != null && (loadPath2 = p.getProperty("loadAuxClassifier")) != null){ + paths.add(loadPath1); + paths.add(loadPath2); + loadClassifiers(paths); + } + + // + // fall back strategy: use the two default paths on NLP machines + // + else { + paths.add(DEFAULT_CLASSIFIER_PATH); + paths.add(DEFAULT_AUX_CLASSIFIER_PATH); + loadClassifiers(paths); + } + } + + /** Loads a series of base classifiers from the paths specified. + * + * @param loadPaths Paths to the base classifiers + * @throws FileNotFoundException If classifier files not found + */ + public ClassifierCombiner(String... loadPaths) throws FileNotFoundException { + super(new Properties()); + List paths = new ArrayList(Arrays.asList(loadPaths)); + loadClassifiers(paths); + } + + + /** Combines a series of base classifiers + * + * @param classifiers The base classifiers + */ + public ClassifierCombiner(AbstractSequenceClassifier... classifiers) { + super(new Properties()); + baseClassifiers = new ArrayList>(Arrays.asList(classifiers)); + flags.backgroundSymbol = baseClassifiers.get(0).flags.backgroundSymbol; + } + + + private void loadClassifiers(List paths) throws FileNotFoundException { + baseClassifiers = new ArrayList>(); + for(String path: paths){ + AbstractSequenceClassifier cls = loadClassifierFromPath(path); + baseClassifiers.add(cls); + if(DEBUG){ + System.err.printf("Successfully loaded classifier #%d from %s.\n", baseClassifiers.size(), path); + } + } + if (baseClassifiers.size() > 0) { + flags.backgroundSymbol = baseClassifiers.get(0).flags.backgroundSymbol; + } + } + + + public static AbstractSequenceClassifier loadClassifierFromPath(String path) + throws FileNotFoundException { + //try loading as a CRFClassifier + try { + return ErasureUtils.uncheckedCast(CRFClassifier.getClassifier(path)); + } catch (Exception e) { + e.printStackTrace(); + } + //try loading as a CMMClassifier + try { + return ErasureUtils.uncheckedCast(CMMClassifier.getClassifier(path)); + } catch (Exception e) { + //fail + //System.err.println("Couldn't load classifier from path :"+path); + FileNotFoundException fnfe = new FileNotFoundException(); + fnfe.initCause(e); + throw fnfe; + } + } + + @Override + public Set labels() { + Set labs = Generics.newHashSet(); + for(AbstractSequenceClassifier cls: baseClassifiers) + labs.addAll(cls.labels()); + return labs; + } + + + /** + * Reads the Answer annotations in the given labellings (produced by the base models) + * and combines them using a priority ordering, i.e., for a given baseDocument all + * labellings seen before in the baseDocuments list have higher priority. + * Writes the answer to AnswerAnnotation in the labeling at position 0 + * (considered to be the main document). + * + * @param baseDocuments Results of all base AbstractSequenceClassifier models + * @return A List of IN with the combined annotations. (This is an + * updating of baseDocuments.get(0), not a new List.) + */ + private List mergeDocuments(List> baseDocuments){ + // we should only get here if there is something to merge + assert(! baseClassifiers.isEmpty() && ! baseDocuments.isEmpty()); + // all base outputs MUST have the same length (we generated them internally!) + for(int i = 1; i < baseDocuments.size(); i ++) + assert(baseDocuments.get(0).size() == baseDocuments.get(i).size()); + + // baseLabels.get(i) points to the labels assigned by baseClassifiers.get(i) + List> baseLabels = new ArrayList>(); + Set seenLabels = Generics.newHashSet(); + for (AbstractSequenceClassifier baseClassifier : baseClassifiers) { + Set labs = baseClassifier.labels(); + labs.removeAll(seenLabels); + seenLabels.addAll(labs); + baseLabels.add(labs); + } + String background = baseClassifiers.get(0).flags.backgroundSymbol; + + if (DEBUG) { + for(int i = 0; i < baseLabels.size(); i ++) + System.err.println("mergeDocuments: Using classifier #" + i + " for " + baseLabels.get(i)); + System.err.println("mergeDocuments: Background symbol is " + background); + + System.err.println("Base model outputs:"); + for( int i = 0; i < baseDocuments.size(); i ++){ + System.err.printf("Output of model #%d:", i); + for (IN l : baseDocuments.get(i)) { + System.err.print(' '); + System.err.print(l.get(CoreAnnotations.AnswerAnnotation.class)); + } + System.err.println(); + } + } + + // incrementally merge each additional model with the main model (i.e., baseDocuments.get(0)) + // this keeps adding labels from the additional models to mainDocument + // hence, when all is done, mainDocument contains the labels of all base models + List mainDocument = baseDocuments.get(0); + for (int i = 1; i < baseDocuments.size(); i ++) { + mergeTwoDocuments(mainDocument, baseDocuments.get(i), baseLabels.get(i), background); + } + + if (DEBUG) { + System.err.print("Output of combined model:"); + for (IN l: mainDocument) { + System.err.print(' '); + System.err.print(l.get(CoreAnnotations.AnswerAnnotation.class)); + } + System.err.println(); + System.err.println(); + } + + return mainDocument; + } + + + /** This merges in labels from the auxDocument into the mainDocument when + * tokens have one of the labels in auxLabels, and the subsequence + * labeled with this auxLabel does not conflict with any non-background + * labelling in the mainDocument. + */ + static void mergeTwoDocuments(List mainDocument, List auxDocument, Set auxLabels, String background) { + boolean insideAuxTag = false; + boolean auxTagValid = true; + String prevAnswer = background; + Collection constituents = new ArrayList(); + + Iterator auxIterator = auxDocument.listIterator(); + + for (INN wMain : mainDocument) { + String mainAnswer = wMain.get(CoreAnnotations.AnswerAnnotation.class); + INN wAux = auxIterator.next(); + String auxAnswer = wAux.get(CoreAnnotations.AnswerAnnotation.class); + boolean insideMainTag = !mainAnswer.equals(background); + + /* if the auxiliary classifier gave it one of the labels unique to + auxClassifier, we might set the mainLabel to that. */ + if (auxLabels.contains(auxAnswer)) { + if ( ! prevAnswer.equals(auxAnswer) && ! prevAnswer.equals(background)) { + if (auxTagValid){ + for (INN wi : constituents) { + wi.set(CoreAnnotations.AnswerAnnotation.class, prevAnswer); + } + } + auxTagValid = true; + constituents = new ArrayList(); + } + insideAuxTag = true; + if (insideMainTag) { auxTagValid = false; } + prevAnswer = auxAnswer; + constituents.add(wMain); + } else { + if (insideAuxTag) { + if (auxTagValid){ + for (INN wi : constituents) { + wi.set(CoreAnnotations.AnswerAnnotation.class, prevAnswer); + } + } + constituents = new ArrayList(); + } + insideAuxTag=false; + auxTagValid = true; + prevAnswer = background; + } + } + // deal with a sequence final auxLabel + if (auxTagValid){ + for (INN wi : constituents) { + wi.set(CoreAnnotations.AnswerAnnotation.class, prevAnswer); + } + } + } + + /** + * Generates the AnswerAnnotation labels of the combined model for the given + * tokens, storing them in place in the tokens. + * + * @param tokens A List of IN + * @return The passed in parameters, which will have the AnswerAnnotation field added/overwritten + */ + @Override + public List classify(List tokens) { + if (baseClassifiers.isEmpty()) { + return tokens; + } + List> baseOutputs = new ArrayList>(); + + // the first base model works in place, modifying the original tokens + List output = baseClassifiers.get(0).classifySentence(tokens); + // classify(List) is supposed to work in place, so add AnswerAnnotation to tokens! + for (int i = 0, sz = output.size(); i < sz; i++) { + tokens.get(i).set(CoreAnnotations.AnswerAnnotation.class, output.get(i).get(CoreAnnotations.AnswerAnnotation.class)); + } + baseOutputs.add(tokens); + + for (int i = 1, sz = baseClassifiers.size(); i < sz; i ++) { + //List copy = deepCopy(tokens); + // no need for deep copy: classifySentence creates a copy of the input anyway + // List copy = tokens; + output = baseClassifiers.get(i).classifySentence(tokens); + baseOutputs.add(output); + } + assert(baseOutputs.size() == baseClassifiers.size()); + List finalAnswer = mergeDocuments(baseOutputs); + + return finalAnswer; + } + + + @SuppressWarnings("unchecked") + @Override + public void train(Collection> docs, + DocumentReaderAndWriter readerAndWriter) { + throw new UnsupportedOperationException(); + } + + @Override + public void printProbsDocument(List document) { + throw new UnsupportedOperationException(); + } + + @Override + public void serializeClassifier(String serializePath) { + throw new UnsupportedOperationException(); + } + + @Override + public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException { + throw new UnsupportedOperationException(); + } + + @Override + public List classifyWithGlobalInformation(List tokenSeq, CoreMap doc, CoreMap sent) { + return classify(tokenSeq); + } + + /** + * Some basic testing of the ClassifierCombiner. + * + * @param args Command-line arguments as properties: -loadClassifier1 serializedFile -loadClassifier2 serializedFile + * @throws Exception If IO or serialization error loading classifiers + */ + public static void main(String[] args) throws Exception { + Properties props = StringUtils.argsToProperties(args); + ClassifierCombiner ec = new ClassifierCombiner(props); + + System.err.println(ec.classifyToString("Marketing : Sony Hopes to Win Much Bigger Market For Wide Range of Small-Video Products --- By Andrew B. Cohen Staff Reporter of The Wall Street Journal")); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EmpiricalNERPrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EmpiricalNERPrior.java new file mode 100644 index 0000000..de4eff7 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EmpiricalNERPrior.java @@ -0,0 +1,284 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.ling.CoreLabel; + +import java.util.List; + + +/** + * @author Jenny Finkel + */ + +public class EmpiricalNERPrior extends EntityCachingAbstractSequencePrior { + + protected String ORG = "ORGANIZATION"; + protected String PER = "PERSON"; + protected String LOC = "LOCATION"; + protected String MISC = "MISC"; + + public EmpiricalNERPrior(String backgroundSymbol, Index classIndex, List doc) { + super(backgroundSymbol, classIndex, doc); + } + + protected double p1 = -Math.log(0.01); + + protected double dem1 = 6631.0; + protected double p2 = -Math.log(6436.0 / dem1)/2.0; + protected double p3 = -Math.log(188 / dem1)/2.0; + protected double p4 = -Math.log(4 / dem1)/2.0; + protected double p5 = -Math.log(3 / dem1)/2.0; + + protected double dem2 = 3169.0; + protected double p6 = -Math.log(188.0 / dem2)/2.0; + protected double p7 = -Math.log(2975 / dem2)/2.0; + protected double p8 = -Math.log(5 / dem2)/2.0; + protected double p9 = -Math.log(1 / dem2)/2.0; + + protected double dem3 = 3151.0; + protected double p10 = -Math.log(4.0 / dem3)/2.0; + protected double p11 = -Math.log(5 / dem3)/2.0; + protected double p12 = -Math.log(3141 / dem3)/2.0; + protected double p13 = -Math.log(1 / dem3)/2.0; + + protected double dem4 = 2035.0; + protected double p14 = -Math.log(3.0 / dem4)/2.0; + protected double p15 = -Math.log(1 / dem4)/2.0; + protected double p16 = -Math.log(1 / dem4)/2.0; + protected double p17 = -Math.log(2030 / dem4)/2.0; + + protected double dem5 = 724.0; + protected double p18 = -Math.log(167.0 / dem5); + protected double p19 = -Math.log(328.0 / dem5); + protected double p20 = -Math.log(5.0 / dem5); + protected double p21 = -Math.log(224.0 / dem5); + + protected double dem6 = 834.0; + protected double p22 = -Math.log(6.0 / dem6); + protected double p23 = -Math.log(819.0 / dem6); + protected double p24 = -Math.log(2.0 / dem6); + protected double p25 = -Math.log(7.0 / dem6); + + protected double dem7 = 1978.0; + protected double p26 = -Math.log(1.0 / dem7); + protected double p27 = -Math.log(22.0 / dem7); + protected double p28 = -Math.log(1941.0 / dem7); + protected double p29 = -Math.log(14.0 / dem7); + + protected double dem8 = 622.0; + protected double p30 = -Math.log(63.0 / dem8); + protected double p31 = -Math.log(191.0 / dem8); + protected double p32 = -Math.log(3.0 / dem8); + protected double p33 = -Math.log(365.0 / dem8); + + public double scoreOf(int[] sequence) { + double p = 0.0; + for (int i = 0; i < entities.length; i++) { + Entity entity = entities[i]; + //System.err.println(entity); + if ((i == 0 || entities[i-1] != entity) && entity != null) { + //System.err.println(1); + int length = entity.words.size(); + String tag1 = classIndex.get(entity.type); + + if (tag1.equals(LOC)) { tag1 = LOC; } + else if (tag1.equals(ORG)) { tag1 = ORG; } + else if (tag1.equals(PER)) { tag1 = PER; } + else if (tag1.equals(MISC)) { tag1 = MISC; } + + int[] other = entities[i].otherOccurrences; + for (int j = 0; j < other.length; j++) { + + Entity otherEntity = null; + for (int k = other[j]; k < other[j]+length && k < entities.length; k++) { + otherEntity = entities[k]; + if (otherEntity != null) { +// if (k > other[j]) { +// System.err.println(entity.words+" "+otherEntity.words); +// } + break; + } + } + // singleton + other instance null? + if (otherEntity == null) { + //p -= length * Math.log(0.1); + //if (entity.words.size() == 1) { + //p -= length * p1; + //} + continue; + } + + int oLength = otherEntity.words.size(); + String tag2 = classIndex.get(otherEntity.type); + + if (tag2.equals(LOC)) { tag2 = LOC; } + else if (tag2.equals(ORG)) { tag2 = ORG; } + else if (tag2.equals(PER)) { tag2 = PER; } + else if (tag2.equals(MISC)) { tag2 = MISC; } + + // exact match?? + boolean exact = false; + int[] oOther = otherEntity.otherOccurrences; + for (int k = 0; k < oOther.length; k++) { + if (oOther[k] >= i && oOther[k] <= i+length-1) { + exact = true; + break; + } + } + + if (exact) { + // entity not complete + if (length != oLength) { + if (tag1 == (tag2)) {// || ((tag1 == LOC && tag2 == ORG) || (tag1 == ORG && tag2 == LOC))) { // || + //p -= Math.abs(oLength - length) * Math.log(0.1); + p -= Math.abs(oLength - length) * p1; + } else if (!(tag1.equals(ORG) && tag2.equals(LOC)) && + !(tag2.equals(LOC) && tag1.equals(ORG))) { + // shorter + p -= (oLength + length) * p1; + } + } + if (tag1 == (LOC)) { + if (tag2 == (LOC)) { + //p -= length * Math.log(6436.0 / dem); + //p -= length * p2; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(188 / dem); + p -= length * p3; + } else if (tag2 == (PER)) { + //p -= length * Math.log(4 / dem); + p -= length * p4; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(3 / dem); + p -= length * p5; + } + } else if (tag1 == (ORG)) { + //double dem = 3169.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(188.0 / dem); + p -= length * p6; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(2975 / dem); + //p -= length * p7; + } else if (tag2 == (PER)) { + //p -= length * Math.log(5 / dem); + p -= length * p8; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(1 / dem); + p -= length * p9; + } + } else if (tag1 == (PER)) { + //double dem = 3151.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(4.0 / dem); + p -= length * p10; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(5 / dem); + p -= length * p11; + } else if (tag2 == (PER)) { + //p -= length * Math.log(3141 / dem); + //p -= length * p12; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(1 / dem); + p -= length * p13; + } + } else if (tag1 == (MISC)) { + //double dem = 2035.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(3.0 / dem); + p -= length * p14; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(1 / dem); + p -= length * p15; + } else if (tag2 == (PER)) { + //p -= length * Math.log(1 / dem); + p -= length * p16; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(2030 / dem); + //p -= length * p17; + } + } + } else { + if (tag1 == (LOC)) { + //double dem = 724.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(167.0 / dem); + //p -= length * p18; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(328.0 / dem); + //p -= length * p19; + } else if (tag2 == (PER)) { + //p -= length * Math.log(5.0 / dem); + p -= length * p20; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(224.0 / dem); + p -= length * p21; + } + } else if (tag1 == (ORG)) { + //double dem = 834.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(6.0 / dem); + p -= length * p22; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(819.0 / dem); + //p -= length * p23; + } else if (tag2 == (PER)) { + //p -= length * Math.log(2.0 / dem); + p -= length * p24; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(7.0 / dem); + p -= length * p25; + } + } else if (tag1 == (PER)) { + //double dem = 1978.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(1.0 / dem); + p -= length * p26; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(22.0 / dem); + p -= length * p27; + } else if (tag2 == (PER)) { + //p -= length * Math.log(1941.0 / dem); + //p -= length * p28; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(14.0 / dem); + p -= length * p29; + } + } else if (tag1 == (MISC)) { + //double dem = 622.0; + if (tag2 == (LOC)) { + //p -= length * Math.log(63.0 / dem); + p -= length * p30; + } else if (tag2 == (ORG)) { + //p -= length * Math.log(191.0 / dem); + p -= length * p31; + } else if (tag2 == (PER)) { + //p -= length * Math.log(3.0 / dem); + p -= length * p32; + } else if (tag2 == (MISC)) { + //p -= length * Math.log(365.0 / dem); + p -= length * p33; + } + } + } + +// if (tag1 == PER) { +// int personIndex = classIndex.indexOf(PER); +// String lastName = entity.words.get(entity.words.size()-1); +// for (int k = 0; k < doc.size(); k++) { +// String w = doc.get(k).word(); +// if (w.equalsIgnoreCase(lastName)) { +// if (sequence[k] != personIndex) { +// p -= p1; +// } +// } +// } +// } + } + } + } + return p; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EmpiricalNERPriorBIO.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EmpiricalNERPriorBIO.java new file mode 100644 index 0000000..ebc1e58 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EmpiricalNERPriorBIO.java @@ -0,0 +1,111 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.sequences.SeqClassifierFlags; + +import java.util.List; + + +/** + * @author Mengqiu Wang + */ + +public class EmpiricalNERPriorBIO extends EntityCachingAbstractSequencePriorBIO { + + private double[][] entityMatrix, subEntityMatrix; + private SeqClassifierFlags flags; + + protected double p1 = Math.log(0.01); + protected double p2 = Math.log(2.0); + protected int ORGIndex, LOCIndex; + + public static boolean DEBUG = false; + + public EmpiricalNERPriorBIO(String backgroundSymbol, Index classIndex, Index tagIndex, List doc, Pair matrices, SeqClassifierFlags flags) { + super(backgroundSymbol, classIndex, tagIndex, doc); + entityMatrix = matrices.first(); + subEntityMatrix = matrices.second(); + this.flags = flags; + ORGIndex = tagIndex.indexOf("ORG"); + LOCIndex = tagIndex.indexOf("LOC"); + } + + public double scoreOf(int[] sequence) { + double p = 0.0; + for (int i = 0; i < entities.length; i++) { + EntityBIO entity = entities[i]; + if ((i == 0 || entities[i-1] != entity) && entity != null) { + int length = entity.words.size(); + int tag1 = entity.type; + // String tag1 = classIndex.get(entity.type); + + int[] other = entities[i].otherOccurrences; + for (int j = 0; j < other.length; j++) { + + EntityBIO otherEntity = null; + for (int k = other[j]; k < other[j]+length && k < entities.length; k++) { + otherEntity = entities[k]; + if (otherEntity != null) { + break; + } + } + // singleton + other instance null? + if (otherEntity == null) { + continue; + } + + int oLength = otherEntity.words.size(); + // String tag2 = classIndex.get(otherEntity.type); + int tag2 = otherEntity.type; + + // exact match?? + boolean exact = false; + int[] oOther = otherEntity.otherOccurrences; + for (int k = 0; k < oOther.length; k++) { + if (oOther[k] >= i && oOther[k] <= i+length-1) { + exact = true; + break; + } + } + + double factor = 0; + if (exact) { + if (DEBUG) + System.err.print("Exact match of tag1=" + tagIndex.get(tag1) + ", tag2=" + tagIndex.get(tag2)); + // entity not complete + if (length != oLength) { + // if (DEBUG) + // System.err.println("Entity Not Complete"); + if (tag1 == tag2) { + p += Math.abs(oLength - length) * p1; + } else if (!(tag1 == ORGIndex && tag2 == LOCIndex ) && + !(tag1 == LOCIndex && tag2 == ORGIndex)) { + // shorter + p += (oLength + length) * p1; + } + } + factor = entityMatrix[tag1][tag2]; + } else { + if (DEBUG) + System.err.print("Sub match of tag1=" + tagIndex.get(tag1) + ", tag2=" + tagIndex.get(tag2)); + factor = subEntityMatrix[tag1][tag2]; + } + if (tag1 == tag2) { + if (flags.matchNERIncentive) { + factor = p2; + // factor *= -1; + } else + factor = 0; + } + if (DEBUG) + System.err.println(" of factor=" + factor + ", p += " + (length * factor)); + p += length * factor; + } + } + } + return p; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EntityCachingAbstractSequencePrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EntityCachingAbstractSequencePrior.java new file mode 100644 index 0000000..e7560f2 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EntityCachingAbstractSequencePrior.java @@ -0,0 +1,499 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.sequences.SequenceModel; +import edu.stanford.nlp.sequences.SequenceListener; +import edu.stanford.nlp.ling.CoreAnnotations; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * This class keeps track of all labeled entities and updates the + * its list whenever the label at a point gets changed. This allows + * you to not have to regereate the list everytime, which can be quite + * inefficient. + * + * @author Jenny Finkel + **/ +public abstract class EntityCachingAbstractSequencePrior implements SequenceModel, SequenceListener { + + protected int[] sequence; + protected int backgroundSymbol; + protected int numClasses; + protected int[] possibleValues; + protected Index classIndex; + protected List doc; + + public EntityCachingAbstractSequencePrior(String backgroundSymbol, Index classIndex, List doc) { + this.classIndex = classIndex; + this.backgroundSymbol = classIndex.indexOf(backgroundSymbol); + this.numClasses = classIndex.size(); + this.possibleValues = new int[numClasses]; + for (int i=0; inot index+1) + **/ + public Entity extractEntity(int[] sequence, int position) { + Entity entity = new Entity(); + entity.type = sequence[position]; + entity.startPosition = position; + entity.words = new ArrayList(); + for ( ; position < sequence.length; position++) { + if (sequence[position] == entity.type) { + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + entity.words.add(word); + if (position == sequence.length - 1) { + entity.otherOccurrences = otherOccurrences(entity); + } + } else { + entity.otherOccurrences = otherOccurrences(entity); + break; + } + } + return entity; + } + + /** + * finds other locations in the sequence where the sequence of + * words in this entity occurs. + */ + public int[] otherOccurrences(Entity entity){ + List other = new ArrayList(); + for (int i = 0; i < doc.size(); i++) { + if (i == entity.startPosition) { continue; } + if (matches(entity, i)) { + other.add(Integer.valueOf(i)); + } + } + return toArray(other); + } + + public static int[] toArray(List list) { + int[] arr = new int[list.size()]; + for (int i = 0; i < arr.length; i++) { + arr[i] = list.get(i); + } + return arr; + } + + public boolean matches(Entity entity, int position) { + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + if (word.equalsIgnoreCase(entity.words.get(0))) { + //boolean matches = true; + for (int j = 1; j < entity.words.size(); j++) { + if (position + j >= doc.size()) { + return false; + } + String nextWord = doc.get(position+j).get(CoreAnnotations.TextAnnotation.class); + if (!nextWord.equalsIgnoreCase(entity.words.get(j))) { + return false; + } + } + return true; + } + return false; + } + + + public boolean joiningTwoEntities(int[] sequence, int position) { + if (sequence[position] == backgroundSymbol) { return false; } + if (position > 0 && position < sequence.length - 1) { + return (sequence[position] == sequence[position - 1] && + sequence[position] == sequence[position + 1]); + } + return false; + } + + public boolean splittingTwoEntities(int[] sequence, int position) { + if (position > 0 && position < sequence.length - 1) { + return (entities[position - 1] == entities[position + 1] && + entities[position - 1] != null); + } + return false; + } + + public boolean appendingEntity(int[] sequence, int position) { + if (position > 0) { + if (entities[position - 1] == null) { return false; } + Entity prev = entities[position - 1]; + return (sequence[position] == sequence[position - 1] && + prev.startPosition + prev.words.size() == position); + } + return false; + } + + public boolean prependingEntity(int[] sequence, int position) { + if (position < sequence.length - 1) { + if (entities[position + 1] == null) { return false; } + return (sequence[position] == sequence[position + 1]); + } + return false; + } + + public boolean addingSingletonEntity(int[] sequence, int position) { + if (sequence[position] == backgroundSymbol) { return false; } + if (position > 0) { + if (sequence[position - 1] == sequence[position]) { return false; } + } + if (position < sequence.length - 1) { + if (sequence[position + 1] == sequence[position]) { return false; } + } + return true; + } + + public boolean removingEndOfEntity(int[] sequence, int position) { + if (position > 0) { + if (sequence[position - 1] == backgroundSymbol) { return false; } + Entity prev = entities[position - 1]; + if (prev != null) { + return (prev.startPosition + prev.words.size() > position); + } + } + return false; + } + + public boolean removingBeginningOfEntity(int[] sequence, int position) { + if (position < sequence.length - 1) { + if (sequence[position + 1] == backgroundSymbol) { return false; } + Entity next = entities[position + 1]; + if (next != null) { + return (next.startPosition <= position); + } + } + return false; + } + + public boolean noChange(int[] sequence, int position) { + if (position > 0) { + if (sequence[position - 1] == sequence[position]) { + return entities[position - 1] == entities[position]; + } + } + if (position < sequence.length - 1) { + if (sequence[position + 1] == sequence[position]) { + return entities[position] == entities[position + 1]; + } + } + // actually, can't tell. either no change, or singleton + // changed type + return false; + } + + public void updateSequenceElement(int[] sequence, int position, int oldVal) { + if (VERBOSE) System.out.println("changing position "+position+" from " +classIndex.get(oldVal)+" to "+classIndex.get(sequence[position])); + + this.sequence = sequence; + + // no change? + if (noChange(sequence, position)) { + if (VERBOSE) System.out.println("no change"); + if (VERBOSE) System.out.println(this); + return; + } + // are we joining 2 entities? + else if (joiningTwoEntities(sequence, position)) { + if (VERBOSE) System.out.println("joining 2 entities"); + Entity newEntity = new Entity(); + Entity prev = entities[position - 1]; + Entity next = entities[position + 1]; + newEntity.startPosition = prev.startPosition; + newEntity.words = new ArrayList(); + newEntity.words.addAll(prev.words); + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + newEntity.words.add(word); + newEntity.words.addAll(next.words); + newEntity.type = sequence[position]; + List other = new ArrayList(); + for (int i = 0; i < prev.otherOccurrences.length; i++) { + int pos = prev.otherOccurrences[i]; + if (matches(newEntity, pos)) { + other.add(Integer.valueOf(pos)); + } + } + newEntity.otherOccurrences = toArray(other); + addEntityToEntitiesArray(newEntity); + if (VERBOSE) System.out.println(this); + return; + } + // are we splitting up an entity? + else if (splittingTwoEntities(sequence, position)) { + if (VERBOSE) System.out.println("splitting into 2 entities"); + Entity entity = entities[position]; + Entity prev = new Entity(); + prev.type = entity.type; + prev.startPosition = entity.startPosition; + prev.words = new ArrayList(entity.words.subList(0, position - entity.startPosition)); + prev.otherOccurrences = otherOccurrences(prev); + addEntityToEntitiesArray(prev); + Entity next = new Entity(); + next.type = entity.type; + next.startPosition = position + 1; + next.words = new ArrayList(entity.words.subList(position - entity.startPosition + 1, entity.words.size())); + next.otherOccurrences = otherOccurrences(next); + addEntityToEntitiesArray(next); + if (sequence[position] == backgroundSymbol) { + entities[position] = null; + } else { + Entity newEntity = new Entity(); + newEntity.startPosition = position; + newEntity.type = sequence[position]; + newEntity.words = new ArrayList(); + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + newEntity.words.add(word); + newEntity.otherOccurrences = otherOccurrences(newEntity); + entities[position] = newEntity; + } + if (VERBOSE) System.out.println(this); + return; + } + // are we prepending to an entity ? + else if (prependingEntity(sequence, position)) { + if (VERBOSE) System.out.println("prepending entity"); + Entity newEntity = new Entity(); + Entity next = entities[position + 1]; + newEntity.startPosition = position; + newEntity.words = new ArrayList(); + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + newEntity.words.add(word); + newEntity.words.addAll(next.words); + newEntity.type = sequence[position]; + //List other = new ArrayList(); + newEntity.otherOccurrences = otherOccurrences(newEntity); + addEntityToEntitiesArray(newEntity); + + if (removingEndOfEntity(sequence, position)) { + if (VERBOSE) System.out.println(" ... and removing end of previous entity."); + Entity prev = entities[position - 1]; + prev.words.remove(prev.words.size()-1); + prev.otherOccurrences = otherOccurrences(prev); + } + if (VERBOSE) System.out.println(this); + return; + } + // are we appending to an entity ? + else if (appendingEntity(sequence, position)) { + if (VERBOSE) System.out.println("appending entity"); + Entity newEntity = new Entity(); + Entity prev = entities[position - 1]; + newEntity.startPosition = prev.startPosition; + newEntity.words = new ArrayList(); + newEntity.words.addAll(prev.words); + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + newEntity.words.add(word); + newEntity.type = sequence[position]; + List other = new ArrayList(); + for (int i = 0; i < prev.otherOccurrences.length; i++) { + int pos = prev.otherOccurrences[i]; + if (matches(newEntity, pos)) { + other.add(Integer.valueOf(pos)); + } + } + newEntity.otherOccurrences = toArray(other); + addEntityToEntitiesArray(newEntity); + + if (removingBeginningOfEntity(sequence, position)) { + if (VERBOSE) System.out.println(" ... and removing beginning of next entity."); + entities[position + 1].words.remove(0); + entities[position + 1].startPosition++; + } + if (VERBOSE) System.out.println(this); + return; + } + // adding new singleton entity + else if (addingSingletonEntity(sequence, position)) { + Entity newEntity = new Entity(); + if (VERBOSE) System.out.println("adding singleton entity"); + newEntity.startPosition = position; + newEntity.words = new ArrayList(); + String word = doc.get(position).get(CoreAnnotations.TextAnnotation.class); + newEntity.words.add(word); + newEntity.type = sequence[position]; + newEntity.otherOccurrences = otherOccurrences(newEntity); + addEntityToEntitiesArray(newEntity); + + if (removingEndOfEntity(sequence, position)) { + if (VERBOSE) System.out.println(" ... and removing end of previous entity."); + Entity prev = entities[position - 1]; + prev.words.remove(prev.words.size()-1); + prev.otherOccurrences = otherOccurrences(prev); + } + + if (removingBeginningOfEntity(sequence, position)) { + if (VERBOSE) System.out.println(" ... and removing beginning of next entity."); + entities[position + 1].words.remove(0); + entities[position + 1].startPosition++; + } + + if (VERBOSE) System.out.println(this); + return; + } + // are splitting off the prev entity? + else if (removingEndOfEntity(sequence, position)) { + if (VERBOSE) System.out.println("splitting off prev entity"); + Entity prev = entities[position - 1]; + prev.words.remove(prev.words.size() - 1); + prev.otherOccurrences = otherOccurrences(prev); + entities[position] = null; + } + // are we splitting off the next entity? + else if (removingBeginningOfEntity(sequence, position)) { + if (VERBOSE) System.out.println("splitting off next entity"); + Entity next = entities[position + 1]; + next.words.remove(0); + next.startPosition++; + next.otherOccurrences = otherOccurrences(next); + entities[position] = null; + } else { + entities[position] = null; + } + if (VERBOSE) System.out.println(this); + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < entities.length; i++) { + sb.append(i); + sb.append("\t"); + String word = doc.get(i).get(CoreAnnotations.TextAnnotation.class); + sb.append(word); + sb.append("\t"); + sb.append(classIndex.get(sequence[i])); + if (entities[i] != null) { + sb.append("\t"); + sb.append(entities[i].toString(classIndex)); + } + sb.append("\n"); + } + return sb.toString(); + } + + public String toString(int pos) { + StringBuffer sb = new StringBuffer(); + for (int i = Math.max(0, pos - 10); i < Math.min(entities.length, pos + 10); i++) { + sb.append(i); + sb.append("\t"); + String word = doc.get(i).get(CoreAnnotations.TextAnnotation.class); + sb.append(word); + sb.append("\t"); + sb.append(classIndex.get(sequence[i])); + if (entities[i] != null) { + sb.append("\t"); + sb.append(entities[i].toString(classIndex)); + } + sb.append("\n"); + } + return sb.toString(); + } +} + +class Entity { + public int startPosition; + public List words; + public int type; + + /** + * the begining index of other locations where this sequence of + * words appears. + */ + public int[] otherOccurrences; + + public String toString(Index classIndex) { + StringBuffer sb = new StringBuffer(); + sb.append("\""); + sb.append(StringUtils.join(words, " ")); + sb.append("\" start: "); + sb.append(startPosition); + sb.append(" type: "); + sb.append(classIndex.get(type)); + sb.append(" other_occurrences: "); + sb.append(Arrays.toString(otherOccurrences)); + return sb.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EntityCachingAbstractSequencePriorBIO.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EntityCachingAbstractSequencePriorBIO.java new file mode 100644 index 0000000..7beb72d --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/EntityCachingAbstractSequencePriorBIO.java @@ -0,0 +1,404 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.sequences.SequenceModel; +import edu.stanford.nlp.sequences.SequenceListener; +import edu.stanford.nlp.ling.CoreAnnotations; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * This class keeps track of all labeled entities and updates the + * its list whenever the label at a point gets changed. This allows + * you to not have to regereate the list everytime, which can be quite + * inefficient. + * + * @author Mengqiu Wang + **/ +public abstract class EntityCachingAbstractSequencePriorBIO implements SequenceModel, SequenceListener { + + protected int[] sequence; + protected int backgroundSymbol; + protected int numClasses; + protected int[] possibleValues; + protected Index classIndex; + protected Index tagIndex; + private List wordDoc; + + public EntityCachingAbstractSequencePriorBIO(String backgroundSymbol, Index classIndex, Index tagIndex, List doc) { + this.classIndex = classIndex; + this.tagIndex = tagIndex; + this.backgroundSymbol = classIndex.indexOf(backgroundSymbol); + this.numClasses = classIndex.size(); + this.possibleValues = new int[numClasses]; + for (int i=0; i(doc.size()); + for (IN w: doc) { + wordDoc.add(w.get(CoreAnnotations.TextAnnotation.class)); + } + } + + private boolean VERBOSE = false; + + EntityBIO[] entities; + + public int leftWindow() { + return Integer.MAX_VALUE; // not Markovian! + } + + public int rightWindow() { + return Integer.MAX_VALUE; // not Markovian! + } + + public int[] getPossibleValues(int position) { + return possibleValues; + } + + public double scoreOf(int[] sequence, int pos) { + return scoresOf(sequence, pos)[sequence[pos]]; + } + + /** + * @return the length of the sequence + */ + public int length() { + return wordDoc.size(); + } + + /** + * get the number of classes in the sequence model. + */ + public int getNumClasses() { + return classIndex.size(); + } + + public double[] getConditionalDistribution (int[] sequence, int position) { + double[] probs = scoresOf(sequence, position); + ArrayMath.logNormalize(probs); + probs = ArrayMath.exp(probs); + //System.out.println(this); + return probs; + } + + public double[] scoresOf (int[] sequence, int position) { + double[] probs = new double[numClasses]; + int origClass = sequence[position]; + int oldVal = origClass; + // if (BisequenceEmpiricalNERPrior.debugIndices.indexOf(position) != -1) + // EmpiricalNERPriorBIO.DEBUG = true; + for (int label = 0; label < numClasses; label++) { + if (label != origClass) { + sequence[position] = label; + updateSequenceElement(sequence, position, oldVal); + probs[label] = scoreOf(sequence); + oldVal = label; + // if (BisequenceEmpiricalNERPrior.debugIndices.indexOf(position) != -1) + // System.out.println(this); + } + } + sequence[position] = origClass; + updateSequenceElement(sequence, position, oldVal); + probs[origClass] = scoreOf(sequence); + // EmpiricalNERPriorBIO.DEBUG = false; + return probs; + } + + public void setInitialSequence(int[] initialSequence) { + this.sequence = initialSequence; + entities = new EntityBIO[initialSequence.length]; + Arrays.fill(entities, null); + String rawTag = null; + String[] parts = null; + for (int i = 0; i < initialSequence.length; i++) { + if (initialSequence[i] != backgroundSymbol) { + rawTag = classIndex.get(sequence[i]); + parts = rawTag.split("-"); + //TODO(mengqiu) this needs to be updated, so that initial can be I as well + if (parts[0].equals("B")) { // B- + EntityBIO entity = extractEntity(initialSequence, i, parts[1]); + addEntityToEntitiesArray(entity); + i += entity.words.size() - 1; + } + } + } + } + + private void addEntityToEntitiesArray(EntityBIO entity) { + for (int j = entity.startPosition; j < entity.startPosition + entity.words.size(); j++) { + entities[j] = entity; + } + } + + /** + * extracts the entity starting at the given position + * and adds it to the entity list. returns the index + * of the last element in the entity (not index+1) + **/ + public EntityBIO extractEntity(int[] sequence, int position, String tag) { + EntityBIO entity = new EntityBIO(); + entity.type = tagIndex.indexOf(tag); + entity.startPosition = position; + entity.words = new ArrayList(); + entity.words.add(wordDoc.get(position)); + int pos = position + 1; + String rawTag = null; + String[] parts = null; + for ( ; pos < sequence.length; pos++) { + rawTag = classIndex.get(sequence[pos]); + parts = rawTag.split("-"); + if (parts[0].equals("I") && parts[1].equals(tag)) { + String word = wordDoc.get(pos); + entity.words.add(word); + } else { + break; + } + } + entity.otherOccurrences = otherOccurrences(entity); + return entity; + } + + /** + * finds other locations in the sequence where the sequence of + * words in this entity occurs. + */ + public int[] otherOccurrences(EntityBIO entity){ + List other = new ArrayList(); + for (int i = 0; i < wordDoc.size(); i++) { + if (i == entity.startPosition) { continue; } + if (matches(entity, i)) { + other.add(Integer.valueOf(i)); + } + } + return toArray(other); + } + + public static int[] toArray(List list) { + int[] arr = new int[list.size()]; + for (int i = 0; i < arr.length; i++) { + arr[i] = list.get(i); + } + return arr; + } + + public boolean matches(EntityBIO entity, int position) { + String word = wordDoc.get(position); + if (word.equalsIgnoreCase(entity.words.get(0))) { + for (int j = 1; j < entity.words.size(); j++) { + if (position + j >= wordDoc.size()) { + return false; + } + String nextWord = wordDoc.get(position+j); + if (!nextWord.equalsIgnoreCase(entity.words.get(j))) { + return false; + } + } + return true; + } + return false; + } + + public void updateSequenceElement(int[] sequence, int position, int oldVal) { + this.sequence = sequence; + + if (sequence[position] == oldVal) + return; + + if (VERBOSE) System.err.println("changing position "+position+" from " +classIndex.get(oldVal)+" to "+classIndex.get(sequence[position])); + + if (sequence[position] == backgroundSymbol) { // new tag is O + String oldRawTag = classIndex.get(oldVal); + String[] oldParts = oldRawTag.split("-"); + if (oldParts[0].equals("B")) { // old tag was a B, current entity definitely affected, also check next one + EntityBIO entity = entities[position]; + if (entity == null) + throw new RuntimeException("oldTag starts with B, entity at position should not be null"); + // remove entities for all words affected by this entity + for (int i=0; i < entity.words.size(); i++) { + entities[position+i] = null; + } + } else { // old tag was a I, check previous one + if (entities[position] != null) { // this was part of an entity, shortened + if (VERBOSE) System.err.println("splitting off prev entity"); + EntityBIO oldEntity = entities[position]; + int oldLen = oldEntity.words.size(); + int offset = position - oldEntity.startPosition; + List newWords = new ArrayList(); + for (int i=0; i 0) + System.err.println("position:" + position +", entities[position-1] = " + entities[position-1].toString(tagIndex)); + } // otherwise, non-entity part I-xxx -> O, no enitty affected + } + } else { + String rawTag = classIndex.get(sequence[position]); + String[] parts = rawTag.split("-"); + if (parts[0].equals("B")) { // new tag is B + if (oldVal == backgroundSymbol) { // start a new entity, may merge with the next word + EntityBIO entity = extractEntity(sequence, position, parts[1]); + addEntityToEntitiesArray(entity); + } else { + String oldRawTag = classIndex.get(oldVal); + String[] oldParts = oldRawTag.split("-"); + if (oldParts[0].equals("B")) { // was a different B-xxx + EntityBIO oldEntity = entities[position]; + if (oldEntity.words.size() > 1) { // remove all old entity, add new singleton + for (int i=0; i< oldEntity.words.size(); i++) + entities[position+i] = null; + EntityBIO entity = extractEntity(sequence, position, parts[1]); + addEntityToEntitiesArray(entity); + } else { // extract entity + EntityBIO entity = extractEntity(sequence, position, parts[1]); + addEntityToEntitiesArray(entity); + } + } else { // was I + EntityBIO oldEntity = entities[position]; + if (oldEntity != null) {// break old entity + int oldLen = oldEntity.words.size(); + int offset = position - oldEntity.startPosition; + List newWords = new ArrayList(); + for (int i=0; i 0) { + if (entities[position-1] != null) { + String oldTag = tagIndex.get(entities[position-1].type); + EntityBIO entity = extractEntity(sequence, position-1-entities[position-1].words.size()+1, oldTag); + addEntityToEntitiesArray(entity); + } + } + } else { + String oldRawTag = classIndex.get(oldVal); + String[] oldParts = oldRawTag.split("-"); + if (oldParts[0].equals("B")) { // was a B, clean the B entity first, then check if previous is an entity + EntityBIO oldEntity = entities[position]; + for (int i=0; i 0) { + if (entities[position-1] != null) { + String oldTag = tagIndex.get(entities[position-1].type); + if (VERBOSE) + System.err.println("position:" + position +", entities[position-1] = " + entities[position-1].toString(tagIndex)); + EntityBIO entity = extractEntity(sequence, position-1-entities[position-1].words.size()+1, oldTag); + addEntityToEntitiesArray(entity); + } + } + } else { // was a differnt I-xxx, + if (entities[position] != null) { // shorten the previous one, remove any additional parts + EntityBIO oldEntity = entities[position]; + int oldLen = oldEntity.words.size(); + int offset = position - oldEntity.startPosition; + List newWords = new ArrayList(); + for (int i=0; i 0) { + if (entities[position-1] != null) { + String oldTag = tagIndex.get(entities[position-1].type); + EntityBIO entity = extractEntity(sequence, position-1-entities[position-1].words.size()+1, oldTag); + addEntityToEntitiesArray(entity); + } + } + } + } + } + } + } + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < entities.length; i++) { + sb.append(i); + sb.append("\t"); + String word = wordDoc.get(i); + sb.append(word); + sb.append("\t"); + sb.append(classIndex.get(sequence[i])); + if (entities[i] != null) { + sb.append("\t"); + sb.append(entities[i].toString(tagIndex)); + } + sb.append("\n"); + } + return sb.toString(); + } + + public String toString(int pos) { + StringBuffer sb = new StringBuffer(); + for (int i = Math.max(0, pos - 3); i < Math.min(entities.length, pos + 3); i++) { + sb.append(i); + sb.append("\t"); + String word = wordDoc.get(i); + sb.append(word); + sb.append("\t"); + sb.append(classIndex.get(sequence[i])); + if (entities[i] != null) { + sb.append("\t"); + sb.append(entities[i].toString(tagIndex)); + } + sb.append("\n"); + } + return sb.toString(); + } +} + +class EntityBIO { + public int startPosition; + public List words; + public int type; + + /** + * the begining index of other locations where this sequence of + * words appears. + */ + public int[] otherOccurrences; + + public String toString(Index tagIndex) { + StringBuffer sb = new StringBuffer(); + sb.append("\""); + sb.append(StringUtils.join(words, " ")); + sb.append("\" start: "); + sb.append(startPosition); + sb.append(" type: "); + sb.append(tagIndex.get(type)); + sb.append(" other_occurrences: "); + sb.append(Arrays.toString(otherOccurrences)); + return sb.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NERClassifierCombiner.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NERClassifierCombiner.java new file mode 100644 index 0000000..781429a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NERClassifierCombiner.java @@ -0,0 +1,170 @@ +package edu.stanford.nlp.ie; + +import java.io.FileNotFoundException; +import java.util.List; +import java.util.Properties; + +import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.PropertiesUtils; +import edu.stanford.nlp.util.StringUtils; + +/** + * Subclass of ClassifierCombiner that behaves like a NER, by copying the AnswerAnnotation labels to NERAnnotation + * Also, it runs an additional classifier (QuantifiableEntityNormalizer) to recognize numeric entities + * @author Mihai Surdeanu + * + */ +public class NERClassifierCombiner extends ClassifierCombiner { + + private final boolean applyNumericClassifiers; + public static final boolean APPLY_NUMERIC_CLASSIFIERS_DEFAULT = true; + public static final String APPLY_NUMERIC_CLASSIFIERS_PROPERTY = "ner.applyNumericClassifiers"; + + private final boolean useSUTime; + + private final AbstractSequenceClassifier nsc; + + public NERClassifierCombiner(Properties props) + throws FileNotFoundException + { + super(props); + applyNumericClassifiers = PropertiesUtils.getBool(props, APPLY_NUMERIC_CLASSIFIERS_PROPERTY, APPLY_NUMERIC_CLASSIFIERS_DEFAULT); + useSUTime = PropertiesUtils.getBool(props, NumberSequenceClassifier.USE_SUTIME_PROPERTY, NumberSequenceClassifier.USE_SUTIME_DEFAULT); + nsc = new NumberSequenceClassifier(new Properties(), useSUTime, props); + } + + public NERClassifierCombiner(String... loadPaths) + throws FileNotFoundException + { + this(APPLY_NUMERIC_CLASSIFIERS_DEFAULT, NumberSequenceClassifier.USE_SUTIME_DEFAULT, loadPaths); + } + + public NERClassifierCombiner(boolean applyNumericClassifiers, + boolean useSUTime, + String... loadPaths) + throws FileNotFoundException + { + super(loadPaths); + this.applyNumericClassifiers = applyNumericClassifiers; + this.useSUTime = useSUTime; + this.nsc = new NumberSequenceClassifier(useSUTime); + } + + public NERClassifierCombiner(boolean applyNumericClassifiers, + boolean useSUTime, + Properties nscProps, + String... loadPaths) + throws FileNotFoundException + { + super(loadPaths); + this.applyNumericClassifiers = applyNumericClassifiers; + this.useSUTime = useSUTime; + this.nsc = new NumberSequenceClassifier(new Properties(), useSUTime, nscProps); + } + + public NERClassifierCombiner(AbstractSequenceClassifier... classifiers) + throws FileNotFoundException + { + this(APPLY_NUMERIC_CLASSIFIERS_DEFAULT, NumberSequenceClassifier.USE_SUTIME_DEFAULT, classifiers); + } + + public NERClassifierCombiner(boolean applyNumericClassifiers, + boolean useSUTime, + AbstractSequenceClassifier... classifiers) + throws FileNotFoundException + { + super(classifiers); + this.applyNumericClassifiers = applyNumericClassifiers; + this.useSUTime = useSUTime; + this.nsc = new NumberSequenceClassifier(useSUTime); + } + + public boolean appliesNumericClassifiers() { + return applyNumericClassifiers; + } + + public boolean usesSUTime() { + return useSUTime; + } + + private static void copyAnswerFieldsToNERField(List l) { + for (INN m: l) { + m.set(CoreAnnotations.NamedEntityTagAnnotation.class, m.get(CoreAnnotations.AnswerAnnotation.class)); + } + } + + @Override + public List classify(List tokens) { + return classifyWithGlobalInformation(tokens, null, null); + } + + @Override + public List classifyWithGlobalInformation(List tokens, final CoreMap document, final CoreMap sentence) { + List output = super.classify(tokens); + if (applyNumericClassifiers) { + try { + // recognizes additional MONEY, TIME, DATE, and NUMBER using a set of deterministic rules + // note: some DATE and TIME entities are recognized by our statistical NER based on MUC + // note: this includes SUTime + // note: requires TextAnnotation, PartOfSpeechTagAnnotation, and AnswerAnnotation + // note: this sets AnswerAnnotation! + recognizeNumberSequences(output, document, sentence); + } catch (Exception e) { + System.err.println("Ignored an exception in NumberSequenceClassifier: (result is that some numbers were not classified)"); + System.err.println("Tokens: " + StringUtils.joinWords(tokens, " ")); + e.printStackTrace(System.err); + } + + // AnswerAnnotation -> NERAnnotation + copyAnswerFieldsToNERField(output); + + try { + // normalizes numeric entities such as MONEY, TIME, DATE, or PERCENT + // note: this uses and sets NamedEntityTagAnnotation! + QuantifiableEntityNormalizer.addNormalizedQuantitiesToEntities(output); + } catch (Exception e) { + System.err.println("Ignored an exception in QuantifiableEntityNormalizer: (result is that entities were not normalized)"); + System.err.println("Tokens: " + StringUtils.joinWords(tokens, " ")); + e.printStackTrace(System.err); + } catch(AssertionError e){ + System.err.println("Ignored an assertion in QuantifiableEntityNormalizer: (result is that entities were not normalized)"); + System.err.println("Tokens: " + StringUtils.joinWords(tokens, " ")); + e.printStackTrace(System.err); + } + } else { + // AnswerAnnotation -> NERAnnotation + copyAnswerFieldsToNERField(output); + } + return output; + } + + private void recognizeNumberSequences(List words, final CoreMap document, final CoreMap sentence) { + // we need to copy here because NumberSequenceClassifier overwrites the AnswerAnnotation + List newWords = NumberSequenceClassifier.copyTokens(words, sentence); + + nsc.classifyWithGlobalInformation(newWords, document, sentence); + + // copy AnswerAnnotation back. Do not overwrite! + // also, copy all the additional annotations generated by SUTime and NumberNormalizer + for (int i = 0, sz = words.size(); i < sz; i++){ + CoreLabel origWord = words.get(i); + CoreLabel newWord = newWords.get(i); + + // System.err.println(newWord.word() + " => " + newWord.get(CoreAnnotations.AnswerAnnotation.class) + " " + origWord.ner()); + + String before = origWord.get(CoreAnnotations.AnswerAnnotation.class); + String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class); + if ((before == null || before.equals(nsc.flags.backgroundSymbol) || before.equals("MISC")) && !newGuess.equals(nsc.flags.backgroundSymbol)) { + origWord.set(CoreAnnotations.AnswerAnnotation.class, newGuess); + } + + // transfer other annotations generated by SUTime or NumberNormalizer + NumberSequenceClassifier.transferAnnotations(newWord, origWord); + + } + } +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NERFeatureFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NERFeatureFactory.java new file mode 100644 index 0000000..d289e54 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NERFeatureFactory.java @@ -0,0 +1,2227 @@ +// NERFeatureFactory -- features for a probabilistic Named Entity Recognizer +// Copyright (c) 2002-2008 Leland Stanford Junior University +// Additional features (c) 2003 The University of Edinburgh +// +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// Support/Questions: java-nlp-user@lists.stanford.edu +// Licensing: java-nlp-support@lists.stanford.edu +// http://nlp.stanford.edu/downloads/crf-classifier.shtml + +package edu.stanford.nlp.ie; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreLabel.GenericAnnotation; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.process.WordShapeClassifier; +import edu.stanford.nlp.sequences.Clique; +import edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter; +import edu.stanford.nlp.sequences.FeatureFactory; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.trees.international.pennchinese.RadicalMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.PaddedList; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.util.Timing; + + +/** + * Features for Named Entity Recognition. The code here creates the features + * by processing Lists of CoreLabels. + * Look at {@link SeqClassifierFlags} to see where the flags are set for + * what options to use for what flags. + *

    + * To add a new feature extractor, you should do the following: + *

      + *
    1. Add a variable (boolean, int, String, etc. as appropriate) to + * SeqClassifierFlags to mark if the new extractor is turned on or + * its value, etc. Add it at the bottom of the list of variables + * currently in the class (this avoids problems with older serialized + * files breaking). Make the default value of the variable false/null/0 + * (this is again for backwards compatibility).
    2. + *
    3. Add a clause to the big if/then/else of setProperties(Properties) in + * SeqClassifierFlags. Unless it is a macro option, make the option name + * the same as the variable name used in step 1.
    4. + *
    5. Add code to NERFeatureFactory for this feature. First decide which + * classes (hidden states) are involved in the feature. If only the + * current class, you add the feature extractor to the + * featuresC code, if both the current and previous class, + * then featuresCpC, etc.
    6. + *
    + *

    Parameters can be defined using a Properties file + * (specified on the command-line with -prop propFile), + * or directly on the command line. The following properties are recognized: + *

    + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *

    + *

    + * + * + * If true, a gazette feature fires when all tokens of a gazette entry match + *

    + *

    + * + * + * + * + * + * + * + * + * + * + *

    + *

    + * + * + * + *

    + *

    + * + *

    + *

    + * + * + * + * + * + *

    + *

    + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
    Property NameTypeDefault ValueDescription
    loadClassifier Stringn/aPath to serialized classifier to load
    loadAuxClassifier Stringn/aPath to auxiliary classifier to load.
    serializeToStringn/aPath to serialize classifier to
    trainFileStringn/aPath of file to use as training data
    testFileStringn/aPath of file to use as training data
    mapStringsee belowThis applies at training time or if testing on tab-separated column data. It says what is in each column. It doesn't apply when running on plain text data. The simplest scenario for training is having words and classes in two column. word=0,answer=1 is the default if conllNoTags is specified; otherwise word=0,tag=1,answer=2 is the default. But you can add other columns, such as for a part-of-speech tag, presences in a lexicon, etc. That would only be useful at runtime if you have part-of-speech information or whatever available and are passing it in with the tokens (that is, you can pass to classify CoreLabel tokens with additional fields stored in them).
    useWordbooleantrueGives you feature for w
    useBinnedLengthStringnullIf non-null, treat as a sequence of comma separated integer bounds, where items above the previous bound up to the next bound are binned Len-range
    useNGramsbooleanfalseMake features from letter n-grams, i.e., substrings of the word
    lowercaseNGramsbooleanfalseMake features from letter n-grams only lowercase
    dehyphenateNGramsbooleanfalseRemove hyphens before making features from letter n-grams
    conjoinShapeNGramsbooleanfalseConjoin word shape and n-gram features
    useNeighborNGramsbooleanfalseUse letter n-grams for the previous and current words in the CpC clique. This feature helps languages such as Chinese, but not so much for English
    usePrevbooleanfalseGives you feature for (pw,c), and together with other options enables other previous features, such as (pt,c) [with useTags)
    useNextbooleanfalseGives you feature for (nw,c), and together with other options enables other next features, such as (nt,c) [with useTags)
    useTagsbooleanfalseGives you features for (t,c), (pt,c) [if usePrev], (nt,c) [if useNext]
    useWordPairsbooleanfalseGives you + * features for (pw, w, c) and (w, nw, c)
    useGazettesbooleanfalseIf true, use gazette features (defined by other flags)
    gazetteStringnullThe value can be one or more filenames (names separated by a comma, semicolon or space). + * If provided gazettes are loaded from these files. Each line should be an entity class name, followed by whitespace followed by an entity (which might be a phrase of several tokens with a single space between words). + * Giving this property turns on useGazettes, so you normally don't need to specify it (but can use it to turn off gazettes specified in a properties file).
    sloppyGazettebooleanfalseIf true, a gazette feature fires when any token of a gazette entry matches
    cleanGazettebooleanfalse
    wordShapeStringnoneEither "none" for no wordShape use, or the name of a word shape function recognized by {@link WordShapeClassifier#lookupShaper(String)}
    useSequencesbooleantrueDoes not use any class combination features if this is false
    usePrevSequencesbooleanfalseDoes not use any class combination features using previous classes if this is false
    useNextSequencesbooleanfalseDoes not use any class combination features using next classes if this is false
    useLongSequencesbooleanfalseUse plain higher-order state sequences out to minimum of length or maxLeft
    useBoundarySequencesbooleanfalseUse extra second order class sequence features when previous is CoNLL boundary, so entity knows it can span boundary.
    useTaggySequencesbooleanfalseUse first, second, and third order class and tag sequence interaction features
    useExtraTaggySequencesbooleanfalseAdd in sequences of tags with just current class features
    useTaggySequencesShapeInteractionbooleanfalseAdd in terms that join sequences of 2 or 3 tags with the current shape
    strictlyFirstOrderbooleanfalseAs an override to whatever other options are in effect, deletes all features other than C and CpC clique features when building the classifier
    entitySubclassificationString"IO"If + * set, convert the labeling of classes (but not the background) into + * one of several alternate encodings (IO, IOB1, IOB2, IOE1, IOE2, SBIEO, with + * a S(ingle), B(eginning), + * E(nding), I(nside) 4-way classification for each class. By default, we + * either do no re-encoding, or the CoNLLDocumentIteratorFactory does a + * lossy encoding as IO. Note that this is all CoNLL-specific, and depends on + * their way of prefix encoding classes, and is only implemented by + * the CoNLLDocumentIteratorFactory.
    useSumbooleanfalse
    tolerancedouble1e-4Convergence tolerance in optimization
    printFeaturesStringnullprint out the features of the classifier to a file based on this name (starting with feat-, suffixed "-1" and "-2")
    printFeaturesUptoint-1Print out features for only the first this many datums, if the value is positive.
    useSymTagsbooleanfalseGives you + * features (pt, t, nt, c), (t, nt, c), (pt, t, c)
    useSymWordPairsbooleanfalseGives you + * features (pw, nw, c)
    printClassifierStringnullStyle in which to print the classifier. One of: HighWeight, HighMagnitude, Collection, AllWeights, WeightHistogram
    printClassifierParamint100A parameter + * to the printing style, which may give, for example the number of parameters + * to print
    internbooleanfalseIf true, + * (String) intern read in data and classes and feature (pre-)names such + * as substring features
    intern2booleanfalseIf true, intern all (final) feature names (if only current word and ngram features are used, these will already have been interned by intern, and this is an unnecessary no-op)
    cacheNGramsbooleanfalseIf true, + * record the NGram features that correspond to a String (under the current + * option settings) and reuse rather than recalculating if the String is seen + * again.
    selfTestbooleanfalse
    noMidNGramsbooleanfalseDo not include character n-gram features for n-grams that contain neither the beginning or end of the word
    maxNGramLengint-1If this number is + * positive, n-grams above this size will not be used in the model
    useReversebooleanfalse
    retainEntitySubclassificationbooleanfalseIf true, rather than undoing a recoding of entity tag subtypes (such as BIO variants), just leave them in the output.
    useLemmasbooleanfalseInclude the lemma of a word as a feature.
    usePrevNextLemmasbooleanfalseInclude the previous/next lemma of a word as a feature.
    useLemmaAsWordbooleanfalseInclude the lemma of a word as a feature.
    normalizeTermsbooleanfalseIf this is true, some words are normalized: day and month names are lowercased (as for normalizeTimex) and some British spellings are mapped to American English spellings (e.g., -our/-or, etc.).
    normalizeTimexbooleanfalseIf this is true, capitalization of day and month names is normalized to lowercase
    useNBbooleanfalse
    useTypeSeqsbooleanfalseUse basic zeroeth order word shape features.
    useTypeSeqs2booleanfalseAdd additional first and second order word shape features
    useTypeSeqs3booleanfalseAdds one more first order shape sequence
    useDisjunctivebooleanfalseInclude in features giving disjunctions of words anywhere in the left or right disjunctionWidth words (preserving direction but not position)
    disjunctionWidthint4The number of words on each side of the current word that are included in the disjunction features
    useDisjunctiveShapeInteractionbooleanfalseInclude in features giving disjunctions of words anywhere in the left or right disjunctionWidth words (preserving direction but not position) interacting with the word shape of the current word
    useWideDisjunctivebooleanfalseInclude in features giving disjunctions of words anywhere in the left or right wideDisjunctionWidth words (preserving direction but not position)
    wideDisjunctionWidthint4The number of words on each side of the current word that are included in the disjunction features
    usePositionbooleanfalseUse combination of position in sentence and class as a feature
    useBeginSentbooleanfalseUse combination of initial position in sentence and class (and word shape) as a feature. (Doesn't seem to help.)
    useDisjShapebooleanfalseInclude features giving disjunctions of word shapes anywhere in the left or right disjunctionWidth words (preserving direction but not position)
    useClassFeaturebooleanfalseInclude a feature for the class (as a class marginal). Puts a prior on the classes which is equivalent to how often the feature appeared in the training data.
    useShapeConjunctionsbooleanfalseConjoin shape with tag or position
    useWordTagbooleanfalseInclude word and tag pair features
    useLastRealWordbooleanfalseIff the prev word is of length 3 or less, add an extra feature that combines the word two back and the current word's shape. Weird!
    useNextRealWordbooleanfalseIff the next word is of length 3 or less, add an extra feature that combines the word after next and the current word's shape. Weird!
    useTitlebooleanfalseMatch a word against a list of name titles (Mr, Mrs, etc.)
    useOccurrencePatternsbooleanfalseThis is a very engineered feature designed to capture multiple references to names. If the current word isn't capitalized, followed by a non-capitalized word, and preceded by a word with alphabetic characters, it returns NO-OCCURRENCE-PATTERN. Otherwise, if the previous word is a capitalized NNP, then if in the next 150 words you find this PW-W sequence, you get XY-NEXT-OCCURRENCE-XY, else if you find W you get XY-NEXT-OCCURRENCE-Y. Similarly for backwards and XY-PREV-OCCURRENCE-XY and XY-PREV-OCCURRENCE-Y. Else (if the previous word isn't a capitalized NNP), under analogous rules you get one or more of X-NEXT-OCCURRENCE-YX, X-NEXT-OCCURRENCE-XY, X-NEXT-OCCURRENCE-X, X-PREV-OCCURRENCE-YX, X-PREV-OCCURRENCE-XY, X-PREV-OCCURRENCE-X.
    useTypeySequencesbooleanfalseSome first order word shape patterns.
    useGenericFeaturesbooleanfalseIf true, any features you include in the map will be incorporated into the model with values equal to those given in the file; values are treated as strings unless you use the "realValued" option (described below)
    justifybooleanfalsePrint out all + * feature/class pairs and their weight, and then for each input data + * point, print justification (weights) for active features
    normalizebooleanfalseFor the CMMClassifier (only) if this is true then the Scorer normalizes scores as probabilities.
    useHuberbooleanfalseUse a Huber loss prior rather than the default quadratic loss.
    useQuarticbooleanfalseUse a Quartic prior rather than the default quadratic loss.
    sigmadouble1.0
    epsilondouble0.01Used only as a parameter in the Huber loss: this is the distance from 0 at which the loss changes from quadratic to linear
    beamSizeint30
    maxLeftint2The number of things to the left that have to be cached to run the Viterbi algorithm: the maximum context of class features used.
    maxRightint2The number of things to the right that have to be cached to run the Viterbi algorithm: the maximum context of class features used. The maximum possible clique size to use is (maxLeft + maxRight + 1)
    dontExtendTaggybooleanfalseDon't extend the range of useTaggySequences when maxLeft is increased.
    numFolds int1The number of folds to use for cross-validation.
    startFold int1The starting fold to run.
    numFoldsToRun int1The number of folds to run.
    mergeTags booleanfalseWhether to merge B- and I- tags.
    splitDocumentsbooleantrueWhether or not to split the data into separate documents for training/testing
    maxDocSizeint10000If this number is greater than 0, attempt to split documents bigger than this value into multiple documents at sentence boundaries during testing; otherwise do nothing.
    + *

    + * Note: flags/properties overwrite left to right. That is, the parameter + * setting specified last is the one used. + *

    + *

    + * DOCUMENTATION ON FEATURE TEMPLATES
    + * 

    + * w = word + * t = tag + * p = position (word index in sentence) + * c = class + * p = paren + * g = gazette + * a = abbrev + * s = shape + * r = regent (dependency governor) + * h = head word of phrase + * n(w) = ngrams from w + * g(w) = gazette entries containing w + * l(w) = length of w + * o(...) = occurrence patterns of words + *

    + * useReverse reverses meaning of prev, next everywhere below (on in macro) + *

    + * "Prolog" booleans: , = AND and ; = OR + *

    + * Mac: Y = turned on in -macro, + * + = additional positive things relative to -macro for CoNLL NERFeatureFactory + * (perhaps none...) + * - = Known negative for CoNLL NERFeatureFactory relative to -macro + *

    p + * Bio: + = additional things that are positive for BioCreative + * - = things negative relative to -macro + *

    + * HighMagnitude: There are no (0) to a few (+) to many (+++) high weight + * features of this template. (? = not used in goodCoNLL, but usually = 0) + *

    + * Feature Mac Bio CRFFlags HighMagnitude + * --------------------------------------------------------------------- + * w,c Y useWord 0 (useWord is almost useless with unlimited ngram features, but helps a fraction in goodCoNLL, if only because of prior fiddling + * p,c usePosition ? + * p=0,c useBeginSent ? + * p=0,s,c useBeginSent ? + * t,c Y useTags ++ + * pw,c Y usePrev + + * pt,c Y usePrev,useTags 0 + * nw,c Y useNext ++ + * nt,c Y useNext,useTags 0 + * pw,w,c Y useWordPairs + + * w,nw,c Y useWordPairs + + * pt,t,nt,c useSymTags ? + * t,nt,c useSymTags ? + * pt,t,c useSymTags ? + * pw,nw,c useSymWordPairs ? + *

    + * pc,c Y usePrev,useSequences,usePrevSequences +++ + * pc,w,c Y usePrev,useSequences,usePrevSequences 0 + * nc,c useNext,useSequences,useNextSequences ? + * w,nc,c useNext,useSequences,useNextSequences ? + * pc,nc,c useNext,usePrev,useSequences,usePrevSequences,useNextSequences ? + * w,pc,nc,c useNext,usePrev,useSequences,usePrevSequences,useNextSequences ? + *

    + * (pw;p2w;p3w;p4w),c + useDisjunctive (out to disjunctionWidth now) +++ + * (nw;n2w;n3w;n4w),c + useDisjunctive (out to disjunctionWidth now) ++++ + * (pw;p2w;p3w;p4w),s,c + useDisjunctiveShapeInteraction ? + * (nw;n2w;n3w;n4w),s,c + useDisjunctiveShapeInteraction ? + * (pw;p2w;p3w;p4w),c + useWideDisjunctive (to wideDisjunctionWidth) ? + * (nw;n2w;n3w;n4w),c + useWideDisjunctive (to wideDisjunctionWidth) ? + * (ps;p2s;p3s;p4s),c useDisjShape (out to disjunctionWidth now) ? + * (ns;n2s;n3s;n4s),c useDisjShape (out to disjunctionWidth now) ? + *

    + * pt,pc,t,c Y useTaggySequences + + * p2t,p2c,pt,pc,t,c Y useTaggySequences,maxLeft>=2 + + * p3t,p3c,p2t,p2c,pt,pc,t,c Y useTaggySequences,maxLeft>=3,!dontExtendTaggy ? + * p2c,pc,c Y useLongSequences ++ + * p3c,p2c,pc,c Y useLongSequences,maxLeft>=3 ? + * p4c,p3c,p2c,pc,c Y useLongSequences,maxLeft>=4 ? + * p2c,pc,c,pw=BOUNDARY useBoundarySequences 0 (OK, but!) + *

    + * p2t,pt,t,c - useExtraTaggySequences ? + * p3t,p2t,pt,t,c - useExtraTaggySequences ? + *

    + * p2t,pt,t,s,p2c,pc,c - useTaggySequencesShapeInteraction ? + * p3t,p2t,pt,t,s,p3c,p2c,pc,c useTaggySequencesShapeInteraction ? + *

    + * s,pc,c Y useTypeySequences ++ + * ns,pc,c Y useTypeySequences // error for ps? not? 0 + * ps,pc,s,c Y useTypeySequences 0 + * // p2s,p2c,ps,pc,s,c Y useTypeySequences,maxLeft>=2 // duplicated a useTypeSeqs2 feature + *

    + * n(w),c Y useNGrams (noMidNGrams, MaxNGramLeng, lowercaseNGrams, dehyphenateNGrams) +++ + * n(w),s,c useNGrams,conjoinShapeNGrams ? + *

    + * g,c + useGazFeatures // test refining this? ? + * pg,pc,c + useGazFeatures ? + * ng,c + useGazFeatures ? + * // pg,g,c useGazFeatures ? + * // pg,g,ng,c useGazFeatures ? + * // p2g,p2c,pg,pc,g,c useGazFeatures ? + * g,w,c useMoreGazFeatures ? + * pg,pc,g,c useMoreGazFeatures ? + * g,ng,c useMoreGazFeatures ? + *

    + * g(w),c useGazette,sloppyGazette (contains same word) ? + * g(w),[pw,nw,...],c useGazette,cleanGazette (entire entry matches) ? + *

    + * s,c Y wordShape >= 0 +++ + * ps,c Y wordShape >= 0,useTypeSeqs + + * ns,c Y wordShape >= 0,useTypeSeqs + + * pw,s,c Y wordShape >= 0,useTypeSeqs + + * s,nw,c Y wordShape >= 0,useTypeSeqs + + * ps,s,c Y wordShape >= 0,useTypeSeqs 0 + * s,ns,c Y wordShape >= 0,useTypeSeqs ++ + * ps,s,ns,c Y wordShape >= 0,useTypeSeqs ++ + * pc,ps,s,c Y wordShape >= 0,useTypeSeqs,useTypeSeqs2 0 + * p2c,p2s,pc,ps,s,c Y wordShape >= 0,useTypeSeqs,useTypeSeqs2,maxLeft>=2 +++ + * pc,ps,s,ns,c wordShape >= 0,useTypeSeqs,useTypeSeqs3 ? + *

    + * p2w,s,c if l(pw) <= 3 Y useLastRealWord // weird features, but work 0 + * n2w,s,c if l(nw) <= 3 Y useNextRealWord ++ + * o(pw,w,nw),c Y useOccurrencePatterns // don't fully grok but has to do with capitalized name patterns ++ + *

    + * a,c useAbbr;useMinimalAbbr + * pa,a,c useAbbr + * a,na,c useAbbr + * pa,a,na,c useAbbr + * pa,pc,a,c useAbbr;useMinimalAbbr + * p2a,p2c,pa,pc,a useAbbr + * w,a,c useMinimalAbbr + * p2a,p2c,a,c useMinimalAbbr + *

    + * RESTR. w,(pw,pc;p2w,p2c;p3w,p3c;p4w,p4c) + useParenMatching,maxLeft>=n + *

    + * c - useClassFeature + *

    + * p,s,c - useShapeConjunctions + * t,s,c - useShapeConjunctions + *

    + * w,t,c + useWordTag ? + * w,pt,c + useWordTag ? + * w,nt,c + useWordTag ? + *

    + * r,c useNPGovernor (only for baseNP words) + * r,t,c useNPGovernor (only for baseNP words) + * h,c useNPHead (only for baseNP words) + * h,t,c useNPHead (only for baseNP words) + *

    + *

    + * + * @author Dan Klein + * @author Jenny Finkel + * @author Christopher Manning + * @author Shipra Dingare + * @author Huy Nguyen + */ +public class NERFeatureFactory extends FeatureFactory { + + private static final long serialVersionUID = -2329726064739185544L; + + public NERFeatureFactory() { + super(); + } + + public void init(SeqClassifierFlags flags) { + super.init(flags); + initGazette(); + if (flags.useDistSim) { + initLexicon(flags); + } + } + + /** + * Extracts all the features from the input data at a certain index. + * + * @param cInfo The complete data set as a List of WordInfo + * @param loc The index at which to extract features. + */ + @Override + public Collection getCliqueFeatures(PaddedList cInfo, int loc, Clique clique) { + Collection features = Generics.newHashSet(); + + boolean doFE = cInfo.get(0).containsKey(CoreAnnotations.DomainAnnotation.class); + String domain = (doFE ? cInfo.get(0).get(CoreAnnotations.DomainAnnotation.class) : null); + +// System.err.println(doFE+"\t"+domain); + + if (clique == cliqueC) { + //200710: tried making this clique null; didn't improve performance (rafferty) + Collection c = featuresC(cInfo, loc); + addAllInterningAndSuffixing(features, c, "C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-C"); + } + } else if (clique == cliqueCpC) { + Collection c = featuresCpC(cInfo, loc); + addAllInterningAndSuffixing(features, c, "CpC"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-CpC"); + } + + c = featuresCnC(cInfo, loc-1); + addAllInterningAndSuffixing(features, c, "CnC"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-CnC"); + } + } else if (clique == cliqueCp2C) { + Collection c = featuresCp2C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "Cp2C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-Cp2C"); + } + } else if (clique == cliqueCp3C) { + Collection c = featuresCp3C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "Cp3C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-Cp3C"); + } + } else if (clique == cliqueCp4C) { + Collection c = featuresCp4C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "Cp4C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-Cp4C"); + } + } else if (clique == cliqueCp5C) { + Collection c = featuresCp5C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "Cp5C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-Cp5C"); + } + } else if (clique == cliqueCpCp2C) { + Collection c = featuresCpCp2C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "CpCp2C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-CpCp2C"); + } + + c = featuresCpCnC(cInfo, loc-1); + addAllInterningAndSuffixing(features, c, "CpCnC"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-CpCnC"); + } + } else if (clique == cliqueCpCp2Cp3C) { + Collection c = featuresCpCp2Cp3C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "CpCp2Cp3C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-CpCp2Cp3C"); + } + } else if (clique == cliqueCpCp2Cp3Cp4C) { + Collection c = featuresCpCp2Cp3Cp4C(cInfo, loc); + addAllInterningAndSuffixing(features, c, "CpCp2Cp3Cp4C"); + if (doFE) { + addAllInterningAndSuffixing(features, c, domain+"-CpCp2Cp3Cp4C"); + } + } + + // System.err.println(StringUtils.join(features,"\n")+"\n"); + return features; + } + + + // TODO: when breaking serialization, it seems like it would be better to + // move the lexicon into (Abstract)SequenceClassifier and to do this + // annotation as part of the ObjectBankWrapper. But note that it is + // serialized in this object currently and it would then need to be + // serialized elsewhere or loaded each time + private Map lexicon; + + private void initLexicon(SeqClassifierFlags flags) { + if (flags.distSimLexicon == null) { + return; + } + if (lexicon != null) { + return; + } + Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon); + lexicon = Generics.newHashMap(); + boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat); + for (String line : ObjectBank.getLineIterator(flags.distSimLexicon, + flags.inputEncoding)) { + String word; + String wordClass; + if (terryKoo) { + String[] bits = line.split("\\t"); + word = bits[1]; + wordClass = bits[0]; + if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) { + wordClass = wordClass.substring(0, flags.distSimMaxBits); + } + } else { + // "alexClark" + String[] bits = line.split("\\s+"); + word = bits[0]; + wordClass = bits[1]; + } + if ( ! flags.casedDistSim) { + word = word.toLowerCase(); + } + if (flags.numberEquivalenceDistSim) { + word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS); + } + lexicon.put(word, wordClass); + } + Timing.endDoing(); + } + + + private void distSimAnnotate(PaddedList info) { + for (CoreLabel fl : info) { + if (fl.has(CoreAnnotations.DistSimAnnotation.class)) { return; } + String word = getWord(fl); + if ( ! flags.casedDistSim) { + word = word.toLowerCase(); + } + if (flags.numberEquivalenceDistSim) { + word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS); + } + String distSim = lexicon.get(word); + if (distSim == null) { + distSim = flags.unknownWordDistSimClass; + } + fl.set(CoreAnnotations.DistSimAnnotation.class, distSim); + } + } + + + private Map> wordToSubstrings = Generics.newHashMap(); + + public void clearMemory() { + wordToSubstrings = Generics.newHashMap(); + lexicon = null; + } + + private static String dehyphenate(String str) { + // don't take out leading or ending ones, just internal + // and remember padded with < > characters + String retStr = str; + int leng = str.length(); + int hyphen = 2; + do { + hyphen = retStr.indexOf('-', hyphen); + if (hyphen >= 0 && hyphen < leng - 2) { + retStr = retStr.substring(0, hyphen) + retStr.substring(hyphen + 1); + } else { + hyphen = -1; + } + } while (hyphen >= 0); + return retStr; + } + + private static String greekify(String str) { + // don't take out leading or ending ones, just internal + // and remember padded with < > characters + + String pattern = "(alpha)|(beta)|(gamma)|(delta)|(epsilon)|(zeta)|(kappa)|(lambda)|(rho)|(sigma)|(tau)|(upsilon)|(omega)"; + + Pattern p = Pattern.compile(pattern); + Matcher m = p.matcher(str); + return m.replaceAll("~"); + } + + /* end methods that do transformations */ + + /* + * static booleans that check strings for certain qualities * + */ + + // cdm: this could be improved to handle more name types, such as + // O'Reilly, DeGuzman, etc. (need a little classifier?!?) + private static boolean isNameCase(String str) { + if (str.length() < 2) { + return false; + } + if (!(Character.isUpperCase(str.charAt(0)) || Character.isTitleCase(str.charAt(0)))) { + return false; + } + for (int i = 1; i < str.length(); i++) { + if (Character.isUpperCase(str.charAt(i))) { + return false; + } + } + return true; + } + + private static boolean noUpperCase(String str) { + if (str.length() < 1) { + return false; + } + for (int i = 0; i < str.length(); i++) { + if (Character.isUpperCase(str.charAt(i))) { + return false; + } + } + return true; + } + + private static boolean hasLetter(String str) { + if (str.length() < 1) { + return false; + } + for (int i = 0; i < str.length(); i++) { + if (Character.isLetter(str.charAt(i))) { + return true; + } + } + return false; + } + + private static final Pattern ordinalPattern = Pattern.compile("(?:(?:first|second|third|fourth|fifth|"+ + "sixth|seventh|eighth|ninth|tenth|"+ + "eleventh|twelfth|thirteenth|"+ + "fourteenth|fifteenth|sixteenth|"+ + "seventeenth|eighteenth|nineteenth|"+ + "twenty|twentieth|thirty|thirtieth|"+ + "forty|fortieth|fifty|fiftieth|"+ + "sixty|sixtieth|seventy|seventieth|"+ + "eighty|eightieth|ninety|ninetieth|"+ + "one|two|three|four|five|six|seven|"+ + "eight|nine|hundred|hundredth)-?)+|[0-9]+(?:st|nd|rd|th)", Pattern.CASE_INSENSITIVE); + + + private static final Pattern numberPattern = Pattern.compile("[0-9]+"); + private static final Pattern ordinalEndPattern = Pattern.compile("(?:st|nd|rd|th)", Pattern.CASE_INSENSITIVE); + + private boolean isOrdinal(List wordInfos, int pos) { + CoreLabel c = wordInfos.get(pos); + String cWord = getWord(c); + Matcher m = ordinalPattern.matcher(cWord); + if (m.matches()) { return true; } + m = numberPattern.matcher(cWord); + if (m.matches()) { + if (pos+1 < wordInfos.size()) { + CoreLabel n = wordInfos.get(pos+1); + m = ordinalEndPattern.matcher(getWord(n)); + if (m.matches()) { return true; } + } + return false; + } + + m = ordinalEndPattern.matcher(cWord); + if (m.matches()) { + if (pos > 0) { + CoreLabel p = wordInfos.get(pos-1); + m = numberPattern.matcher(getWord(p)); + if (m.matches()) { return true; } + } + } + if (cWord.equals("-")) { + if (pos+1 < wordInfos.size() && pos > 0) { + CoreLabel p = wordInfos.get(pos-1); + CoreLabel n = wordInfos.get(pos+1); + m = ordinalPattern.matcher(getWord(p)); + if (m.matches()) { + m = ordinalPattern.matcher(getWord(n)); + if (m.matches()) { + return true; + } + } + } + } + return false; + } + + /* end static booleans that check strings for certain qualities */ + + /** + * Gazette Stuff. + */ + + private static class GazetteInfo implements Serializable { + final String feature; + final int loc; + final String[] words; + private static final long serialVersionUID = -5903728481621584810L; + public GazetteInfo(String feature, int loc, String[] words) { + this.feature = feature; + this.loc = loc; + this.words = words; + } + } // end class GazetteInfo + + private Map> wordToGazetteEntries = Generics.newHashMap(); + private Map> wordToGazetteInfos = Generics.newHashMap(); + + /** Reads a gazette file. Each line of it consists of a class name + * (a String not containing whitespace characters), followed by whitespace + * characters followed by a phrase, which is one or more tokens separated + * by a single space. + * + * @param in Where to read the gazette from + * @throws IOException If IO errors + */ + private void readGazette(BufferedReader in) throws IOException { + Pattern p = Pattern.compile("^(\\S+)\\s+(.+)$"); + for (String line; (line = in.readLine()) != null; ) { + Matcher m = p.matcher(line); + if (m.matches()) { + String type = intern(m.group(1)); + String phrase = m.group(2); + String[] words = phrase.split(" "); + for (int i = 0; i < words.length; i++) { + String word = intern(words[i]); + if (flags.sloppyGazette) { + Collection entries = wordToGazetteEntries.get(word); + if (entries == null) { + entries = Generics.newHashSet(); + wordToGazetteEntries.put(word, entries); + } + String feature = intern(type + "-GAZ" + words.length); + entries.add(feature); + feature = intern(type + "-GAZ"); + entries.add(feature); + } + if (flags.cleanGazette) { + Collection infos = wordToGazetteInfos.get(word); + if (infos == null) { + infos = Generics.newHashSet(); + wordToGazetteInfos.put(word, infos); + } + GazetteInfo info = new GazetteInfo(intern(type + "-GAZ" + words.length), i, words); + infos.add(info); + info = new GazetteInfo(intern(type + "-GAZ"), i, words); + infos.add(info); + } + } + } + } + } + + private Set>> genericAnnotationKeys; // = null; //cache which keys are generic annotations so we don't have to do too many instanceof checks + + @SuppressWarnings({"unchecked", "SuspiciousMethodCalls"}) + private void makeGenericKeyCache(CoreLabel c) { + genericAnnotationKeys = Generics.newHashSet(); + for (Class key : c.keySet()) { + if (CoreLabel.genericValues.containsKey(key)) { + Class> genKey = (Class>) key; + genericAnnotationKeys.add(genKey); + } + } + } + + private Set lastNames; // = null; + private Set maleNames; // = null; + private Set femaleNames; // = null; + + private final Pattern titlePattern = Pattern.compile("(Mr|Ms|Mrs|Dr|Miss|Sen|Judge|Sir)\\.?"); // todo: should make static final and add more titles + + + protected Collection featuresC(PaddedList cInfo, int loc) { + CoreLabel p3 = cInfo.get(loc - 3); + CoreLabel p2 = cInfo.get(loc - 2); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel c = cInfo.get(loc); + CoreLabel n = cInfo.get(loc + 1); + CoreLabel n2 = cInfo.get(loc + 2); + + String cWord = getWord(c); + String pWord = getWord(p); + String nWord = getWord(n); + String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); + String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); + String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class); + + Collection featuresC = new ArrayList(); + + if (flags.useDistSim) { + distSimAnnotate(cInfo); + } + + if (flags.useBagOfWords) { + for (IN word : cInfo) { + featuresC.add(getWord(word) + "-BAGOFWORDS"); + } + } + + if (flags.useDistSim && flags.useMoreTags) { + featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD"); + } + + + if (flags.useDistSim) { + featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM"); + } + + + if (flags.useTitle) { + Matcher m = titlePattern.matcher(cWord); + if (m.matches()) { + featuresC.add("IS_TITLE"); + } + } + + + if (flags.useInternal && flags.useExternal ) { + + if (flags.useWord) { + featuresC.add(cWord + "-WORD"); + } + + if (flags.use2W) { + featuresC.add(getWord(p2) + "-P2W"); + featuresC.add(getWord(n2) + "-N2W"); + } + + if (flags.useLC) { + featuresC.add(cWord.toLowerCase() + "-CL"); + featuresC.add(pWord.toLowerCase() + "-PL"); + featuresC.add(nWord.toLowerCase() + "-NL"); + } + + if (flags.useUnknown) { // for true casing + featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class)+"-UNKNOWN"); + featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class)+"-PUNKNOWN"); + featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class)+"-NUNKNOWN"); + } + + if (flags.useLemmas) { + String lem = c.getString(CoreAnnotations.LemmaAnnotation.class); + if (! "".equals(lem)) { + featuresC.add(lem + "-LEM"); + } + } + if (flags.usePrevNextLemmas) { + String plem = p.getString(CoreAnnotations.LemmaAnnotation.class); + String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class); + if (! "".equals(plem)) { + featuresC.add(plem + "-PLEM"); + } + if (! "".equals(nlem)) { + featuresC.add(nlem + "-NLEM"); + } + } + + if (flags.checkNameList) { + try { + if (lastNames == null) { + lastNames = Generics.newHashSet(); + + for (String line : ObjectBank.getLineIterator(flags.lastNameList)) { + String[] cols = line.split("\\s+"); + lastNames.add(cols[0]); + } + } + if (maleNames == null) { + maleNames = Generics.newHashSet(); + for (String line : ObjectBank.getLineIterator(flags.maleNameList)) { + String[] cols = line.split("\\s+"); + maleNames.add(cols[0]); + } + } + if (femaleNames == null) { + femaleNames = Generics.newHashSet(); + for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) { + String[] cols = line.split("\\s+"); + femaleNames.add(cols[0]); + } + } + + String name = cWord.toUpperCase(); + if (lastNames.contains(name)) { + featuresC.add("LAST_NAME"); + } + + if (maleNames.contains(name)) { + featuresC.add("MALE_NAME"); + } + + if (femaleNames.contains(name)) { + featuresC.add("FEMALE_NAME"); + } + + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + if (flags.binnedLengths != null) { + int len = cWord.length(); + String featureName = null; + for (int i = 0; i <= flags.binnedLengths.length; i++) { + if (i == flags.binnedLengths.length) { + featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf"; + } else if (len <= flags.binnedLengths[i]) { + featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i]; + break; + } + } + featuresC.add(featureName); + } + + if (flags.useABGENE) { + featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE"); + featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE"); + featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE"); + } + + if (flags.useABSTRFreqDict) { + featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + } + + if (flags.useABSTR) { + featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"); + featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT"); + featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT"); + } + + if (flags.useGENIA) { + featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA"); + featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA"); + featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA"); + } + if (flags.useWEBFreqDict) { + featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + } + + if (flags.useWEB) { + featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"); + featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB"); + featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB"); + } + + if (flags.useIsURL) { + featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL"); + } + if (flags.useEntityRule) { + featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class)+"-ENTITYRULE"); + } + if (flags.useEntityTypes) { + featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE"); + } + if (flags.useIsDateRange) { + featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE"); + } + + if (flags.useABSTRFreq) { + featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); + } + + if (flags.useFREQ) { + featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); + } + + if (flags.useMoreTags) { + featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD"); + } + + if (flags.usePosition) { + featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION"); + } + if (flags.useBeginSent) { + String pos = c.get(CoreAnnotations.PositionAnnotation.class); + if ("0".equals(pos)) { + featuresC.add("BEGIN-SENT"); + featuresC.add(cShape + "-BEGIN-SENT"); + } else if (Integer.toString(cInfo.size() - 1).equals(pos)) { + featuresC.add("END-SENT"); + featuresC.add(cShape + "-END-SENT"); + } else { + featuresC.add("IN-SENT"); + featuresC.add(cShape + "-IN-SENT"); + } + } + if (flags.useTags) { + featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); + } + + if (flags.useOrdinal) { + if (isOrdinal(cInfo, loc)) { + featuresC.add("C_ORDINAL"); + if (isOrdinal(cInfo, loc-1)) { + //System.err.print(getWord(p) + " "); + featuresC.add("PC_ORDINAL"); + } + //System.err.println(cWord); + } + if (isOrdinal(cInfo, loc-1)) { + featuresC.add("P_ORDINAL"); + } + } + + if (flags.usePrev) { + featuresC.add(pWord + "-PW"); + if (flags.useTags) { + featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG"); + } + if (flags.useDistSim) { + featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM"); + } + if (flags.useIsURL) { + featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL"); + } + if (flags.useEntityTypes) { + featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE"); + } + } + + if (flags.useNext) { + featuresC.add(nWord + "-NW"); + if (flags.useTags) { + featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG"); + } + if (flags.useDistSim) { + featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM"); + } + if (flags.useIsURL) { + featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL"); + } + if (flags.useEntityTypes) { + featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE"); + } + } + /*here, entityTypes refers to the type in the PASCAL IE challenge: + * i.e. certain words are tagged "Date" or "Location" */ + + + if (flags.useEitherSideWord) { + featuresC.add(pWord + "-EW"); + featuresC.add(nWord + "-EW"); + } + + if (flags.useWordPairs) { + featuresC.add(cWord + '-' + pWord + "-W-PW"); + featuresC.add(cWord + '-' + nWord + "-W-NW"); + } + + if (flags.useSymTags) { + if (flags.useTags) { + featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS"); + featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS"); + featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS"); + } + if (flags.useDistSim) { + featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM"); + featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM"); + featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM"); + } + + } + + if (flags.useSymWordPairs) { + featuresC.add(pWord + '-' + nWord + "-SWORDS"); + } + + if (flags.useGazFeatures) { + if (!c.get(CoreAnnotations.GazAnnotation.class).equals(flags.dropGaz)) { + featuresC.add(c.get(CoreAnnotations.GazAnnotation.class) + "-GAZ"); + } + if (!n.get(CoreAnnotations.GazAnnotation.class).equals(flags.dropGaz)) { + featuresC.add(n.get(CoreAnnotations.GazAnnotation.class) + "-NGAZ"); + } + if (!p.get(CoreAnnotations.GazAnnotation.class).equals(flags.dropGaz)) { + featuresC.add(p.get(CoreAnnotations.GazAnnotation.class) + "-PGAZ"); + } + } + + if (flags.useMoreGazFeatures) { + if (!c.get(CoreAnnotations.GazAnnotation.class).equals(flags.dropGaz)) { + featuresC.add(c.get(CoreAnnotations.GazAnnotation.class) + '-' + cWord + "-CG-CW-GAZ"); + if (!n.get(CoreAnnotations.GazAnnotation.class).equals(flags.dropGaz)) { + featuresC.add(c.get(CoreAnnotations.GazAnnotation.class) + '-' + n.get(CoreAnnotations.GazAnnotation.class) + "-CNGAZ"); + } + if (!p.get(CoreAnnotations.GazAnnotation.class).equals(flags.dropGaz)) { + featuresC.add(p.get(CoreAnnotations.GazAnnotation.class) + '-' + c.get(CoreAnnotations.GazAnnotation.class) + "-PCGAZ"); + } + } + } + + if (flags.useAbbr || flags.useMinimalAbbr) { + featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); + } + + if (flags.useAbbr1 || flags.useMinimalAbbr1) { + if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { + featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); + } + } + + if (flags.useAbbr) { + featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); + featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); + featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); + } + + if (flags.useAbbr1) { + if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { + featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); + featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); + featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); + } + } + + if (flags.useChunks) { + featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK"); + featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK"); + featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); + } + + if (flags.useMinimalAbbr) { + featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); + } + + if (flags.useMinimalAbbr1) { + if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { + featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); + } + } + + String prevVB = "", nextVB = ""; + if (flags.usePrevVB) { + for (int j = loc - 1; ; j--) { + CoreLabel wi = cInfo.get(j); + if (wi == cInfo.getPad()) { + prevVB = "X"; + featuresC.add("X-PVB"); + break; + } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { + featuresC.add(getWord(wi) + "-PVB"); + prevVB = getWord(wi); + break; + } + } + } + + if (flags.useNextVB) { + for (int j = loc + 1; ; j++) { + CoreLabel wi = cInfo.get(j); + if (wi == cInfo.getPad()) { + featuresC.add("X-NVB"); + nextVB = "X"; + break; + } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { + featuresC.add(getWord(wi) + "-NVB"); + nextVB = getWord(wi); + break; + } + } + } + + if (flags.useVB) { + featuresC.add(prevVB + '-' + nextVB + "-PNVB"); + } + + if (flags.useShapeConjunctions) { + featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH"); + if (flags.useTags) { + featuresC.add(c.tag() + cShape + "-TAG-SH"); + } + if (flags.useDistSim) { + featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH"); + } + + } + + if (flags.useWordTag) { + featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T"); + featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT"); + featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT"); + } + + if (flags.useNPHead) { + featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW"); + if (flags.useTags) { + featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T"); + } + if (flags.useDistSim) { + featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM"); + } + } + + if (flags.useNPGovernor) { + featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW"); + if (flags.useTags) { + featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T"); + } + if (flags.useDistSim) { + featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1"); + } + } + + if (flags.useHeadGov) { + featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW"); + } + + if (flags.useClassFeature) { + featuresC.add("###"); + } + + if (flags.useFirstWord) { + String firstWord = getWord(cInfo.get(0)); + featuresC.add(firstWord); + } + + if (flags.useNGrams) { + Collection subs = null; + if (flags.cacheNGrams) { + subs = wordToSubstrings.get(cWord); + } + if (subs == null) { + subs = new ArrayList(); + String word = '<' + cWord + '>'; + if (flags.lowercaseNGrams) { + word = word.toLowerCase(); + } + if (flags.dehyphenateNGrams) { + word = dehyphenate(word); + } + if (flags.greekifyNGrams) { + word = greekify(word); + } + // minimum length substring is 2 letters (hardwired) + // hoist flags.noMidNGrams so only linear in word length for that case + if (flags.noMidNGrams) { + int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : + word.length(); + for (int j = 2; j <= max; j++) { + subs.add(intern('#' + word.substring(0, j) + '#')); + } + int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : + 0; + int lenM1 = word.length() - 1; + for (int i = start; i < lenM1; i++) { + subs.add(intern('#' + word.substring(i) + '#')); + } + } else { + for (int i = 0; i < word.length(); i++) { + for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) { + if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { + continue; + } + subs.add(intern('#' + word.substring(i, j) + '#')); + } + } + } + if (flags.cacheNGrams) { + wordToSubstrings.put(cWord, subs); + } + } + featuresC.addAll(subs); + if (flags.conjoinShapeNGrams) { + for (String str : subs) { + String feat = str + '-' + cShape + "-CNGram-CS"; + featuresC.add(feat); + } + } + } + + if (flags.useGazettes) { + if (flags.sloppyGazette) { + Collection entries = wordToGazetteEntries.get(cWord); + if (entries != null) { + featuresC.addAll(entries); + } + } + if (flags.cleanGazette) { + Collection infos = wordToGazetteInfos.get(cWord); + if (infos != null) { + for (GazetteInfo gInfo : infos) { + boolean ok = true; + for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) { + ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc))); + } + if (ok) { + featuresC.add(gInfo.feature); + } + } + } + } + } + + if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { + featuresC.add(cShape + "-TYPE"); + if (flags.useTypeSeqs) { + featuresC.add(pShape + "-PTYPE"); + featuresC.add(nShape + "-NTYPE"); + featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); + featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); + featuresC.add(pShape + "..." + cShape + "-PCTYPE"); + featuresC.add(cShape + "..." + nShape + "-CNTYPE"); + featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); + } + } + + if (flags.useLastRealWord) { + if (pWord.length() <= 3) { + // extending this to check for 2 short words doesn't seem to help.... + featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); + } + } + + if (flags.useNextRealWord) { + if (nWord.length() <= 3) { + // extending this to check for 2 short words doesn't seem to help.... + featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); + } + } + + if (flags.useOccurrencePatterns) { + featuresC.addAll(occurrencePatterns(cInfo, loc)); + } + + if (flags.useDisjunctive) { + for (int i = 1; i <= flags.disjunctionWidth; i++) { + CoreLabel dn = cInfo.get(loc + i); + CoreLabel dp = cInfo.get(loc - i); + featuresC.add(getWord(dn) + "-DISJN"); + if (flags.useDisjunctiveShapeInteraction) { + featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); + } + featuresC.add(getWord(dp) + "-DISJP"); + if (flags.useDisjunctiveShapeInteraction) { + featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); + } + } + } + + if (flags.useWideDisjunctive) { + for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { + featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); + featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); + } + } + + if (flags.useEitherSideDisjunctive) { + for (int i = 1; i <= flags.disjunctionWidth; i++) { + featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE"); + featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE"); + } + } + + if (flags.useDisjShape) { + for (int i = 1; i <= flags.disjunctionWidth; i++) { + featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); + // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); + featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); + // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); + } + } + + if (flags.useExtraTaggySequences) { + if (flags.useTags) { + featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); + featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); + } + if (flags.useDistSim) { + featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); + featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); + } + } + + if (flags.useMUCFeatures) { + featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class)+"-SECTION"); + featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class)+"-WORD_POSITION"); + featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class)+"-SENT_POSITION"); + featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class)+"-PARA_POSITION"); + featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class)+ '-' +c.get(CoreAnnotations.ShapeAnnotation.class)+"-WORD_POSITION_SHAPE"); + } + } else if (flags.useInternal) { + + if (flags.useWord) { + featuresC.add(cWord + "-WORD"); + } + + if (flags.useNGrams) { + Collection subs = wordToSubstrings.get(cWord); + if (subs == null) { + subs = new ArrayList(); + String word = '<' + cWord + '>'; + if (flags.lowercaseNGrams) { + word = word.toLowerCase(); + } + if (flags.dehyphenateNGrams) { + word = dehyphenate(word); + } + if (flags.greekifyNGrams) { + word = greekify(word); + } + for (int i = 0; i < word.length(); i++) { + for (int j = i + 2; j <= word.length(); j++) { + if (flags.noMidNGrams && i != 0 && j != word.length()) { + continue; + } + if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { + continue; + } + //subs.add(intern("#" + word.substring(i, j) + "#")); + subs.add(intern('#' + word.substring(i, j) + '#')); + } + } + if (flags.cacheNGrams) { + wordToSubstrings.put(cWord, subs); + } + } + featuresC.addAll(subs); + if (flags.conjoinShapeNGrams) { + String shape = c.get(CoreAnnotations.ShapeAnnotation.class); + for (String str : subs) { + String feat = str + '-' + shape + "-CNGram-CS"; + featuresC.add(feat); + } + } + } + + if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { + featuresC.add(cShape + "-TYPE"); + } + + if (flags.useOccurrencePatterns) { + featuresC.addAll(occurrencePatterns(cInfo, loc)); + } + + } else if (flags.useExternal) { + + if (flags.usePrev) { + featuresC.add(pWord + "-PW"); + } + + if (flags.useNext) { + featuresC.add(nWord + "-NW"); + } + + if (flags.useWordPairs) { + featuresC.add(cWord + '-' + pWord + "-W-PW"); + featuresC.add(cWord + '-' + nWord + "-W-NW"); + } + + if (flags.useSymWordPairs) { + featuresC.add(pWord + '-' + nWord + "-SWORDS"); + } + + if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { + if (flags.useTypeSeqs) { + featuresC.add(pShape + "-PTYPE"); + featuresC.add(nShape + "-NTYPE"); + featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); + featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); + if (flags.maxLeft > 0) featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order + featuresC.add(cShape + "..." + nShape + "-CNTYPE"); + featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); + } + } + + if (flags.useLastRealWord) { + if (pWord.length() <= 3) { + featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); + } + } + + if (flags.useNextRealWord) { + if (nWord.length() <= 3) { + featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); + } + } + + if (flags.useDisjunctive) { + for (int i = 1; i <= flags.disjunctionWidth; i++) { + CoreLabel dn = cInfo.get(loc + i); + CoreLabel dp = cInfo.get(loc - i); + featuresC.add(getWord(dn) + "-DISJN"); + if (flags.useDisjunctiveShapeInteraction) { + featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); + } + featuresC.add(getWord(dp) + "-DISJP"); + if (flags.useDisjunctiveShapeInteraction) { + featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); + } + } + } + + if (flags.useWideDisjunctive) { + for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { + featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); + featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); + } + } + + if (flags.useDisjShape) { + for (int i = 1; i <= flags.disjunctionWidth; i++) { + featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); + // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); + featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); + // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); + } + } + + } + + // Stuff to add binary features from the additional columns + if (flags.twoStage) { + featuresC.add(c.get(Bin1Annotation.class) + "-BIN1"); + featuresC.add(c.get(Bin2Annotation.class) + "-BIN2"); + featuresC.add(c.get(Bin3Annotation.class) + "-BIN3"); + featuresC.add(c.get(Bin4Annotation.class) + "-BIN4"); + featuresC.add(c.get(Bin5Annotation.class) + "-BIN5"); + featuresC.add(c.get(Bin6Annotation.class) + "-BIN6"); + } + + if(flags.useIfInteger){ + try { + int val = Integer.parseInt(cWord); + if(val > 0) featuresC.add("POSITIVE_INTEGER"); + else if(val < 0) featuresC.add("NEGATIVE_INTEGER"); + // System.err.println("FOUND INTEGER"); + } catch(NumberFormatException e){ + // not an integer value, nothing to do + } + } + + //Stuff to add arbitrary features + if (flags.useGenericFeatures) { + //see if we need to cach the keys + if (genericAnnotationKeys == null) { + makeGenericKeyCache(c); + } + //now look through the cached keys + for (Class key : genericAnnotationKeys) { + //System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key)); + if(c.get(key) != null && c.get(key) instanceof Collection){ + for(Object ob: (Collection)c.get(key)) + featuresC.add(ob + "-" + CoreLabel.genericValues.get(key)); + }else + featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key)); + } + } + + if(flags.useTopics){ + //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD"); + featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class)+ "-TopicID"); + featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID"); + featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID"); + //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID"); + //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID"); + //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID"); + //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH"); + //asdasd + } + + // NER tag annotations from a previous NER system + if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) { + featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)+ "-CStackedNERTag"); + featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)+ "-WCStackedNERTag"); + + if (flags.useNext) { + featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag"); + featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag"); + + if (flags.usePrev) { + featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag"); + featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag"); + } + } + if (flags.usePrev) { + featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag"); + } + } + if(flags.useWordnetFeatures) + featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class)+"-WordnetSyn"); + if(flags.useProtoFeatures) + featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class)+"-Proto"); + if(flags.usePhraseWordTags) + featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class)+"-PhraseTag"); + if(flags.usePhraseWords) + { + for(String w: c.get(CoreAnnotations.PhraseWordsAnnotation.class)) + featuresC.add(w+"-PhraseWord"); + } + if(flags.useCommonWordsFeature) + featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class)); + + if (flags.useRadical && cWord.length() > 0) { + if (cWord.length() == 1) { + featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + + "-SINGLE-CHAR-RADICAL"); + } else { + featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + + "-START-RADICAL"); + featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + + "-END-RADICAL"); + } + for (int i = 0; i < cWord.length(); ++i) { + featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + + "-RADICAL"); + } + } + + if(flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()){ + String[] ws = c.word().split(flags.splitWordRegex); + for(String s: ws){ + featuresC.add(s+"-SPLITWORD"); + } + } + return featuresC; + } // end featuresC() + + /** + * Binary feature annotations + */ + private static class Bin1Annotation implements CoreAnnotation { + public Class getType() { return String.class; } } + + private static class Bin2Annotation implements CoreAnnotation { + public Class getType() { return String.class; } } + + private static class Bin3Annotation implements CoreAnnotation { + public Class getType() { return String.class; } } + + private static class Bin4Annotation implements CoreAnnotation { + public Class getType() { return String.class; } } + + private static class Bin5Annotation implements CoreAnnotation { + public Class getType() { return String.class; } } + + private static class Bin6Annotation implements CoreAnnotation { + public Class getType() { return String.class; } } + + + + protected Collection featuresCpC(PaddedList cInfo, int loc) { + CoreLabel p = cInfo.get(loc - 1); + CoreLabel c = cInfo.get(loc); + CoreLabel n = cInfo.get(loc + 1); + + String cWord = getWord(c); + String pWord = getWord(p); + String cDS = c.getString(CoreAnnotations.DistSimAnnotation.class); + String pDS = p.getString(CoreAnnotations.DistSimAnnotation.class); + String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); + String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); + Collection featuresCpC = new ArrayList(); + + if (flags.noEdgeFeature) + return featuresCpC; + + if (flags.transitionEdgeOnly) { + featuresCpC.add("PSEQ"); + return featuresCpC; + } + + if (flags.useNeighborNGrams) { + int maxLen = pWord.length(); + if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) { + maxLen = flags.maxNGramLeng; + } + for (int len = 1; len <= maxLen; ++len) { + featuresCpC.add(pWord.substring(0, len) + "-PREVIOUS-PREFIX"); + } + for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) { + featuresCpC.add(pWord.substring(pos, pWord.length()) + + "-PREVIOUS-SUFFIX"); + } + + maxLen = cWord.length(); + if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) { + maxLen = flags.maxNGramLeng; + } + for (int len = 1; len <= maxLen; ++len) { + featuresCpC.add(cWord.substring(0, len) + "-CURRENT-PREFIX"); + } + for (int pos = cWord.length() - maxLen; pos < cWord.length(); ++pos) { + featuresCpC.add(cWord.substring(pos, cWord.length()) + + "-CURRENT-SUFFIX"); + } + } + + if (flags.useInternal && flags.useExternal ) { + + if (flags.useOrdinal) { + if (isOrdinal(cInfo, loc)) { + featuresCpC.add("C_ORDINAL"); + if (isOrdinal(cInfo, loc-1)) { + featuresCpC.add("PC_ORDINAL"); + } + } + if (isOrdinal(cInfo, loc-1)) { + featuresCpC.add("P_ORDINAL"); + } + } + + if (flags.useAbbr || flags.useMinimalAbbr) { + featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS"); + } + + if (flags.useAbbr1 || flags.useMinimalAbbr1) { + if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { + featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS"); + } + } + + if (flags.useChunkySequences) { + featuresCpC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); + } + + if (flags.usePrev) { + if (flags.useSequences && flags.usePrevSequences) { + featuresCpC.add("PSEQ"); + featuresCpC.add(cWord + "-PSEQW"); + featuresCpC.add(pWord+ '-' +cWord + "-PSEQW2"); + + featuresCpC.add(pWord + "-PSEQpW"); + + featuresCpC.add(pDS + "-PSEQpDS"); + featuresCpC.add(cDS + "-PSEQcDS"); + featuresCpC.add(pDS+ '-' +cDS + "-PSEQpcDS"); + + if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings)) { + featuresCpC.add(pShape + "-PSEQpS"); + featuresCpC.add(cShape + "-PSEQcS"); + featuresCpC.add(pShape+ '-' +cShape + "-PSEQpcS"); + } + } + } + + if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || + flags.useShapeStrings) + && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) { + if (flags.useTypeSeqs3) { + featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES"); + } + if (flags.useTypeSeqs2) { + featuresCpC.add(pShape + '-' + cShape + "-TYPES"); + } + + if (flags.useYetMoreCpCShapes) { + String p2Shape = cInfo.get(loc - 2).getString(CoreAnnotations.ShapeAnnotation.class); + featuresCpC.add(p2Shape + '-' + pShape + '-' + cShape + "-YMS"); + featuresCpC.add(pShape + '-' + cShape + "-" + n.getString(CoreAnnotations.ShapeAnnotation.class) + "-YMSPCN"); + } + } + + if (flags.useTypeySequences) { + featuresCpC.add(cShape + "-TPS2"); + featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1"); + // featuresCpC.add(pShape) + "-" + cShape) + "-TPS"); // duplicates -TYPES, so now omitted; you may need to slighly increase sigma to duplicate previous results, however. + } + + if (flags.useTaggySequences) { + if (flags.useTags) { + featuresCpC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TS"); + } + if (flags.useDistSim) { + featuresCpC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TS1"); + } + } + + if (flags.useParenMatching) { + if (flags.useReverse) { + if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) { + if (pWord.equals(")") || pWord.equals("]") || pWord.equals("-RRB-")) { + featuresCpC.add("PAREN-MATCH"); + } + } + } else { + if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) { + if (pWord.equals("(") || pWord.equals("[") || pWord.equals("-LRB-")) { + featuresCpC.add("PAREN-MATCH"); + } + } + } + } + if (flags.useEntityTypeSequences) { + featuresCpC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + '-' + c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ETSEQ"); + } + if (flags.useURLSequences) { + featuresCpC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + '-' + c.get(CoreAnnotations.IsURLAnnotation.class) + "-URLSEQ"); + } + } else if (flags.useInternal) { + + if (flags.useSequences && flags.usePrevSequences) { + featuresCpC.add("PSEQ"); + featuresCpC.add(cWord + "-PSEQW"); + } + + if (flags.useTypeySequences) { + featuresCpC.add(cShape + "-TPS2"); + } + + } else if (flags.useExternal) { + + if( ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || + flags.useShapeStrings) + && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) { + if (flags.useTypeSeqs3) { + featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES"); + } + if (flags.useTypeSeqs2) { + featuresCpC.add(pShape + '-' + cShape + "-TYPES"); + } + } + + if (flags.useTypeySequences) { + featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1"); + featuresCpC.add(pShape + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TPS"); + } + } + + return featuresCpC; + } + + protected Collection featuresCp2C(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel p2 = cInfo.get(loc - 2); + + String cWord = getWord(c); + String pWord = getWord(p); + String p2Word = getWord(p2); + Collection featuresCp2C = new ArrayList(); + + if (flags.useMoreAbbr) { + featuresCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-P2ABBRANS"); + } + + if (flags.useMinimalAbbr) { + featuresCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-P2AP2CABB"); + } + + if (flags.useMinimalAbbr1) { + if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { + featuresCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-P2AP2CABB"); + } + } + + if (flags.useParenMatching) { + if (flags.useReverse) { + if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) { + if ((p2Word.equals(")") || p2Word.equals("]") || p2Word.equals("-RRB-")) && ! (pWord.equals(")") || pWord.equals("]") || pWord.equals("-RRB-"))) { + featuresCp2C.add("PAREN-MATCH"); + } + } + } else { + if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) { + if ((p2Word.equals("(") || p2Word.equals("[") || p2Word.equals("-LRB-")) && ! (pWord.equals("(") || pWord.equals("[") || pWord.equals("-LRB-"))) { + featuresCp2C.add("PAREN-MATCH"); + } + } + } + } + + return featuresCp2C; + } + + protected Collection featuresCp3C(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel p2 = cInfo.get(loc - 2); + CoreLabel p3 = cInfo.get(loc - 3); + + String cWord = getWord(c); + String pWord = getWord(p); + String p2Word = getWord(p2); + String p3Word = getWord(p3); + Collection featuresCp3C = new ArrayList(); + + if (flags.useParenMatching) { + if (flags.useReverse) { + if (cWord.equals("(") || cWord.equals("[")) { + if ((flags.maxLeft >= 3) && (p3Word.equals(")") || p3Word.equals("]")) && !(p2Word.equals(")") || p2Word.equals("]") || pWord.equals(")") || pWord.equals("]"))) { + featuresCp3C.add("PAREN-MATCH"); + } + } + } else { + if (cWord.equals(")") || cWord.equals("]")) { + if ((flags.maxLeft >= 3) && (p3Word.equals("(") || p3Word.equals("[")) && !(p2Word.equals("(") || p2Word.equals("[") || pWord.equals("(") || pWord.equals("["))) { + featuresCp3C.add("PAREN-MATCH"); + } + } + } + } + + return featuresCp3C; + } + + protected Collection featuresCp4C(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel p2 = cInfo.get(loc - 2); + CoreLabel p3 = cInfo.get(loc - 3); + CoreLabel p4 = cInfo.get(loc - 4); + + String cWord = getWord(c); + String pWord = getWord(p); + String p2Word = getWord(p2); + String p3Word = getWord(p3); + String p4Word = getWord(p4); + + Collection featuresCp4C = new ArrayList(); + + if (flags.useParenMatching) { + if (flags.useReverse) { + if (cWord.equals("(") || cWord.equals("[")) { + if ((flags.maxLeft >= 4) && (p4Word.equals(")") || p4Word.equals("]")) && !(p3Word.equals(")") || p3Word.equals("]") || p2Word.equals(")") || p2Word.equals("]") || pWord.equals(")") || pWord.equals("]"))) { + featuresCp4C.add("PAREN-MATCH"); + } + } + } else { + if (cWord.equals(")") || cWord.equals("]")) { + if ((flags.maxLeft >= 4) && (p4Word.equals("(") || p4Word.equals("[")) && !(p3Word.equals("(") || p3Word.equals("[") || p2Word.equals("(") || p2Word.equals("[") || pWord.equals("(") || pWord.equals("["))) { + featuresCp4C.add("PAREN-MATCH"); + } + } + } + } + + return featuresCp4C; + } + + protected Collection featuresCp5C(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel p2 = cInfo.get(loc - 2); + CoreLabel p3 = cInfo.get(loc - 3); + CoreLabel p4 = cInfo.get(loc - 4); + CoreLabel p5 = cInfo.get(loc - 5); + + String cWord = getWord(c); + String pWord = getWord(p); + String p2Word = getWord(p2); + String p3Word = getWord(p3); + String p4Word = getWord(p4); + String p5Word = getWord(p5); + Collection featuresCp5C = new ArrayList(); + + if (flags.useParenMatching) { + if (flags.useReverse) { + if (cWord.equals("(") || cWord.equals("[")) { + if ((flags.maxLeft >= 5) && (p5Word.equals(")") || p5Word.equals("]")) && !(p4Word.equals(")") || p4Word.equals("]") || p3Word.equals(")") || p3Word.equals("]") || p2Word.equals(")") || p2Word.equals("]") || pWord.equals(")") || pWord.equals("]"))) { + featuresCp5C.add("PAREN-MATCH"); + } + } + } else { + if (cWord.equals(")") || cWord.equals("]")) { + if ((flags.maxLeft >= 5) && (p5Word.equals("(") || p5Word.equals("[")) && !(p4Word.equals("(") || p4Word.equals("[") || p3Word.equals("(") || p3Word.equals("[") || p2Word.equals("(") || p2Word.equals("[") || pWord.equals("(") || pWord.equals("["))) { + featuresCp5C.add("PAREN-MATCH"); + } + } + } + } + return featuresCp5C; + } + + + protected Collection featuresCpCp2C(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel p2 = cInfo.get(loc - 2); + + String pWord = getWord(p); + String p2Word = getWord(p2); + + Collection featuresCpCp2C = new ArrayList(); + + if (flags.useInternal && flags.useExternal) { + + if (false && flags.useTypeySequences && flags.maxLeft >= 2) { // this feature duplicates -TYPETYPES one below, so don't include it (hurts to duplicate)!!! + featuresCpCp2C.add(p2.get(CoreAnnotations.ShapeAnnotation.class) + '-' + p.get(CoreAnnotations.ShapeAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTPS"); + } + + if (flags.useAbbr) { + featuresCpCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-2PABBRANS"); + } + + if (flags.useChunks) { + featuresCpCp2C.add(p2.get(CoreAnnotations.ChunkAnnotation.class) + '-' + p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-2PCHUNKS"); + } + + if (flags.useLongSequences) { + featuresCpCp2C.add("PPSEQ"); + } + if (flags.useBoundarySequences && pWord.equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { + featuresCpCp2C.add("BNDRY-SPAN-PPSEQ"); + } + // This more complex consistency checker didn't help! + // if (flags.useBoundarySequences) { + // // try enforce consistency over "and" and "," as well as boundary + // if (pWord.equals(CoNLLDocumentIteratorFactory.BOUNDARY) || + // pWord.equalsIgnoreCase("and") || pWord.equalsIgnoreCase("or") || + // pWord.equals(",")) { + // } + // } + + if (flags.useTaggySequences) { + if (flags.useTags) { + featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); + if (flags.useTaggySequencesShapeInteraction) { + featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTS-CS"); + } + } + if (flags.useDistSim) { + featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); + if (flags.useTaggySequencesShapeInteraction) { + featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTS1-CS"); + } + } + } + + if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || + flags.useShapeStrings) + && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { + String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); + String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); + String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); + featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); + } + } else if (flags.useInternal) { + + if (flags.useLongSequences) { + featuresCpCp2C.add("PPSEQ"); + } + } else if (flags.useExternal) { + + if (flags.useLongSequences) { + featuresCpCp2C.add("PPSEQ"); + } + + if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || + flags.useShapeStrings) + && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { + String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); + String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); + String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); + featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); + } + } + + return featuresCpCp2C; + } + + + protected Collection featuresCpCp2Cp3C(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + CoreLabel p = cInfo.get(loc - 1); + CoreLabel p2 = cInfo.get(loc - 2); + CoreLabel p3 = cInfo.get(loc - 3); + + Collection featuresCpCp2Cp3C = new ArrayList(); + + if (flags.useTaggySequences) { + if (flags.useTags) { + if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) { + featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); + if (flags.useTaggySequencesShapeInteraction) { + featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTTS-CS"); + } + } + } + if (flags.useDistSim) { + if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) { + featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); + if (flags.useTaggySequencesShapeInteraction) { + featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTTS1-CS"); + } + } + } + } + + if (flags.maxLeft >= 3) { + if (flags.useLongSequences) { + featuresCpCp2Cp3C.add("PPPSEQ"); + } + if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { + featuresCpCp2Cp3C.add("BNDRY-SPAN-PPPSEQ"); + } + } + + return featuresCpCp2Cp3C; + } + + protected Collection featuresCpCp2Cp3Cp4C(PaddedList cInfo, int loc) { + Collection featuresCpCp2Cp3Cp4C = new ArrayList(); + + CoreLabel p = cInfo.get(loc - 1); + + if (flags.maxLeft >= 4) { + if (flags.useLongSequences) { + featuresCpCp2Cp3Cp4C.add("PPPPSEQ"); + } + if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { + featuresCpCp2Cp3Cp4C.add("BNDRY-SPAN-PPPPSEQ"); + } + } + + return featuresCpCp2Cp3Cp4C; + } + + + protected Collection featuresCnC(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + + Collection featuresCnC = new ArrayList(); + + if (flags.useNext) { + if (flags.useSequences && flags.useNextSequences) { + featuresCnC.add("NSEQ"); + featuresCnC.add(getWord(c) + "-NSEQW"); + } + } + + return featuresCnC; + } + + + protected Collection featuresCpCnC(PaddedList cInfo, int loc) { + CoreLabel c = cInfo.get(loc); + + Collection featuresCpCnC = new ArrayList(); + + if (flags.useNext && flags.usePrev) { + if (flags.useSequences && flags.usePrevSequences && flags.useNextSequences) { + featuresCpCnC.add("PNSEQ"); + featuresCpCnC.add(getWord(c) + "-PNSEQW"); + } + } + + return featuresCpCnC; + } + + + int reverse(int i) { + return (flags.useReverse ? -1 * i : i); + } + + private Collection occurrencePatterns(PaddedList cInfo, int loc) { + // features on last Cap + String word = getWord(cInfo.get(loc)); + String nWord = getWord(cInfo.get(loc + reverse(1))); + CoreLabel p = cInfo.get(loc - reverse(1)); + String pWord = getWord(p); + // System.err.println(word+" "+nWord); + if (!(isNameCase(word) && noUpperCase(nWord) && hasLetter(nWord) && hasLetter(pWord) && p != cInfo.getPad())) { + return Collections.singletonList("NO-OCCURRENCE-PATTERN"); + } + // System.err.println("LOOKING"); + Set l = Generics.newHashSet(); + if (cInfo.get(loc - reverse(1)).getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && isNameCase(pWord) && cInfo.get(loc - reverse(1)).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP")) { + for (int jump = 3; jump < 150; jump++) { + if (getWord(cInfo.get(loc + reverse(jump))).equals(word)) { + if (getWord(cInfo.get(loc + reverse(jump - 1))).equals(pWord)) { + l.add("XY-NEXT-OCCURRENCE-XY"); + } else { + l.add("XY-NEXT-OCCURRENCE-Y"); + } + } + } + for (int jump = -3; jump > -150; jump--) { + if (getWord(cInfo.get(loc + reverse(jump))).equals(word)) { + if (getWord(cInfo.get(loc + reverse(jump - 1))).equals(pWord)) { + l.add("XY-PREV-OCCURRENCE-XY"); + } else { + l.add("XY-PREV-OCCURRENCE-Y"); + } + } + } + } else { + for (int jump = 3; jump < 150; jump++) { + if (getWord(cInfo.get(loc + reverse(jump))).equals(word)) { + if (isNameCase(getWord(cInfo.get(loc + reverse(jump - 1)))) && (cInfo.get(loc + reverse(jump - 1))).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP")) { + l.add("X-NEXT-OCCURRENCE-YX"); + // System.err.println(getWord(cInfo.get(loc+reverse(jump-1)))); + } else if (isNameCase(getWord(cInfo.get(loc + reverse(jump + 1)))) && (cInfo.get(loc + reverse(jump + 1))).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP")) { + // System.err.println(getWord(cInfo.get(loc+reverse(jump+1)))); + l.add("X-NEXT-OCCURRENCE-XY"); + } else { + l.add("X-NEXT-OCCURRENCE-X"); + } + } + } + for (int jump = -3; jump > -150; jump--) { + if (getWord(cInfo.get(loc + jump)) != null && getWord(cInfo.get(loc + jump)).equals(word)) { + if (isNameCase(getWord(cInfo.get(loc + reverse(jump + 1)))) && (cInfo.get(loc + reverse(jump + 1))).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP")) { + l.add("X-PREV-OCCURRENCE-YX"); + // System.err.println(getWord(cInfo.get(loc+reverse(jump+1)))); + } else if (isNameCase(getWord(cInfo.get(loc + reverse(jump - 1)))) && cInfo.get(loc + reverse(jump - 1)).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP")) { + l.add("X-PREV-OCCURRENCE-XY"); + // System.err.println(getWord(cInfo.get(loc+reverse(jump-1)))); + } else { + l.add("X-PREV-OCCURRENCE-X"); + } + } + } + } + /* + if (!l.isEmpty()) { + System.err.println(pWord+" "+word+" "+nWord+" "+l); + } + */ + return l; + } + + String intern(String s) { + if (flags.intern) { + return s.intern(); + } else { + return s; + } + } + + public void initGazette() { + try { + // read in gazettes + if (flags.gazettes == null) { flags.gazettes = new ArrayList(); } + List gazettes = flags.gazettes; + for (String gazetteFile : gazettes) { + BufferedReader r = IOUtils.readerFromString(gazetteFile, flags.inputEncoding); + readGazette(r); + r.close(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + +} // end class NERFeatureFactory diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NumberNormalizer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NumberNormalizer.java new file mode 100644 index 0000000..0be8cfe --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/NumberNormalizer.java @@ -0,0 +1,801 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.tokensregex.Env; +import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; +import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; +import edu.stanford.nlp.pipeline.ChunkAnnotationUtils; +import edu.stanford.nlp.pipeline.CoreMapAggregator; +import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; +import edu.stanford.nlp.util.*; + +import java.math.BigDecimal; +import java.util.*; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Provides functions for converting words to numbers + * Unlike QuantifiableEntityNormalizer that normalizes various + * types of quantifiable entities like money and dates, + * NumberNormalizer only normalizes numeric expressions + * (e.g. one => 1, two hundred => 200.0 ) + * + *
    + * This code is somewhat hacked together, so should be reworked. + * + *
    + * There is a library in perl for parsing english numbers: + * http://blog.cordiner.net/2010/01/02/parsing-english-numbers-with-perl/ + * + *

    + * TODO: To be merged into QuantifiableEntityNormalizer. + * It can be used by QuantifiableEntityNormalizer + * to first convert numbers expressed as words + * into numeric quantities before figuring + * out how to do higher level combos + * (like one hundred dollars and five cents) + *
    + * TODO: Known to not handle the following: + * oh: two oh one + * non-integers: one and a half, one point five, three fifth + * funky numbers: pi + *
    + * TODO: This class is very language dependent + * Should really be AmericanEnglishNumberNormalizer + *
    + * TODO: Make things not static + * + * @author Angel Chang + */ +public class NumberNormalizer { + + private NumberNormalizer() {} // static class + + private static final Logger logger = Logger.getLogger(NumberNormalizer.class.getName()); + // TODO: make this not static, let different NumberNormalizers use + // different loggers + public static void setVerbose(boolean verbose) { + if (verbose) { + logger.setLevel(Level.FINE); + } else { + logger.setLevel(Level.SEVERE); + } + } + + // Need these in order - first must come after 21st + //public static final Pattern teOrdinalWords = Pattern.compile("(?i)(tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)"); + //static final Pattern teNumOrds = Pattern.compile("(?i)([23]?1-?st|11-?th|[23]?2-?nd|12-?th|[12]?3-?rd|13-?th|[12]?[4-90]-?th|30-?th)"); + //static final Pattern unitNumsPattern = Pattern.compile("(?i)(one|two|three|four|five|six|seven|eight|nine)"); + //static final Pattern uniqueNumsPattern = Pattern.compile("(?i)(ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen)"); + //static final Pattern tensNumsPattern = Pattern.compile("(?i)(twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)"); + private static final Pattern numUnitPattern = Pattern.compile("(?i)(hundred|thousand|million|billion|trillion)"); + private static final Pattern numEndUnitPattern = Pattern.compile("(?i)(gross|dozen|score)"); + + /***********************/ + + private static final Pattern numberTermPattern = Pattern.compile("(?i)(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundred?th|thousandth|millionth|billionth|trillionth)"); + private static final Pattern numberTermPattern2 = Pattern.compile("(?i)(" + numberTermPattern.pattern() + "(-" + numberTermPattern.pattern() + ")?)"); + private static final Pattern ordinalUnitPattern = Pattern.compile("(?i)(hundredth|thousandth|millionth)"); + + // private static final String[] unitWords = {"trillion", "billion", "million", "thousand", "hundred"}; + // private static final String[] endUnitWords = {"gross", "dozen", "score"}; + + // Converts numbers in words to numeric form + // works through trillions + protected static final Pattern digitsPattern = Pattern.compile("\\d+"); + private static final Pattern numPattern = Pattern.compile("[-+]?(?:\\d+(?:,\\d\\d\\d)*(?:\\.\\d*)?|\\.\\d+)"); + private static final Pattern numRangePattern = Pattern.compile("(" + numPattern.pattern() + ")-(" + numPattern.pattern() + ")"); + // private static final Pattern[] endUnitWordsPattern = new Pattern[endUnitWords.length]; + // private static final Pattern[] unitWordsPattern = new Pattern[unitWords.length]; + // static { + // int i = 0; + // for (String uw:endUnitWords) { + // endUnitWordsPattern[i] = Pattern.compile("(.*)\\s*" + Pattern.quote(uw) + "\\s*(.*)"); + // i++; + // } + // int ii = 0; + // for (String uw:unitWords) { + // unitWordsPattern[ii] = Pattern.compile("(.*)\\s*" + Pattern.quote(uw) + "\\s*(.*)"); + // ii++; + // } + // } + + // TODO: similar to QuantifiableEntityNormalizer.wordsToValues + // QuantifiableEntityNormalizer also has bn (for billion) + // should consolidate + // here we use Number representation instead of double... + private static final Map word2NumMap = Generics.newHashMap(); + static + { + // Special words for numbers + word2NumMap.put("dozen", 12); + word2NumMap.put("score", 20); + word2NumMap.put("gross", 144); + word2NumMap.put("quarter", 0.25); + word2NumMap.put("half", 0.5); + word2NumMap.put("oh", 0); + word2NumMap.put("a" , 1); + word2NumMap.put("an" , 1); + + // Standard words for numbers + word2NumMap.put("zero", 0); + word2NumMap.put("one", 1); + word2NumMap.put("two", 2); + word2NumMap.put("three", 3); + word2NumMap.put("four", 4); + word2NumMap.put("five", 5); + word2NumMap.put("six", 6); + word2NumMap.put("seven", 7); + word2NumMap.put("eight", 8); + word2NumMap.put("nine", 9); + word2NumMap.put("ten", 10); + word2NumMap.put("eleven", 11); + word2NumMap.put("twelve", 12); + word2NumMap.put("thirteen", 13); + word2NumMap.put("fourteen", 14); + word2NumMap.put("fifteen", 15); + word2NumMap.put("sixteen", 16); + word2NumMap.put("seventeen", 17); + word2NumMap.put("eighteen", 18); + word2NumMap.put("nineteen", 19); + word2NumMap.put("twenty", 20); + word2NumMap.put("thirty", 30); + word2NumMap.put("forty", 40); + word2NumMap.put("fifty", 50); + word2NumMap.put("sixty", 60); + word2NumMap.put("seventy", 70); + word2NumMap.put("eighty", 80); + word2NumMap.put("ninety", 90); + word2NumMap.put("hundred", 100); + word2NumMap.put("thousand", 1000); + word2NumMap.put("million", 1000000); + word2NumMap.put("billion", 1000000000); + word2NumMap.put("trillion", 1000000000000L); + } + + // similar to QuantifiableEntityNormalizer.ordinalsToValues + private static final Map ordWord2NumMap = Generics.newHashMap(); + static { + ordWord2NumMap.put("zeroth", 0); + ordWord2NumMap.put("first", 1); + ordWord2NumMap.put("second", 2); + ordWord2NumMap.put("third", 3); + ordWord2NumMap.put("fourth", 4); + ordWord2NumMap.put("fifth", 5); + ordWord2NumMap.put("sixth", 6); + ordWord2NumMap.put("seventh", 7); + ordWord2NumMap.put("eighth", 8); + ordWord2NumMap.put("ninth", 9); + ordWord2NumMap.put("tenth", 10); + ordWord2NumMap.put("eleventh", 11); + ordWord2NumMap.put("twelfth", 12); + ordWord2NumMap.put("thirteenth", 13); + ordWord2NumMap.put("fourteenth", 14); + ordWord2NumMap.put("fifteenth", 15); + ordWord2NumMap.put("sixteenth", 16); + ordWord2NumMap.put("seventeenth", 17); + ordWord2NumMap.put("eighteenth", 18); + ordWord2NumMap.put("nineteenth", 19); + ordWord2NumMap.put("twentieth", 20); + ordWord2NumMap.put("thirtieth", 30); + ordWord2NumMap.put("fortieth", 40); + ordWord2NumMap.put("fiftieth", 50); + ordWord2NumMap.put("sixtieth", 60); + ordWord2NumMap.put("seventieth", 70); + ordWord2NumMap.put("eightieth", 80); + ordWord2NumMap.put("ninetieth", 90); + ordWord2NumMap.put("hundredth", 100); + ordWord2NumMap.put("hundreth", 100); // really a spelling error + ordWord2NumMap.put("thousandth", 1000); + ordWord2NumMap.put("millionth", 1000000); + ordWord2NumMap.put("billionth", 1000000000); + ordWord2NumMap.put("trillionth", 1000000000000L); + } + + // Seems to work better than quantifiable entity normalizer's numeric conversion + private static final Pattern alphaPattern = Pattern.compile("([a-zA-Z]+)"); + private static final Pattern wsPattern = Pattern.compile("\\s+"); + + /** + * Fairly generous utility function to convert a string representing + * a number (hopefully) to a Number. + * Assumes that something else has somehow determined that the string + * makes ONE suitable number. + * The value of the number is determined by: + * 0. Breaking up the string into pieces using whitespace + * (stuff like "and", "-", "," is turned into whitespace); + * 1. Determining the numeric value of the pieces; + * 2. Finding the numeric value of each piece; + * 3. Combining the pieces together to form the overall value: + * a. Find the largest component and its value (X), + * b. Let B = overall value of pieces to the left (recursive), + * c. Let C = overall value of pieces to the right recursive), + * d. The overall value = B*X + C. + * + * @param str The String to convert + * @return numeric value of string + */ + public static Number wordToNumber(String str){ + if (str.trim().equals("")) { + return null; + } + + boolean neg = false; + + String originalString = str; + + // Trims and lowercases stuff + str = str.trim(); + str = str.toLowerCase(); + + if (str.startsWith("-")) { + neg = true; + } + + // eliminate hyphens, commas, and the word "and" + str = str.replaceAll("\\band\\b", " "); + str = str.replaceAll("-", " "); + str = str.replaceAll("(\\d),(\\d)", "$1$2"); // Maybe something like 4,233,000 ?? + str = str.replaceAll(",", " "); +// str = str.replaceAll("(\\d)(\\w)","$1 $2"); + + // Trims again (do we need this?) + str = str.trim(); + + // TODO: error checking.... + //if string starts with "a ", as in "a hundred", replace it with "one" + if (str.startsWith("a ")) { + str = str.replace("a", "one"); + } + + // cut off some trailing s + if (str.endsWith("sands")) { + // thousands + str = str.substring(0, str.length() - 1); + } else if (str.endsWith("ions")) { + // millions, billions, etc + str = str.substring(0, str.length() - 1); + } + + // now count words + String[] fields = wsPattern.split(str); + Number[] numFields = new Number[fields.length]; + int numWords = fields.length; + + // get numeric value of each word piece + for (int curIndex = 0; curIndex < numWords; curIndex++) { + String curPart = fields[curIndex]; + Matcher m = alphaPattern.matcher(curPart); + if (m.find()) { + // Some part of the word has alpha characters + Number curNum; + if (word2NumMap.containsKey(curPart)) { + curNum = word2NumMap.get(curPart); + } else if (ordWord2NumMap.containsKey(curPart)) { + if (curIndex == numWords-1){ + curNum = ordWord2NumMap.get(curPart); + } else { + throw new NumberFormatException("Error in wordToNumber function."); + } + } else if (curIndex > 0 && (curPart.endsWith("ths") || curPart.endsWith("rds"))) { + // Fractions? + curNum = ordWord2NumMap.get(curPart.substring(0, curPart.length()-1)); + if (curNum != null) { + curNum = 1/curNum.doubleValue(); + } else { + throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + curPart + "\", originally part of \"" + originalString + "\", piece # " + curIndex); + } + } else if (Character.isDigit(curPart.charAt(0))) { + if (curPart.endsWith("th") || curPart.endsWith("rd") || curPart.endsWith("nd") || curPart.endsWith("st")) { + curPart = curPart.substring(0, curPart.length()-2); + } + if (digitsPattern.matcher(curPart).matches()) { + curNum = Long.parseLong(curPart); + } else{ + throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + curPart + "\", originally part of \"" + originalString + "\", piece # " + curIndex); + } + } else { + throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + curPart + "\", originally part of \"" + originalString + "\", piece # " + curIndex); + } + numFields[curIndex] = curNum; + } else { + // Word is all numeric + if (digitsPattern.matcher(curPart).matches()) { + numFields[curIndex] = Long.parseLong(curPart); + } else if (numPattern.matcher(curPart).matches()) { + numFields[curIndex] = new BigDecimal(curPart); + } else { + // Hmm, strange number + throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + curPart + "\", originally part of \"" + originalString + "\", piece # " + curIndex); + } + } + } + Number n = wordToNumberRecurse(numFields); + return (neg)? -n.doubleValue():n; + } + + private static Number wordToNumberRecurse(Number[] numFields) + { + return wordToNumberRecurse(numFields, 0, numFields.length); + } + + private static Number wordToNumberRecurse(Number[] numFields, int start, int end) + { + // return solitary number + if (end <= start) return 0; + if (end - start == 1) { + return numFields[start]; + } + + // first, find highest number in string + Number highestNum = Double.NEGATIVE_INFINITY; + int highestNumIndex = start; + for (int i = start; i < end; i++) { + Number curNum = numFields[i]; + if (curNum != null && curNum.doubleValue() >= highestNum.doubleValue()){ + highestNum = curNum; + highestNumIndex = i; + } + } + + Number beforeNum = 1; + if (highestNumIndex > start) { + beforeNum = wordToNumberRecurse(numFields, start, highestNumIndex); + if (beforeNum == null) beforeNum = 1; + } + Number afterNum = wordToNumberRecurse(numFields, highestNumIndex+1, end); + if (afterNum == null) afterNum = 0; + + // TODO: Everything is treated as double... losing precision information here + // Sufficient for now + // Should we usually use BigDecimal to do our calculations? + // There are also fractions to consider. + Number evaluatedNumber = ((beforeNum.doubleValue() * highestNum.doubleValue()) + afterNum.doubleValue()); + return evaluatedNumber; + } + + public static Env getNewEnv() + { + Env env = TokenSequencePattern.getNewEnv(); + + // Do case insensitive matching + env.setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE); + + initEnv(env); + return env; + } + + public static void initEnv(Env env) + { + // Custom binding for numeric values expressions + env.bind("numtype", CoreAnnotations.NumericTypeAnnotation.class); + env.bind("numvalue", CoreAnnotations.NumericValueAnnotation.class); + env.bind("numcomptype", CoreAnnotations.NumericCompositeTypeAnnotation.class); + env.bind("numcompvalue", CoreAnnotations.NumericCompositeValueAnnotation.class); + env.bind("$NUMCOMPTERM", " [ { numcomptype::EXISTS } & !{ numcomptype:NUMBER_RANGE } ] "); + env.bind("$NUMTERM", " [ { numtype::EXISTS } & !{ numtype:NUMBER_RANGE } ] "); + env.bind("$NUMRANGE", " [ { numtype:NUMBER_RANGE } ] "); + // TODO: Improve code to only recognize integers + env.bind("$INTTERM", " [ { numtype::EXISTS } & !{ numtype:NUMBER_RANGE } & !{ word:/.*\\.\\d+.*/} ] "); + env.bind("$POSINTTERM", " [ { numvalue>0 } & !{ word:/.*\\.\\d+.*/} ] "); + env.bind("$ORDTERM", " [ { numtype:ORDINAL } ] "); + env.bind("$BEFORE_WS", " [ { before:/\\s*/ } | !{ before::EXISTS} ]"); + env.bind("$AFTER_WS", " [ { after:/\\s*/ } | !{ after::EXISTS} ]"); + env.bind("$BEFORE_AFTER_WS", " [ $BEFORE_WS & $AFTER_WS ]"); + } + + private static final Env env = getNewEnv(); + + /** + * Find and mark numbers (does not need NumberSequenceClassifier) + * Each token is annotated with the numeric value and type + * - CoreAnnotations.NumericTypeAnnotation.class: ORDINAL, UNIT (hundred, thousand,..., dozen, gross,...), NUMBER + * - CoreAnnotations.NumericValueAnnotation.class: Number representing the numeric value of the token + * ( two thousand => 2 1000 ) + * + * Tries also to separate individual numbers like four five six, + * while keeping numbers like four hundred and seven together + * Annotate tokens belonging to each composite number with + * - CoreAnnotations.NumericCompositeTypeAnnotation.class: ORDINAL (1st, 2nd), NUMBER (one hundred) + * - CoreAnnotations.NumericCompositeValueAnnotation.class: Number representing the composite numeric value + * ( two thousand => 2000 2000 ) + * + * Also returns list of CoreMap representing the identified numbers + * + * The function is overly aggressive in marking possible numbers + * - should either do more checks or use in conjunction with NumberSequenceClassifier + * to avoid marking certain tokens (like second/NN) as numbers... + * + * @param annotation The annotation structure + * @return list of CoreMap representing the identified numbers + */ + public static List findNumbers(CoreMap annotation) + { + List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); + for (CoreLabel token:tokens) { + String w = token.word(); + w = w.trim().toLowerCase(); + + if (/*("CD".equals(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)) || */ + NumberNormalizer.numPattern.matcher(w).matches() || NumberNormalizer.numberTermPattern2.matcher(w).matches() || + NumberSequenceClassifier.ORDINAL_PATTERN.matcher(w).matches() || NumberNormalizer.numEndUnitPattern.matcher(w).matches()) { + // TODO: first ADVERB and second NN shouldn't be marked as ordinals + // But maybe we don't care, this can just mark the potential numbers, something else can disregard those + try { + token.set(CoreAnnotations.NumericValueAnnotation.class, NumberNormalizer.wordToNumber(w)); + if (NumberSequenceClassifier.ORDINAL_PATTERN.matcher(w).find()) { + token.set(CoreAnnotations.NumericTypeAnnotation.class, "ORDINAL"); + } else if (NumberNormalizer.numUnitPattern.matcher(w).matches()) { + token.set(CoreAnnotations.NumericTypeAnnotation.class, "UNIT"); + } else if (NumberNormalizer.numEndUnitPattern.matcher(w).matches()) { + token.set(CoreAnnotations.NumericTypeAnnotation.class, "UNIT"); + } else { + token.set(CoreAnnotations.NumericTypeAnnotation.class, "NUMBER"); + } + } catch (Exception ex) { + logger.warning("Error interpreting number " + w + ": " + ex.getMessage()); + } + } + } + // TODO: Should we allow "," in written out numbers? + // TODO: Handle "-" that is not with token? + TokenSequencePattern pattern = TokenSequencePattern.compile( + env, "$NUMTERM ( [/,/ & $BEFORE_WS]? [$POSINTTERM & $BEFORE_WS] )* ( [/,/ & $BEFORE_WS]? [/and/ & $BEFORE_WS] [$POSINTTERM & $BEFORE_WS]+ )? "); +// env, "$NUMTERM ( $POSINTTERM /,/? )* ( /and/ $POSINTTERM+ )? "); + TokenSequenceMatcher matcher = pattern.getMatcher(tokens); + List numbers = new ArrayList(); + while (matcher.find()) { + @SuppressWarnings("unused") + List matchedTokens = matcher.groupNodes(); + int numStart = matcher.start(); + int possibleNumEnd = -1; + int lastUnitPos = -1; + int possibleNumStart = -1; + Number possibleNumEndUnit = null; + Number lastUnit = null; + // Check if we need to split matched chunk up more + for (int i = matcher.start(); i < matcher.end(); i++) { + CoreLabel token = tokens.get(i); + CoreLabel prev = (i > matcher.start())? tokens.get(i - 1): null; + Number num = token.get(CoreAnnotations.NumericValueAnnotation.class); + Number prevNum = (prev != null)? prev.get(CoreAnnotations.NumericValueAnnotation.class):null; + String w = token.word(); + w = w.trim().toLowerCase(); + if (",".equals(w)) { + if (lastUnit != null && lastUnitPos == i-1) { + // OKAY, this may be one big number + possibleNumEnd = i; + possibleNumEndUnit = lastUnit; + } else { + // Not one big number + if (numStart < i) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i)); + numStart = i+1; + possibleNumEnd = -1; + possibleNumEndUnit = null; + lastUnit = null; + lastUnitPos = -1; + } + } + if (numStart == i) { + numStart = i+1; + } + } else if ("and".equals(w)) { + // Check if number before and was unit + String prevWord = prev.word(); + if (lastUnitPos == i-1 || (lastUnitPos == i-2 && ",".equals(prevWord))) { + // Okay + } else { + // Two separate numbers + if (numStart < possibleNumEnd) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd)); + if (possibleNumStart >= possibleNumEnd) { + numStart = possibleNumStart; + } else { + numStart = i+1; + } + } else if (numStart < i) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i)); + numStart = i+1; + } + if (lastUnitPos < numStart) { + lastUnit = null; + lastUnitPos = -1; + } + possibleNumEnd = -1; + possibleNumEndUnit = null; + } + } else { + // NUMBER or ORDINAL + String numType = token.get(CoreAnnotations.NumericTypeAnnotation.class); + if ("UNIT".equals(numType)) { + // Compare this unit with previous + if (lastUnit == null || lastUnit.longValue() > num.longValue()) { + // lastUnit larger than this unit + // maybe four thousand two hundred? + // OKAY, probably one big number + } else { + if (numStart < possibleNumEnd) { + // Units are increasing - check if this unit is >= unit before "," (if so, need to split into chunks) + // Not one big number ( had a comma ) + if (num.longValue() >= possibleNumEndUnit.longValue()) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd)); + if (possibleNumStart >= possibleNumEnd) { + numStart = possibleNumStart; + } else { + numStart = i; + } + possibleNumEnd = -1; + possibleNumEndUnit = null; + } + } else { + // unit is increasing - can be okay, maybe five hundred thousand? + // what about four hundred five thousand + // unit might also be the same, as in thousand thousand, + // which we convert to million + } + } + lastUnit = num; + lastUnitPos = i; + } else { + // Normal number + if (num == null) { + logger.warning("NO NUMBER: " + token.word()); + continue; + } + if (prevNum != null) { + if (num.doubleValue() > 0) { + if (num.doubleValue() < 10) { + // This number is a digit + // Treat following as two separate numbers + // \d+ [0-9] + // [one to nine] [0-9] + if (NumberNormalizer.numPattern.matcher(prev.word()).matches() || + prevNum.longValue() < 10 || prevNum.longValue() % 10 != 0 ) { + // two separate numbers + if (numStart < i) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i)); + } + numStart = i; + possibleNumEnd = -1; + possibleNumEndUnit = null; + lastUnit = null; + lastUnitPos = -1; + } + } else { + String prevNumType = prev.get(CoreAnnotations.NumericTypeAnnotation.class); + if ("UNIT".equals(prevNumType)) { + // OKAY + } else if (!ordinalUnitPattern.matcher(w).matches()) { + // Start of new number + if (numStart < i) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i)); + } + numStart = i; + possibleNumEnd = -1; + possibleNumEndUnit = null; + lastUnit = null; + lastUnitPos = -1; + } + } + } + } + if ("ORDINAL".equals(numType)) { + if (possibleNumEnd >= 0) { + if (numStart < possibleNumEnd) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd)); + } + if (possibleNumStart > possibleNumEnd) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumStart, i+1)); + } else { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumEnd+1, i+1)); + } + } else { + if (numStart < i+1) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i+1)); + } + } + numStart = i+1; + possibleNumEnd = -1; + possibleNumEndUnit = null; + lastUnit = null; + lastUnitPos = -1; + } + if (possibleNumStart < possibleNumEnd) { + possibleNumStart = i; + } + } + } + } + if (numStart < matcher.end()) { + numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, matcher.end())); + } + } + for (CoreMap n:numbers) { + String exp = n.get(CoreAnnotations.TextAnnotation.class); + List ts = n.get(CoreAnnotations.TokensAnnotation.class); + String label = ts.get(ts.size() - 1).get(CoreAnnotations.NumericTypeAnnotation.class); + if ("UNIT".equals(label)) { + label = "NUMBER"; + } + try { + Number num = NumberNormalizer.wordToNumber(exp); + if (num == null) { + logger.warning("NO NUMBER FOR: \"" + exp + "\""); + } + n.set(CoreAnnotations.NumericCompositeValueAnnotation.class, num); + n.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, label); + for (CoreLabel t:ts) { + t.set(CoreAnnotations.NumericCompositeValueAnnotation.class, num); + t.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, label); + } + } catch (NumberFormatException ex) { + logger.log(Level.WARNING, "Invalid number for: \"" + exp + "\"", ex); + } + } + return numbers; + } + + /** + * Find and mark number ranges + * Ranges are NUM1 [-|to] NUM2 where NUM2 > NUM1 + * + * Each number range is marked with + * - CoreAnnotations.NumericTypeAnnotation.class: NUMBER_RANGE + * - CoreAnnotations.NumericObjectAnnotation.class: {@code Pair} representing the start/end of the range + * + * @param annotation - annotation where numbers have already been identified + * @return list of CoreMap representing the identified number ranges + */ + public static List findNumberRanges(CoreMap annotation) + { + List numerizedTokens = annotation.get(CoreAnnotations.NumerizedTokensAnnotation.class); + for (CoreMap token:numerizedTokens) { + String w = token.get(CoreAnnotations.TextAnnotation.class); + w = w.trim().toLowerCase(); + Matcher rangeMatcher = NumberNormalizer.numRangePattern.matcher(w); + if (rangeMatcher.matches()) { + try { + String w1 = rangeMatcher.group(1); + String w2 = rangeMatcher.group(2); + Number v1 = NumberNormalizer.wordToNumber(w1); + Number v2 = NumberNormalizer.wordToNumber(w2); + if (v2.doubleValue() > v1.doubleValue()) { + token.set(CoreAnnotations.NumericTypeAnnotation.class, "NUMBER_RANGE"); + token.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE"); + Pair range = new Pair(v1,v2); + token.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range); + } + } catch (Exception ex) { + logger.warning("Error interpreting number range " + w + ": " + ex.getMessage()); + } + } + } + List numberRanges = new ArrayList(); + TokenSequencePattern pattern = TokenSequencePattern.compile(env, "(?:$NUMCOMPTERM /-|to/ $NUMCOMPTERM) | $NUMRANGE"); + TokenSequenceMatcher matcher = pattern.getMatcher(numerizedTokens); + while (matcher.find()) { + List matched = matcher.groupNodes(); + if (matched.size() == 1) { + numberRanges.add(matched.get(0)); + } else { + Number v1 = matched.get(0).get(CoreAnnotations.NumericCompositeValueAnnotation.class); + Number v2 = matched.get(matched.size()-1).get(CoreAnnotations.NumericCompositeValueAnnotation.class); + if (v2.doubleValue() > v1.doubleValue()) { + CoreMap newChunk = ChunkAnnotationUtils.getMergedChunk(numerizedTokens, matcher.start(), matcher.end(), + CoreMapAttributeAggregator.getDefaultAggregators()); + newChunk.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE"); + Pair range = new Pair(v1,v2); + newChunk.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range); + numberRanges.add(newChunk); + } + } + } + return numberRanges; + } + + /** + * Takes annotation and identifies numbers in the annotation + * Returns a list of tokens (as CoreMaps) with numbers merged + * As by product, also marks each individual token with the TokenBeginAnnotation and TokenEndAnnotation + * - this is mainly to make it easier to the rest of the code to figure out what the token offsets are. + * + * Note that this copies the annotation, since it modifies token offsets in the original + * @param annotationRaw The annotation to find numbers in + * @return list of CoreMap representing the identified numbers + */ + public static List findAndMergeNumbers(CoreMap annotationRaw){ + //copy annotation to preserve its integrity + CoreMap annotation = new ArrayCoreMap(annotationRaw); + // Find and label numbers + List numbers = NumberNormalizer.findNumbers(annotation); + CoreMapAggregator numberAggregator = CoreMapAggregator.getAggregator(CoreMapAttributeAggregator.DEFAULT_NUMERIC_AGGREGATORS, CoreAnnotations.TokensAnnotation.class); + + // We are going to mark the token begin and token end for each token so we can more easily deal with + // ensuring correct token offsets for merging + //get sentence offset + Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); + if (startTokenOffset == null) { + startTokenOffset = 0; + } + //set token offsets + int i = 0; + List savedTokenBegins = new LinkedList(); + List savedTokenEnds = new LinkedList(); + for (CoreMap c:annotation.get(CoreAnnotations.TokensAnnotation.class)) { + //set token begin + if( (i==0 && c.get(CoreAnnotations.TokenBeginAnnotation.class) != null) || (i > 0 && !savedTokenBegins.isEmpty()) ){ + savedTokenBegins.add(c.get(CoreAnnotations.TokenBeginAnnotation.class)); + } + c.set(CoreAnnotations.TokenBeginAnnotation.class, i+startTokenOffset); + i++; + //set token end + if( (i==1 && c.get(CoreAnnotations.TokenEndAnnotation.class) != null) || (i > 1 && !savedTokenEnds.isEmpty()) ){ + savedTokenEnds.add(c.get(CoreAnnotations.TokenEndAnnotation.class)); + } + c.set(CoreAnnotations.TokenEndAnnotation.class, i+startTokenOffset); + } + //merge numbers + final Integer startTokenOffsetFinal = startTokenOffset; + List mergedNumbers = numberAggregator.merge(annotation.get(CoreAnnotations.TokensAnnotation.class), numbers, + new Function>() { + @Override + public Interval apply(CoreMap in) { + return Interval.toInterval( + in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal, + in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal); + } + }); + //restore token offsets + if (!savedTokenBegins.isEmpty() && !savedTokenEnds.isEmpty()) { + for (CoreMap c : mergedNumbers) { + // get new indices + int newBegin = c.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffset; + int newEnd = c.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffset; + // get token offsets for those indices + c.set(CoreAnnotations.TokenBeginAnnotation.class, savedTokenBegins.get(newBegin)); + c.set(CoreAnnotations.TokenEndAnnotation.class, savedTokenEnds.get(newEnd-1)); + } + } + //return + return mergedNumbers; + } + + public static List findAndAnnotateNumericExpressions(CoreMap annotation) + { + List mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation); + annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers); + return mergedNumbers; + } + + public static List findAndAnnotateNumericExpressionsWithRanges(CoreMap annotation) + { + Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); + if (startTokenOffset == null) { + startTokenOffset = 0; + } + List mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation); + annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers); + // Find and label number ranges + List numberRanges = NumberNormalizer.findNumberRanges(annotation); + final Integer startTokenOffsetFinal = startTokenOffset; + List mergedNumbersWithRanges = CollectionUtils.mergeListWithSortedMatchedPreAggregated( + annotation.get(CoreAnnotations.NumerizedTokensAnnotation.class), numberRanges, + new Function>() { + @Override + public Interval apply(CoreMap in) { + return Interval.toInterval( + in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal, + in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal); + } + }); + annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbersWithRanges); + return mergedNumbersWithRanges; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/QuantifiableEntityNormalizer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/QuantifiableEntityNormalizer.java new file mode 100644 index 0000000..35a9f5d --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/QuantifiableEntityNormalizer.java @@ -0,0 +1,1434 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.ie.pascal.ISODateInstance; +import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.time.TimeAnnotations; +import edu.stanford.nlp.time.Timex; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.EditDistance; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; + +import static java.lang.System.err; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * Various methods for normalizing Money, Date, Percent, Time, and + * Number, Ordinal amounts. + * These matchers are generous in that they try to quantify something + * that's already been labelled by an NER system; don't use them to make + * classification decisions. This class has a twin in the pipeline world: + * {@link edu.stanford.nlp.pipeline.QuantifiableEntityNormalizingAnnotator}. + * Please keep the substantive content here, however, so as to lessen code + * duplication. + *

    + * Implementation note: The extensive test code for this class is + * now in a separate JUnit Test class. This class depends on the background + * symbol for NER being the default background symbol. This should be fixed + * at some point. + * + * @author Chris Cox + * @author Christopher Manning (extended for RTE) + * @author Anna Rafferty + */ +public class QuantifiableEntityNormalizer { + + private static final boolean DEBUG = false; + private static final boolean DEBUG2 = false; // String normlz functions + + public static String BACKGROUND_SYMBOL = SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL; // this isn't a constant; it's set by the QuantifiableEntityNormalizingAnnotator + + private static final Pattern timePattern = Pattern.compile("([0-2]?[0-9])((?::[0-5][0-9]){0,2})([PpAa]\\.?[Mm]\\.?)?"); + + private static final Pattern moneyPattern = Pattern.compile("([$\u00A3\u00A5\u20AC#]?)(-?[0-9,]*)(\\.[0-9]*)?+"); + private static final Pattern scorePattern = Pattern.compile(" *([0-9]+) *- *([0-9]+) *"); + + //Collections of entity types + private static final Set quantifiable; //Entity types that are quantifiable + private static final Set collapseBeforeParsing; + private static final Set timeUnitWords; + private static final Map moneyMultipliers; + private static final Map moneyMultipliers2; + private static final Map currencyWords; + public static final ClassicCounter wordsToValues; + public static final ClassicCounter ordinalsToValues; + + static { + + quantifiable = Generics.newHashSet(); + quantifiable.add("MONEY"); + quantifiable.add("TIME"); + quantifiable.add("DATE"); + quantifiable.add("PERCENT"); + quantifiable.add("NUMBER"); + quantifiable.add("ORDINAL"); + quantifiable.add("DURATION"); + + collapseBeforeParsing = Generics.newHashSet(); + collapseBeforeParsing.add("PERSON"); + collapseBeforeParsing.add("ORGANIZATION"); + collapseBeforeParsing.add("LOCATION"); + + timeUnitWords = Generics.newHashSet(); + timeUnitWords.add("second"); + timeUnitWords.add("seconds"); + timeUnitWords.add("minute"); + timeUnitWords.add("minutes"); + timeUnitWords.add("hour"); + timeUnitWords.add("hours"); + timeUnitWords.add("day"); + timeUnitWords.add("days"); + timeUnitWords.add("week"); + timeUnitWords.add("weeks"); + timeUnitWords.add("month"); + timeUnitWords.add("months"); + timeUnitWords.add("year"); + timeUnitWords.add("years"); + + currencyWords = Generics.newHashMap(); + currencyWords.put("dollars?", '$'); + currencyWords.put("cents?", '$'); + currencyWords.put("pounds?", '\u00A3'); + currencyWords.put("pence|penny", '\u00A3'); + currencyWords.put("yen", '\u00A5'); + currencyWords.put("euros?", '\u20AC'); + currencyWords.put("won", '\u20A9'); + currencyWords.put("\\$", '$'); + currencyWords.put("\u00A2", '$'); // cents + currencyWords.put("\u00A3", '\u00A3'); // pounds + currencyWords.put("#", '\u00A3'); // for Penn treebank + currencyWords.put("\u00A5", '\u00A5'); // Yen + currencyWords.put("\u20AC", '\u20AC'); // Euro + currencyWords.put("\u20A9", '\u20A9'); // Won + currencyWords.put("yuan", '\u5143'); // Yuan + + moneyMultipliers = Generics.newHashMap(); + moneyMultipliers.put("trillion", 1000000000000.0); // can't be an integer + moneyMultipliers.put("billion",1000000000.0); + moneyMultipliers.put("bn",1000000000.0); + moneyMultipliers.put("million", 1000000.0); + moneyMultipliers.put("thousand", 1000.0); + moneyMultipliers.put("hundred", 100.0); + moneyMultipliers.put("b.", 1000000000.0); + moneyMultipliers.put("m.", 1000000.0); + moneyMultipliers.put(" m ",1000000.0); + moneyMultipliers.put(" k ",1000.0); + + moneyMultipliers2 = Generics.newHashMap(); + moneyMultipliers2.put("[0-9](m)(?:[^a-zA-Z]|$)", 1000000); + moneyMultipliers2.put("[0-9](b)(?:[^a-zA-Z]|$)", 1000000000); + + wordsToValues = new ClassicCounter(); + wordsToValues.setCount("zero", 0.0); + wordsToValues.setCount("one", 1.0); + wordsToValues.setCount("two", 2.0); + wordsToValues.setCount("three", 3.0); + wordsToValues.setCount("four", 4.0); + wordsToValues.setCount("five", 5.0); + wordsToValues.setCount("six", 6.0); + wordsToValues.setCount("seven", 7.0); + wordsToValues.setCount("eight", 8.0); + wordsToValues.setCount("nine", 9.0); + wordsToValues.setCount("ten", 10.0); + wordsToValues.setCount("eleven", 11.0); + wordsToValues.setCount("twelve", 12.0); + wordsToValues.setCount("thirteen", 13.0); + wordsToValues.setCount("fourteen", 14.0); + wordsToValues.setCount("fifteen", 15.0); + wordsToValues.setCount("sixteen", 16.0); + wordsToValues.setCount("seventeen", 17.0); + wordsToValues.setCount("eighteen", 18.0); + wordsToValues.setCount("nineteen", 19.0); + wordsToValues.setCount("twenty", 20.0); + wordsToValues.setCount("thirty", 30.0); + wordsToValues.setCount("forty", 40.0); + wordsToValues.setCount("fifty", 50.0); + wordsToValues.setCount("sixty", 60.0); + wordsToValues.setCount("seventy", 70.0); + wordsToValues.setCount("eighty", 80.0); + wordsToValues.setCount("ninety", 90.0); + wordsToValues.setCount("hundred", 100.0); + wordsToValues.setCount("thousand", 1000.0); + wordsToValues.setCount("million", 1000000.0); + wordsToValues.setCount("billion", 1000000000.0); + wordsToValues.setCount("bn", 1000000000.0); + wordsToValues.setCount("trillion", 1000000000000.0); + wordsToValues.setCount("dozen", 12.0); + + ordinalsToValues = new ClassicCounter(); + ordinalsToValues.setCount("zeroth", 0.0); + ordinalsToValues.setCount("first", 1.0); + ordinalsToValues.setCount("second", 2.0); + ordinalsToValues.setCount("third", 3.0); + ordinalsToValues.setCount("fourth", 4.0); + ordinalsToValues.setCount("fifth", 5.0); + ordinalsToValues.setCount("sixth", 6.0); + ordinalsToValues.setCount("seventh", 7.0); + ordinalsToValues.setCount("eighth", 8.0); + ordinalsToValues.setCount("ninth", 9.0); + ordinalsToValues.setCount("tenth", 10.0); + ordinalsToValues.setCount("eleventh", 11.0); + ordinalsToValues.setCount("twelfth", 12.0); + ordinalsToValues.setCount("thirteenth", 13.0); + ordinalsToValues.setCount("fourteenth", 14.0); + ordinalsToValues.setCount("fifteenth", 15.0); + ordinalsToValues.setCount("sixteenth", 16.0); + ordinalsToValues.setCount("seventeenth", 17.0); + ordinalsToValues.setCount("eighteenth", 18.0); + ordinalsToValues.setCount("nineteenth", 19.0); + ordinalsToValues.setCount("twentieth", 20.0); + ordinalsToValues.setCount("twenty-first", 21.0); + ordinalsToValues.setCount("twenty-second", 22.0); + ordinalsToValues.setCount("twenty-third", 23.0); + ordinalsToValues.setCount("twenty-fourth", 24.0); + ordinalsToValues.setCount("twenty-fifth", 25.0); + ordinalsToValues.setCount("twenty-sixth", 26.0); + ordinalsToValues.setCount("twenty-seventh", 27.0); + ordinalsToValues.setCount("twenty-eighth", 28.0); + ordinalsToValues.setCount("twenty-ninth", 29.0); + ordinalsToValues.setCount("thirtieth", 30.0); + ordinalsToValues.setCount("thirty-first", 31.0); + ordinalsToValues.setCount("fortieth", 40.0); + ordinalsToValues.setCount("fiftieth", 50.0); + ordinalsToValues.setCount("sixtieth", 60.0); + ordinalsToValues.setCount("seventieth", 70.0); + ordinalsToValues.setCount("eightieth", 80.0); + ordinalsToValues.setCount("ninetieth", 90.0); + ordinalsToValues.setCount("hundredth", 100.0); + ordinalsToValues.setCount("thousandth", 1000.0); + ordinalsToValues.setCount("millionth", 1000000.0); + ordinalsToValues.setCount("billionth", 1000000000.0); + ordinalsToValues.setCount("trillionth", 1000000000000.0); + } + + private QuantifiableEntityNormalizer() {} // this is all static + + /** + * This method returns the closest match in set such that the match + * has more than three letters and differs from word only by one substitution, + * deletion, or insertion. If not match exists, returns null. + */ + private static String getOneSubstitutionMatch(String word, Set set) { + // TODO (?) pass the EditDistance around more places to make this + // more efficient. May not really matter. + EditDistance ed = new EditDistance(); + for(String cur : set) { + if(isOneSubstitutionMatch(word, cur, ed)) + return cur; + } + return null; + } + + private static boolean isOneSubstitutionMatch(String word, String match, + EditDistance ed) { + if(word.equalsIgnoreCase(match)) + return true; + if(match.length() > 3) { + if(ed.score(word, match) <= 1) + return true; + } + return false; + } + + /** Convert the content of a List of CoreMaps to a single + * space-separated String. This grabs stuff based on the get(CoreAnnotations.NamedEntityTagAnnotation.class) field. + * [CDM: Changed to look at NamedEntityTagAnnotation not AnswerClass Jun 2010, hoping that will fix a bug.] + * + * @param l The List + * @return one string containing all words in the list, whitespace separated + */ + public static String singleEntityToString(List l) { + String entityType = l.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class); + StringBuilder sb = new StringBuilder(); + for (E w : l) { + assert(w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(entityType)); + sb.append(w.get(CoreAnnotations.TextAnnotation.class)); + sb.append(' '); + } + return sb.toString(); + } + + + /** + * Currently this populates a List<CoreLabel> with words from the passed List, + * but NER entities are collapsed and {@link CoreLabel} constituents of entities have + * NER information in their "quantity" fields. + *

    + * NOTE: This now seems to be used nowhere. The collapsing is done elsewhere. + * That's probably appropriate; it doesn't seem like this should be part of + * QuantifiableEntityNormalizer, since it's set to collapse non-quantifiable + * entities.... + * + * @param l a list of CoreLabels with NER labels, + * @return a Sentence where PERSON, ORG, LOC, entities are collapsed. + */ + public static List collapseNERLabels(List l){ + if(DEBUG) { + for (CoreLabel w: l) { + System.err.println("<<"+w.get(CoreAnnotations.TextAnnotation.class)+"::"+w.get(CoreAnnotations.PartOfSpeechAnnotation.class)+"::"+w.get(CoreAnnotations.NamedEntityTagAnnotation.class)+">>"); + } + } + + List s = new ArrayList(); + String lastEntity = BACKGROUND_SYMBOL; + StringBuilder entityStringCollector = null; + + //Iterate through each word.... + for (CoreLabel w: l) { + String entityType = w.get(CoreAnnotations.NamedEntityTagAnnotation.class); + //if we've just completed an entity and we're looking at a non-continuation, + //we want to add that now. + if (entityStringCollector != null && ! entityType.equals(lastEntity)) { + CoreLabel nextWord = new CoreLabel(); + nextWord.setWord(entityStringCollector.toString()); + nextWord.set(CoreAnnotations.PartOfSpeechAnnotation.class, "NNP"); + nextWord.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity); + s.add(nextWord); + if (DEBUG) { + err.print("Quantifiable: Collapsing "); + err.println(entityStringCollector.toString()); + } + entityStringCollector = null; + } + //If its not to be collapsed, toss it onto the sentence. + if ( ! collapseBeforeParsing.contains(entityType)) { + s.add(w); + } else { //If it is to be collapsed.... + //if its a continuation of the last entity, add it to the + //current buffer. + if (entityType.equals(lastEntity)){ + assert entityStringCollector != null; + entityStringCollector.append('_'); + entityStringCollector.append(w.get(CoreAnnotations.TextAnnotation.class)); + } else { + //and its NOT a continuation, make a new buffer. + entityStringCollector = new StringBuilder(); + entityStringCollector.append(w.get(CoreAnnotations.TextAnnotation.class)); + } + } + lastEntity=entityType; + } + // if the last token was a named-entity, we add it here. + if (entityStringCollector!=null) { + CoreLabel nextWord = new CoreLabel(); + nextWord.setWord(entityStringCollector.toString()); + nextWord.set(CoreAnnotations.PartOfSpeechAnnotation.class, "NNP"); + nextWord.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity); + s.add(nextWord); + } + for (CoreLabel w : s) { + System.err.println("<<"+w.get(CoreAnnotations.TextAnnotation.class)+"::"+w.get(CoreAnnotations.PartOfSpeechAnnotation.class)+"::"+w.get(CoreAnnotations.NamedEntityTagAnnotation.class)+">>"); + } + return s; + } + + + /** + * Provided for backwards compatibility; see normalizedDateString(s, openRangeMarker) + */ + static String normalizedDateString(String s, Timex timexFromSUTime) { + return normalizedDateString(s, ISODateInstance.NO_RANGE, timexFromSUTime); + } + + /** + * Returns a string that represents either a single date or a range of + * dates. Representation pattern is roughly ISO8601, with some extensions + * for greater expressivity; see {@link ISODateInstance} for details. + * @param s Date string to normalize + * @param openRangeMarker a marker for whether this date is not involved in + * an open range, is involved in an open range that goes forever backward and + * stops at s, or is involved in an open range that goes forever forward and + * starts at s + * @return A yyyymmdd format normalized date + */ + static String normalizedDateString(String s, String openRangeMarker, Timex timexFromSUTime) { + if(timexFromSUTime != null) { + if(timexFromSUTime.value() != null){ + // fully disambiguated temporal + return timexFromSUTime.value(); + } else { + // this is a relative date, e.g., "yesterday" + return timexFromSUTime.altVal(); + } + } + + ISODateInstance d = new ISODateInstance(s, openRangeMarker); + if (DEBUG2) err.println("normalizeDate: " + s + " to " + d.getDateString()); + return (d.getDateString()); + } + + /** + * Tries to heuristically determine if the given word is a year + */ + static boolean isYear(CoreMap word) { + String wordString = word.get(CoreAnnotations.TextAnnotation.class); + if(word.get(CoreAnnotations.PartOfSpeechAnnotation.class) == null || word.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { + //one possibility: it's a two digit year with an apostrophe: '90 + if(wordString.length() == 3 && wordString.startsWith("'")) { + if (DEBUG) { + System.err.println("Found potential two digit year: " + wordString); + } + wordString = wordString.substring(1); + try { + Integer.parseInt(wordString); + return true; + } catch(Exception e) { + return false; + } + } + //if it is 4 digits, with first one <3 (usually we're not talking about + //the far future, say it's a year + if(wordString.length() == 4) { + try { + int num = Integer.parseInt(wordString); + if(num < 3000) + return true; + } catch(Exception e) { + return false; + } + } + } + return false; + } + + + private static final String dateRangeAfterOneWord = "after|since"; + private static final String dateRangeBeforeOneWord = "before|until"; + private static final List> dateRangeBeforePairedOneWord; + static { + dateRangeBeforePairedOneWord = new ArrayList>(); + dateRangeBeforePairedOneWord.add(new Pair("between", "and")); + dateRangeBeforePairedOneWord.add(new Pair("from", "to")); + dateRangeBeforePairedOneWord.add(new Pair("from", "-")); + } + + private static final String datePrepositionAfterWord = "in|of"; + + + /** + * Takes the strings of the one previous and 3 next words to a date to + * detect date range modifiers like "before" or "between \ and \ + * @param + */ + private static String detectDateRangeModifier(List date, List list, int beforeIndex, int afterIndex) { + E prev = (beforeIndex >= 0) ? list.get(beforeIndex) : null; + int sz = list.size(); + E next = (afterIndex < sz) ? list.get(afterIndex) : null; + E next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1) : null; + E next3 = (afterIndex + 2 < sz) ? list.get(afterIndex + 2) : null; + + + if (DEBUG) { + err.println("DateRange: previous: " + prev); + err.println("Quantifiable: next: " + next + ' ' + next2 + ' ' + next3); + } + + //sometimes the year gets tagged as CD but not as a date - if this happens, we want to add it in + if (next != null && isYear(next)) { + date.add(next); + next.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE"); + afterIndex++; + } + if (next2 != null && isYear(next2)) { + date.add(next); + assert(next != null); // keep the static analysis happy. + next.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE"); + date.add(next2); + next2.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE"); + afterIndex += 2; + } + + //sometimes the date will be stated in a form like "June of 1984" -> we'd like this to be 198406 + if(next != null && next.get(CoreAnnotations.TextAnnotation.class).matches(datePrepositionAfterWord)) { + //check if the next next word is a year or month + if(next2 != null && (isYear(next2))) {//TODO: implement month! + date.add(next); + date.add(next2); + afterIndex += 2; + } + } + + //String range = detectTwoSidedRangeModifier(date.get(0), list, beforeIndex, afterIndex); + //if(range !=ISODateInstance.NO_RANGE) return range; + //check if it's an open range - two sided ranges get checked elsewhere + //based on the prev word + if(prev != null) { + String prevWord = prev.get(CoreAnnotations.TextAnnotation.class).toLowerCase(); + if(prevWord.matches(dateRangeBeforeOneWord)) { + //we have an open range of the before type - e.g., Before June 6, John was 5 + prev.set(CoreAnnotations.PartOfSpeechAnnotation.class, "DATE_MOD"); + return ISODateInstance.OPEN_RANGE_BEFORE; + } else if(prevWord.matches(dateRangeAfterOneWord)) { + //we have an open range of the after type - e.g., After June 6, John was 6 + prev.set(CoreAnnotations.PartOfSpeechAnnotation.class, "DATE_MOD"); + return ISODateInstance.OPEN_RANGE_AFTER; + } + } + + + return ISODateInstance.NO_RANGE; + } + + /** + * This should detect things like "between 5 and 5 million" and "from April 3 to June 6" + * Each side of the range is tagged with the correct numeric quantity (e.g., 5/5x10E6 or + * ****0403/****0606) and the other words (e.g., "between", "and", "from", "to") are + * tagged as quantmod to avoid penalizing them for lack of alignment/matches. + * + * This method should be called after other collapsing is complete (e.g. 5 million should already be + * concatenated) + * @param + */ + private static List detectTwoSidedRangeModifier(E firstDate, List list, int beforeIndex, int afterIndex, boolean concatenate) { + E prev = (beforeIndex >= 0) ? list.get(beforeIndex) : null; + //E cur = list.get(0); + int sz = list.size(); + E next = (afterIndex < sz) ? list.get(afterIndex) : null; + E next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1) : null; + List toRemove = new ArrayList(); + + String curNER = (firstDate == null ? "" : firstDate.get(CoreAnnotations.NamedEntityTagAnnotation.class)); + if(curNER == null) curNER = ""; + if(firstDate == null || firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) == null) return toRemove; + //TODO: make ranges actually work + //first check if it's of the form "between and "/etc + + if (prev != null) { + for (Pair ranges : dateRangeBeforePairedOneWord) { + if (prev.get(CoreAnnotations.TextAnnotation.class).matches(ranges.first())) { + if (next != null && next2 != null) { + String nerNext2 = next2.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if (next.get(CoreAnnotations.TextAnnotation.class).matches(ranges.second()) && nerNext2 != null && nerNext2.equals(curNER)) { + //Add rest in + prev.set(CoreAnnotations.PartOfSpeechAnnotation.class, "QUANT_MOD"); + String rangeString; + if(curNER.equals("DATE")) { + ISODateInstance c = new ISODateInstance(new ISODateInstance(firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)), + new ISODateInstance(next2.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class))); + rangeString = c.getDateString(); + } else { + rangeString = firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + '-' + next2.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class); + } + if (DEBUG) { + System.err.println("#1: Changing normalized NER from " + firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + rangeString + " at index " + beforeIndex); + } + firstDate.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, rangeString); + if (DEBUG) { + System.err.println("#2: Changing normalized NER from " + next2.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + rangeString + " at index " + afterIndex); + } + next2.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, rangeString); + next.set(CoreAnnotations.NamedEntityTagAnnotation.class, nerNext2); + if (DEBUG) { + System.err.println("#3: Changing normalized NER from " + next.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + rangeString + " at index " + (afterIndex + 1)); + } + next.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, rangeString); + if (concatenate) { + List numberWords = new ArrayList(); + numberWords.add(firstDate); + numberWords.add(next); + numberWords.add(next2); + concatenateNumericString(numberWords, toRemove); + + } + } + } + } + } + } + return toRemove; + } + + /** + * Concatenates separate words of a date or other numeric quantity into one node (e.g., 3 November -> 3_November) + * Tag is CD or NNP, and other words are added to the remove list + */ + static void concatenateNumericString(List words, List toRemove) { + if (words.size() <= 1) return; + boolean first = true; + StringBuilder newText = new StringBuilder(); + E foundEntity = null; + for (E word : words) { + if (foundEntity == null && (word.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") || word.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP"))) { + foundEntity = word; + } + if (first) { + first = false; + } else { + newText.append('_'); + } + newText.append(word.get(CoreAnnotations.TextAnnotation.class)); + } + if (foundEntity == null) { + foundEntity = words.get(0);//if we didn't find one with the appropriate tag, just take the first one + } + toRemove.addAll(words); + toRemove.remove(foundEntity); + foundEntity.set(CoreAnnotations.PartOfSpeechAnnotation.class, "CD"); // cdm 2008: is this actually good for dates?? + String collapsed = newText.toString(); + foundEntity.set(CoreAnnotations.TextAnnotation.class, collapsed); + foundEntity.set(CoreAnnotations.OriginalTextAnnotation.class, collapsed); + } + + + public static String normalizedTimeString(String s, Timex timexFromSUTime) { + return normalizedTimeString(s, null, timexFromSUTime); + } + + public static String normalizedTimeString(String s, String ampm, Timex timexFromSUTime) { + if(timexFromSUTime != null){ + if(timexFromSUTime.value() != null){ + // this timex is fully disambiguated + return timexFromSUTime.value(); + } else { + // not disambiguated; contains some relative date + return timexFromSUTime.altVal(); + } + } + + if (DEBUG2) err.println("normalizingTime: " + s); + s = s.replaceAll("[ \t\n\0\f\r]", ""); + Matcher m = timePattern.matcher(s); + if (s.equalsIgnoreCase("noon")) { + return "12:00pm"; + } else if (s.equalsIgnoreCase("midnight")) { + return "00:00am"; // or "12:00am" ? + } else if (s.equalsIgnoreCase("morning")) { + return "M"; + } else if (s.equalsIgnoreCase("afternoon")) { + return "A"; + } else if (s.equalsIgnoreCase("evening")) { + return "EN"; + } else if (s.equalsIgnoreCase("night")) { + return "N"; + } else if (s.equalsIgnoreCase("day")) { + return "D"; + } else if (s.equalsIgnoreCase("suppertime")) { + return "EN"; + } else if (s.equalsIgnoreCase("lunchtime")) { + return "MD"; + } else if (s.equalsIgnoreCase("midday")) { + return "MD"; + } else if (s.equalsIgnoreCase("teatime")) { + return "A"; + } else if (s.equalsIgnoreCase("dinnertime")) { + return "EN"; + } else if (s.equalsIgnoreCase("dawn")) { + return "EM"; + } else if (s.equalsIgnoreCase("dusk")) { + return "EN"; + } else if (s.equalsIgnoreCase("sundown")) { + return "EN"; + } else if (s.equalsIgnoreCase("sunup")) { + return "EM"; + } else if (s.equalsIgnoreCase("daybreak")) { + return "EM"; + } else if (m.matches()) { + if (DEBUG2) { + err.printf("timePattern matched groups: |%s| |%s| |%s| |%s|\n", m.group(0), m.group(1), m.group(2), m.group(3)); + } + // group 1 is hours, group 2 is minutes and maybe seconds; group 3 is am/pm + StringBuilder sb = new StringBuilder(); + sb.append(m.group(1)); + if (m.group(2) == null || "".equals(m.group(2))) { + sb.append(":00"); + } else { + sb.append(m.group(2)); + } + if (m.group(3) != null) { + String suffix = m.group(3); + suffix = suffix.replaceAll("\\.", ""); + suffix = suffix.toLowerCase(); + sb.append(suffix); + } else if (ampm != null) { + sb.append(ampm); + } else { + // Do nothing; leave ambiguous + // sb.append("pm"); + } + if (DEBUG2) { + err.println("normalizedTimeString new str: " + sb.toString()); + } + return sb.toString(); + } else if (DEBUG) { + err.println("Quantifiable: couldn't normalize " + s); + } + return null; + } + + /** + * Heuristically decides if s is in American (42.33) or European (42,33) number format + * and tries to turn European version into American. + * + */ + private static String convertToAmerican(String s) { + if(s.contains(",")) { + //turn all but the last into blanks - this isn't really correct, but it's close enough for now + while(s.indexOf(',') != s.lastIndexOf(',')) + s = s.replaceFirst(",", ""); + int place = s.lastIndexOf(','); + //if it's american, should have at least three characters after it + if (place >= s.length() - 3 && place != s.length() - 1) { + s = s.substring(0, place) + '.' + s.substring(place + 1); + } else { + s = s.replace(",", ""); + } + } + return s; + } + + static String normalizedMoneyString(String s, Number numberFromSUTime) { + //first, see if it looks like european style + s = convertToAmerican(s); + // clean up string + s = s.replaceAll("[ \t\n\0\f\r,]", ""); + s = s.toLowerCase(); + if (DEBUG2) { + err.println("normalizedMoneyString: Normalizing "+s); + } + + double multiplier = 1.0; + + // do currency words + char currencySign = '$'; + for (String currencyWord : currencyWords.keySet()) { + if (StringUtils.find(s, currencyWord)) { + if (DEBUG2) { err.println("Found units: " + currencyWord); } + if (currencyWord.equals("pence|penny") || currencyWord.equals("cents?") || currencyWord.equals("\u00A2")) { + multiplier *= 0.01; + } + // if(DEBUG){err.println("Quantifiable: Found "+ currencyWord);} + s = s.replaceAll(currencyWord, ""); + currencySign = currencyWords.get(currencyWord); + } + } + + // process rest as number + String value = normalizedNumberStringQuiet(s, multiplier, "", numberFromSUTime); + if (value == null) { + return null; + } else { + return currencySign + value; + } + } + + public static String normalizedNumberString(String s, String nextWord, Number numberFromSUTime) { + if (DEBUG2) { err.println("normalizedNumberString: normalizing "+s); } + return normalizedNumberStringQuiet(s, 1.0, nextWord, numberFromSUTime); + } + + + private static final Pattern allSpaces = Pattern.compile(" *"); + + + public static String normalizedNumberStringQuiet(String s, + double multiplier, + String nextWord, + Number numberFromSUTime) { + // normalizations from SUTime take precedence, if available + if(numberFromSUTime != null){ + double v = Double.valueOf(numberFromSUTime.toString()); + return Double.toString(v * multiplier); + } + + // clean up string + String origSClean = s.replaceAll("[\t\n\0\f\r]", ""); + if (allSpaces.matcher(origSClean).matches()) { + return s; + } + String[] origSSplit = origSClean.split(" "); + s = s.replaceAll("[ \t\n\0\f\r]", ""); + //see if it looks like european style + s = convertToAmerican(s); + // remove parenthesis around numbers + // if PTBTokenized, this next bit should be a no-op + // in some contexts parentheses might indicate a negative number, but ignore that. + if (s.startsWith("(") && s.endsWith(")")) { + s = s.substring(1, s.length() - 1); + if (DEBUG2) err.println("Deleted (): " + s); + } + s = s.toLowerCase(); + + // get multipliers like "billion" + boolean foundMultiplier = false; + for (String moneyTag : moneyMultipliers.keySet()) { + if (s.contains(moneyTag)) { + // if (DEBUG) {err.println("Quantifiable: Found "+ moneyTag);} + //special case check: m can mean either meters or million - if nextWord is high or long, we assume meters - this is a huge and bad hack!!! + if(moneyTag.equals("m") && (nextWord.equals("high") || nextWord.equals("long") )) continue; + s = s.replaceAll(moneyTag, ""); + multiplier *= moneyMultipliers.get(moneyTag); + foundMultiplier = true; + } + } + for (String moneyTag : moneyMultipliers2.keySet()) { + Matcher m = Pattern.compile(moneyTag).matcher(s); + if (m.find()) { + // if(DEBUG){err.println("Quantifiable: Found "+ moneyTag);} + multiplier *= moneyMultipliers2.get(moneyTag); + foundMultiplier = true; + int start = m.start(1); + int end = m.end(1); + // err.print("Deleting from " + s); + s = s.substring(0, start) + s.substring(end); + // err.println("; Result is " + s); + } + } + if(!foundMultiplier) { + EditDistance ed = new EditDistance(); + for (String moneyTag : moneyMultipliers.keySet()) { + if(isOneSubstitutionMatch(origSSplit[origSSplit.length - 1], + moneyTag, ed)) { + s = s.replaceAll(moneyTag, ""); + multiplier *= moneyMultipliers.get(moneyTag); + } + } + } + + if (DEBUG2) err.println("Looking for number words in |" + s + "|; multiplier is " + multiplier); + + // handle numbers written in words + String[] parts = s.split("[ -]"); + boolean processed = false; + double dd = 0.0; + for (String part : parts) { + if (wordsToValues.containsKey(part)) { + dd += wordsToValues.getCount(part); + processed = true; + } else { + String partMatch = getOneSubstitutionMatch(part, wordsToValues.keySet()); + if(partMatch != null) { + dd += wordsToValues.getCount(partMatch); + processed = true; + } + } + } + if (processed) { + dd *= multiplier; + return Double.toString(dd); + } + + // handle numbers written as numbers + // s = s.replaceAll("-", ""); //This is bad: it lets 22-7 be the number 227! + s = s.replaceAll("[A-Za-z]", ""); + + // handle scores or range + Matcher m2 = scorePattern.matcher(s); + if (m2.matches()) { + double d1 = Double.parseDouble(m2.group(1)); + double d2 = Double.parseDouble(m2.group(2)); + return Double.toString(d1) + " - " + Double.toString(d2); + } + + // check for hyphenated word like 4-Ghz: delete final - + if (s.endsWith("-")) { + s = s.substring(0, s.length() - 1); + } + + Matcher m = moneyPattern.matcher(s); + if (m.matches()) { + if (DEBUG2) { + err.println("Number matched with |" + m.group(2) + "| |" + + m.group(3) + '|'); + } + try { + double d = 0.0; + if (m.group(2) != null && ! m.group(2).equals("")) { + d = Double.parseDouble(m.group(2)); + } + if (m.group(3) != null && ! m.group(3).equals("")) { + d += Double.parseDouble(m.group(3)); + } + if (d == 0.0 && multiplier != 1.0) { + // we'd found a multiplier + d = 1.0; + } + d *= multiplier; + return Double.toString(d); + } catch (Exception e) { + if (DEBUG2) { + e.printStackTrace(); + } + return null; + } + } else if (multiplier != 1.0) { + // we found a multiplier, so we have something + return Double.toString(multiplier); + } else { + return null; + } + } + + public static String normalizedOrdinalString(String s, Number numberFromSUTime) { + if (DEBUG2) { err.println("normalizedOrdinalString: normalizing "+s); } + return normalizedOrdinalStringQuiet(s, numberFromSUTime); + } + + public static final Pattern numberPattern = Pattern.compile("([0-9.]+)"); + + public static String normalizedOrdinalStringQuiet(String s, Number numberFromSUTime) { + // clean up string + s = s.replaceAll("[ \t\n\0\f\r,]", ""); + // remove parenthesis around numbers + // if PTBTokenized, this next bit should be a no-op + // in some contexts parentheses might indicate a negative number, but ignore that. + if (s.startsWith("(") && s.endsWith(")")) { + s = s.substring(1, s.length() - 1); + if (DEBUG2) err.println("Deleted (): " + s); + } + s = s.toLowerCase(); + + if (DEBUG2) err.println("Looking for ordinal words in |" + s + '|'); + if (Character.isDigit(s.charAt(0))) { + Matcher matcher = numberPattern.matcher(s); + matcher.find(); + // just parse number part, assuming last two letters are st/nd/rd + return normalizedNumberStringQuiet(matcher.group(), 1.0, "", numberFromSUTime); + } else if (ordinalsToValues.containsKey(s)) { + return Double.toString(ordinalsToValues.getCount(s)); + } else { + String val = getOneSubstitutionMatch(s, ordinalsToValues.keySet()); + if(val != null) + return Double.toString(ordinalsToValues.getCount(val)); + else + return null; + } + } + + public static String normalizedPercentString(String s, Number numberFromSUTime) { + if (DEBUG2) { + err.println("normalizedPercentString: " + s); + } + s = s.replaceAll("\\s", ""); + s = s.toLowerCase(); + if (s.contains("%") || s.contains("percent")) { + s = s.replaceAll("percent|%", ""); + } + String norm = normalizedNumberStringQuiet(s, 1.0, "", numberFromSUTime); + if (norm == null) { + return null; + } + return '%' + norm; + } + + /** Fetches the first encountered Number set by SUTime */ + private static Number fetchNumberFromSUTime(List l) { + for(E e: l) { + if(e.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){ + return e.get(CoreAnnotations.NumericCompositeValueAnnotation.class); + } + } + return null; + } + + private static Timex fetchTimexFromSUTime(List l) { + for(E e: l) { + if(e.containsKey(TimeAnnotations.TimexAnnotation.class)){ + return e.get(TimeAnnotations.TimexAnnotation.class); + } + } + return null; + } + + private static List processEntity(List l, + String entityType, String compModifier, String nextWord) { + assert(quantifiable.contains(entityType)); + if (DEBUG) { + System.err.println("Quantifiable.processEntity: " + l); + } + String s; + if (entityType.equals("TIME")) { + s = timeEntityToString(l); + } else { + s = singleEntityToString(l); + } + + Number numberFromSUTime = fetchNumberFromSUTime(l); + Timex timexFromSUTime = fetchTimexFromSUTime(l); + + if (DEBUG) System.err.println("Quantifiable: working on " + s); + String p = null; + if (entityType.equals("NUMBER")) { + p = ""; + if (compModifier != null) { + p = compModifier; + } + String q = normalizedNumberString(s, nextWord, numberFromSUTime); + if (q != null) { + p = p.concat(q); + } else { + p = null; + } + } else if (entityType.equals("ORDINAL")) { + p = normalizedOrdinalString(s, numberFromSUTime); + } else if (entityType.equals("DURATION")) { + // SUTime marks some ordinals, e.g., "22nd time", as durations + p = normalizedOrdinalString(s, numberFromSUTime); + } else if (entityType.equals("MONEY")) { + p = ""; + if(compModifier!=null) { + p = compModifier; + } + String q = normalizedMoneyString(s, numberFromSUTime); + if (q != null) { + p = p.concat(q); + } else { + p = null; + } + } else if (entityType.equals("DATE")) { + p = normalizedDateString(s, timexFromSUTime); + } else if (entityType.equals("TIME")) { + p = ""; + if (compModifier != null && ! compModifier.matches("am|pm")) { + p = compModifier; + } + String q = normalizedTimeString(s, compModifier != null ? compModifier : "", timexFromSUTime); + if (q != null && q.length() == 1 && !q.equals("D")) { + p = p.concat(q); + } else { + p = q; + } + } else if (entityType.equals("PERCENT")) { + p = ""; + if (compModifier != null) { + p = compModifier; + } + String q = normalizedPercentString(s, numberFromSUTime); + if (q != null) { + p = p.concat(q); + } else { + p = null; + } + } + if (DEBUG) { + err.println("Quantifiable: Processed '" + s + "' as '" + p + '\''); + } + + int i = 0; + for (E wi : l) { + if (p != null) { + if (DEBUG) { + System.err.println("#4: Changing normalized NER from " + wi.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + p + " at index " + i); + } + wi.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, p); + } + //currently we also write this into the answers; + //wi.setAnswer(wi.get(CoreAnnotations.AnswerAnnotation.class)+"("+p+")"); + i++; + } + return l; + } + + + /** @param l The list of tokens in a time entity + * @return the word in the time word list that should be normalized + */ + private static String timeEntityToString(List l) { + String entityType = l.get(0).get(CoreAnnotations.AnswerAnnotation.class); + int size = l.size(); + for (E w : l) { + assert(w.get(CoreAnnotations.AnswerAnnotation.class) == null || + w.get(CoreAnnotations.AnswerAnnotation.class).equals(entityType)); + Matcher m = timePattern.matcher(w.get(CoreAnnotations.TextAnnotation.class)); + if (m.matches()) + return w.get(CoreAnnotations.TextAnnotation.class); + } + if (DEBUG) { + System.err.println("default: " + l.get(size-1).get(CoreAnnotations.TextAnnotation.class)); + } + return l.get(size-1).get(CoreAnnotations.TextAnnotation.class); + } + + + /** + * Takes the output of an {@link AbstractSequenceClassifier} and marks up + * each document by normalizing quantities. Each {@link CoreLabel} in any + * of the documents which is normalizable will receive a "normalizedQuantity" + * attribute. + * + * @param l a {@link List} of {@link List}s of {@link CoreLabel}s + * @return The list with normalized entity fields filled in + */ + public static List> normalizeClassifierOutput(List> l){ + for (List doc: l) { + addNormalizedQuantitiesToEntities(doc); + } + return l; + } + + private static final String lessEqualThreeWords = "no (?:more|greater|higher) than|as (?:many|much) as"; + private static final String greaterEqualThreeWords = "no (?:less|fewer) than|as few as"; + + private static final String greaterThanTwoWords = "(?:more|greater|larger|higher) than"; + private static final String lessThanTwoWords = "(?:less|fewer|smaller) than|at most"; + private static final String lessEqualTwoWords = "no (?:more|greater)_than|or less|up to"; + private static final String greaterEqualTwoWords = "no (?:less|fewer)_than|or more|at least"; + private static final String approxTwoWords = "just (?:over|under)|or so"; + + private static final String greaterThanOneWord = "(?:above|over|more_than|greater_than)"; + private static final String lessThanOneWord = "(?:below|under|less_than)"; + private static final String lessEqualOneWord = "(?:up_to|within)"; + // note that ones like "nearly" or "almost" can be above or below: + // "almost 500 killed", "almost zero inflation" + private static final String approxOneWord = "(?:approximately|estimated|nearly|around|about|almost|just_over|just_under)"; + private static final String other = "other"; + + /** + * Takes the strings of the three previous and next words to a quantity and + * detects a + * quantity modifier like "less than", "more than", etc. + * Any of these words may be null or an empty String. + */ + private static String detectQuantityModifier(List list, int beforeIndex, int afterIndex) { + String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): ""; + String prev2 = (beforeIndex - 1 >= 0) ? list.get(beforeIndex - 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): ""; + String prev3 = (beforeIndex - 2 >= 0) ? list.get(beforeIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): ""; + int sz = list.size(); + String next = (afterIndex < sz) ? list.get(afterIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): ""; + String next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): ""; + String next3 = (afterIndex + 2 < sz) ? list.get(afterIndex + 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): ""; + + if (DEBUG) { + err.println("Quantifiable: previous: " + prev3 + ' ' + prev2+ ' ' + prev); + err.println("Quantifiable: next: " + next + ' ' + next2 + ' ' + next3); + } + + String longPrev = prev3 + ' ' + prev2 + ' ' + prev; + if (longPrev.matches(lessEqualThreeWords)) { return "<="; } + if (longPrev.matches(greaterEqualThreeWords)) { return ">="; } + + longPrev = prev2 + ' ' + prev; + if (longPrev.matches(greaterThanTwoWords)) { return ">"; } + if (longPrev.matches(lessEqualTwoWords)) { return "<="; } + if (longPrev.matches(greaterEqualTwoWords)) { return ">="; } + if (longPrev.matches(lessThanTwoWords)) { return "<"; } + if (longPrev.matches(approxTwoWords)) { return "~"; } + + String longNext = next + ' ' + next2; + if (longNext.matches(greaterEqualTwoWords)) { return ">="; } + if (longNext.matches(lessEqualTwoWords)) { return "<="; } + + if (prev.matches(greaterThanOneWord)) { return ">"; } + if (prev.matches(lessThanOneWord)) { return "<"; } + if (prev.matches(lessEqualOneWord)) { return "<="; } + if (prev.matches(approxOneWord)) { return "~"; } + + if (next.matches(other)) { return ">="; } + + if (DEBUG) { err.println("Quantifiable: not a quantity modifier"); } + return null; + } + + + private static String earlyOneWord = "early"; + private static String earlyTwoWords = "(?:dawn|eve|beginning) of"; + private static String earlyThreeWords = "early in the"; + private static String lateOneWord = "late"; + private static String lateTwoWords = "late at|end of"; + private static String lateThreeWords = "end of the"; + private static String middleTwoWords = "(?:middle|midst) of"; + private static String middleThreeWords = "(?:middle|midst) of the"; + + private static String amOneWord = "[Aa]\\.?[Mm]\\.?"; + private static String pmOneWord = "[Pp]\\.?[Mm]\\.?"; + private static String amThreeWords = "in the morning"; + private static String pmTwoWords = "at night"; + private static String pmThreeWords = "in the (?:afternoon|evening)"; + + + /** + * Takes the strings of the three previous words to a quantity and detects a + * quantity modifier like "less than", "more than", etc. + * Any of these words may be null or an empty String. + */ + private static String detectTimeOfDayModifier(List list, int beforeIndex, int afterIndex) { + String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : ""; + String prev2 = (beforeIndex - 1 >= 0) ? list.get(beforeIndex - 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : ""; + String prev3 = (beforeIndex - 2 >= 0) ? list.get(beforeIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : ""; + int sz = list.size(); + String next = (afterIndex < sz) ? list.get(afterIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : ""; + String next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : ""; + String next3 = (afterIndex + 2 < sz) ? list.get(afterIndex + 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : ""; + + String longPrev = prev3 + ' ' + prev2 + ' ' + prev; + if (longPrev.matches(earlyThreeWords)) { + return "E"; + } + else if (longPrev.matches(lateThreeWords)) { + return "L"; + } + else if (longPrev.matches(middleThreeWords)) { + return "M"; + } + + longPrev = prev2 + ' ' + prev; + if (longPrev.matches(earlyTwoWords)) { + return "E"; + } + else if (longPrev.matches(lateTwoWords)) { + return "L"; + } + else if (longPrev.matches(middleTwoWords)) { + return "M"; + } + + if (prev.matches(earlyOneWord) || prev2.matches(earlyOneWord)) { + return "E"; + } + else if (prev.matches(lateOneWord) || prev2.matches(lateOneWord)) { + return "L"; + } + + String longNext = next3 + ' ' + next2 + ' ' + next; + if (longNext.matches(pmThreeWords)) { + return "pm"; + } + if (longNext.matches(amThreeWords)) { + return "am"; + } + + longNext = next2 + ' ' + next; + if (longNext.matches(pmTwoWords)) { + return "pm"; + } + + if (next.matches(amOneWord) || next2.matches("morning") || next3.matches("morning")) { + return "am"; + } + if (next.matches(pmOneWord) || next2.matches("afternoon") || next3.matches("afternoon") + || next2.matches("night") || next3.matches("night") + || next2.matches("evening") || next3.matches("evening")) { + return "pm"; + } + + return ""; + } + + /** + * Identifies contiguous MONEY, TIME, DATE, or PERCENT entities + * and tags each of their consitituents with a "normalizedQuantity" + * label which contains the appropriate normalized string corresponding to + * the full quantity. Quantities are not concatenated + * + * @param l A list of {@link CoreMap}s representing a single + * document. Note: the Labels are updated in place. + */ + public static void addNormalizedQuantitiesToEntities(List l) { + addNormalizedQuantitiesToEntities(l, false); + } + + /** + * Identifies contiguous MONEY, TIME, DATE, or PERCENT entities + * and tags each of their consitituents with a "normalizedQuantity" + * label which contains the appropriate normalized string corresponding to + * the full quantity. + * + * @param list A list of {@link CoreMap}s representing a single + * document. Note: the Labels are updated in place. + * @param concatenate true if quantities should be concatenated into one label, false otherwise + */ + public static void addNormalizedQuantitiesToEntities(List list, boolean concatenate) { + List toRemove = new ArrayList(); // list for storing those objects we're going to remove at the end (e.g., if concatenate, we replace 3 November with 3_November, have to remove one of the originals) + + String lastEntity = BACKGROUND_SYMBOL; + String timeModifier = ""; + int beforeIndex = -1; + ArrayList collector = new ArrayList(); + for (int i = 0, sz = list.size(); i < sz; i++) { + E wi = list.get(i); + if (DEBUG) { System.err.println("addNormalizedQuantitiesToEntities: wi is " + wi + "; collector is " + collector); } + // repairs commas in between dates... String constant first in equals() in case key has null value.... + if ((i+1) < sz && ",".equals(wi.get(CoreAnnotations.TextAnnotation.class)) && "DATE".equals(lastEntity)) { + E nextWord = list.get(i+1); + String nextNER = nextWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if (nextNER != null && nextNER.equals("DATE")) { + wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE"); + } + } + + //repairs mistagged multipliers after a numeric quantity + String curWord = (wi.get(CoreAnnotations.TextAnnotation.class) != null ? wi.get(CoreAnnotations.TextAnnotation.class) : ""); + String nextWord = ""; + if ((i+1) < sz) { + nextWord = list.get(i+1).get(CoreAnnotations.TextAnnotation.class); + if(nextWord == null) + nextWord = ""; + } + + if (!curWord.equals("") && (moneyMultipliers.containsKey(curWord) || (getOneSubstitutionMatch(curWord, moneyMultipliers.keySet()) != null)) && lastEntity != null && (lastEntity.equals("MONEY") || lastEntity.equals("NUMBER"))) { + wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity); + } + + //repairs four digit ranges (2002-2004) that have not been tagged as years - maybe bad? (empirically useful) + if (curWord.contains("-")) { + String[] sides = curWord.split("-"); + if (sides.length == 2) { + try { + int first = Integer.parseInt(sides[0]); + int second = Integer.parseInt(sides[1]); + //they're both integers, see if they're both between 1000-3000 (likely years) + if (1000 <= first && first <= 3000 && 1000 <= second && second <= 3000) { + wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE"); + String dateStr = new ISODateInstance(new ISODateInstance(sides[0]), new ISODateInstance(sides[1])).getDateString(); + if (DEBUG) { + System.err.println("#5: Changing normalized NER from " + wi.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + dateStr + " at index " + i); + } + wi.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, dateStr); + continue; + } + } catch (Exception e) { + // they weren't numbers. + } + } + } + + // Marks time units as NUMBER if they are preceded by a CD tag. e.g. "two years" or "5 minutes" + String prevTag = (i-1 > 0 ? list.get(i-1).get(CoreAnnotations.PartOfSpeechAnnotation.class) : null); + if ( timeUnitWords.contains(curWord) && + (wi.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null || !wi.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("DATE")) && + (prevTag != null && prevTag.equals("CD")) ) { + wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "NUMBER"); + } + + String currEntity = wi.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if (currEntity != null && currEntity.equals("TIME")) { + if (timeModifier.equals("")) { + timeModifier = detectTimeOfDayModifier(list, i-1, i+1); + } + } + + // if the current wi is a non-continuation and the last one was a + // quantity, we close and process the last segment. + if ((currEntity == null || ! currEntity.equals(lastEntity)) && quantifiable.contains(lastEntity)) { + String compModifier = null; + // special handling of TIME + if (lastEntity.equals("TIME")) { + processEntity(collector, lastEntity, timeModifier, nextWord); + } else if (lastEntity.equals(("DATE"))) { + //detect date range modifiers by looking at nearby words + compModifier = detectDateRangeModifier(collector, list, beforeIndex, i); + if (!compModifier.equals(ISODateInstance.BOUNDED_RANGE)) + processEntity(collector, lastEntity, compModifier, nextWord); + //now repair this date if it's more than one word + //doesn't really matter which one we keep ideally we should be doing lemma/etc matching anyway + //but we vaguely try to deal with this by choosing the NNP or the CD + if (concatenate) + concatenateNumericString(collector, toRemove); + } else { + // detect "more than", "nearly", etc. by looking at nearby words. + if (lastEntity.equals("MONEY") || lastEntity.equals("NUMBER") || + lastEntity.equals("PERCENT")) { + compModifier = detectQuantityModifier(list, beforeIndex, i); + } + processEntity(collector, lastEntity, compModifier, nextWord); + if (concatenate) { + concatenateNumericString(collector, toRemove); + } + } + + collector = new ArrayList(); + timeModifier = ""; + } + // if the current wi is a quantity, we add it to the collector. + // if its the first word in a quantity, we record index before it + if (quantifiable.contains(currEntity)) { + if (collector.isEmpty()) { + beforeIndex = i - 1; + } + collector.add(wi); + } + lastEntity=currEntity; + } + // process any final entity + if (quantifiable.contains(lastEntity)) { + String compModifier = null; + if (lastEntity.equals("TIME")) { + processEntity(collector, lastEntity, timeModifier, ""); + } else if(lastEntity.equals(("DATE"))) { + compModifier = detectDateRangeModifier(collector, list, beforeIndex, list.size()); + processEntity(collector, lastEntity, compModifier, ""); + //now repair this date if it's more than one word + //doesn't really matter which one we keep ideally we should be doing lemma/etc matching anyway + //but we vaguely try to deal with this by choosing the NNP or the CD + if (concatenate) { + concatenateNumericString(collector,toRemove); + } + } else { + // detect "more than", "nearly", etc. by looking at nearby words. + if (lastEntity.equals("MONEY") || lastEntity.equals("NUMBER") || + lastEntity.equals("PERCENT")) { + compModifier = detectQuantityModifier(list, beforeIndex, list.size()); + } + processEntity(collector, lastEntity, compModifier, ""); + if(concatenate) { + concatenateNumericString(collector, toRemove); + } + } + } + if (concatenate) { + list.removeAll(toRemove); + } + List moreRemoves = new ArrayList(); + for (int i = 0, sz = list.size(); i < sz; i++) { + E wi = list.get(i); + moreRemoves.addAll(detectTwoSidedRangeModifier(wi, list, i-1, i+1, concatenate)); + } + if (concatenate) { + list.removeAll(moreRemoves); + } + } + + + /** + * Runs a deterministic named entity classifier which is good at recognizing + * numbers and money and date expressions not recognized by our statistical + * NER. It then changes any BACKGROUND_SYMBOL's from the list to + * the value tagged by this deterministic NER. + * It then adds normalized values for quantifiable entities. + * + * @param l A document to label + * @return The list with results of 'specialized' (rule-governed) NER filled in + */ + public static List applySpecializedNER(List l) { + int sz = l.size(); + // copy l + List copyL = new ArrayList(sz); + for (int i = 0; i < sz; i++) { + if (DEBUG2) { + if (i == 1) { + String tag = l.get(i).get(CoreAnnotations.PartOfSpeechAnnotation.class); + if (tag == null || tag.equals("")) { + err.println("Quantifiable: error! tag is " + tag); + } + } + } + copyL.add(new CoreLabel(l.get(i))); + } + // run NumberSequenceClassifier + AbstractSequenceClassifier nsc = new NumberSequenceClassifier(); + copyL = nsc.classify(copyL); + // update entity only if it was not O + for (int i = 0; i < sz; i++) { + E before = l.get(i); + CoreLabel nscAnswer = copyL.get(i); + if (before.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null && before.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(BACKGROUND_SYMBOL) && + (nscAnswer.get(CoreAnnotations.AnswerAnnotation.class) != null && !nscAnswer.get(CoreAnnotations.AnswerAnnotation.class).equals(BACKGROUND_SYMBOL))) { + System.err.println("Quantifiable: updating class for " + + before.get(CoreAnnotations.TextAnnotation.class) + '/' + + before.get(CoreAnnotations.NamedEntityTagAnnotation.class) + " to " + nscAnswer.get(CoreAnnotations.AnswerAnnotation.class)); + before.set(CoreAnnotations.NamedEntityTagAnnotation.class, nscAnswer.get(CoreAnnotations.AnswerAnnotation.class)); + } + } + + addNormalizedQuantitiesToEntities(l); + return l; + } // end applySpecializedNER + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/SeminarsPrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/SeminarsPrior.java new file mode 100644 index 0000000..0515c66 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/SeminarsPrior.java @@ -0,0 +1,187 @@ +package edu.stanford.nlp.ie; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.ie.pascal.AcronymModel; +import edu.stanford.nlp.ling.CoreAnnotations; + + +import java.util.*; + +/** + * @author Jenny Finkel + */ + +public class SeminarsPrior extends EntityCachingAbstractSequencePrior { + + //double penalty = 4.0; + double penalty = 2.3; + //double penalty1 = 3.0; + //double penalty2 = 4.0; + + public SeminarsPrior(String backgroundSymbol, Index classIndex, List doc) { + super(backgroundSymbol, classIndex, doc); + init(doc); + } + + private void init(List doc) { + + interned = new String[doc.size()]; + int i = 0; + for (IN wi : doc) { + interned[i++] = wi.get(CoreAnnotations.TextAnnotation.class).toLowerCase().intern(); + } + + } + + private String[] interned; + + public double scoreOf(int[] sequence) { + + Set speakers = Generics.newHashSet(); + Set locations = Generics.newHashSet(); + Set stimes = Generics.newHashSet(); + Set etimes = Generics.newHashSet(); + + List speakersL = new ArrayList(); + List locationsL = new ArrayList(); + List stimesL = new ArrayList(); + List etimesL = new ArrayList(); + + double p = 0.0; + for (int i = 0; i < entities.length; i++) { + Entity entity = entities[i]; + if ((i == 0 || entities[i-1] != entity) && entity != null) { + + String type = classIndex.get(entity.type); + String phrase = StringUtils.join(entity.words, " ").toLowerCase(); + if (type.equalsIgnoreCase("SPEAKER")) { + speakers.add(phrase); + speakersL.add(entity); + } else if (type.equalsIgnoreCase("LOCATION")) { + locations.add(phrase); + locationsL.add(entity); + } else if (type.equals("STIME")) { + stimes.add(phrase); + stimesL.add(entity); + } else if (type.equals("ETIME")) { + etimes.add(phrase); + etimesL.add(entity); + } else { + System.err.println("unknown entity type: "+type); + System.exit(0); + } + } + } + + for (Entity stimeE : stimesL) { + if (stimes.size() == 1) { break; } + String stime = StringUtils.join(stimeE.words, " "); + String time = ""; + for (char c : stime.toCharArray()) { + if (c >= '0' && c <= '9') { + time += c; + } + } + if (time.length() == 1 || time.length() == 2) { time = time+"00"; } + boolean match = false; + for (String stime1 : stimes) { + String time1 = ""; + for (char c : stime1.toCharArray()) { + if (c >= '0' && c <= '9') { + time1 += c; + } + } + if (time1.length() == 1 || time1.length() == 2) { time1 = time1+"00"; } + if (!time.equals(time1)) { + p -= stimeE.words.size() * penalty; + //System.err.println(time+" ("+s+") "+time1+" ("+s1+") "+stimes); + } + } + } + + + for (Entity etimeE : etimesL) { + if (etimes.size() == 1) { break; } + String etime = StringUtils.join(etimeE.words, " "); + String time = ""; + for (char c : etime.toCharArray()) { + if (c >= '0' && c <= '9') { + time += c; + } + } + if (time.length() == 1 || time.length() == 2) { time = time+"00"; } + boolean match = false; + for (String etime1 : etimes) { + String time1 = ""; + for (char c : etime1.toCharArray()) { + if (c >= '0' && c <= '9') { + time1 += c; + } + } + if (time1.length() == 1 || time1.length() == 2) { time1 = time1+"00"; } + if (!time.equals(time1)) { + p -= etimeE.words.size() * penalty; + //System.err.println(time+" ("+s+") "+time1+" ("+s1+") "+etimes); + } + } + } + +// for (Entity locationE : locationsL) { +// String location = StringUtils.join(locationE.words, " "); +// for (String location1 : locations) { +// String s1 = location; +// String s2 = location1; +// if (s2.length() > s1.length()) { +// String tmp = s2; +// s2 = s1; +// s1 = tmp; +// } +// Pair pair = new Pair(s1, s2); +// Boolean b = aliasLocCache.get(pair); +// if (b == null) { +// double d = acronymModel.HearstSimilarity(s1, s2); +// b = (d >= 0.7); +// aliasLocCache.put(pair, b); +// } +// if (!b) { +// p -= locationE.words.size() * penalty; +// } +// } +// } + + int speakerIndex = classIndex.indexOf("SPEAKER"); + + for (Entity speakerE : speakersL) { + //String lastName = speakerE.words.get(speakerE.words.size()-1); + String lastName = interned[speakerE.startPosition+speakerE.words.size()-1]; + + for (int i = 0; i < interned.length; i++) { + String w = interned[i]; + if (w == lastName) { + if (sequence[i] != speakerIndex) { + p -= penalty; + } + } + } + } + + return p; + } + + private static Map, Boolean> aliasLocCache = Generics.newHashMap(); + + private static AcronymModel acronymModel; + + static { + try { + acronymModel = new AcronymModel(); + } catch (Exception e) { + throw new RuntimeException(e.getMessage()); + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/UniformPrior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/UniformPrior.java new file mode 100644 index 0000000..5ae102b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/UniformPrior.java @@ -0,0 +1,71 @@ +package edu.stanford.nlp.ie; + +import java.util.List; + +import edu.stanford.nlp.sequences.SequenceListener; +import edu.stanford.nlp.sequences.SequenceModel; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Index; + +/** + * Uniform prior to be used for generic Gibbs inference in the ie.crf.CRFClassifier + * @author Mihai + * + */ +public class UniformPrior implements SequenceModel, SequenceListener { + + protected int[] sequence; + protected int backgroundSymbol; + protected int numClasses; + protected int[] possibleValues; + protected Index classIndex; + protected List doc; + + public UniformPrior(String backgroundSymbol, Index classIndex, List doc) { + this.classIndex = classIndex; + this.backgroundSymbol = classIndex.indexOf(backgroundSymbol); + this.numClasses = classIndex.size(); + this.possibleValues = new int[numClasses]; + for (int i=0; i + * java -server -mx500m edu.stanford.nlp.ie.crf.CRFBiasedClassifier -loadClassifier model.gz -testFile test.txt -classBias A:0.5,B:1.5 + * + *

    The command above sets a bias of 0.5 towards class A and a bias of + * 1.5 towards class B. These biases (which internally are treated as + * feature weights in the log-linear model underpinning the CRF + * classifier) can take any real value. As the weight of A tends to plus + * infinity, the classifier will only predict A labels, and as it tends + * towards minus infinity, it will never predict A labels. + * + * @author Michel Galley + * @author Sonal Gupta (made the class generic) + */ + +public class CRFBiasedClassifier extends CRFClassifier { + + private static final String BIAS = "@@@DECODING_CLASS_BIAS@@@"; + private boolean testTime = false; + + + public CRFBiasedClassifier(Properties props) { + super(props); + } + + @Override + public CRFDatum, CRFLabel> makeDatum(List info, int loc, FeatureFactory featureFactory) { + + pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); + PaddedList pInfo = new PaddedList(info, pad); + + List> features = new ArrayList>(); + Collection done = Generics.newHashSet(); + for (int i = 0; i < windowSize; i++) { + List featuresC = new ArrayList(); + List windowCliques = featureFactory.getCliques(i, 0); + windowCliques.removeAll(done); + done.addAll(windowCliques); + for (Clique c : windowCliques) { + featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c)); + if(testTime && i==0) + // this feature is only present at test time and only appears + // in cliques of size 1 (i.e., cliques with window=0) + featuresC.add(BIAS); + } + features.add(featuresC); + } + + int[] labels = new int[windowSize]; + for (int i = 0; i < windowSize; i++) { + String answer = pInfo.get(loc + i - windowSize + 1).get(CoreAnnotations.AnswerAnnotation.class); + labels[i] = classIndex.indexOf(answer); + } + + return new CRFDatum, CRFLabel>(features, new CRFLabel(labels), null); + } + + void addBiasFeature() { + if(!featureIndex.contains(BIAS)) { + featureIndex.add(BIAS); + double[][] newWeights = new double[weights.length+1][]; + System.arraycopy (weights,0,newWeights,0,weights.length); + newWeights[weights.length] = new double[classIndex.size()]; + weights = newWeights; + } + } + + public void setBiasWeight(String cname, double weight) { + int ci = classIndex.indexOf(cname); + setBiasWeight(ci, weight); + } + + public void setBiasWeight(int cindex, double weight) { + addBiasFeature(); + int fi = featureIndex.indexOf(BIAS); + weights[fi][cindex] = weight; + } + + @Override + public List classify(List document) { + testTime = true; + List l = super.classify(document); + testTime = false; + return l; + } + + class CRFBiasedClassifierOptimizer implements Function { + CRFBiasedClassifier crf; + Function evalFunction; + + CRFBiasedClassifierOptimizer(CRFBiasedClassifier c, Function e) { + crf = c; + evalFunction = e; + } + + public Double apply(Double w) { + crf.setBiasWeight(0,w); + return evalFunction.apply(w); + } + } + + /** + * Adjust the bias parameter to optimize some objective function. + * Note that this function only tunes the bias parameter of one class + * (class of index 0), and is thus only useful for binary classification + * problems. + */ + public void adjustBias(List> develData, Function evalFunction, double low, double high) { + LineSearcher ls = new GoldenSectionLineSearch(true,1e-2,low,high); + CRFBiasedClassifierOptimizer optimizer = new CRFBiasedClassifierOptimizer(this, evalFunction); + double optVal = ls.minimize(optimizer); + int bi = featureIndex.indexOf(BIAS); + System.err.println("Class bias of "+weights[bi][0]+" reaches optimal value "+optVal); + } + + /** The main method, which is essentially the same as in CRFClassifier. See the class documentation. */ + public static void main(String[] args) throws Exception { + System.err.println("CRFBiasedClassifier invoked at " + new Date() + + " with arguments:"); + for (String arg : args) { + System.err.print(" " + arg); + } + System.err.println(); + + Properties props = StringUtils.argsToProperties(args); + CRFBiasedClassifier crf = new CRFBiasedClassifier(props); + String testFile = crf.flags.testFile; + String loadPath = crf.flags.loadClassifier; + + if (loadPath != null) { + crf.loadClassifierNoExceptions(loadPath, props); + } else if (crf.flags.loadJarClassifier != null) { + crf.loadJarClassifier(crf.flags.loadJarClassifier, props); + } else { + crf.loadDefaultClassifier(); + } + if(crf.flags.classBias != null) { + StringTokenizer biases = new java.util.StringTokenizer(crf.flags.classBias,","); + while (biases.hasMoreTokens()) { + StringTokenizer bias = new java.util.StringTokenizer(biases.nextToken(),":"); + String cname = bias.nextToken(); + double w = Double.parseDouble(bias.nextToken()); + crf.setBiasWeight(cname,w); + System.err.println("Setting bias for class "+cname+" to "+w); + } + } + + if (testFile != null) { + DocumentReaderAndWriter readerAndWriter = crf.makeReaderAndWriter(); + if (crf.flags.printFirstOrderProbs) { + crf.printFirstOrderProbs(testFile, readerAndWriter); + } else if (crf.flags.printProbs) { + crf.printProbs(testFile, readerAndWriter); + } else if (crf.flags.useKBest) { + int k = crf.flags.kBest; + crf.classifyAndWriteAnswersKBest(testFile, k, readerAndWriter); + } else { + crf.classifyAndWriteAnswers(testFile, readerAndWriter); + } + } + } // end main + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFClassifier.java new file mode 100644 index 0000000..7a059d7 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFClassifier.java @@ -0,0 +1,3530 @@ +// CRFClassifier -- a probabilistic (CRF) sequence model, mainly used for NER. +// Copyright (c) 2002-2008 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +// For more information, bug reports, fixes, contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// Support/Questions: java-nlp-user@lists.stanford.edu +// Licensing: java-nlp-support@lists.stanford.edu + +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.ie.*; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.util.ConvertByteArray; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.optimization.*; +import edu.stanford.nlp.optimization.Function; +import edu.stanford.nlp.sequences.*; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.*; + +import java.io.*; +import java.lang.reflect.InvocationTargetException; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** + * Class for Sequence Classification using a Conditional Random Field model. + * The code has functionality for different document formats, but when + * using the standard {@link edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter} for training + * or testing models, input files are expected to + * be one token per line with the columns indicating things like the word, + * POS, chunk, and answer class. The default for + * ColumnDocumentReaderAndWriter training data is 3 column input, + * with the columns containing a word, its POS, and its gold class, but + * this can be specified via the map property. + *

    + * When run on a file with -textFile, + * the file is assumed to be plain English text (or perhaps simple HTML/XML), + * and a reasonable attempt is made at English tokenization by + * {@link PlainTextDocumentReaderAndWriter}. The class used to read + * the text can be changed with -plainTextDocumentReaderAndWriter. + * Extra options can be supplied to the tokenizer using the + * -tokenizeOptions flag. + *

    + * To read from stdin, use the flag -readStdin. The same + * reader/writer will be used as for -textFile. + *

    + * Typical command-line usage + *

    For running a trained model with a provided serialized classifier on a + * text file:

    + * + * java -mx500m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier + * conll.ner.gz -textFile samplesentences.txt + * + *

    + * When specifying all parameters in a properties file (train, test, or + * runtime): + *

    + * + * java -mx1g edu.stanford.nlp.ie.crf.CRFClassifier -prop propFile + * + *

    + * To train and test a simple NER model from the command line:
    + * java -mx1000m edu.stanford.nlp.ie.crf.CRFClassifier + * -trainFile trainFile -testFile testFile -macro > output + *

    + *

    + * To train with multiple files:
    + * java -mx1000m edu.stanford.nlp.ie.crf.CRFClassifier + * -trainFileList file1,file2,... -testFile testFile -macro > output + *

    + *

    + * To test on multiple files, use the -testFiles option and a comma + * separated list. + *

    + * Features are defined by a {@link edu.stanford.nlp.sequences.FeatureFactory}. + * {@link NERFeatureFactory} is used by default, and you should look + * there for feature templates and properties or flags that will cause + * certain features to be used when training an NER classifier. There + * are also various feature factories for Chinese word segmentation + * such as {@link edu.stanford.nlp.wordseg.ChineseSegmenterFeatureFactory}. + * Features are specified either + * by a Properties file (which is the recommended method) or by flags on the + * command line. The flags are read into a {@link SeqClassifierFlags} object, + * which the user need not be concerned with, unless wishing to add new + * features.

    CRFClassifier may also be used programmatically. When creating + * a new instance, you must specify a Properties object. You may then + * call train methods to train a classifier, or load a classifier. The other way + * to get a CRFClassifier is to deserialize one via the static + * {@link CRFClassifier#getClassifier(String)} methods, which return a + * deserialized classifier. You may then tag (classify the items of) documents + * using either the assorted classify() or the assorted + * classify methods in {@link AbstractSequenceClassifier}. + * Probabilities assigned by the CRF can be interrogated using either the + * printProbsDocument() or getCliqueTrees() methods. + * + * @author Jenny Finkel + * @author Sonal Gupta (made the class generic) + * @author Mengqiu Wang (LOP implementation and non-linear CRF implementation) + * TODO(mengqiu) need to move the embedding lookup and capitalization features into a FeatureFactory + */ +public class CRFClassifier extends AbstractSequenceClassifier { + + List> labelIndices; + Index tagIndex; + Pair entityMatrices; + + CliquePotentialFunction cliquePotentialFunction; + HasCliquePotentialFunction cliquePotentialFunctionHelper; + + /** Parameter weights of the classifier. */ + double[][] weights; + double[][] linearWeights; + double[][] inputLayerWeights4Edge; + double[][] outputLayerWeights4Edge; + double[][] inputLayerWeights; + double[][] outputLayerWeights; + + /** index the features of CRF */ + Index featureIndex; + /** caches the featureIndex */ + int[] map; + List> featureIndicesSetArray; + List> featureIndicesListArray; + Random random = new Random(2147483647L); + Index nodeFeatureIndicesMap; + Index edgeFeatureIndicesMap; + Map embeddings = null; + + /** + * Name of default serialized classifier resource to look for in a jar file. + */ + public static final String DEFAULT_CLASSIFIER = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"; + private static final boolean VERBOSE = false; + + // List selftraindatums = new ArrayList(); + + protected CRFClassifier() { + super(new SeqClassifierFlags()); + } + + public CRFClassifier(Properties props) { + super(props); + } + + public CRFClassifier(SeqClassifierFlags flags) { + super(flags); + } + + /** + * Makes a copy of the crf classifier + */ + public CRFClassifier(CRFClassifier crf) { + super(crf.flags); + this.windowSize = crf.windowSize; + this.featureFactory = crf.featureFactory; + this.pad = crf.pad; + this.knownLCWords = (crf.knownLCWords != null) ? Generics.newHashSet(crf.knownLCWords) : null; + this.featureIndex = (crf.featureIndex != null) ? new HashIndex(crf.featureIndex.objectsList()) : null; + if (crf.flags.nonLinearCRF) { + this.nodeFeatureIndicesMap = (crf.nodeFeatureIndicesMap != null) ? new HashIndex(crf.nodeFeatureIndicesMap.objectsList()) : null; + this.edgeFeatureIndicesMap = (crf.edgeFeatureIndicesMap != null) ? new HashIndex(crf.edgeFeatureIndicesMap.objectsList()) : null; + } + this.classIndex = (crf.classIndex != null) ? new HashIndex(crf.classIndex.objectsList()) : null; + if (crf.labelIndices != null) { + this.labelIndices = new ArrayList>(crf.labelIndices.size()); + for (int i = 0; i < crf.labelIndices.size(); i++) { + this.labelIndices.add((crf.labelIndices.get(i) != null) ? new HashIndex(crf.labelIndices.get(i).objectsList()) : null); + } + } else { + this.labelIndices = null; + } + int numFeatures = featureIndex != null ? featureIndex.size() : 0; + this.cliquePotentialFunction = crf.cliquePotentialFunction; + /* + weights = new double[numFeatures][]; + for (int i = 0; i < numFeatures; i++) { + String feature = featureIndex.get(i); + int index = crf.featureIndex.indexOf(feature); + weights[i] = new double[crf.weights[index].length]; + System.arraycopy(crf.weights[index], 0, weights[i], 0, weights[i].length); + } + */ + } + + /** + * Returns the total number of weights associated with this classifier. + * + * @return number of weights + */ + public int getNumWeights() { + if (weights == null) return 0; + int numWeights = 0; + for (double[] wts : weights) { + numWeights += wts.length; + } + return numWeights; + } + + /** + * Get index of featureType for feature indexed by i. (featureType index is + * used to index labelIndices to get labels.) + * + * @param i + * feature index + * @return index of featureType + */ + private int getFeatureTypeIndex(int i) { + return getFeatureTypeIndex(featureIndex.get(i)); + } + + /** + * Get index of featureType for feature based on the feature string + * (featureType index used to index labelIndices to get labels) + * + * @param feature + * feature string + * @return index of featureType + */ + private static int getFeatureTypeIndex(String feature) { + if (feature.endsWith("|C")) { + return 0; + } else if (feature.endsWith("|CpC")) { + return 1; + } else if (feature.endsWith("|Cp2C")) { + return 2; + } else if (feature.endsWith("|Cp3C")) { + return 3; + } else if (feature.endsWith("|Cp4C")) { + return 4; + } else if (feature.endsWith("|Cp5C")) { + return 5; + } else { + throw new RuntimeException("Unknown feature type " + feature); + } + } + + /** + * Scales the weights of this crfclassifier by the specified weight + * + * @param scale + */ + public void scaleWeights(double scale) { + for (int i = 0; i < weights.length; i++) { + for (int j = 0; j < weights[i].length; j++) { + weights[i][j] *= scale; + } + } + } + + /** + * Combines weights from another crf (scaled by weight) into this crf's + * weights (assumes that this crf's indices have already been updated to + * include features/labels from the other crf) + * + * @param crf + * Other crf whose weights to combine into this crf + * @param weight + * amount to scale the other crf's weights by + */ + private void combineWeights(CRFClassifier crf, double weight) { + int numFeatures = featureIndex.size(); + int oldNumFeatures = weights.length; + + // Create a map of other crf labels to this crf labels + Map crfLabelMap = Generics.newHashMap(); + for (int i = 0; i < crf.labelIndices.size(); i++) { + for (int j = 0; j < crf.labelIndices.get(i).size(); j++) { + CRFLabel labels = crf.labelIndices.get(i).get(j); + int[] newLabelIndices = new int[i + 1]; + for (int ci = 0; ci <= i; ci++) { + String classLabel = crf.classIndex.get(labels.getLabel()[ci]); + newLabelIndices[ci] = this.classIndex.indexOf(classLabel); + } + CRFLabel newLabels = new CRFLabel(newLabelIndices); + crfLabelMap.put(labels, newLabels); + int k = this.labelIndices.get(i).indexOf(newLabels); // the indexing is needed, even when not printed out! + // System.err.println("LabelIndices " + i + " " + labels + ": " + j + + // " mapped to " + k); + } + } + + // Create map of featureIndex to featureTypeIndex + map = new int[numFeatures]; + for (int i = 0; i < numFeatures; i++) { + map[i] = getFeatureTypeIndex(i); + } + + // Create new weights + double[][] newWeights = new double[numFeatures][]; + for (int i = 0; i < numFeatures; i++) { + int length = labelIndices.get(map[i]).size(); + newWeights[i] = new double[length]; + if (i < oldNumFeatures) { + assert (length >= weights[i].length); + System.arraycopy(weights[i], 0, newWeights[i], 0, weights[i].length); + } + } + weights = newWeights; + + // Get original weight indices from other crf and weight them in + // depending on the type of the feature, different number of weights is + // associated with it + for (int i = 0; i < crf.weights.length; i++) { + String feature = crf.featureIndex.get(i); + int newIndex = featureIndex.indexOf(feature); + // Check weights are okay dimension + if (weights[newIndex].length < crf.weights[i].length) { + throw new RuntimeException("Incompatible CRFClassifier: weight length mismatch for feature " + newIndex + ": " + + featureIndex.get(newIndex) + " (also feature " + i + ": " + crf.featureIndex.get(i) + ") " + ", len1=" + + weights[newIndex].length + ", len2=" + crf.weights[i].length); + } + int featureTypeIndex = map[newIndex]; + for (int j = 0; j < crf.weights[i].length; j++) { + CRFLabel labels = crf.labelIndices.get(featureTypeIndex).get(j); + CRFLabel newLabels = crfLabelMap.get(labels); + int k = this.labelIndices.get(featureTypeIndex).indexOf(newLabels); + weights[newIndex][k] += crf.weights[i][j] * weight; + } + } + } + + /** + * Combines weighted crf with this crf + * + * @param crf + * @param weight + */ + public void combine(CRFClassifier crf, double weight) { + Timing timer = new Timing(); + + // Check the CRFClassifiers are compatible + if (!this.pad.equals(crf.pad)) { + throw new RuntimeException("Incompatible CRFClassifier: pad does not match"); + } + if (this.windowSize != crf.windowSize) { + throw new RuntimeException("Incompatible CRFClassifier: windowSize does not match"); + } + if (this.labelIndices.size() != crf.labelIndices.size()) { + // Should match since this should be same as the windowSize + throw new RuntimeException("Incompatible CRFClassifier: labelIndices length does not match"); + } + this.classIndex.addAll(crf.classIndex.objectsList()); + + // Combine weights of the other classifier with this classifier, + // weighing the other classifier's weights by weight + // First merge the feature indicies + int oldNumFeatures1 = this.featureIndex.size(); + int oldNumFeatures2 = crf.featureIndex.size(); + int oldNumWeights1 = this.getNumWeights(); + int oldNumWeights2 = crf.getNumWeights(); + this.featureIndex.addAll(crf.featureIndex.objectsList()); + this.knownLCWords.addAll(crf.knownLCWords); + assert (weights.length == oldNumFeatures1); + + // Combine weights of this classifier with other classifier + for (int i = 0; i < labelIndices.size(); i++) { + this.labelIndices.get(i).addAll(crf.labelIndices.get(i).objectsList()); + } + System.err.println("Combining weights: will automatically match labelIndices"); + combineWeights(crf, weight); + + int numFeatures = featureIndex.size(); + int numWeights = getNumWeights(); + long elapsedMs = timer.stop(); + System.err.println("numFeatures: orig1=" + oldNumFeatures1 + ", orig2=" + oldNumFeatures2 + ", combined=" + + numFeatures); + System.err + .println("numWeights: orig1=" + oldNumWeights1 + ", orig2=" + oldNumWeights2 + ", combined=" + numWeights); + System.err.println("Time to combine CRFClassifier: " + Timing.toSecondsString(elapsedMs) + " seconds"); + } + + public void dropFeaturesBelowThreshold(double threshold) { + Index newFeatureIndex = new HashIndex(); + for (int i = 0; i < weights.length; i++) { + double smallest = weights[i][0]; + double biggest = weights[i][0]; + for (int j = 1; j < weights[i].length; j++) { + if (weights[i][j] > biggest) { + biggest = weights[i][j]; + } + if (weights[i][j] < smallest) { + smallest = weights[i][j]; + } + if (biggest - smallest > threshold) { + newFeatureIndex.add(featureIndex.get(i)); + break; + } + } + } + + int[] newMap = new int[newFeatureIndex.size()]; + for (int i = 0; i < newMap.length; i++) { + int index = featureIndex.indexOf(newFeatureIndex.get(i)); + newMap[i] = map[index]; + } + map = newMap; + featureIndex = newFeatureIndex; + } + + /** + * Convert a document List into arrays storing the data features and labels. + * This is used at test time. + * + * @param document Testing documents + * @return A Triple, where the first element is an int[][][] representing the + * data, the second element is an int[] representing the labels, and + * the third element is a double[][][] representing the feature values (optionally null) + */ + public Triple documentToDataAndLabels(List document) { + return documentToDataAndLabels(document, false); + } + + /** + * Convert a document List into arrays storing the data features and labels. + * This is used at both training and test time. + * + * @param document Training documents + * @return A Triple, where the first element is an int[][][] representing the + * data, the second element is an int[] representing the labels, the third + * element is an (optional, could be null) double[][][] representing the + * feature real values. + */ + public Triple documentToDataAndLabels(List document, + boolean trainTime) { + boolean droppedFeature = false; // todo: remove me + int docSize = document.size(); + // first index is position in the document also the index of the + // clique/factor table + // second index is the number of elements in the clique/window these + // features are for (starting with last element) + // third index is position of the feature in the array that holds them. + // An element in data[j][k][m] is the index of the mth feature occurring in + // position k of the jth clique + int[][][] data = new int[docSize][windowSize][]; + double[][][] featureVals = new double[docSize][windowSize][]; + // index is the position in the document. + // element in labels[j] is the index of the correct label (if it exists) at + // position j of document + int[] labels = new int[docSize]; + + if (flags.useReverse) { + Collections.reverse(document); + } + + // System.err.println("docSize:"+docSize); + for (int j = 0; j < docSize; j++) { + CRFDatum, CRFLabel> d = makeDatum(document, j, featureFactory); + + List> features = d.asFeatures(); + List featureValList = d.asFeatureVals(); + for (int k = 0, fSize = features.size(); k < fSize; k++) { + Collection cliqueFeatures = features.get(k); + data[j][k] = new int[cliqueFeatures.size()]; + if(featureValList != null) { // CRFBiasedClassifier.makeDatum causes null + featureVals[j][k] = featureValList.get(k); + } + int m = 0; + for (String feature : cliqueFeatures) { + if (trainTime && flags.inputDropOut != 0.0) { + //TODO(mengqiu) droppedFeature code below appears to be buggy, also + // would not work with featureVals + double rand = Math.random(); + if (rand < flags.inputDropOut) { + if ( ! droppedFeature) { + System.err.printf("Dropped feature %s%n", feature); + droppedFeature = true; + } + continue; + } + } + + int index = featureIndex.indexOf(feature); + if (index >= 0) { + data[j][k][m] = index; + m++; + } else { + // this is where we end up when we do feature threshold cutoffs + } + } + + if (m < data[j][k].length) { + int[] f = new int[m]; + System.arraycopy(data[j][k], 0, f, 0, m); + data[j][k] = f; + if (featureVals[j][k] != null) { + double[] fVal = new double[m]; + System.arraycopy(featureVals[j][k], 0, fVal, 0, m); + featureVals[j][k] = fVal; + } + } + } + + IN wi = document.get(j); + labels[j] = classIndex.indexOf(wi.get(CoreAnnotations.AnswerAnnotation.class)); + } + + if (flags.useReverse) { + Collections.reverse(document); + } + + if (flags.nonLinearCRF) { + data = transformDocData(data); + } + + return new Triple(data, labels, featureVals); + } + + private int[][][] transformDocData(int[][][] docData) { + int[][][] transData = new int[docData.length][][]; + for (int i = 0; i < docData.length; i++) { + transData[i] = new int[docData[i].length][]; + for (int j = 0; j < docData[i].length; j++) { + int[] cliqueFeatures = docData[i][j]; + transData[i][j] = new int[cliqueFeatures.length]; + for (int n = 0; n < cliqueFeatures.length; n++) { + int transFeatureIndex = -1; + if (j == 0) { + transFeatureIndex = nodeFeatureIndicesMap.indexOf(cliqueFeatures[n]); + if (transFeatureIndex == -1) + throw new RuntimeException("node cliqueFeatures[n]="+cliqueFeatures[n]+" not found, nodeFeatureIndicesMap.size="+nodeFeatureIndicesMap.size()); + } else { + transFeatureIndex = edgeFeatureIndicesMap.indexOf(cliqueFeatures[n]); + if (transFeatureIndex == -1) + throw new RuntimeException("edge cliqueFeatures[n]="+cliqueFeatures[n]+" not found, edgeFeatureIndicesMap.size="+edgeFeatureIndicesMap.size()); + } + transData[i][j][n] = transFeatureIndex; + } + } + } + return transData; + } + + public void printLabelInformation(String testFile, DocumentReaderAndWriter readerAndWriter) throws Exception { + ObjectBank> documents = makeObjectBankFromFile(testFile, readerAndWriter); + for (List document : documents) { + printLabelValue(document); + } + } + + public void printLabelValue(List document) { + + if (flags.useReverse) { + Collections.reverse(document); + } + + NumberFormat nf = new DecimalFormat(); + + List classes = new ArrayList(); + for (int i = 0; i < classIndex.size(); i++) { + classes.add(classIndex.get(i)); + } + String[] columnHeaders = classes.toArray(new String[classes.size()]); + + // System.err.println("docSize:"+docSize); + for (int j = 0; j < document.size(); j++) { + + System.out.println("--== " + document.get(j).get(CoreAnnotations.TextAnnotation.class) + " ==--"); + + List lines = new ArrayList(); + List rowHeaders = new ArrayList(); + List line = new ArrayList(); + + for (int p = 0; p < labelIndices.size(); p++) { + if (j + p >= document.size()) { + continue; + } + CRFDatum, CRFLabel> d = makeDatum(document, j + p, featureFactory); + + List> features = d.asFeatures(); + for (int k = p, fSize = features.size(); k < fSize; k++) { + Collection cliqueFeatures = features.get(k); + for (String feature : cliqueFeatures) { + int index = featureIndex.indexOf(feature); + if (index >= 0) { + // line.add(feature+"["+(-p)+"]"); + rowHeaders.add(feature + '[' + (-p) + ']'); + double[] values = new double[labelIndices.get(0).size()]; + for (CRFLabel label : labelIndices.get(k)) { + int[] l = label.getLabel(); + double v = weights[index][labelIndices.get(k).indexOf(label)]; + values[l[l.length - 1 - p]] += v; + } + for (double value : values) { + line.add(nf.format(value)); + } + lines.add(line.toArray(new String[line.size()])); + line = new ArrayList(); + } + } + } + // lines.add(Collections.emptyList()); + System.out.println(StringUtils.makeAsciiTable(lines.toArray(new String[lines.size()][0]), rowHeaders + .toArray(new String[rowHeaders.size()]), columnHeaders, 0, 1, true)); + System.out.println(); + } + // System.err.println(edu.stanford.nlp.util.StringUtils.join(lines,"\n")); + } + + if (flags.useReverse) { + Collections.reverse(document); + } + } + + /** + * Convert an ObjectBank to arrays of data features and labels. + * This version is used at training time. + * + * @return A Triple, where the first element is an int[][][][] representing the + * data, the second element is an int[][] representing the labels, and + * the third element is a double[][][][] representing the feature values + * which could be optionally left as null. + */ + public Triple documentsToDataAndLabels(Collection> documents) { + + // first index is the number of the document + // second index is position in the document also the index of the + // clique/factor table + // third index is the number of elements in the clique/window these features + // are for (starting with last element) + // fourth index is position of the feature in the array that holds them + // element in data[i][j][k][m] is the index of the mth feature occurring in + // position k of the jth clique of the ith document + // int[][][][] data = new int[documentsSize][][][]; + List data = new ArrayList(); + List featureVal = new ArrayList(); + + // first index is the number of the document + // second index is the position in the document + // element in labels[i][j] is the index of the correct label (if it exists) + // at position j in document i + // int[][] labels = new int[documentsSize][]; + List labels = new ArrayList(); + + int numDatums = 0; + + for (List doc : documents) { + Triple docTriple = documentToDataAndLabels(doc, true); + data.add(docTriple.first()); + labels.add(docTriple.second()); + if (flags.useEmbedding) + featureVal.add(docTriple.third()); + numDatums += doc.size(); + } + + System.err.println("numClasses: " + classIndex.size() + ' ' + classIndex); + System.err.println("numDocuments: " + data.size()); + System.err.println("numDatums: " + numDatums); + System.err.println("numFeatures: " + featureIndex.size()); + printFeatures(); + + double[][][][] featureValArr = null; + if (flags.useEmbedding) + featureValArr = featureVal.toArray(new double[data.size()][][][]); + + + return new Triple( + data.toArray(new int[data.size()][][][]), + labels.toArray(new int[labels.size()][]), + featureValArr); + } + + /** + * Convert an ObjectBank to corresponding collection of data features and + * labels. This version is used at test time. + * + * @return A List of pairs, one for each document, where the first element is + * an int[][][] representing the data and the second element is an + * int[] representing the labels. + */ + public List> documentsToDataAndLabelsList(Collection> documents) { + int numDatums = 0; + + List> docList = new ArrayList>(); + for (List doc : documents) { + Triple docTriple = documentToDataAndLabels(doc); + docList.add(docTriple); + numDatums += doc.size(); + } + + System.err.println("numClasses: " + classIndex.size() + ' ' + classIndex); + System.err.println("numDocuments: " + docList.size()); + System.err.println("numDatums: " + numDatums); + System.err.println("numFeatures: " + featureIndex.size()); + return docList; + } + + protected void printFeatures() { + if (flags.printFeatures == null) { + return; + } + try { + String enc = flags.inputEncoding; + if (flags.inputEncoding == null) { + System.err.println("flags.inputEncoding doesn't exist, Use UTF-8 as default"); + enc = "UTF-8"; + } + + PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("features-" + flags.printFeatures + + ".txt"), enc), true); + for (String feat : featureIndex) { + pw.println(feat); + } + pw.close(); + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + /** + * This routine builds the labelIndices which give the + * empirically legal label sequences (of length (order) at most + * windowSize) and the classIndex, which indexes + * known answer classes. + * + * @param ob The training data: Read from an ObjectBank, each item in it is a + * {@code List}. + */ + protected void makeAnswerArraysAndTagIndex(Collection> ob) { + + Set[] featureIndices = new HashSet[windowSize]; + for (int i = 0; i < windowSize; i++) { + featureIndices[i] = Generics.newHashSet(); + } + + labelIndices = new ArrayList>(windowSize); + for (int i = 0; i < windowSize; i++) { + labelIndices.add(new HashIndex()); + } + + Index labelIndex = labelIndices.get(windowSize - 1); + + classIndex = new HashIndex(); + // classIndex.add("O"); + classIndex.add(flags.backgroundSymbol); + + Set[] seenBackgroundFeatures = new HashSet[2]; + seenBackgroundFeatures[0] = Generics.newHashSet(); + seenBackgroundFeatures[1] = Generics.newHashSet(); + + int wordCount = 0; + + for (List doc : ob) { + if (flags.useReverse) { + Collections.reverse(doc); + } + + int docSize = doc.size(); + // create the full set of labels in classIndex + // note: update to use addAll later + for (int j = 0; j < docSize; j++) { + wordCount++; + String ans = doc.get(j).get(CoreAnnotations.AnswerAnnotation.class); + if (ans == null || ans.equals("")) { + throw new IllegalArgumentException("Word " + wordCount + " (\"" + doc.get(j).get(CoreAnnotations.TextAnnotation.class) + "\") has a blank answer"); + } + classIndex.add(ans); + } + + for (int j = 0; j < docSize; j++) { + CRFDatum, CRFLabel> d = makeDatum(doc, j, featureFactory); + labelIndex.add(d.label()); + + List> features = d.asFeatures(); + for (int k = 0, fsize = features.size(); k < fsize; k++) { + Collection cliqueFeatures = features.get(k); + if (k < 2 && flags.removeBackgroundSingletonFeatures) { + String ans = doc.get(j).get(CoreAnnotations.AnswerAnnotation.class); + boolean background = ans.equals(flags.backgroundSymbol); + if (k == 1 && j > 0 && background) { + ans = doc.get(j - 1).get(CoreAnnotations.AnswerAnnotation.class); + background = ans.equals(flags.backgroundSymbol); + } + if (background) { + for (String f : cliqueFeatures) { + if (!featureIndices[k].contains(f)) { + if (seenBackgroundFeatures[k].contains(f)) { + seenBackgroundFeatures[k].remove(f); + featureIndices[k].add(f); + } else { + seenBackgroundFeatures[k].add(f); + } + } + } + } else { + seenBackgroundFeatures[k].removeAll(cliqueFeatures); + featureIndices[k].addAll(cliqueFeatures); + } + } else { + featureIndices[k].addAll(cliqueFeatures); + } + } + } + + if (flags.useReverse) { + Collections.reverse(doc); + } + } + + int numFeatures = 0; + for (int i = 0; i < windowSize; i++) { + numFeatures += featureIndices[i].size(); + } + + featureIndex = new HashIndex(); + map = new int[numFeatures]; + + for (int i = 0; i < windowSize; i++) { + Index featureIndexMap = new HashIndex(); + + featureIndex.addAll(featureIndices[i]); + for (String str : featureIndices[i]) { + int index = featureIndex.indexOf(str); + map[index] = i; + featureIndexMap.add(index); + } + if (i == 0) { + nodeFeatureIndicesMap = featureIndexMap; + System.err.println("setting nodeFeatureIndicesMap, size="+nodeFeatureIndicesMap.size()); + } else { + edgeFeatureIndicesMap = featureIndexMap; + System.err.println("setting edgeFeatureIndicesMap, size="+edgeFeatureIndicesMap.size()); + } + } + + if (flags.numOfFeatureSlices > 0) { + System.err.println("Taking " + flags.numOfFeatureSlices + " out of " + flags.totalFeatureSlice + " slices of node features for training"); + pruneNodeFeatureIndices(flags.totalFeatureSlice, flags.numOfFeatureSlices); + } + + if (flags.useObservedSequencesOnly) { + for (int i = 0, liSize = labelIndex.size(); i < liSize; i++) { + CRFLabel label = labelIndex.get(i); + for (int j = windowSize - 2; j >= 0; j--) { + label = label.getOneSmallerLabel(); + labelIndices.get(j).add(label); + } + } + } else { + for (int i = 0; i < labelIndices.size(); i++) { + labelIndices.set(i, allLabels(i + 1, classIndex)); + } + } + + if (VERBOSE) { + for (int i = 0, fiSize = featureIndex.size(); i < fiSize; i++) { + System.out.println(i + ": " + featureIndex.get(i)); + } + } + } + + protected static Index allLabels(int window, Index classIndex) { + int[] label = new int[window]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) + // 4.12.5 + // Arrays.fill(label, 0); + int numClasses = classIndex.size(); + Index labelIndex = new HashIndex(); + OUTER: while (true) { + CRFLabel l = new CRFLabel(label); + labelIndex.add(l); + int[] label1 = new int[window]; + System.arraycopy(label, 0, label1, 0, label.length); + label = label1; + for (int j = 0; j < label.length; j++) { + label[j]++; + if (label[j] >= numClasses) { + label[j] = 0; + if (j == label.length - 1) { + break OUTER; + } + } else { + break; + } + } + } + return labelIndex; + } + + /** + * Makes a CRFDatum by producing features and a label from input data at a + * specific position, using the provided factory. + * + * @param info The input data + * @param loc The position to build a datum at + * @param featureFactory The FeatureFactory to use to extract features + * @return The constructed CRFDatum + */ + public CRFDatum, CRFLabel> makeDatum(List info, int loc, + edu.stanford.nlp.sequences.FeatureFactory featureFactory) { + // pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); // cdm: isn't this unnecessary, as this is how it's initialized in AbstractSequenceClassifier.reinit? + PaddedList pInfo = new PaddedList(info, pad); + + ArrayList> features = new ArrayList>(); + List featureVals = new ArrayList(); + + // for (int i = 0; i < windowSize; i++) { + // List featuresC = new ArrayList(); + // for (int j = 0; j < FeatureFactory.win[i].length; j++) { + // featuresC.addAll(featureFactory.features(info, loc, + // FeatureFactory.win[i][j])); + // } + // features.add(featuresC); + // } + + // todo [cdm Aug 2012]: Since getCliques returns all cliques within its bounds, can't the for loop here be eliminated? But my first attempt to removed failed to produce identical results.... + Collection done = Generics.newHashSet(); + for (int i = 0; i < windowSize; i++) { + List featuresC = new ArrayList(); + List windowCliques = FeatureFactory.getCliques(i, 0); + windowCliques.removeAll(done); + done.addAll(windowCliques); + double[] featureValArr = null; + if (flags.useEmbedding && i == 0) {// only activated for node features + List embeddingList = new ArrayList(); + int concatEmbeddingLen = 0; + String currentWord = null; + for (int currLoc = loc-2; currLoc <= loc+2; currLoc++) { + double[] embedding = null; + if (currLoc >=0 && currLoc < info.size()) { + currentWord = info.get(loc).get(CoreAnnotations.TextAnnotation.class); + String word = currentWord.toLowerCase(); + word = word.replaceAll("(-)?\\d+(\\.\\d*)?", "0"); + if (embeddings.containsKey(word)) + embedding = embeddings.get(word); + else + embedding = embeddings.get("UNKNOWN"); + } else { + embedding = embeddings.get("PADDING"); + } + + for (int e = 0; e < embedding.length; e++) { + featuresC.add("EMBEDDING-(" + (currLoc-loc) + ")-" + e); + } + + if (flags.addCapitalFeatures) { + int numOfCapitalFeatures = 4; + double[] newEmbedding = new double[embedding.length + numOfCapitalFeatures]; + int currLen = embedding.length; + System.arraycopy(embedding, 0, newEmbedding, 0, currLen); + for (int e = 0; e < numOfCapitalFeatures; e++) + featuresC.add("CAPITAL-(" + (currLoc-loc) + ")-" + e); + + if (currLoc >=0 && currLoc < info.size()) { // skip PADDING + // check if word is all caps + if (currentWord.toUpperCase().equals(currentWord)) + newEmbedding[currLen] = 1; + else { + currLen += 1; + // check if word is all lower + if (currentWord.toLowerCase().equals(currentWord)) + newEmbedding[currLen] = 1; + else { + currLen += 1; + // check first letter cap + if (Character.isUpperCase(currentWord.charAt(0))) + newEmbedding[currLen] = 1; + else { + currLen += 1; + // check if at least one non-initial letter is cap + String remainder = currentWord.substring(1); + if (!remainder.toLowerCase().equals(remainder)) + newEmbedding[currLen] = 1; + } + } + } + } + embedding = newEmbedding; + } + + embeddingList.add(embedding); + concatEmbeddingLen += embedding.length; + } + double[] concatEmbedding = new double[concatEmbeddingLen]; + int currPos = 0; + for (double[] em: embeddingList) { + System.arraycopy(em, 0, concatEmbedding, currPos, em.length); + currPos += em.length; + } + + if (flags.prependEmbedding) { + int additionalFeatureCount = 0; + for (Clique c : windowCliques) { + Collection fCol = featureFactory.getCliqueFeatures(pInfo, loc, c); //todo useless copy because of typing reasons + featuresC.addAll(fCol); + additionalFeatureCount += fCol.size(); + } + featureValArr = new double[concatEmbedding.length + additionalFeatureCount]; + System.arraycopy(concatEmbedding, 0, featureValArr, 0, concatEmbedding.length); + Arrays.fill(featureValArr, concatEmbedding.length, featureValArr.length, 1.0); + } else { + featureValArr = concatEmbedding; + } + + if (flags.addBiasToEmbedding) { + featuresC.add("BIAS-FEATURE"); + double[] newFeatureValArr = new double[featureValArr.length + 1]; + System.arraycopy(featureValArr, 0, newFeatureValArr, 0, featureValArr.length); + newFeatureValArr[newFeatureValArr.length-1] = 1; + featureValArr = newFeatureValArr; + } + + } else { + for (Clique c : windowCliques) { + featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c)); //todo useless copy because of typing reasons + } + } + // [This is the version that didn't work.] + // List windowCliques = FeatureFactory.getCliques(windowSize - 1, 0); // -1 as <= in getCliques() + // List> features = new ArrayList>(windowCliques.size()); + // for (Clique c : windowCliques) { + // List featuresC = new ArrayList(featureFactory.getCliqueFeatures(pInfo, loc, c)); //todo useless copy because of typing reasons + features.add(featuresC); + featureVals.add(featureValArr); + } + + int[] labels = new int[windowSize]; + + for (int i = 0; i < windowSize; i++) { + String answer = pInfo.get(loc + i - windowSize + 1).get(CoreAnnotations.AnswerAnnotation.class); + labels[i] = classIndex.indexOf(answer); + } + + printFeatureLists(pInfo.get(loc), features); + + CRFDatum, CRFLabel> d = new CRFDatum, CRFLabel>(features, new CRFLabel(labels), featureVals); + // System.err.println(d); + return d; + } + + public static class TestSequenceModel implements SequenceModel { + + private final int window; + private final int numClasses; + // private final FactorTable[] factorTables; + private final CRFCliqueTree cliqueTree; + private final int[] tags; + private final int[] backgroundTag; + + // public Scorer(FactorTable[] factorTables) { + public TestSequenceModel(CRFCliqueTree cliqueTree) { + // this.factorTables = factorTables; + this.cliqueTree = cliqueTree; + // this.window = factorTables[0].windowSize(); + this.window = cliqueTree.window(); + // this.numClasses = factorTables[0].numClasses(); + this.numClasses = cliqueTree.getNumClasses(); + tags = new int[numClasses]; + for (int i = 0; i < tags.length; i++) { + tags[i] = i; + } + backgroundTag = new int[] { cliqueTree.backgroundIndex() }; + } + + @Override + public int length() { + return cliqueTree.length(); + } + + @Override + public int leftWindow() { + return window - 1; + } + + @Override + public int rightWindow() { + return 0; + } + + @Override + public int[] getPossibleValues(int pos) { + if (pos < window - 1) { + return backgroundTag; + } + return tags; + } + + /** + * Return the score of the proposed tags for position given. + * @param tags is an array indicating the assignment of labels to score. + * @param pos is the position to return a score for. + */ + @Override + public double scoreOf(int[] tags, int pos) { + int[] previous = new int[window - 1]; + int realPos = pos - window + 1; + for (int i = 0; i < window - 1; i++) { + previous[i] = tags[realPos + i]; + } + return cliqueTree.condLogProbGivenPrevious(realPos, tags[pos], previous); + } + + @Override + public double[] scoresOf(int[] tags, int pos) { + int realPos = pos - window + 1; + double[] scores = new double[numClasses]; + int[] previous = new int[window - 1]; + for (int i = 0; i < window - 1; i++) { + previous[i] = tags[realPos + i]; + } + for (int i = 0; i < numClasses; i++) { + scores[i] = cliqueTree.condLogProbGivenPrevious(realPos, i, previous); + } + return scores; + } + + @Override + public double scoreOf(int[] sequence) { + throw new UnsupportedOperationException(); + } + + } // end class TestSequenceModel + + @Override + public List classify(List document) { + if (flags.doGibbs) { + try { + return classifyGibbs(document); + } catch (Exception e) { + System.err.println("Error running testGibbs inference!"); + e.printStackTrace(); + return null; + } + } else if (flags.crfType.equalsIgnoreCase("maxent")) { + return classifyMaxEnt(document); + } else { + throw new RuntimeException("Unsupported inference type: " + flags.crfType); + } + } + + private List classify(List document, Triple documentDataAndLabels) { + if (flags.doGibbs) { + try { + return classifyGibbs(document, documentDataAndLabels); + } catch (Exception e) { + System.err.println("Error running testGibbs inference!"); + e.printStackTrace(); + return null; + } + } else if (flags.crfType.equalsIgnoreCase("maxent")) { + return classifyMaxEnt(document, documentDataAndLabels); + } else { + throw new RuntimeException("Unsupported inference type: " + flags.crfType); + } + } + + /** + * This method is supposed to be used by CRFClassifierEvaluator only, should not have global visibility + * The generic classifyAndWriteAnswers omits the second argument documentDataAndLables + */ + void classifyAndWriteAnswers(Collection> documents, + List> documentDataAndLabels, + PrintWriter printWriter, + DocumentReaderAndWriter readerAndWriter) throws IOException { + Timing timer = new Timing(); + + Counter entityTP = new ClassicCounter(); + Counter entityFP = new ClassicCounter(); + Counter entityFN = new ClassicCounter(); + boolean resultsCounted = true; + + int numWords = 0; + int numDocs = 0; + for (List doc : documents) { + classify(doc, documentDataAndLabels.get(numDocs)); + numWords += doc.size(); + writeAnswers(doc, printWriter, readerAndWriter); + resultsCounted = resultsCounted && countResults(doc, entityTP, entityFP, entityFN); + numDocs++; + } + long millis = timer.stop(); + double wordspersec = numWords / (((double) millis) / 1000); + NumberFormat nf = new DecimalFormat("0.00"); // easier way! + if (!flags.suppressTestDebug) + System.err.println(StringUtils.getShortClassName(this) + " tagged " + numWords + " words in " + numDocs + + " documents at " + nf.format(wordspersec) + " words per second."); + if (resultsCounted && !flags.suppressTestDebug) { + printResults(entityTP, entityFP, entityFN); + } + } + + @Override + public SequenceModel getSequenceModel(List doc) { + Triple p = documentToDataAndLabels(doc); + return getSequenceModel(doc, p); + } + + public SequenceModel getSequenceModel(List doc, Triple documentDataAndLabels) { + Triple p = documentDataAndLabels; + return new TestSequenceModel(getCliqueTree(p)); + } + + private CliquePotentialFunction getCliquePotentialFunction() { + if (cliquePotentialFunction == null) { + if (flags.nonLinearCRF) { + if (flags.secondOrderNonLinear) + cliquePotentialFunction = new NonLinearSecondOrderCliquePotentialFunction(inputLayerWeights4Edge, outputLayerWeights4Edge, inputLayerWeights, outputLayerWeights, flags); + else + cliquePotentialFunction = new NonLinearCliquePotentialFunction(linearWeights, inputLayerWeights, outputLayerWeights, flags); + } else { + cliquePotentialFunction = new LinearCliquePotentialFunction(weights); + } + } + return cliquePotentialFunction; + } + + public void updateWeights(double[] x) { + cliquePotentialFunction = cliquePotentialFunctionHelper.getCliquePotentialFunction(x); + } + + /** + * Do standard sequence inference, using either Viterbi or Beam inference + * depending on the value of flags.inferenceType. + * + * @param document + * Document to classify. Classification happens in place. This + * document is modified. + * @return The classified document + */ + public List classifyMaxEnt(List document) { + if (document.isEmpty()) { + return document; + } + + SequenceModel model = getSequenceModel(document); + return classifyMaxEnt(document, model); + } + + private List classifyMaxEnt(List document, Triple documentDataAndLabels) { + if (document.isEmpty()) { + return document; + } + SequenceModel model = getSequenceModel(document, documentDataAndLabels); + return classifyMaxEnt(document, model); + } + + private List classifyMaxEnt(List document, SequenceModel model) { + if (document.isEmpty()) { + return document; + } + + if (flags.inferenceType == null) { + flags.inferenceType = "Viterbi"; + } + + BestSequenceFinder tagInference; + if (flags.inferenceType.equalsIgnoreCase("Viterbi")) { + tagInference = new ExactBestSequenceFinder(); + } else if (flags.inferenceType.equalsIgnoreCase("Beam")) { + tagInference = new BeamBestSequenceFinder(flags.beamSize); + } else { + throw new RuntimeException("Unknown inference type: " + flags.inferenceType + ". Your options are Viterbi|Beam."); + } + + int[] bestSequence = tagInference.bestSequence(model); + + if (flags.useReverse) { + Collections.reverse(document); + } + for (int j = 0, docSize = document.size(); j < docSize; j++) { + IN wi = document.get(j); + String guess = classIndex.get(bestSequence[j + windowSize - 1]); + wi.set(CoreAnnotations.AnswerAnnotation.class, guess); + } + if (flags.useReverse) { + Collections.reverse(document); + } + return document; + } + + public List classifyGibbs(List document) throws ClassNotFoundException, SecurityException, + NoSuchMethodException, IllegalArgumentException, InstantiationException, IllegalAccessException, + InvocationTargetException { + Triple p = documentToDataAndLabels(document); + return classifyGibbs(document, p); + } + + public List classifyGibbs(List document, Triple documentDataAndLabels) + throws ClassNotFoundException, SecurityException, NoSuchMethodException, IllegalArgumentException, + InstantiationException, IllegalAccessException, InvocationTargetException { + // System.err.println("Testing using Gibbs sampling."); + Triple p = documentDataAndLabels; + List newDocument = document; // reversed if necessary + if (flags.useReverse) { + Collections.reverse(document); + newDocument = new ArrayList(document); + Collections.reverse(document); + } + + CRFCliqueTree cliqueTree = getCliqueTree(p); + + SequenceModel model = cliqueTree; + SequenceListener listener = cliqueTree; + + SequenceModel priorModel = null; + SequenceListener priorListener = null; + + if (flags.useNERPrior) { + EntityCachingAbstractSequencePrior prior = new EmpiricalNERPrior(flags.backgroundSymbol, classIndex, + newDocument); + // SamplingNERPrior prior = new SamplingNERPrior(flags.backgroundSymbol, + // classIndex, newDocument); + priorModel = prior; + priorListener = prior; + } else if (flags.useNERPriorBIO) { + /* + if (tagIndex == null) { + tagIndex = new HashIndex(); + for (String tag: classIndex.objectsList()) { + String[] parts = tag.split("-"); + if (parts.length > 1) + tagIndex.add(parts[parts.length-1]); + } + tagIndex.add(flags.backgroundSymbol); + } + if (entityMatrices == null) + entityMatrices = BisequenceEmpiricalNERPrior.readEntityMatrices(flags.entityMatrix, tagIndex); + */ + EntityCachingAbstractSequencePriorBIO prior = new EmpiricalNERPriorBIO(flags.backgroundSymbol, classIndex, tagIndex, newDocument, entityMatrices, flags); + priorModel = prior; + priorListener = prior; + } else if (flags.useAcqPrior) { + EntityCachingAbstractSequencePrior prior = new AcquisitionsPrior(flags.backgroundSymbol, classIndex, + newDocument); + priorModel = prior; + priorListener = prior; + } else if (flags.useSemPrior) { + EntityCachingAbstractSequencePrior prior = new SeminarsPrior(flags.backgroundSymbol, classIndex, + newDocument); + priorModel = prior; + priorListener = prior; + } else if (flags.useUniformPrior) { + // System.err.println("Using uniform prior!"); + UniformPrior uniPrior = new UniformPrior(flags.backgroundSymbol, classIndex, newDocument); + priorModel = uniPrior; + priorListener = uniPrior; + } else { + throw new RuntimeException("no prior specified"); + } + + model = new FactoredSequenceModel(model, priorModel); + listener = new FactoredSequenceListener(listener, priorListener); + + SequenceGibbsSampler sampler = new SequenceGibbsSampler(0, 0, listener); + int[] sequence = new int[cliqueTree.length()]; + + if (flags.initViterbi) { + TestSequenceModel testSequenceModel = new TestSequenceModel(cliqueTree); + ExactBestSequenceFinder tagInference = new ExactBestSequenceFinder(); + int[] bestSequence = tagInference.bestSequence(testSequenceModel); + System.arraycopy(bestSequence, windowSize - 1, sequence, 0, sequence.length); + } else { + int[] initialSequence = SequenceGibbsSampler.getRandomSequence(model); + System.arraycopy(initialSequence, 0, sequence, 0, sequence.length); + } + + sampler.verbose = 0; + + if (flags.annealingType.equalsIgnoreCase("linear")) { + sequence = sampler.findBestUsingAnnealing(model, CoolingSchedule.getLinearSchedule(1.0, flags.numSamples), + sequence); + } else if (flags.annealingType.equalsIgnoreCase("exp") || flags.annealingType.equalsIgnoreCase("exponential")) { + sequence = sampler.findBestUsingAnnealing(model, CoolingSchedule.getExponentialSchedule(1.0, flags.annealingRate, + flags.numSamples), sequence); + } else { + throw new RuntimeException("No annealing type specified"); + } + + // System.err.println(ArrayMath.toString(sequence)); + + if (flags.useReverse) { + Collections.reverse(document); + } + + for (int j = 0, dsize = newDocument.size(); j < dsize; j++) { + IN wi = document.get(j); + if (wi == null) throw new RuntimeException(""); + if (classIndex == null) throw new RuntimeException(""); + wi.set(CoreAnnotations.AnswerAnnotation.class, classIndex.get(sequence[j])); + } + + if (flags.useReverse) { + Collections.reverse(document); + } + + return document; + } + + /** + * + * @param sentence + * @param priorModels + * an array of prior models + * @param priorListeners + * an array of prior listeners + * @param modelWts + * an array of model weights: IMPORTANT: this includes the weight of + * CRF classifier as well at position 0, and therefore is longer than + * priorListeners/priorModels array by 1. + * @return A list of INs with + * @throws ClassNotFoundException + * @throws SecurityException + * @throws NoSuchMethodException + * @throws IllegalArgumentException + * @throws InstantiationException + * @throws IllegalAccessException + * @throws InvocationTargetException + * TODO(mengqiu) refactor this method to re-use classifyGibbs + */ + public List classifyGibbsUsingPrior(List sentence, SequenceModel[] priorModels, + SequenceListener[] priorListeners, double[] modelWts) throws ClassNotFoundException, SecurityException, + NoSuchMethodException, IllegalArgumentException, InstantiationException, IllegalAccessException, + InvocationTargetException { + + if ((priorModels.length + 1) != modelWts.length) + throw new RuntimeException( + "modelWts array should be longer than the priorModels array by 1 unit since it also includes the weight of the CRF model at position 0."); + + // System.err.println("Testing using Gibbs sampling."); + Triple p = documentToDataAndLabels(sentence); + + List newDocument = sentence; // reversed if necessary + if (flags.useReverse) { + Collections.reverse(sentence); + newDocument = new ArrayList(sentence); + Collections.reverse(sentence); + } + + CRFCliqueTree cliqueTree = getCliqueTree(p); + + SequenceModel model = cliqueTree; + SequenceListener listener = cliqueTree; + + SequenceModel[] models = new SequenceModel[priorModels.length + 1]; + models[0] = model; + for (int i = 1; i < models.length; i++) + models[i] = priorModels[i - 1]; + model = new FactoredSequenceModel(models, modelWts); + + SequenceListener[] listeners = new SequenceListener[priorListeners.length + 1]; + listeners[0] = listener; + for (int i = 1; i < listeners.length; i++) + listeners[i] = priorListeners[i - 1]; + listener = new FactoredSequenceListener(listeners); + + SequenceGibbsSampler sampler = new SequenceGibbsSampler(0, 0, listener); + int[] sequence = new int[cliqueTree.length()]; + + if (flags.initViterbi) { + TestSequenceModel testSequenceModel = new TestSequenceModel(cliqueTree); + ExactBestSequenceFinder tagInference = new ExactBestSequenceFinder(); + int[] bestSequence = tagInference.bestSequence(testSequenceModel); + System.arraycopy(bestSequence, windowSize - 1, sequence, 0, sequence.length); + } else { + int[] initialSequence = SequenceGibbsSampler.getRandomSequence(model); + System.arraycopy(initialSequence, 0, sequence, 0, sequence.length); + } + + SequenceGibbsSampler.verbose = 0; + + if (flags.annealingType.equalsIgnoreCase("linear")) { + sequence = sampler.findBestUsingAnnealing(model, CoolingSchedule.getLinearSchedule(1.0, flags.numSamples), + sequence); + } else if (flags.annealingType.equalsIgnoreCase("exp") || flags.annealingType.equalsIgnoreCase("exponential")) { + sequence = sampler.findBestUsingAnnealing(model, CoolingSchedule.getExponentialSchedule(1.0, flags.annealingRate, + flags.numSamples), sequence); + } else { + throw new RuntimeException("No annealing type specified"); + } + + // System.err.println(ArrayMath.toString(sequence)); + + if (flags.useReverse) { + Collections.reverse(sentence); + } + + for (int j = 0, dsize = newDocument.size(); j < dsize; j++) { + IN wi = sentence.get(j); + if (wi == null) throw new RuntimeException(""); + if (classIndex == null) throw new RuntimeException(""); + wi.set(CoreAnnotations.AnswerAnnotation.class, classIndex.get(sequence[j])); + } + + if (flags.useReverse) { + Collections.reverse(sentence); + } + + return sentence; + } + + + //TODO(mengqiu) refactor this method to re-use classifyGibbs + public List classifyGibbsUsingPrior(List sentence, SequenceModel priorModel, SequenceListener priorListener, + double model1Wt, double model2Wt) throws ClassNotFoundException, SecurityException, NoSuchMethodException, + IllegalArgumentException, InstantiationException, IllegalAccessException, InvocationTargetException { + // System.err.println("Testing using Gibbs sampling."); + Triple p = documentToDataAndLabels(sentence); + List newDocument = sentence; // reversed if necessary + if (flags.useReverse) { + newDocument = new ArrayList(sentence); + Collections.reverse(newDocument); + } + + CRFCliqueTree cliqueTree = getCliqueTree(p); + + SequenceModel model = cliqueTree; + SequenceListener listener = cliqueTree; + + model = new FactoredSequenceModel(model, priorModel, model1Wt, model2Wt); + listener = new FactoredSequenceListener(listener, priorListener); + + SequenceGibbsSampler sampler = new SequenceGibbsSampler(0, 0, listener); + int[] sequence = new int[cliqueTree.length()]; + + if (flags.initViterbi) { + TestSequenceModel testSequenceModel = new TestSequenceModel(cliqueTree); + ExactBestSequenceFinder tagInference = new ExactBestSequenceFinder(); + int[] bestSequence = tagInference.bestSequence(testSequenceModel); + System.arraycopy(bestSequence, windowSize - 1, sequence, 0, sequence.length); + } else { + int[] initialSequence = SequenceGibbsSampler.getRandomSequence(model); + System.arraycopy(initialSequence, 0, sequence, 0, sequence.length); + } + + SequenceGibbsSampler.verbose = 0; + + if (flags.annealingType.equalsIgnoreCase("linear")) { + sequence = sampler.findBestUsingAnnealing(model, CoolingSchedule.getLinearSchedule(1.0, flags.numSamples), + sequence); + } else if (flags.annealingType.equalsIgnoreCase("exp") || flags.annealingType.equalsIgnoreCase("exponential")) { + sequence = sampler.findBestUsingAnnealing(model, CoolingSchedule.getExponentialSchedule(1.0, flags.annealingRate, + flags.numSamples), sequence); + } else { + throw new RuntimeException("No annealing type specified"); + } + + // System.err.println(ArrayMath.toString(sequence)); + + if (flags.useReverse) { + Collections.reverse(sentence); + } + + for (int j = 0, dsize = newDocument.size(); j < dsize; j++) { + IN wi = sentence.get(j); + if (wi == null) throw new RuntimeException(""); + if (classIndex == null) throw new RuntimeException(""); + wi.set(CoreAnnotations.AnswerAnnotation.class, classIndex.get(sequence[j])); + } + + if (flags.useReverse) { + Collections.reverse(sentence); + } + + return sentence; + } + + /** + * Takes a {@link List} of something that extends {@link CoreMap} and prints + * the likelihood of each possible label at each point. + * + * @param document + * A {@link List} of something that extends CoreMap. + */ + @Override + public void printProbsDocument(List document) { + + Triple p = documentToDataAndLabels(document); + + CRFCliqueTree cliqueTree = getCliqueTree(p); + + // for (int i = 0; i < factorTables.length; i++) { + for (int i = 0; i < cliqueTree.length(); i++) { + IN wi = document.get(i); + System.out.print(wi.get(CoreAnnotations.TextAnnotation.class) + '\t'); + for (Iterator iter = classIndex.iterator(); iter.hasNext();) { + String label = iter.next(); + int index = classIndex.indexOf(label); + // double prob = Math.pow(Math.E, factorTables[i].logProbEnd(index)); + double prob = cliqueTree.prob(i, index); + System.out.print(label + '=' + prob); + if (iter.hasNext()) { + System.out.print("\t"); + } else { + System.out.print("\n"); + } + } + } + } + + /** + * Takes the file, reads it in, and prints out the likelihood of each possible + * label at each point. This gives a simple way to examine the probability + * distributions of the CRF. See getCliqueTrees() for more. + * + * @param filename + * The path to the specified file + */ + public void printFirstOrderProbs(String filename, DocumentReaderAndWriter readerAndWriter) { + // only for the OCR data does this matter + flags.ocrTrain = false; + + ObjectBank> docs = makeObjectBankFromFile(filename, readerAndWriter); + printFirstOrderProbsDocuments(docs); + } + + /** + * Takes a {@link List} of documents and prints the likelihood of each + * possible label at each point. + * + * @param documents + * A {@link List} of {@link List} of INs. + */ + public void printFirstOrderProbsDocuments(ObjectBank> documents) { + for (List doc : documents) { + printFirstOrderProbsDocument(doc); + System.out.println(); + } + } + + /** + * Want to make arbitrary probability queries? Then this is the method for + * you. Given the filename, it reads it in and breaks it into documents, and + * then makes a CRFCliqueTree for each document. you can then ask the clique + * tree for marginals and conditional probabilities of almost anything you + * want. + */ + public List> getCliqueTrees(String filename, DocumentReaderAndWriter readerAndWriter) { + // only for the OCR data does this matter + flags.ocrTrain = false; + + List> cts = new ArrayList>(); + ObjectBank> docs = makeObjectBankFromFile(filename, readerAndWriter); + for (List doc : docs) { + cts.add(getCliqueTree(doc)); + } + + return cts; + } + + public CRFCliqueTree getCliqueTree(Triple p) { + int[][][] data = p.first(); + double[][][] featureVal = p.third(); + + return CRFCliqueTree.getCalibratedCliqueTree(data, labelIndices, classIndex.size(), classIndex, + flags.backgroundSymbol, getCliquePotentialFunction(), featureVal); + } + + public CRFCliqueTree getCliqueTree(List document) { + Triple p = documentToDataAndLabels(document); + return getCliqueTree(p); + } + + /** + * Takes a {@link List} of something that extends {@link CoreMap} and prints + * the likelihood of each possible label at each point. + * + * @param document + * A {@link List} of something that extends {@link CoreMap}. + */ + public void printFirstOrderProbsDocument(List document) { + + CRFCliqueTree cliqueTree = getCliqueTree(document); + + // for (int i = 0; i < factorTables.length; i++) { + for (int i = 0; i < cliqueTree.length(); i++) { + IN wi = document.get(i); + System.out.print(wi.get(CoreAnnotations.TextAnnotation.class) + '\t'); + for (Iterator iter = classIndex.iterator(); iter.hasNext();) { + String label = iter.next(); + int index = classIndex.indexOf(label); + if (i == 0) { + // double prob = Math.pow(Math.E, factorTables[i].logProbEnd(index)); + double prob = cliqueTree.prob(i, index); + System.out.print(label + '=' + prob); + if (iter.hasNext()) { + System.out.print("\t"); + } else { + System.out.print("\n"); + } + } else { + for (Iterator iter1 = classIndex.iterator(); iter1.hasNext();) { + String label1 = iter1.next(); + int index1 = classIndex.indexOf(label1); + // double prob = Math.pow(Math.E, factorTables[i].logProbEnd(new + // int[]{index1, index})); + double prob = cliqueTree.prob(i, new int[] { index1, index }); + System.out.print(label1 + '_' + label + '=' + prob); + if (iter.hasNext() || iter1.hasNext()) { + System.out.print("\t"); + } else { + System.out.print("\n"); + } + } + } + } + } + } + + /** + * Train a classifier from documents. + * + * @param docs A Collection (perhaps ObjectBank) of documents + */ + @Override + public void train(Collection> docs, DocumentReaderAndWriter readerAndWriter) { + Timing timer = new Timing(); + timer.start(); + + if (flags.numOfSlices > 0) { + System.err.println("Taking " + flags.numOfSlices + " out of " + flags.totalDataSlice + " slices of data for training"); + List> docsToShuffle = new ArrayList>(); + for (List doc : docs) { + docsToShuffle.add(doc); + } + Collections.shuffle(docsToShuffle, random); + int cutOff = (int)(docsToShuffle.size() / (flags.totalDataSlice + 0.0) * flags.numOfSlices); + docs = docsToShuffle.subList(0, cutOff); + } + + makeAnswerArraysAndTagIndex(docs); + long elapsedMs = timer.stop(); + System.err.println("Time to convert docs to feature indices: " + Timing.toSecondsString(elapsedMs) + " seconds"); + if (flags.exportFeatures != null) { + timer.start(); + CRFFeatureExporter featureExporter = new CRFFeatureExporter(this); + featureExporter.printFeatures(flags.exportFeatures, docs); + elapsedMs = timer.stop(); + System.err.println("Time to export features: " + Timing.toSecondsString(elapsedMs) + " seconds"); + } + + for (int i = 0; i <= flags.numTimesPruneFeatures; i++) { + timer.start(); + Triple dataAndLabelsAndFeatureVals = documentsToDataAndLabels(docs); + elapsedMs = timer.stop(); + System.err.println("Time to convert docs to data/labels: " + Timing.toSecondsString(elapsedMs) + " seconds"); + + Evaluator[] evaluators = null; + if (flags.evaluateIters > 0 || flags.terminateOnEvalImprovement) { + List evaluatorList = new ArrayList(); + if (flags.useMemoryEvaluator) + evaluatorList.add(new MemoryEvaluator()); + if (flags.evaluateTrain) { + CRFClassifierEvaluator crfEvaluator = new CRFClassifierEvaluator("Train set", this); + List> trainDataAndLabels = new ArrayList>(); + int[][][][] data = dataAndLabelsAndFeatureVals.first(); + int[][] labels = dataAndLabelsAndFeatureVals.second(); + double[][][][] featureVal = dataAndLabelsAndFeatureVals.third(); + for (int j = 0; j < data.length; j++) { + Triple p = new Triple(data[j], labels[j], featureVal[j]); + trainDataAndLabels.add(p); + } + crfEvaluator.setTestData(docs, trainDataAndLabels); + if (flags.evalCmd.length() > 0) + crfEvaluator.setEvalCmd(flags.evalCmd); + evaluatorList.add(crfEvaluator); + } + if (flags.testFile != null) { + CRFClassifierEvaluator crfEvaluator = new CRFClassifierEvaluator("Test set (" + flags.testFile + ")", + this); + ObjectBank> testObjBank = makeObjectBankFromFile(flags.testFile, readerAndWriter); + List> testDocs = new ArrayList>(); + for (List doc : testObjBank) { + testDocs.add(doc); + } + List> testDataAndLabels = documentsToDataAndLabelsList(testDocs); + crfEvaluator.setTestData(testDocs, testDataAndLabels); + if (flags.evalCmd.length() > 0) + crfEvaluator.setEvalCmd(flags.evalCmd); + evaluatorList.add(crfEvaluator); + } + if (flags.testFiles != null) { + String[] testFiles = flags.testFiles.split(","); + for (String testFile : testFiles) { + CRFClassifierEvaluator crfEvaluator = new CRFClassifierEvaluator("Test set (" + + testFile + ")", this); + ObjectBank> testObjBank = makeObjectBankFromFile(testFile, readerAndWriter); + List> testDataAndLabels = documentsToDataAndLabelsList(testObjBank); + crfEvaluator.setTestData(testObjBank, testDataAndLabels); + if (flags.evalCmd.length() > 0) + crfEvaluator.setEvalCmd(flags.evalCmd); + evaluatorList.add(crfEvaluator); + } + } + evaluators = new Evaluator[evaluatorList.size()]; + evaluatorList.toArray(evaluators); + } + + if (flags.numTimesPruneFeatures == i) { + docs = null; // hopefully saves memory + } + // save feature index to disk and read in later + File featIndexFile = null; + + int numFeatures = featureIndex.size(); + if (flags.saveFeatureIndexToDisk) { + try { + System.err.println("Writing feature index to temporary file."); + featIndexFile = IOUtils.writeObjectToTempFile(featureIndex, "featIndex" + i + ".tmp"); + featureIndex = null; + } catch (IOException e) { + throw new RuntimeException("Could not open temporary feature index file for writing."); + } + } + + // first index is the number of the document + // second index is position in the document also the index of the + // clique/factor table + // third index is the number of elements in the clique/window thase + // features are for (starting with last element) + // fourth index is position of the feature in the array that holds them + // element in data[i][j][k][m] is the index of the mth feature occurring + // in position k of the jth clique of the ith document + int[][][][] data = dataAndLabelsAndFeatureVals.first(); + // first index is the number of the document + // second index is the position in the document + // element in labels[i][j] is the index of the correct label (if it + // exists) at position j in document i + int[][] labels = dataAndLabelsAndFeatureVals.second(); + double[][][][] featureVals = dataAndLabelsAndFeatureVals.third(); + + if (flags.loadProcessedData != null) { + List, String>>> processedData = loadProcessedData(flags.loadProcessedData); + if (processedData != null) { + // enlarge the data and labels array + int[][][][] allData = new int[data.length + processedData.size()][][][]; + double[][][][] allFeatureVals = new double[featureVals.length + processedData.size()][][][]; + int[][] allLabels = new int[labels.length + processedData.size()][]; + System.arraycopy(data, 0, allData, 0, data.length); + System.arraycopy(labels, 0, allLabels, 0, labels.length); + System.arraycopy(featureVals, 0, allFeatureVals, 0, featureVals.length); + // add to the data and labels array + addProcessedData(processedData, allData, allLabels, allFeatureVals, data.length); + data = allData; + labels = allLabels; + featureVals = allFeatureVals; + } + } + + if (flags.nonLinearCRF) { + if (flags.secondOrderNonLinear) { + CRFNonLinearSecondOrderLogConditionalObjectiveFunction func = new CRFNonLinearSecondOrderLogConditionalObjectiveFunction(data, labels, + windowSize, classIndex, labelIndices, map, flags, nodeFeatureIndicesMap.size(), edgeFeatureIndicesMap.size()); + cliquePotentialFunctionHelper = func; + + double[] allWeights = trainWeightsUsingNonLinearCRF(func, evaluators); + Quadruple params = func.separateWeights(allWeights); + this.inputLayerWeights4Edge = params.first(); + this.outputLayerWeights4Edge = params.second(); + this.inputLayerWeights = params.third(); + this.outputLayerWeights = params.fourth(); + System.err.println("Edge Output Layer Weights:"); + for (int ii = 0; ii < outputLayerWeights4Edge.length; ii++) { + System.err.print("[ "); + for (int jj = 0; jj < outputLayerWeights4Edge[ii].length; jj++) { + System.err.print(outputLayerWeights4Edge[ii][jj] + " "); + } + System.err.println("]"); + } + System.err.println("Node Output Layer Weights:"); + for (int ii = 0; ii < outputLayerWeights.length; ii++) { + System.err.print("[ "); + for (int jj = 0; jj < outputLayerWeights[ii].length; jj++) { + System.err.print(outputLayerWeights[ii][jj] + " "); + } + System.err.println("]"); + } + } else { + CRFNonLinearLogConditionalObjectiveFunction func = new CRFNonLinearLogConditionalObjectiveFunction(data, labels, + windowSize, classIndex, labelIndices, map, flags, nodeFeatureIndicesMap.size(), edgeFeatureIndicesMap.size(), featureVals); + cliquePotentialFunctionHelper = func; + + double[] allWeights = trainWeightsUsingNonLinearCRF(func, evaluators); + Triple params = func.separateWeights(allWeights); + this.linearWeights = params.first(); + this.inputLayerWeights = params.second(); + this.outputLayerWeights = params.third(); + if (flags.printWeights) { + System.err.println("Linear Layer Weights:"); + for (int ii = 0; ii < linearWeights.length; ii++) { + // for (int ii = 0; ii < Math.min(1, linearWeights.length); ii++) { + System.err.print("[ "); + for (int jj = 0; jj < linearWeights[ii].length; jj++) { + System.err.print(linearWeights[ii][jj] + " "); + } + System.err.println("]"); + } + System.err.println("Input Layer Weights:"); + for (int ii = 0; ii < inputLayerWeights.length; ii++) { + // for (int ii = 0; ii < Math.min(1, inputLayerWeights.length); ii++) { + System.err.print("[ "); + for (int jj = 0; jj < inputLayerWeights[ii].length; jj++) { + System.err.print(inputLayerWeights[ii][jj] + " "); + } + System.err.println("]"); + } + System.err.println("Output Layer Weights:"); + for (int ii = 0; ii < outputLayerWeights.length; ii++) { + System.err.print("[ "); + for (int jj = 0; jj < outputLayerWeights[ii].length; jj++) { + System.err.print(outputLayerWeights[ii][jj] + " "); + } + System.err.println("]"); + } + } + } + } else { + double[] oneDimWeights = null; + if (flags.useFloat) { + oneDimWeights = trainWeightsUsingFloatCRF(data, labels, i); + } else if (flags.numLopExpert > 1) { + oneDimWeights = trainWeightsUsingLopCRF(numFeatures, data, labels, evaluators, i); + } else { + oneDimWeights = trainWeightsUsingDoubleCRF(data, labels, evaluators, i, featureVals); + } + this.weights = CRFLogConditionalObjectiveFunction.to2D(oneDimWeights, labelIndices, map); + } + + // save feature index to disk and read in later + if (flags.saveFeatureIndexToDisk) { + try { + System.err.println("Reading temporary feature index file."); + featureIndex = (Index) IOUtils.readObjectFromFile(featIndexFile); + } catch (Exception e) { + throw new RuntimeException("Could not open temporary feature index file for reading."); + } + } + + if (i != flags.numTimesPruneFeatures) { + dropFeaturesBelowThreshold(flags.featureDiffThresh); + System.err.println("Removing features with weight below " + flags.featureDiffThresh + " and retraining..."); + } + } + } + + protected double[] trainWeightsUsingFloatCRF(int[][][][] data, int[][] labels, int pruneFeatureItr) { + CRFLogConditionalObjectiveFloatFunction func = new CRFLogConditionalObjectiveFloatFunction(data, labels, + featureIndex, windowSize, classIndex, labelIndices, map, flags.backgroundSymbol, flags.sigma); + cliquePotentialFunctionHelper = func; + + QNMinimizer minimizer; + if (flags.interimOutputFreq != 0) { + FloatFunction monitor = new ResultStoringFloatMonitor(flags.interimOutputFreq, flags.serializeTo); + minimizer = new QNMinimizer(monitor); + } else { + minimizer = new QNMinimizer(); + } + + if (pruneFeatureItr == 0) { + minimizer.setM(flags.QNsize); + } else { + minimizer.setM(flags.QNsize2); + } + + float[] initialWeights; + if (flags.initialWeights == null) { + initialWeights = func.initial(); + } else { + try { + System.err.println("Reading initial weights from file " + flags.initialWeights); + DataInputStream dis = new DataInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream( + flags.initialWeights)))); + initialWeights = ConvertByteArray.readFloatArr(dis); + } catch (IOException e) { + throw new RuntimeException("Could not read from float initial weight file " + flags.initialWeights); + } + } + System.err.println("numWeights: " + initialWeights.length); + float[] weights = minimizer.minimize(func, (float) flags.tolerance, initialWeights); + return ArrayMath.floatArrayToDoubleArray(weights); + } + + protected void pruneNodeFeatureIndices(int totalNumOfFeatureSlices, int numOfFeatureSlices) { + int numOfNodeFeatures = nodeFeatureIndicesMap.size(); + int beginIndex = 0; + int endIndex = Math.min( (int)(numOfNodeFeatures / (totalNumOfFeatureSlices+0.0) * numOfFeatureSlices), numOfNodeFeatures); + List nodeFeatureOriginalIndices = nodeFeatureIndicesMap.objectsList(); + List edgeFeatureOriginalIndices = edgeFeatureIndicesMap.objectsList(); + + Index newNodeFeatureIndex = new HashIndex(); + Index newEdgeFeatureIndex = new HashIndex(); + Index newFeatureIndex = new HashIndex(); + + for (int i = beginIndex; i < endIndex; i++) { + int oldIndex = nodeFeatureOriginalIndices.get(i); + String f = featureIndex.get(oldIndex); + int index = newFeatureIndex.indexOf(f, true); + newNodeFeatureIndex.add(index); + } + for (Integer edgeFIndex: edgeFeatureOriginalIndices) { + String f = featureIndex.get(edgeFIndex); + int index = newFeatureIndex.indexOf(f, true); + newEdgeFeatureIndex.add(index); + } + + nodeFeatureIndicesMap = newNodeFeatureIndex; + edgeFeatureIndicesMap = newEdgeFeatureIndex; + + int[] newMap = new int[newFeatureIndex.size()]; + for (int i = 0; i < newMap.length; i++) { + int index = featureIndex.indexOf(newFeatureIndex.get(i)); + newMap[i] = map[index]; + } + map = newMap; + + featureIndex = newFeatureIndex; + } + + /* + protected int[][][][] pruneNodeFeaturesBySlice(int totalNumOfFeatureSlices, int numOfFeatureSlices, int[][][][] data) { + + int[] oldFeatures = null; + int oldFeatureIndex = -1; + + ArrayList newFeatureList = new ArrayList(1000); + + int[][][][] newData = new int[data.length][][][]; + for (int m = 0; m < data.length; m++) { + newData[m] = new int[data[m].length][][]; + for (int i = 0; i < data[m].length; i++) { + newData[m][i] = new int[data[m][i].length][]; + for (int j = 0; j < data[m][i].length; m++) { + oldFeatures = data[m][i][j]; + newFeatureList.clear(); + for (int k = 0; k < oldFeatures.length; k++) { + oldFeatureIndex = oldFeatures[k]; + if (oldToNewFeatureIndexMap.containsKey(oldFeatureIndex)) { + newFeatureList.add(oldToNewFeatureIndexMap.get(oldFeatureIndex)); + } + } + newData[m][i][j] = new int[newFeatureList.size()]; + for (int k = 0; k < newFeatureList.size(); k++) { + newData[m][i][j][k] = newFeatureList.get(k); + } + } + } + } + + return newData; + } + */ + + protected int[][][][] createPartialDataForLOP(int lopIter, int[][][][] data) { + int[] oldFeatures = null; + int oldFeatureIndex = -1; + ArrayList newFeatureList = new ArrayList(1000); + Set featureIndicesSet = featureIndicesSetArray.get(lopIter); + + int[][][][] newData = new int[data.length][][][]; + for (int i = 0; i < data.length; i++) { + newData[i] = new int[data[i].length][][]; + for (int j = 0; j < data[i].length; j++) { + newData[i][j] = new int[data[i][j].length][]; + for (int k = 0; k < data[i][j].length; k++) { + oldFeatures = data[i][j][k]; + newFeatureList.clear(); + for (int l = 0; l < oldFeatures.length; l++) { + oldFeatureIndex = oldFeatures[l]; + if (featureIndicesSet.contains(oldFeatureIndex)) { + newFeatureList.add(oldFeatureIndex); + } + } + newData[i][j][k] = new int[newFeatureList.size()]; + for (int l = 0; l < newFeatureList.size(); ++l) { + newData[i][j][k][l] = newFeatureList.get(l); + } + } + } + } + + return newData; + } + + /* + protected int[][] getFeatureBoundaryIndices(int numFeatures, int numLopExpert) { + // first find begin/end feature index for each expert + int interval = numFeatures / numLopExpert; + int[] beginFeatureIndices = new int[numLopExpert]; + int[] begin1DParamIndices = new int[numLopExpert]; + int[] endFeatureIndices = new int[numLopExpert]; + int[] end1DParamIndices = new int[numLopExpert]; + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + int beginIndex = lopIter * interval; + int endIndex = (lopIter+1) * interval; + if (lopIter == numLopExpert - 1) { + endIndex = numFeatures; + } + int begin1D = 0; + for (int fIndex = 0; fIndex < beginIndex; fIndex++) { + begin1D += labelIndices[map[fIndex]].size(); + } + int end1D = begin1D; + for (int fIndex = beginIndex; fIndex < endIndex; fIndex++) { + end1D += labelIndices[map[fIndex]].size(); + } + beginFeatureIndices[lopIter] = beginIndex; + endFeatureIndices[lopIter] = endIndex; + begin1DParamIndices[lopIter] = begin1D; + end1DParamIndices[lopIter] = end1D; + } + int[][] boundaryIndices = new int[4][]; + boundaryIndices[0] = beginFeatureIndices; + boundaryIndices[1] = endFeatureIndices; + boundaryIndices[2] = begin1DParamIndices; + boundaryIndices[3] = end1DParamIndices; + return boundaryIndices; + } + */ + + protected void getFeatureBoundaryIndices(int numFeatures, int numLopExpert) { + // first find begin/end feature index for each expert + int interval = numFeatures / numLopExpert; + featureIndicesSetArray = new ArrayList>(numLopExpert); + featureIndicesListArray = new ArrayList>(numLopExpert); + for (int i = 0; i < numLopExpert; i++) { + featureIndicesSetArray.add(Generics.newHashSet(interval)); + featureIndicesListArray.add(Generics.newArrayList(interval)); + } + if (flags.randomLopFeatureSplit) { + for (int fIndex = 0; fIndex < numFeatures; fIndex++) { + int lopIter = random.nextInt(numLopExpert); + featureIndicesSetArray.get(lopIter).add(fIndex); + featureIndicesListArray.get(lopIter).add(fIndex); + } + } else { + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + int beginIndex = lopIter * interval; + int endIndex = (lopIter+1) * interval; + if (lopIter == numLopExpert - 1) { + endIndex = numFeatures; + } + for (int fIndex = beginIndex; fIndex < endIndex; fIndex++) { + featureIndicesSetArray.get(lopIter).add(fIndex); + featureIndicesListArray.get(lopIter).add(fIndex); + } + } + } + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + Collections.sort(featureIndicesListArray.get(lopIter)); + } + } + + protected double[] trainWeightsUsingLopCRF(int numFeatures, int[][][][] data, int[][] labels, Evaluator[] evaluators, int pruneFeatureItr) { + int numLopExpert = flags.numLopExpert; + double[][] lopExpertWeights = new double[numLopExpert][]; + + getFeatureBoundaryIndices(numFeatures, numLopExpert); + + if (flags.initialLopWeights != null) { + try { + System.err.println("Reading initial LOP weights from file " + flags.initialLopWeights + " ..."); + BufferedReader br = IOUtils.readerFromString(flags.initialLopWeights); + List listOfWeights = new ArrayList(numLopExpert); + for (String line; (line = br.readLine()) != null; ) { + line = line.trim(); + String[] parts = line.split("\t"); + double[] wArr = new double[parts.length]; + for (int i = 0; i < parts.length; i++) { + wArr[i] = Double.parseDouble(parts[i]); + } + listOfWeights.add(wArr); + } + assert(listOfWeights.size() == numLopExpert); + System.err.println("Done!"); + for (int i = 0; i < numLopExpert; i++) + lopExpertWeights[i] = listOfWeights.get(i); + // DataInputStream dis = new DataInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream( + // flags.initialLopWeights)))); + // initialScales = Convert.readDoubleArr(dis); + } catch (IOException e) { + throw new RuntimeException("Could not read from double initial LOP weights file " + flags.initialLopWeights); + } + } else { + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + int[][][][] partialData = createPartialDataForLOP(lopIter, data); + if (flags.randomLopWeights) { + lopExpertWeights[lopIter] = initWeightsUsingDoubleCRF(partialData, labels, evaluators, pruneFeatureItr); + } else { + lopExpertWeights[lopIter] = trainWeightsUsingDoubleCRF(partialData, labels, evaluators, pruneFeatureItr, null); + } + } + if (flags.includeFullCRFInLOP) { + double[][] newLopExpertWeights = new double[numLopExpert+1][]; + System.arraycopy(lopExpertWeights, 0, newLopExpertWeights, 0, lopExpertWeights.length); + if (flags.randomLopWeights) { + newLopExpertWeights[numLopExpert] = initWeightsUsingDoubleCRF(data, labels, evaluators, pruneFeatureItr); + } else { + newLopExpertWeights[numLopExpert] = trainWeightsUsingDoubleCRF(data, labels, evaluators, pruneFeatureItr, null); + } + + Set newSet = Generics.newHashSet(numFeatures); + List newList = new ArrayList(numFeatures); + for (int fIndex = 0; fIndex < numFeatures; fIndex++) { + newSet.add(fIndex); + newList.add(fIndex); + } + featureIndicesSetArray.add(newSet); + featureIndicesListArray.add(newList); + + numLopExpert += 1; + lopExpertWeights = newLopExpertWeights; + } + } + + // Dumb scales + // double[] lopScales = new double[numLopExpert]; + // Arrays.fill(lopScales, 1.0); + CRFLogConditionalObjectiveFunctionForLOP func = new CRFLogConditionalObjectiveFunctionForLOP(data, labels, lopExpertWeights, + windowSize, classIndex, labelIndices, map, flags.backgroundSymbol, numLopExpert, featureIndicesSetArray, featureIndicesListArray, + flags.backpropLopTraining); + cliquePotentialFunctionHelper = func; + + Minimizer minimizer = getMinimizer(0, evaluators); + + double[] initialScales; + //TODO(mengqiu) clean this part up when backpropLogTraining == true + if (flags.initialLopScales == null) { + initialScales = func.initial(); + } else { + try { + System.err.println("Reading initial LOP scales from file " + flags.initialLopScales); + DataInputStream dis = new DataInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream( + flags.initialLopScales)))); + initialScales = ConvertByteArray.readDoubleArr(dis); + } catch (IOException e) { + throw new RuntimeException("Could not read from double initial LOP scales file " + flags.initialLopScales); + } + } + + double[] learnedParams = minimizer.minimize(func, flags.tolerance, initialScales); + double[] rawScales = func.separateLopScales(learnedParams); + double[] lopScales = ArrayMath.softmax(rawScales); + System.err.println("After SoftMax Transformation, learned scales are:"); + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + System.err.println("lopScales[" + lopIter + "] = " + lopScales[lopIter]); + } + double[][] learnedLopExpertWeights = lopExpertWeights; + if (flags.backpropLopTraining) { + learnedLopExpertWeights = func.separateLopExpertWeights(learnedParams); + } + return CRFLogConditionalObjectiveFunctionForLOP.combineAndScaleLopWeights(numLopExpert, learnedLopExpertWeights, lopScales); + } + + protected double[] initWeightsUsingDoubleCRF(int[][][][] data, int[][] labels, Evaluator[] evaluators, int pruneFeatureItr) { + CRFLogConditionalObjectiveFunction func = new CRFLogConditionalObjectiveFunction(data, labels, windowSize, classIndex, + labelIndices, map, flags.priorType, flags.backgroundSymbol, flags.sigma, null); + return func.initial(); + } + + protected double[] trainWeightsUsingNonLinearCRF(AbstractCachingDiffFunction func, Evaluator[] evaluators) { + Minimizer minimizer = getMinimizer(0, evaluators); + + double[] initialWeights; + if (flags.initialWeights == null) { + initialWeights = func.initial(); + } else { + try { + System.err.println("Reading initial weights from file " + flags.initialWeights); + DataInputStream dis = new DataInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream( + flags.initialWeights)))); + initialWeights = ConvertByteArray.readDoubleArr(dis); + } catch (IOException e) { + throw new RuntimeException("Could not read from double initial weight file " + flags.initialWeights); + } + } + System.err.println("numWeights: " + initialWeights.length); + + if (flags.testObjFunction) { + StochasticDiffFunctionTester tester = new StochasticDiffFunctionTester(func); + if (tester.testSumOfBatches(initialWeights, 1e-4)) { + System.err.println("Testing complete... exiting"); + System.exit(1); + } else { + System.err.println("Testing failed....exiting"); + System.exit(1); + } + + } + //check gradient + if (flags.checkGradient) { + if (func.gradientCheck()) { + System.err.println("gradient check passed"); + } else { + throw new RuntimeException("gradient check failed"); + } + } + double[] w = minimizer.minimize(func, flags.tolerance, initialWeights); + return w; + } + + protected double[] trainWeightsUsingDoubleCRF(int[][][][] data, int[][] labels, Evaluator[] evaluators, int pruneFeatureItr, double[][][][] featureVals) { + + CRFLogConditionalObjectiveFunction func = new CRFLogConditionalObjectiveFunction(data, labels, + windowSize, classIndex, labelIndices, map, flags.priorType, flags.backgroundSymbol, flags.sigma, featureVals); + cliquePotentialFunctionHelper = func; + + Minimizer minimizer = getMinimizer(pruneFeatureItr, evaluators); + + double[] initialWeights; + if (flags.initialWeights == null) { + initialWeights = func.initial(); + } else { + try { + System.err.println("Reading initial weights from file " + flags.initialWeights); + DataInputStream dis = new DataInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream( + flags.initialWeights)))); + initialWeights = ConvertByteArray.readDoubleArr(dis); + } catch (IOException e) { + throw new RuntimeException("Could not read from double initial weight file " + flags.initialWeights); + } + } + System.err.println("numWeights: " + initialWeights.length); + + if (flags.testObjFunction) { + StochasticDiffFunctionTester tester = new StochasticDiffFunctionTester(func); + if (tester.testSumOfBatches(initialWeights, 1e-4)) { + System.err.println("Testing complete... exiting"); + System.exit(1); + } else { + System.err.println("Testing failed....exiting"); + System.exit(1); + } + + } + //check gradient + if (flags.checkGradient) { + if (func.gradientCheck()) { + System.err.println("gradient check passed"); + } else { + throw new RuntimeException("gradient check failed"); + } + } + double[] ws = minimizer.minimize(func, flags.tolerance, initialWeights); + if (flags.inputDropOut != 0.0) { + // scale the weights since they won't be dropped at test time + ArrayMath.multiplyInPlace(ws, 1.0/(1.0 - flags.inputDropOut)); + System.err.printf("Scaled weights by %f", 1.0/(1.0 - flags.inputDropOut)); + } + return ws; + } + + + protected Minimizer getMinimizer() { + return getMinimizer(0, null); + } + + protected Minimizer getMinimizer(int featurePruneIteration, Evaluator[] evaluators) { + Minimizer minimizer = null; + if (flags.useQN) { + int QNmem; + if (featurePruneIteration == 0) { + QNmem = flags.QNsize; + } else { + QNmem = flags.QNsize2; + } + + if (flags.interimOutputFreq != 0) { + Function monitor = new ResultStoringMonitor(flags.interimOutputFreq, flags.serializeTo); + minimizer = new QNMinimizer(monitor, QNmem, flags.useRobustQN); + } else { + minimizer = new QNMinimizer(QNmem, flags.useRobustQN); + } + + ((QNMinimizer) minimizer).terminateOnEvalImprovement(flags.terminateOnEvalImprovement); + ((QNMinimizer) minimizer).setTerminateOnEvalImprovementNumOfEpoch(flags.terminateOnEvalImprovementNumOfEpoch); + ((QNMinimizer) minimizer).suppressTestPrompt(flags.suppressTestDebug); + if (flags.useOWLQN) { + ((QNMinimizer) minimizer).useOWLQN(flags.useOWLQN, flags.priorL1Lambda); + } + } else if (flags.useInPlaceSGD) { + StochasticInPlaceMinimizer sgdMinimizer = + new StochasticInPlaceMinimizer(flags.sigma, flags.SGDPasses, flags.tuneSampleSize, flags.stochasticBatchSize); + if (flags.useSGDtoQN) { + QNMinimizer qnMinimizer; + int QNmem; + if (featurePruneIteration == 0) { + QNmem = flags.QNsize; + } else { + QNmem = flags.QNsize2; + } + if (flags.interimOutputFreq != 0) { + Function monitor = new ResultStoringMonitor(flags.interimOutputFreq, flags.serializeTo); + qnMinimizer = new QNMinimizer(monitor, QNmem, flags.useRobustQN); + } else { + qnMinimizer = new QNMinimizer(QNmem, flags.useRobustQN); + } + minimizer = new HybridMinimizer(sgdMinimizer, qnMinimizer, flags.SGDPasses); + } else { + minimizer = sgdMinimizer; + } + } else if (flags.useSGDtoQN) { + minimizer = new SGDToQNMinimizer(flags.initialGain, flags.stochasticBatchSize, + flags.SGDPasses, flags.QNPasses, flags.SGD2QNhessSamples, + flags.QNsize, flags.outputIterationsToFile); + } else if (flags.useSMD) { + minimizer = new SMDMinimizer(flags.initialGain, flags.stochasticBatchSize, flags.stochasticMethod, + flags.SGDPasses); + } else if (flags.useSGD) { + minimizer = new SGDMinimizer(flags.initialGain, flags.stochasticBatchSize); + } else if (flags.useScaledSGD) { + minimizer = new ScaledSGDMinimizer(flags.initialGain, flags.stochasticBatchSize, flags.SGDPasses, + flags.scaledSGDMethod); + } else if (flags.l1reg > 0.0) { + minimizer = ReflectionLoading.loadByReflection("edu.stanford.nlp.optimization.OWLQNMinimizer", flags.l1reg); + } + + if (minimizer instanceof HasEvaluators) { + ((HasEvaluators) minimizer).setEvaluators(flags.evaluateIters, evaluators); + } + if (minimizer == null) { + throw new RuntimeException("No minimizer assigned!"); + } + + return minimizer; + } + + /** + * Creates a new CRFDatum from the preprocessed allData format, given the + * document number, position number, and a List of Object labels. + * + * @return A new CRFDatum + */ + protected List, ? extends CharSequence>> extractDatumSequence(int[][][] allData, int beginPosition, int endPosition, + List labeledWordInfos) { + List, ? extends CharSequence>> result = new ArrayList, ? extends CharSequence>>(); + int beginContext = beginPosition - windowSize + 1; + if (beginContext < 0) { + beginContext = 0; + } + // for the beginning context, add some dummy datums with no features! + // TODO: is there any better way to do this? + for (int position = beginContext; position < beginPosition; position++) { + List> cliqueFeatures = new ArrayList>(); + List featureVals = new ArrayList(); + for (int i = 0; i < windowSize; i++) { + // create a feature list + cliqueFeatures.add(Collections.emptyList()); + featureVals.add(null); + } + CRFDatum, String> datum = new CRFDatum, String>(cliqueFeatures, + labeledWordInfos.get(position).get(CoreAnnotations.AnswerAnnotation.class), featureVals); + result.add(datum); + } + // now add the real datums + for (int position = beginPosition; position <= endPosition; position++) { + List> cliqueFeatures = new ArrayList>(); + List featureVals = new ArrayList(); + for (int i = 0; i < windowSize; i++) { + // create a feature list + Collection features = new ArrayList(); + for (int j = 0; j < allData[position][i].length; j++) { + features.add(featureIndex.get(allData[position][i][j])); + } + cliqueFeatures.add(features); + featureVals.add(null); + } + CRFDatum,String> datum = new CRFDatum,String>(cliqueFeatures, + labeledWordInfos.get(position).get(CoreAnnotations.AnswerAnnotation.class), featureVals); + result.add(datum); + } + return result; + } + + /** + * Adds the List of Lists of CRFDatums to the data and labels arrays, treating + * each datum as if it were its own document. Adds context labels in addition + * to the target label for each datum, meaning that for a particular document, + * the number of labels will be windowSize-1 greater than the number of + * datums. + * + * @param processedData + * a List of Lists of CRFDatums + */ + protected void addProcessedData(List, String>>> processedData, int[][][][] data, + int[][] labels, double[][][][] featureVals, int offset) { + for (int i = 0, pdSize = processedData.size(); i < pdSize; i++) { + int dataIndex = i + offset; + List, String>> document = processedData.get(i); + int dsize = document.size(); + labels[dataIndex] = new int[dsize]; + data[dataIndex] = new int[dsize][][]; + if (featureVals != null) + featureVals[dataIndex] = new double[dsize][][]; + for (int j = 0; j < dsize; j++) { + CRFDatum, String> crfDatum = document.get(j); + // add label, they are offset by extra context + labels[dataIndex][j] = classIndex.indexOf(crfDatum.label()); + // add featureVals + List featureValList = null; + if (featureVals != null) + featureValList = crfDatum.asFeatureVals(); + // add features + List> cliques = crfDatum.asFeatures(); + int csize = cliques.size(); + data[dataIndex][j] = new int[csize][]; + if (featureVals != null) + featureVals[dataIndex][j] = new double[csize][]; + for (int k = 0; k < csize; k++) { + Collection features = cliques.get(k); + + data[dataIndex][j][k] = new int[features.size()]; + if (featureVals != null) + featureVals[dataIndex][j][k] = featureValList.get(k); + + int m = 0; + try { + for (String feature : features) { + // System.err.println("feature " + feature); + // if (featureIndex.indexOf(feature)) ; + if (featureIndex == null) { + System.out.println("Feature is NULL!"); + } + data[dataIndex][j][k][m] = featureIndex.indexOf(feature); + m++; + } + } catch (Exception e) { + e.printStackTrace(); + System.err.printf("[index=%d, j=%d, k=%d, m=%d]\n", dataIndex, j, k, m); + System.err.println("data.length " + data.length); + System.err.println("data[dataIndex].length " + data[dataIndex].length); + System.err.println("data[dataIndex][j].length " + data[dataIndex][j].length); + System.err.println("data[dataIndex][j][k].length " + data[dataIndex][j].length); + System.err.println("data[dataIndex][j][k][m] " + data[dataIndex][j][k][m]); + return; + } + } + } + } + } + + protected static void saveProcessedData(List datums, String filename) { + System.err.print("Saving processsed data of size " + datums.size() + " to serialized file..."); + ObjectOutputStream oos = null; + try { + oos = new ObjectOutputStream(new FileOutputStream(filename)); + oos.writeObject(datums); + } catch (IOException e) { + // do nothing + } finally { + IOUtils.closeIgnoringExceptions(oos); + } + System.err.println("done."); + } + + protected static List, String>>> loadProcessedData(String filename) { + System.err.print("Loading processed data from serialized file..."); + ObjectInputStream ois = null; + List, String>>> result = Collections.emptyList(); + try { + ois = new ObjectInputStream(new FileInputStream(filename)); + result = (List, String>>>) ois.readObject(); + } catch (Exception e) { + e.printStackTrace(); + } finally { + IOUtils.closeIgnoringExceptions(ois); + } + System.err.println("done. Got " + result.size() + " datums."); + return result; + } + + public void loadTextClassifier(String text, Properties props) throws ClassCastException, IOException, + ClassNotFoundException, InstantiationException, IllegalAccessException { + // System.err.println("DEBUG: in loadTextClassifier"); + System.err.println("Loading Text Classifier from " + text); + BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(text)))); + + String line = br.readLine(); + // first line should be this format: + // labelIndices.size()=\t%d + String[] toks = line.split("\\t"); + if (!toks[0].equals("labelIndices.length=")) { + throw new RuntimeException("format error"); + } + int size = Integer.parseInt(toks[1]); + labelIndices = new ArrayList>(size); + for (int labelIndicesIdx = 0; labelIndicesIdx < size; labelIndicesIdx++) { + line = br.readLine(); + // first line should be this format: + // labelIndices.length=\t%d + // labelIndices[0].size()=\t%d + toks = line.split("\\t"); + if (!(toks[0].startsWith("labelIndices[") && toks[0].endsWith("].size()="))) { + throw new RuntimeException("format error"); + } + int labelIndexSize = Integer.parseInt(toks[1]); + labelIndices.add(new HashIndex()); + int count = 0; + while (count < labelIndexSize) { + line = br.readLine(); + toks = line.split("\\t"); + int idx = Integer.parseInt(toks[0]); + if (count != idx) { + throw new RuntimeException("format error"); + } + + String[] crflabelstr = toks[1].split(" "); + int[] crflabel = new int[crflabelstr.length]; + for (int i = 0; i < crflabelstr.length; i++) { + crflabel[i] = Integer.parseInt(crflabelstr[i]); + } + CRFLabel crfL = new CRFLabel(crflabel); + + labelIndices.get(labelIndicesIdx).add(crfL); + count++; + } + } + + /**************************************/ + System.err.printf("DEBUG: labelIndices.length=\t%d\n", labelIndices.size()); + for (int i = 0; i < labelIndices.size(); i++) { + System.err.printf("DEBUG: labelIndices[%d].size()=\t%d\n", i, labelIndices.get(i).size()); + for (int j = 0; j < labelIndices.get(i).size(); j++) { + int[] label = labelIndices.get(i).get(j).getLabel(); + List list = new ArrayList(); + for (int l : label) { + list.add(l); + } + System.err.printf("DEBUG: %d\t%s\n", j, StringUtils.join(list, " ")); + } + } + /**************************************/ + + line = br.readLine(); + toks = line.split("\\t"); + if (!toks[0].equals("classIndex.size()=")) { + throw new RuntimeException("format error"); + } + int classIndexSize = Integer.parseInt(toks[1]); + classIndex = new HashIndex(); + int count = 0; + while (count < classIndexSize) { + line = br.readLine(); + toks = line.split("\\t"); + int idx = Integer.parseInt(toks[0]); + if (count != idx) { + throw new RuntimeException("format error"); + } + classIndex.add(toks[1]); + count++; + } + + /******************************************/ + System.err.printf("DEBUG: classIndex.size()=\t%d\n", classIndex.size()); + for (int i = 0; i < classIndex.size(); i++) { + System.err.printf("DEBUG: %d\t%s\n", i, classIndex.get(i)); + } + /******************************************/ + + line = br.readLine(); + toks = line.split("\\t"); + if (!toks[0].equals("featureIndex.size()=")) { + throw new RuntimeException("format error"); + } + int featureIndexSize = Integer.parseInt(toks[1]); + featureIndex = new HashIndex(); + count = 0; + while (count < featureIndexSize) { + line = br.readLine(); + toks = line.split("\\t"); + int idx = Integer.parseInt(toks[0]); + if (count != idx) { + throw new RuntimeException("format error"); + } + featureIndex.add(toks[1]); + count++; + } + + /***************************************/ + System.err.printf("DEBUG: featureIndex.size()=\t%d\n", featureIndex.size()); + /***************************************/ + + /* + for(int i = 0; i < featureIndex.size(); i++) { + System.err.printf("DEBUG: %d\t%s\n", i, featureIndex.get(i)); + } + */ + /***************************************/ + + line = br.readLine(); + if (!line.equals("")) { + throw new RuntimeException("format error"); + } + Properties p = new Properties(); + line = br.readLine(); + + while (!line.equals("")) { + // System.err.println("DEBUG: flags line: "+line); + String[] keyValue = line.split("="); + // System.err.printf("DEBUG: p.setProperty(%s,%s)\n", keyValue[0], + // keyValue[1]); + p.setProperty(keyValue[0], keyValue[1]); + line = br.readLine(); + } + + // System.err.println("DEBUG: out from flags"); + flags = new SeqClassifierFlags(p); + System.err.println("DEBUG: "); + System.err.print(flags.toString()); + System.err.println("DEBUG: "); + + if (flags.useEmbedding) { + line = br.readLine(); + toks = line.split("\\t"); + if (!toks[0].equals("embeddings.size()=")) { + throw new RuntimeException("format error in embeddings"); + } + int embeddingSize = Integer.parseInt(toks[1]); + embeddings = Generics.newHashMap(embeddingSize); + count = 0; + while (count < embeddingSize) { + line = br.readLine().trim(); + toks = line.split("\\t"); + String word = toks[0]; + double[] arr = ArrayUtils.toDoubleArray(toks[1].split(" ")); + embeddings.put(word, arr); + count++; + } + } + + if (flags.nonLinearCRF) { + line = br.readLine(); + toks = line.split("\\t"); + if (!toks[0].equals("nodeFeatureIndicesMap.size()=")) { + throw new RuntimeException("format error in nodeFeatureIndicesMap"); + } + int nodeFeatureIndicesMapSize = Integer.parseInt(toks[1]); + nodeFeatureIndicesMap = new HashIndex(); + count = 0; + while (count < nodeFeatureIndicesMapSize) { + line = br.readLine(); + toks = line.split("\\t"); + int idx = Integer.parseInt(toks[0]); + if (count != idx) { + throw new RuntimeException("format error"); + } + nodeFeatureIndicesMap.add(Integer.parseInt(toks[1])); + count++; + } + + /***************************************/ + System.err.printf("DEBUG: nodeFeatureIndicesMap.size()=\t%d\n", nodeFeatureIndicesMap.size()); + /***************************************/ + + line = br.readLine(); + toks = line.split("\\t"); + if (!toks[0].equals("edgeFeatureIndicesMap.size()=")) { + throw new RuntimeException("format error"); + } + int edgeFeatureIndicesMapSize = Integer.parseInt(toks[1]); + edgeFeatureIndicesMap = new HashIndex(); + count = 0; + while (count < edgeFeatureIndicesMapSize) { + line = br.readLine(); + toks = line.split("\\t"); + int idx = Integer.parseInt(toks[0]); + if (count != idx) { + throw new RuntimeException("format error"); + } + edgeFeatureIndicesMap.add(Integer.parseInt(toks[1])); + count++; + } + + /***************************************/ + System.err.printf("DEBUG: edgeFeatureIndicesMap.size()=\t%d\n", edgeFeatureIndicesMap.size()); + /***************************************/ + + int weightsLength = -1; + if (flags.secondOrderNonLinear) { + line = br.readLine(); + toks = line.split("\\t"); + if (!toks[0].equals("inputLayerWeights4Edge.length=")) { + throw new RuntimeException("format error"); + } + weightsLength = Integer.parseInt(toks[1]); + inputLayerWeights4Edge = new double[weightsLength][]; + count = 0; + while (count < weightsLength) { + line = br.readLine(); + + toks = line.split("\\t"); + int weights2Length = Integer.parseInt(toks[0]); + inputLayerWeights4Edge[count] = new double[weights2Length]; + String[] weightsValue = toks[1].split(" "); + if (weights2Length != weightsValue.length) { + throw new RuntimeException("weights format error"); + } + + for (int i2 = 0; i2 < weights2Length; i2++) { + inputLayerWeights4Edge[count][i2] = Double.parseDouble(weightsValue[i2]); + } + count++; + } + /***************************************/ + System.err.printf("DEBUG: double[%d][] inputLayerWeights4Edge loaded\n", weightsLength); + /***************************************/ + + line = br.readLine(); + + toks = line.split("\\t"); + if (!toks[0].equals("outputLayerWeights4Edge.length=")) { + throw new RuntimeException("format error"); + } + weightsLength = Integer.parseInt(toks[1]); + outputLayerWeights4Edge = new double[weightsLength][]; + count = 0; + while (count < weightsLength) { + line = br.readLine(); + + toks = line.split("\\t"); + int weights2Length = Integer.parseInt(toks[0]); + outputLayerWeights4Edge[count] = new double[weights2Length]; + String[] weightsValue = toks[1].split(" "); + if (weights2Length != weightsValue.length) { + throw new RuntimeException("weights format error"); + } + + for (int i2 = 0; i2 < weights2Length; i2++) { + outputLayerWeights4Edge[count][i2] = Double.parseDouble(weightsValue[i2]); + } + count++; + } + /***************************************/ + System.err.printf("DEBUG: double[%d][] outputLayerWeights loaded\n", weightsLength); + /***************************************/ + + } else { + line = br.readLine(); + + toks = line.split("\\t"); + if (!toks[0].equals("linearWeights.length=")) { + throw new RuntimeException("format error"); + } + weightsLength = Integer.parseInt(toks[1]); + linearWeights = new double[weightsLength][]; + count = 0; + while (count < weightsLength) { + line = br.readLine(); + + toks = line.split("\\t"); + int weights2Length = Integer.parseInt(toks[0]); + linearWeights[count] = new double[weights2Length]; + String[] weightsValue = toks[1].split(" "); + if (weights2Length != weightsValue.length) { + throw new RuntimeException("weights format error"); + } + + for (int i2 = 0; i2 < weights2Length; i2++) { + linearWeights[count][i2] = Double.parseDouble(weightsValue[i2]); + } + count++; + } + /***************************************/ + System.err.printf("DEBUG: double[%d][] linearWeights loaded\n", weightsLength); + /***************************************/ + } + + line = br.readLine(); + + toks = line.split("\\t"); + if (!toks[0].equals("inputLayerWeights.length=")) { + throw new RuntimeException("format error"); + } + weightsLength = Integer.parseInt(toks[1]); + inputLayerWeights = new double[weightsLength][]; + count = 0; + while (count < weightsLength) { + line = br.readLine(); + + toks = line.split("\\t"); + int weights2Length = Integer.parseInt(toks[0]); + inputLayerWeights[count] = new double[weights2Length]; + String[] weightsValue = toks[1].split(" "); + if (weights2Length != weightsValue.length) { + throw new RuntimeException("weights format error"); + } + + for (int i2 = 0; i2 < weights2Length; i2++) { + inputLayerWeights[count][i2] = Double.parseDouble(weightsValue[i2]); + } + count++; + } + /***************************************/ + System.err.printf("DEBUG: double[%d][] inputLayerWeights loaded\n", weightsLength); + /***************************************/ + + line = br.readLine(); + + toks = line.split("\\t"); + if (!toks[0].equals("outputLayerWeights.length=")) { + throw new RuntimeException("format error"); + } + weightsLength = Integer.parseInt(toks[1]); + outputLayerWeights = new double[weightsLength][]; + count = 0; + while (count < weightsLength) { + line = br.readLine(); + + toks = line.split("\\t"); + int weights2Length = Integer.parseInt(toks[0]); + outputLayerWeights[count] = new double[weights2Length]; + String[] weightsValue = toks[1].split(" "); + if (weights2Length != weightsValue.length) { + throw new RuntimeException("weights format error"); + } + + for (int i2 = 0; i2 < weights2Length; i2++) { + outputLayerWeights[count][i2] = Double.parseDouble(weightsValue[i2]); + } + count++; + } + /***************************************/ + System.err.printf("DEBUG: double[%d][] outputLayerWeights loaded\n", weightsLength); + /***************************************/ + } + + // + // edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory + // + line = br.readLine(); + + String[] featureFactoryName = line.split(" "); + if (!featureFactoryName[0].equals("") || !featureFactoryName[2].equals("")) { + throw new RuntimeException("format error"); + } + featureFactory = (edu.stanford.nlp.sequences.FeatureFactory) Class.forName(featureFactoryName[1]).newInstance(); + featureFactory.init(flags); + + reinit(); + + // 2 + line = br.readLine(); + + String[] windowSizeName = line.split(" "); + if (!windowSizeName[0].equals("") || !windowSizeName[2].equals("")) { + throw new RuntimeException("format error"); + } + windowSize = Integer.parseInt(windowSizeName[1]); + + // weights.length= 2655170 + line = br.readLine(); + + toks = line.split("\\t"); + if (!toks[0].equals("weights.length=")) { + throw new RuntimeException("format error"); + } + int weightsLength = Integer.parseInt(toks[1]); + weights = new double[weightsLength][]; + count = 0; + while (count < weightsLength) { + line = br.readLine(); + + toks = line.split("\\t"); + int weights2Length = Integer.parseInt(toks[0]); + weights[count] = new double[weights2Length]; + String[] weightsValue = toks[1].split(" "); + if (weights2Length != weightsValue.length) { + throw new RuntimeException("weights format error"); + } + + for (int i2 = 0; i2 < weights2Length; i2++) { + weights[count][i2] = Double.parseDouble(weightsValue[i2]); + } + count++; + } + System.err.printf("DEBUG: double[%d][] weights loaded\n", weightsLength); + line = br.readLine(); + + if (line != null) { + throw new RuntimeException("weights format error"); + } + } + + /** + * Serialize the model to a human readable format. It's not yet complete. It + * should now work for Chinese segmenter though. TODO: check things in + * serializeClassifier and add other necessary serialization back + * + * @param serializePath + * File to write text format of classifier to. + */ + public void serializeTextClassifier(String serializePath) { + System.err.print("Serializing Text classifier to " + serializePath + "..."); + try { + PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(serializePath))); + + pw.printf("labelIndices.length=\t%d\n", labelIndices.size()); + for (int i = 0; i < labelIndices.size(); i++) { + pw.printf("labelIndices[%d].size()=\t%d\n", i, labelIndices.get(i).size()); + for (int j = 0; j < labelIndices.get(i).size(); j++) { + int[] label = labelIndices.get(i).get(j).getLabel(); + List list = new ArrayList(); + for (int l : label) { + list.add(l); + } + pw.printf("%d\t%s\n", j, StringUtils.join(list, " ")); + } + } + + pw.printf("classIndex.size()=\t%d\n", classIndex.size()); + for (int i = 0; i < classIndex.size(); i++) { + pw.printf("%d\t%s\n", i, classIndex.get(i)); + } + // pw.printf("\n"); + + pw.printf("featureIndex.size()=\t%d\n", featureIndex.size()); + for (int i = 0; i < featureIndex.size(); i++) { + pw.printf("%d\t%s\n", i, featureIndex.get(i)); + } + // pw.printf("\n"); + + pw.println(""); + pw.print(flags.toString()); + pw.println(""); + + if (flags.useEmbedding) { + pw.printf("embeddings.size()=\t%d\n", embeddings.size()); + for (String word: embeddings.keySet()) { + double[] arr = embeddings.get(word); + Double[] arrUnboxed = new Double[arr.length]; + for(int i = 0; i < arr.length; i++) + arrUnboxed[i] = arr[i]; + pw.printf("%s\t%s\n", word, StringUtils.join(arrUnboxed, " ")); + } + } + + if (flags.nonLinearCRF) { + pw.printf("nodeFeatureIndicesMap.size()=\t%d\n", nodeFeatureIndicesMap.size()); + for (int i = 0; i < nodeFeatureIndicesMap.size(); i++) { + pw.printf("%d\t%d\n", i, nodeFeatureIndicesMap.get(i)); + } + + pw.printf("edgeFeatureIndicesMap.size()=\t%d\n", edgeFeatureIndicesMap.size()); + for (int i = 0; i < edgeFeatureIndicesMap.size(); i++) { + pw.printf("%d\t%d\n", i, edgeFeatureIndicesMap.get(i)); + } + + if (flags.secondOrderNonLinear) { + pw.printf("inputLayerWeights4Edge.length=\t%d\n", inputLayerWeights4Edge.length); + for (double[] ws : inputLayerWeights4Edge) { + ArrayList list = new ArrayList(); + for (double w : ws) { + list.add(w); + } + pw.printf("%d\t%s\n", ws.length, StringUtils.join(list, " ")); + } + pw.printf("outputLayerWeights4Edge.length=\t%d\n", outputLayerWeights4Edge.length); + for (double[] ws : outputLayerWeights4Edge) { + ArrayList list = new ArrayList(); + for (double w : ws) { + list.add(w); + } + pw.printf("%d\t%s\n", ws.length, StringUtils.join(list, " ")); + } + } else { + pw.printf("linearWeights.length=\t%d\n", linearWeights.length); + for (double[] ws : linearWeights) { + ArrayList list = new ArrayList(); + for (double w : ws) { + list.add(w); + } + pw.printf("%d\t%s\n", ws.length, StringUtils.join(list, " ")); + } + } + pw.printf("inputLayerWeights.length=\t%d\n", inputLayerWeights.length); + for (double[] ws : inputLayerWeights) { + ArrayList list = new ArrayList(); + for (double w : ws) { + list.add(w); + } + pw.printf("%d\t%s\n", ws.length, StringUtils.join(list, " ")); + } + pw.printf("outputLayerWeights.length=\t%d\n", outputLayerWeights.length); + for (double[] ws : outputLayerWeights) { + ArrayList list = new ArrayList(); + for (double w : ws) { + list.add(w); + } + pw.printf("%d\t%s\n", ws.length, StringUtils.join(list, " ")); + } + } + + + pw.printf(" %s \n", featureFactory.getClass().getName()); + + pw.printf(" %d \n", windowSize); + + pw.printf("weights.length=\t%d\n", weights.length); + for (double[] ws : weights) { + ArrayList list = new ArrayList(); + for (double w : ws) { + list.add(w); + } + pw.printf("%d\t%s\n", ws.length, StringUtils.join(list, " ")); + } + + pw.close(); + System.err.println("done."); + + } catch (Exception e) { + System.err.println("Failed"); + e.printStackTrace(); + // don't actually exit in case they're testing too + // System.exit(1); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void serializeClassifier(String serializePath) { + System.err.print("Serializing classifier to " + serializePath + "..."); + + ObjectOutputStream oos = null; + try { + oos = IOUtils.writeStreamFromString(serializePath); + + oos.writeObject(labelIndices); + oos.writeObject(classIndex); + oos.writeObject(featureIndex); + oos.writeObject(flags); + if (flags.useEmbedding) + oos.writeObject(embeddings); + if (flags.nonLinearCRF) { + oos.writeObject(nodeFeatureIndicesMap); + oos.writeObject(edgeFeatureIndicesMap); + if (flags.secondOrderNonLinear) { + oos.writeObject(inputLayerWeights4Edge); + oos.writeObject(outputLayerWeights4Edge); + } else { + oos.writeObject(linearWeights); + } + oos.writeObject(inputLayerWeights); + oos.writeObject(outputLayerWeights); + } + oos.writeObject(featureFactory); + oos.writeInt(windowSize); + oos.writeObject(weights); + // oos.writeObject(WordShapeClassifier.getKnownLowerCaseWords()); + + oos.writeObject(knownLCWords); + + System.err.println("done."); + + } catch (Exception e) { + System.err.println("Failed"); + e.printStackTrace(); + // don't actually exit in case they're testing too + // System.exit(1); + } finally { + IOUtils.closeIgnoringExceptions(oos); + } + } + + /** + * Loads a classifier from the specified InputStream. This version works + * quietly (unless VERBOSE is true). If props is non-null then any properties + * it specifies override those in the serialized file. However, only some + * properties are sensible to change (you shouldn't change how features are + * defined). + *

    + * Note: This method does not close the ObjectInputStream. (But earlier + * versions of the code used to, so beware....) + */ + @Override + @SuppressWarnings( { "unchecked" }) + // can't have right types in deserialization + public void loadClassifier(ObjectInputStream ois, Properties props) throws ClassCastException, IOException, + ClassNotFoundException { + Object o = ois.readObject(); + // TODO: when we next break serialization, get rid of this fork and only read the List + if (o instanceof List) { + labelIndices = (List>) o; + } else { + Index[] indexArray = (Index[]) o; + labelIndices = new ArrayList>(indexArray.length); + for (int i = 0; i < indexArray.length; ++i) { + labelIndices.add(indexArray[i]); + } + } + classIndex = (Index) ois.readObject(); + featureIndex = (Index) ois.readObject(); + flags = (SeqClassifierFlags) ois.readObject(); + if (flags.useEmbedding) { + embeddings = (Map) ois.readObject(); + } + if (flags.nonLinearCRF) { + nodeFeatureIndicesMap = (Index) ois.readObject(); + edgeFeatureIndicesMap = (Index) ois.readObject(); + if (flags.secondOrderNonLinear) { + inputLayerWeights4Edge = (double[][]) ois.readObject(); + outputLayerWeights4Edge = (double[][]) ois.readObject(); + } else { + linearWeights = (double[][]) ois.readObject(); + } + inputLayerWeights = (double[][]) ois.readObject(); + outputLayerWeights = (double[][]) ois.readObject(); + } + featureFactory = (edu.stanford.nlp.sequences.FeatureFactory) ois.readObject(); + + if (props != null) { + flags.setProperties(props, false); + } + reinit(); + + windowSize = ois.readInt(); + weights = (double[][]) ois.readObject(); + + // WordShapeClassifier.setKnownLowerCaseWords((Set) ois.readObject()); + knownLCWords = (Set) ois.readObject(); + + if (VERBOSE) { + System.err.println("windowSize=" + windowSize); + System.err.println("flags=\n" + flags); + } + } + + /** + * This is used to load the default supplied classifier stored within the jar + * file. THIS FUNCTION WILL ONLY WORK IF THE CODE WAS LOADED FROM A JAR FILE + * WHICH HAS A SERIALIZED CLASSIFIER STORED INSIDE IT. + */ + public void loadDefaultClassifier() { + loadJarClassifier(DEFAULT_CLASSIFIER, null); + } + + public void loadTagIndex() { + if (flags.useNERPriorBIO) { + if (tagIndex == null) { + tagIndex = new HashIndex(); + for (String tag: classIndex.objectsList()) { + String[] parts = tag.split("-"); + if (parts.length > 1) + tagIndex.add(parts[parts.length-1]); + } + tagIndex.add(flags.backgroundSymbol); + } + if (entityMatrices == null) + entityMatrices = BisequenceEmpiricalNERPrior.readEntityMatrices(flags.entityMatrix, tagIndex); + } + } + + /** + * This is used to load the default supplied classifier stored within the jar + * file. THIS FUNCTION WILL ONLY WORK IF THE CODE WAS LOADED FROM A JAR FILE + * WHICH HAS A SERIALIZED CLASSIFIER STORED INSIDE IT. + */ + public void loadDefaultClassifier(Properties props) { + loadJarClassifier(DEFAULT_CLASSIFIER, props); + } + + /** + * Used to get the default supplied classifier inside the jar file. THIS + * FUNCTION WILL ONLY WORK IF THE CODE WAS LOADED FROM A JAR FILE WHICH HAS A + * SERIALIZED CLASSIFIER STORED INSIDE IT. + * + * @return The default CRFClassifier in the jar file (if there is one) + */ + public static CRFClassifier getDefaultClassifier() { + CRFClassifier crf = new CRFClassifier(); + crf.loadDefaultClassifier(); + return crf; + } + + /** + * Used to get the default supplied classifier inside the jar file. THIS + * FUNCTION WILL ONLY WORK IF THE CODE WAS LOADED FROM A JAR FILE WHICH HAS A + * SERIALIZED CLASSIFIER STORED INSIDE IT. + * + * @return The default CRFClassifier in the jar file (if there is one) + */ + public static CRFClassifier getDefaultClassifier(Properties props) { + CRFClassifier crf = new CRFClassifier(); + crf.loadDefaultClassifier(props); + return crf; + } + + /** + * Used to load a classifier stored as a resource inside a jar file. THIS + * FUNCTION WILL ONLY WORK IF THE CODE WAS LOADED FROM A JAR FILE WHICH HAS A + * SERIALIZED CLASSIFIER STORED INSIDE IT. + * + * @param resourceName + * Name of clasifier resource inside the jar file. + * @return A CRFClassifier stored in the jar file + */ + public static CRFClassifier getJarClassifier(String resourceName, Properties props) { + CRFClassifier crf = new CRFClassifier(); + crf.loadJarClassifier(resourceName, props); + return crf; + } + + /** + * Loads a CRF classifier from a filepath, and returns it. + * + * @param file + * File to load classifier from + * @return The CRF classifier + * + * @throws IOException + * If there are problems accessing the input stream + * @throws ClassCastException + * If there are problems interpreting the serialized data + * @throws ClassNotFoundException + * If there are problems interpreting the serialized data + */ + public static CRFClassifier getClassifier(File file) throws IOException, ClassCastException, + ClassNotFoundException { + CRFClassifier crf = new CRFClassifier(); + crf.loadClassifier(file); + return crf; + } + + /** + * Loads a CRF classifier from an InputStream, and returns it. This method + * does not buffer the InputStream, so you should have buffered it before + * calling this method. + * + * @param in + * InputStream to load classifier from + * @return The CRF classifier + * + * @throws IOException + * If there are problems accessing the input stream + * @throws ClassCastException + * If there are problems interpreting the serialized data + * @throws ClassNotFoundException + * If there are problems interpreting the serialized data + */ + public static CRFClassifier getClassifier(InputStream in) throws IOException, ClassCastException, + ClassNotFoundException { + CRFClassifier crf = new CRFClassifier(); + crf.loadClassifier(in); + return crf; + } + + public static CRFClassifier getClassifierNoExceptions(String loadPath) { + CRFClassifier crf = new CRFClassifier(); + crf.loadClassifierNoExceptions(loadPath); + return crf; + } + + public static CRFClassifier getClassifier(String loadPath) throws IOException, ClassCastException, + ClassNotFoundException { + CRFClassifier crf = new CRFClassifier(); + crf.loadClassifier(loadPath); + return crf; + } + + public static CRFClassifier getClassifier(String loadPath, Properties props) throws IOException, ClassCastException, + ClassNotFoundException { + CRFClassifier crf = new CRFClassifier(); + crf.loadClassifier(loadPath, props); + return crf; + } + + /** The main method. See the class documentation. */ + public static void main(String[] args) throws Exception { + StringUtils.printErrInvocationString("CRFClassifier", args); + + Properties props = StringUtils.argsToProperties(args); + CRFClassifier crf = new CRFClassifier(props); + String testFile = crf.flags.testFile; + String testFiles = crf.flags.testFiles; + String textFile = crf.flags.textFile; + String textFiles = crf.flags.textFiles; + String loadPath = crf.flags.loadClassifier; + String loadTextPath = crf.flags.loadTextClassifier; + String serializeTo = crf.flags.serializeTo; + String serializeToText = crf.flags.serializeToText; + + if (crf.flags.useEmbedding && crf.flags.embeddingWords != null && crf.flags.embeddingVectors != null) { + System.err.println("Reading Embedding Files"); + BufferedReader br = IOUtils.readerFromString(crf.flags.embeddingWords); + String line = null; + List wordList = new ArrayList(); + while ((line = br.readLine()) != null) { + wordList.add(line.trim()); + } + System.err.println("Found a dictionary of size " + wordList.size()); + br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(crf.flags.embeddingVectors)))); + crf.embeddings = Generics.newHashMap(); + double[] vector = null; + int count = 0; + while ((line = br.readLine()) != null) { + vector = ArrayUtils.toDoubleArray(line.trim().split(" ")); + crf.embeddings.put(wordList.get(count++), vector); + } + System.err.println("Found " + count + " matching embeddings of dimension " + vector.length); + } + + if (loadPath != null) { + crf.loadClassifierNoExceptions(loadPath, props); + } else if (loadTextPath != null) { + System.err.println("Warning: this is now only tested for Chinese Segmenter"); + System.err.println("(Sun Dec 23 00:59:39 2007) (pichuan)"); + try { + crf.loadTextClassifier(loadTextPath, props); + // System.err.println("DEBUG: out from crf.loadTextClassifier"); + } catch (Exception e) { + throw new RuntimeException("error loading " + loadTextPath, e); + } + } else if (crf.flags.loadJarClassifier != null) { + crf.loadJarClassifier(crf.flags.loadJarClassifier, props); + } else if (crf.flags.trainFile != null || crf.flags.trainFileList != null) { + crf.train(); + } else { + crf.loadDefaultClassifier(); + } + + crf.loadTagIndex(); + + // System.err.println("Using " + crf.flags.featureFactory); + // System.err.println("Using " + + // StringUtils.getShortClassName(crf.readerAndWriter)); + + if (serializeTo != null) { + crf.serializeClassifier(serializeTo); + } + + if (serializeToText != null) { + crf.serializeTextClassifier(serializeToText); + } + + if (testFile != null) { + DocumentReaderAndWriter readerAndWriter = crf.defaultReaderAndWriter(); + if (crf.flags.searchGraphPrefix != null) { + crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, crf.makeReaderAndWriter()); + } else if (crf.flags.printFirstOrderProbs) { + crf.printFirstOrderProbs(testFile, readerAndWriter); + } else if (crf.flags.printProbs) { + crf.printProbs(testFile, readerAndWriter); + } else if (crf.flags.useKBest) { + int k = crf.flags.kBest; + crf.classifyAndWriteAnswersKBest(testFile, k, readerAndWriter); + } else if (crf.flags.printLabelValue) { + crf.printLabelInformation(testFile, readerAndWriter); + } else { + crf.classifyAndWriteAnswers(testFile, readerAndWriter); + } + } + + if (testFiles != null) { + List files = new ArrayList(); + for (String filename : testFiles.split(",")) { + files.add(new File(filename)); + } + crf.classifyAndWriteAnswers(files, crf.defaultReaderAndWriter()); + } + + if (textFile != null) { + crf.classifyAndWriteAnswers(textFile); + } + + if (textFiles != null) { + List files = new ArrayList(); + for (String filename : textFiles.split(",")) { + files.add(new File(filename)); + } + crf.classifyAndWriteAnswers(files); + } + + if (crf.flags.readStdin) { + crf.classifyStdin(); + } + } // end main + + @Override + public List classifyWithGlobalInformation(List tokenSeq, final CoreMap doc, final CoreMap sent) { + return classify(tokenSeq); + } + + public void writeWeights(PrintStream p) { + for (String feature : featureIndex) { + int index = featureIndex.indexOf(feature); + // line.add(feature+"["+(-p)+"]"); + // rowHeaders.add(feature + '[' + (-p) + ']'); + double[] v = weights[index]; + Index l = this.labelIndices.get(0); + p.println(feature + "\t\t"); + for (CRFLabel label : l) { + p.print(label.toString(classIndex) + ":" + v[l.indexOf(label)] + "\t"); + } + p.println(); + + } + } + + public Map> topWeights() { + Map> w = new HashMap>(); + for (String feature : featureIndex) { + int index = featureIndex.indexOf(feature); + // line.add(feature+"["+(-p)+"]"); + // rowHeaders.add(feature + '[' + (-p) + ']'); + double[] v = weights[index]; + Index l = this.labelIndices.get(0); + for (CRFLabel label : l) { + if(!w.containsKey(label.toString(classIndex))) + w.put(label.toString(classIndex), new ClassicCounter()); + w.get(label.toString(classIndex)).setCount(feature, v[l.indexOf(label)]); + } + } + return w; + } + +} // end class CRFClassifier diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFClassifierEvaluator.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFClassifierEvaluator.java new file mode 100644 index 0000000..df851f3 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFClassifierEvaluator.java @@ -0,0 +1,149 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.optimization.CmdEvaluator; +import edu.stanford.nlp.stats.MultiClassChunkEvalStats; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Triple; + +import java.io.*; +import java.util.Collection; +import java.util.List; + +/** + * Evaluates CRFClassifier on a set of data + * - called by QNMinimizer periodically + * - If evalCmd is set, runs command line specified by evalCmd + * otherwise does evaluation internally + * NOTE: when running conlleval with exec on Linux, linux will first + * fork process by duplicating memory of current process. So if + * JVM has lots of memory, it will all be duplicated when + * child process is initially forked. + * @author Angel Chang + */ +public class CRFClassifierEvaluator extends CmdEvaluator { + private CRFClassifier classifier; + // NOTE: Defalt uses -r, specify without -r if IOB + private String cmdStr = "/u/nlp/bin/conlleval -r"; + private String[] cmd; + + // TODO: Use data structure to hold data + features + // Cache already featurized documents + // Original object bank + Collection> data; + // Featurized data + List> featurizedData; + + public CRFClassifierEvaluator(String description, + CRFClassifier classifier, + Collection> data, + List> featurizedData) + { + this.description = description; + this.classifier = classifier; + this.data = data; + this.featurizedData = featurizedData; + cmd = getCmd(cmdStr); + saveOutput = true; + } + + public CRFClassifierEvaluator(String description, + CRFClassifier classifier) + { + this.description = description; + this.classifier = classifier; + saveOutput = true; + } + + /** + * Set the data to test on + */ + public void setTestData(Collection> data, List> featurizedData) + { + this.data = data; + this.featurizedData = featurizedData; + } + + /** + * Set the evaluation command (set to null to skip evaluation using command line) + * @param evalCmd + */ + public void setEvalCmd(String evalCmd) + { + System.err.println("setEvalCmd to " + evalCmd); + this.cmdStr = evalCmd; + if (cmdStr != null) { + cmdStr = cmdStr.trim(); + if (cmdStr.length() == 0) { cmdStr = null; } + } + cmd = getCmd(cmdStr); + } + + public void setValues(double[] x) + { + classifier.updateWeights(x); + } + + public String[] getCmd() + { + return cmd; + } + + private double interpretCmdOutput() { + String output = getOutput(); + String[] parts = output.split("\\s+"); + int fScoreIndex = 0; + for (; fScoreIndex < parts.length; fScoreIndex++) + if (parts[fScoreIndex].equals("FB1:")) + break; + fScoreIndex += 1; + if (fScoreIndex < parts.length) + return Double.parseDouble(parts[fScoreIndex]); + else { + System.err.println("ERROR in CRFClassifierEvaluator.interpretCmdOutput(), cannot find FB1 score in output:\n"+output); + return -1; + } + } + + @Override + public void outputToCmd(OutputStream outputStream) + { + try { + PrintWriter pw = IOUtils.encodedOutputStreamPrintWriter(outputStream, null, true); + classifier.classifyAndWriteAnswers(data, featurizedData, pw, + classifier.makeReaderAndWriter()); + } catch (IOException ex) { + throw new RuntimeIOException(ex); + } + } + + public double evaluate(double[] x) { + double score = 0; + setValues(x); + if (getCmd() != null) { + evaluateCmd(getCmd()); + score = interpretCmdOutput(); + } else { + try { + // TODO: Classify in memory instead of writing to tmp file + File f = File.createTempFile("CRFClassifierEvaluator","txt"); + f.deleteOnExit(); + OutputStream outputStream = new BufferedOutputStream(new FileOutputStream(f)); + PrintWriter pw = IOUtils.encodedOutputStreamPrintWriter(outputStream, null, true); + classifier.classifyAndWriteAnswers(data, featurizedData, pw, + classifier.makeReaderAndWriter()); + outputStream.close(); + BufferedReader br = new BufferedReader(new FileReader(f)); + MultiClassChunkEvalStats stats = new MultiClassChunkEvalStats("O"); + score = stats.score(br, "\t"); + System.err.println(stats.getConllEvalString()); + f.delete(); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + return score; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFCliqueTree.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFCliqueTree.java new file mode 100644 index 0000000..a0dd4a8 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFCliqueTree.java @@ -0,0 +1,707 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.sequences.SequenceListener; +import edu.stanford.nlp.sequences.SequenceModel; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.GeneralizedCounter; +import edu.stanford.nlp.util.Index; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Builds a CliqueTree (an array of FactorTable) and does message passing + * inference along it. + * + * @param The type of the label (usually String in our uses) + * @author Jenny Finkel + */ +public class CRFCliqueTree implements SequenceModel, SequenceListener { + + protected final FactorTable[] factorTables; + protected final double z; // norm constant + protected final Index classIndex; + private final E backgroundSymbol; + private final int backgroundIndex; + // the window size, which is also the clique size + protected final int windowSize; + // the number of possible classes for each label + private final int numClasses; + private final int[] possibleValues; + + /** Initialize a clique tree */ + public CRFCliqueTree(FactorTable[] factorTables, Index classIndex, E backgroundSymbol) { + this(factorTables, classIndex, backgroundSymbol, factorTables[0].totalMass()); + } + + /** This extra constructor was added to support the CRFCliqueTreeForPartialLabels */ + CRFCliqueTree(FactorTable[] factorTables, Index classIndex, E backgroundSymbol, double z) { + this.factorTables = factorTables; + this.z = z; + this.classIndex = classIndex; + this.backgroundSymbol = backgroundSymbol; + backgroundIndex = classIndex.indexOf(backgroundSymbol); + windowSize = factorTables[0].windowSize(); + numClasses = classIndex.size(); + possibleValues = new int[numClasses]; + for (int i = 0; i < numClasses; i++) { + possibleValues[i] = i; + } + + // Debug only + // System.out.println("CRFCliqueTree constructed::numClasses: " + + // numClasses); + } + + public FactorTable[] getFactorTables() { + return this.factorTables; + } + + public Index classIndex() { + return classIndex; + } + + // SEQUENCE MODEL METHODS + + public int length() { + return factorTables.length; + } + + public int leftWindow() { + return windowSize; + } + + public int rightWindow() { + return 0; + } + + public int[] getPossibleValues(int position) { + return possibleValues; + } + + public double scoreOf(int[] sequence, int pos) { + return scoresOf(sequence, pos)[sequence[pos]]; + } + + /** + * Computes the unnormalized log conditional distribution over values of the + * element at position pos in the sequence, conditioned on the values of the + * elements in all other positions of the provided sequence. + * + * @param sequence + * the sequence containing the rest of the values to condition on + * @param position + * the position of the element to give a distribution for + * @return an array of type double, representing a probability distribution; + * sums to 1.0 + */ + public double[] scoresOf(int[] sequence, int position) { + if (position >= factorTables.length) throw new RuntimeException("Index out of bounds: " + position); + // DecimalFormat nf = new DecimalFormat("#0.000"); + // if (position>0 && position= length()) { + nextLength = length() - position - 1; + } + FactorTable nextFactorTable = factorTables[position + nextLength]; + if (nextLength != windowSize - 1) { + for (int j = 0; j < windowSize - 1 - nextLength; j++) { + nextFactorTable = nextFactorTable.sumOutFront(); + } + } + if (nextLength == 0) { // we are asking about the prob of no sequence + Arrays.fill(probNextGivenThis, 1.0); + } else { + int[] next = new int[nextLength]; + System.arraycopy(sequence, position + 1, next, 0, nextLength); + for (int label = 0; label < numClasses; label++) { + // ask the factor table such that pos is the first position in the + // window + // probNextGivenThis[label] = + // factorTables[position+nextLength].conditionalLogProbGivenFirst(label, + // next); + // probNextGivenThis[label] = + // nextFactorTable.conditionalLogProbGivenFirst(label, next); + probNextGivenThis[label] = nextFactorTable.unnormalizedConditionalLogProbGivenFirst(label, next); + } + } + + // pointwise multiply + return ArrayMath.pairwiseAdd(probThisGivenPrev, probNextGivenThis); + } + + /** + * Returns the log probability of this sequence given the CRF. Does so by + * computing the marginal of the first windowSize tags, and then computing the + * conditional probability for the rest of them, conditioned on the previous + * tags. + * + * @param sequence + * the sequence to compute a score for + * @return the score for the sequence + */ + public double scoreOf(int[] sequence) { + + int[] given = new int[window() - 1]; + Arrays.fill(given, classIndex.indexOf(backgroundSymbol)); + double logProb = 0; + for (int i = 0; i < length(); i++) { + int label = sequence[i]; + logProb += condLogProbGivenPrevious(i, label, given); + System.arraycopy(given, 1, given, 0, given.length - 1); + given[given.length - 1] = label; + } + return logProb; + } + + // OTHER + + public int window() { + return windowSize; + } + + public int getNumClasses() { + return numClasses; + } + + public double totalMass() { + return z; + } + + public int backgroundIndex() { + return backgroundIndex; + } + + public E backgroundSymbol() { + return backgroundSymbol; + } + + // + // MARGINAL PROB OF TAG AT SINGLE POSITION + // + + public double logProb(int position, int label) { + double u = factorTables[position].unnormalizedLogProbEnd(label); + return u - z; + } + + public double prob(int position, int label) { + return Math.exp(logProb(position, label)); + } + + public double logProb(int position, E label) { + return logProb(position, classIndex.indexOf(label)); + } + + public double prob(int position, E label) { + return Math.exp(logProb(position, label)); + } + + public Counter probs(int position) { + Counter c = new ClassicCounter(); + for (int i = 0; i < classIndex.size(); i++) { + E label = classIndex.get(i); + c.incrementCount(label, prob(position, i)); + } + return c; + } + + public Counter logProbs(int position) { + Counter c = new ClassicCounter(); + for (int i = 0; i < classIndex.size(); i++) { + E label = classIndex.get(i); + c.incrementCount(label, logProb(position, i)); + } + return c; + } + + // + // MARGINAL PROBS OF TAGS AT MULTIPLE POSITIONS + // + + /** + * returns the log probability for the given labels (indexed using + * classIndex), where the last label corresponds to the label at the specified + * position. For instance if you called logProb(5, {1,2,3}) it will return the + * marginal log prob that the label at position 3 is 1, the label at position + * 4 is 2 and the label at position 5 is 3. + */ + public double logProb(int position, int[] labels) { + if (labels.length < windowSize) { + return factorTables[position].unnormalizedLogProbEnd(labels) - z; + } else if (labels.length == windowSize) { + return factorTables[position].unnormalizedLogProb(labels) - z; + } else { + int[] l = new int[windowSize]; + System.arraycopy(labels, 0, l, 0, l.length); + int position1 = position - labels.length + windowSize; + double p = factorTables[position1].unnormalizedLogProb(l) - z; + l = new int[windowSize - 1]; + System.arraycopy(labels, 1, l, 0, l.length); + position1++; + for (int i = windowSize; i < labels.length; i++) { + p += condLogProbGivenPrevious(position1++, labels[i], l); + System.arraycopy(l, 1, l, 0, l.length - 1); + l[windowSize - 2] = labels[i]; + } + return p; + } + } + + /** + * Returns the probability for the given labels (indexed using classIndex), + * where the last label corresponds to the label at the specified position. + * For instance if you called prob(5, {1,2,3}) it will return the marginal + * prob that the label at position 3 is 1, the label at position 4 is 2 and + * the label at position 5 is 3. + */ + public double prob(int position, int[] labels) { + return Math.exp(logProb(position, labels)); + } + + /** + * returns the log probability for the given labels, where the last label + * corresponds to the label at the specified position. For instance if you + * called logProb(5, {"O", "PER", "ORG"}) it will return the marginal log prob + * that the label at position 3 is "O", the label at position 4 is "PER" and + * the label at position 5 is "ORG". + */ + public double logProb(int position, E[] labels) { + return logProb(position, objectArrayToIntArray(labels)); + } + + /** + * returns the probability for the given labels, where the last label + * corresponds to the label at the specified position. For instance if you + * called logProb(5, {"O", "PER", "ORG"}) it will return the marginal prob + * that the label at position 3 is "O", the label at position 4 is "PER" and + * the label at position 5 is "ORG". + */ + public double prob(int position, E[] labels) { + return Math.exp(logProb(position, labels)); + } + + public GeneralizedCounter logProbs(int position, int window) { + GeneralizedCounter gc = new GeneralizedCounter(window); + int[] labels = new int[window]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) + // 4.12.5 + // Arrays.fill(labels, 0); + + OUTER: while (true) { + List labelsList = intArrayToListE(labels); + gc.incrementCount(labelsList, logProb(position, labels)); + for (int i = 0; i < labels.length; i++) { + labels[i]++; + if (labels[i] < numClasses) { + break; + } + if (i == labels.length - 1) { + break OUTER; + } + labels[i] = 0; + } + } + return gc; + } + + public GeneralizedCounter probs(int position, int window) { + GeneralizedCounter gc = new GeneralizedCounter(window); + int[] labels = new int[window]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) + // 4.12.5 + // Arrays.fill(labels, 0); + + OUTER: while (true) { + List labelsList = intArrayToListE(labels); + gc.incrementCount(labelsList, prob(position, labels)); + for (int i = 0; i < labels.length; i++) { + labels[i]++; + if (labels[i] < numClasses) { + break; + } + if (i == labels.length - 1) { + break OUTER; + } + labels[i] = 0; + } + } + return gc; + } + + // + // HELPER METHODS + // + + private int[] objectArrayToIntArray(E[] os) { + int[] is = new int[os.length]; + for (int i = 0; i < os.length; i++) { + is[i] = classIndex.indexOf(os[i]); + } + return is; + } + + private List intArrayToListE(int[] is) { + List os = new ArrayList(is.length); + for (int i = 0; i < is.length; i++) { + os.add(classIndex.get(is[i])); + } + return os; + } + + /** + * Gives the probability of a tag at a single position conditioned on a + * sequence of previous labels. + * + * @param position + * Index in sequence + * @param label + * Label of item at index + * @param prevLabels + * @return conditional log probability + */ + public double condLogProbGivenPrevious(int position, int label, int[] prevLabels) { + if (prevLabels.length + 1 == windowSize) { + return factorTables[position].conditionalLogProbGivenPrevious(prevLabels, label); + } else if (prevLabels.length + 1 < windowSize) { + FactorTable ft = factorTables[position].sumOutFront(); + while (ft.windowSize() > prevLabels.length + 1) { + ft = ft.sumOutFront(); + } + return ft.conditionalLogProbGivenPrevious(prevLabels, label); + } else { + int[] p = new int[windowSize - 1]; + System.arraycopy(prevLabels, prevLabels.length - p.length, p, 0, p.length); + return factorTables[position].conditionalLogProbGivenPrevious(p, label); + } + } + + public double condLogProbGivenPrevious(int position, E label, E[] prevLabels) { + return condLogProbGivenPrevious(position, classIndex.indexOf(label), objectArrayToIntArray(prevLabels)); + } + + public double condProbGivenPrevious(int position, int label, int[] prevLabels) { + return Math.exp(condLogProbGivenPrevious(position, label, prevLabels)); + } + + public double condProbGivenPrevious(int position, E label, E[] prevLabels) { + return Math.exp(condLogProbGivenPrevious(position, label, prevLabels)); + } + + public Counter condLogProbsGivenPrevious(int position, int[] prevlabels) { + Counter c = new ClassicCounter(); + for (int i = 0; i < classIndex.size(); i++) { + E label = classIndex.get(i); + c.incrementCount(label, condLogProbGivenPrevious(position, i, prevlabels)); + } + return c; + } + + public Counter condLogProbsGivenPrevious(int position, E[] prevlabels) { + Counter c = new ClassicCounter(); + for (int i = 0; i < classIndex.size(); i++) { + E label = classIndex.get(i); + c.incrementCount(label, condLogProbGivenPrevious(position, label, prevlabels)); + } + return c; + } + + // + // PROB OF TAG AT SINGLE POSITION CONDITIONED ON FOLLOWING SEQUENCE OF LABELS + // + + public double condLogProbGivenNext(int position, int label, int[] nextLabels) { + position = position + nextLabels.length; + if (nextLabels.length + 1 == windowSize) { + return factorTables[position].conditionalLogProbGivenNext(nextLabels, label); + } else if (nextLabels.length + 1 < windowSize) { + FactorTable ft = factorTables[position].sumOutFront(); + while (ft.windowSize() > nextLabels.length + 1) { + ft = ft.sumOutFront(); + } + return ft.conditionalLogProbGivenPrevious(nextLabels, label); + } else { + int[] p = new int[windowSize - 1]; + System.arraycopy(nextLabels, 0, p, 0, p.length); + return factorTables[position].conditionalLogProbGivenPrevious(p, label); + } + } + + public double condLogProbGivenNext(int position, E label, E[] nextLabels) { + return condLogProbGivenNext(position, classIndex.indexOf(label), objectArrayToIntArray(nextLabels)); + } + + public double condProbGivenNext(int position, int label, int[] nextLabels) { + return Math.exp(condLogProbGivenNext(position, label, nextLabels)); + } + + public double condProbGivenNext(int position, E label, E[] nextLabels) { + return Math.exp(condLogProbGivenNext(position, label, nextLabels)); + } + + public Counter condLogProbsGivenNext(int position, int[] nextlabels) { + Counter c = new ClassicCounter(); + for (int i = 0; i < classIndex.size(); i++) { + E label = classIndex.get(i); + c.incrementCount(label, condLogProbGivenNext(position, i, nextlabels)); + } + return c; + } + + public Counter condLogProbsGivenNext(int position, E[] nextlabels) { + Counter c = new ClassicCounter(); + for (int i = 0; i < classIndex.size(); i++) { + E label = classIndex.get(i); + c.incrementCount(label, condLogProbGivenNext(position, label, nextlabels)); + } + return c; + } + + // + // PROB OF TAG AT SINGLE POSITION CONDITIONED ON PREVIOUS AND FOLLOWING + // SEQUENCE OF LABELS + // + + // public double condProbGivenPreviousAndNext(int position, int label, int[] + // prevLabels, int[] nextLabels) { + + // } + + + + // + // JOINT CONDITIONAL PROBS + // + /** + * @return a new CRFCliqueTree for the weights on the data + */ + public static CRFCliqueTree getCalibratedCliqueTree(int[][][] data, List> labelIndices, + int numClasses, Index classIndex, E backgroundSymbol, CliquePotentialFunction cliquePotentialFunc, double[][][] featureVals) { + + FactorTable[] factorTables = new FactorTable[data.length]; + FactorTable[] messages = new FactorTable[data.length - 1]; + + for (int i = 0; i < data.length; i++) { + double[][] featureValByCliqueSize = null; + if (featureVals != null) + featureValByCliqueSize = featureVals[i]; + factorTables[i] = getFactorTable(data[i], labelIndices, numClasses, cliquePotentialFunc, featureValByCliqueSize); + + if (i > 0) { + messages[i - 1] = factorTables[i - 1].sumOutFront(); + factorTables[i].multiplyInFront(messages[i - 1]); + } + } + + for (int i = factorTables.length - 2; i >= 0; i--) { + FactorTable summedOut = factorTables[i + 1].sumOutEnd(); + summedOut.divideBy(messages[i]); + factorTables[i].multiplyInEnd(summedOut); + } + + return new CRFCliqueTree(factorTables, classIndex, backgroundSymbol); + } + + /** + * @return a new CRFCliqueTree for the weights on the data + */ + public static CRFCliqueTree getCalibratedCliqueTree(double[] weights, double wscale, int[][] weightIndices, + int[][][] data, List> labelIndices, int numClasses, Index classIndex, E backgroundSymbol) { + + FactorTable[] factorTables = new FactorTable[data.length]; + FactorTable[] messages = new FactorTable[data.length - 1]; + + for (int i = 0; i < data.length; i++) { + + factorTables[i] = getFactorTable(weights, wscale, weightIndices, data[i], labelIndices, numClasses); + + if (i > 0) { + messages[i - 1] = factorTables[i - 1].sumOutFront(); + factorTables[i].multiplyInFront(messages[i - 1]); + } + } + + for (int i = factorTables.length - 2; i >= 0; i--) { + + FactorTable summedOut = factorTables[i + 1].sumOutEnd(); + summedOut.divideBy(messages[i]); + factorTables[i].multiplyInEnd(summedOut); + } + + return new CRFCliqueTree(factorTables, classIndex, backgroundSymbol); + } + + private static FactorTable getFactorTable(double[] weights, double wscale, int[][] weightIndices, int[][] data, + List> labelIndices, int numClasses) { + + FactorTable factorTable = null; + + for (int j = 0; j < labelIndices.size(); j++) { + Index labelIndex = labelIndices.get(j); + FactorTable ft = new FactorTable(numClasses, j + 1); + + // ... and each possible labeling for that clique + for (int k = 0, liSize = labelIndex.size(); k < liSize; k++) { + int[] label = ((CRFLabel) labelIndex.get(k)).getLabel(); + double weight = 0.0; + for (int m = 0; m < data[j].length; m++) { + int wi = weightIndices[data[j][m]][k]; + weight += wscale * weights[wi]; + } + // try{ + ft.setValue(label, weight); + // } catch (Exception e) { + // System.out.println("CRFCliqueTree::getFactorTable"); + // System.out.println("NumClasses: " + numClasses + " j+1: " + (j+1)); + // System.out.println("k: " + k+" label: " +label+" labelIndexSize: " + + // labelIndex.size()); + // throw new RunTimeException(e.toString()); + // } + + } + if (j > 0) { + ft.multiplyInEnd(factorTable); + } + factorTable = ft; + + } + + return factorTable; + } + + public static FactorTable getFactorTable(double[][] weights, int[][] data, List> labelIndices, int numClasses) { + CliquePotentialFunction cliquePotentialFunc = new LinearCliquePotentialFunction(weights); + return getFactorTable(data, labelIndices, numClasses, cliquePotentialFunc); + } + + + public static FactorTable getFactorTable(int[][] data, List> labelIndices, int numClasses, CliquePotentialFunction cliquePotentialFunc) { + return getFactorTable(data, labelIndices, numClasses, cliquePotentialFunc, null); + } + + public static FactorTable getFactorTable(int[][] data, List> labelIndices, int numClasses, CliquePotentialFunction cliquePotentialFunc, double[][] featureValByCliqueSize) { + FactorTable factorTable = null; + + for (int j = 0; j < labelIndices.size(); j++) { + Index labelIndex = labelIndices.get(j); + FactorTable ft = new FactorTable(numClasses, j + 1); + double[] featureVal = null; + if (featureValByCliqueSize != null) + featureVal = featureValByCliqueSize[j]; + + // ... and each possible labeling for that clique + for (int k = 0, liSize = labelIndex.size(); k < liSize; k++) { + int[] label = ((CRFLabel) labelIndex.get(k)).getLabel(); + double cliquePotential = cliquePotentialFunc.computeCliquePotential(j+1, k, data[j], featureVal); + // for (int m = 0; m < data[j].length; m++) { + // weight += weights[data[j][m]][k]; + // } + // try{ + ft.setValue(label, cliquePotential); + // } catch (Exception e) { + // System.out.println("CRFCliqueTree::getFactorTable"); + // System.out.println("NumClasses: " + numClasses + " j+1: " + (j+1)); + // System.out.println("k: " + k+" label: " +label+" labelIndexSize: " + + // labelIndex.size()); + // throw new RunTimeException(e.toString()); + // } + + } + if (j > 0) { + ft.multiplyInEnd(factorTable); + } + factorTable = ft; + + } + + return factorTable; + } + + // SEQUENCE MODEL METHODS + + /** + * Computes the distribution over values of the element at position pos in the + * sequence, conditioned on the values of the elements in all other positions + * of the provided sequence. + * + * @param sequence + * the sequence containing the rest of the values to condition on + * @param position + * the position of the element to give a distribution for + * @return an array of type double, representing a probability distribution; + * sums to 1.0 + */ + public double[] getConditionalDistribution(int[] sequence, int position) { + double[] result = scoresOf(sequence, position); + ArrayMath.logNormalize(result); + // System.out.println("marginal: " + ArrayMath.toString(marginal, + // nf)); + // System.out.println("conditional: " + ArrayMath.toString(result, + // nf)); + result = ArrayMath.exp(result); + // System.out.println("conditional: " + ArrayMath.toString(result, + // nf)); + return result; + } + + /** + * Informs this sequence model that the value of the element at position pos + * has changed. This allows this sequence model to update its internal model + * if desired. + * + */ + public void updateSequenceElement(int[] sequence, int pos, int oldVal) { + // do nothing; we don't change this model + } + + /** + * Informs this sequence model that the value of the whole sequence is + * initialized to sequence + * + */ + public void setInitialSequence(int[] sequence) { + // do nothing + } + + /** + * @return the number of possible values for each element; it is assumed to be + * the same for the element at each position + */ + public int getNumValues() { + return numClasses; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFDatum.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFDatum.java new file mode 100644 index 0000000..8fecfb9 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFDatum.java @@ -0,0 +1,114 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.ling.Datum; + +import java.io.Serializable; +import java.util.List; + + +/** + * The representation of Datums used internally in CRFClassifier. + * + * @author Jenny Finkel + */ + +public class CRFDatum implements Serializable { + + /** + * Features for this Datum. + */ + @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) + private final List features; + @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) + private final LAB label; + @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) + // featureVals holds the (optional) feature value for non-boolean features + // such as the ones used in continuous vector space embeddings + private final List featureVals; + + /** + * Constructs a new BasicDatum with the given features and label. + * + * @param features The features of the CRFDatum + * @param label The label of the CRFDatum + */ + public CRFDatum(List features, LAB label, List featureVals) { + this.features = features; + this.label = label; + this.featureVals = featureVals; + } + + /** + * Returns the collection that this BasicDatum was constructed with. + * + * @return the collection that this BasicDatum was constructed with. + */ + public List asFeatures() { + return features; + } + + /** + * Returns the double array containing the feature values + * + * @return the double array that contains the feature values matching each feature as + * returned by asFeatures() + */ + public List asFeatureVals() { + return featureVals; + } + + + /** + * Returns the label for this Datum, or null if none have been set. + * @return The label for this Datum, or null if none have been set. + */ + + public LAB label() { + return label; + } + + /** + * Returns a String representation of this BasicDatum (lists features and labels). + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder("CRFDatum[\n"); + sb.append(" label=").append(label).append('\n'); + for (int i = 0, sz = features.size(); i < sz; i++) { + sb.append(" features(").append(i).append("):").append(features.get(i)); + sb.append(", val=").append(featureVals.get(i)); + sb.append('\n'); + } + sb.append(']'); + return sb.toString(); + } + + + /** + * Returns whether the given Datum contains the same features as this Datum. + * Doesn't check the labels, should we change this? + * (CDM Feb 2012: Also doesn't correctly respect the contract for equals, + * since it gives one way equality with other Datum's.) + * + * @param o The object to test equality with + * @return Whether it is equal to this CRFDatum in terms of features + */ + @Override + public boolean equals(Object o) { + if (!(o instanceof Datum)) { + return (false); + } + + Datum d = (Datum) o; + return features.equals(d.asFeatures()); + } + + @Override + public int hashCode() { + return features.hashCode(); + } + + private static final long serialVersionUID = -8345554365027671190L; + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java new file mode 100644 index 0000000..732abb0 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java @@ -0,0 +1,176 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.StringUtils; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Properties; + +/** + * Exports CRF features for use with other programs + * - Usage: CRFFeatureExporter -prop -trainFile -exportFeatures + * - Output file is automatically gzipped/b2zipped if ending in gz/bz2 + * - bzip2 requires that bzip2 is available via command line + * - Currently exports features in a format that can be read by a modified crfsgd + * (crfsgd assumes features are gzipped) + * TODO: Support other formats (like crfsuite) + * @author Angel Chang + */ +public class CRFFeatureExporter { + private char delimiter = '\t'; + private static String eol = System.getProperty("line.separator"); + private CRFClassifier classifier; + + public CRFFeatureExporter(CRFClassifier classifier) + { + this.classifier = classifier; + } + + /** + * Prefix features with U- (for unigram) features + * or B- (for bigram) features + * @param feat String representing the feature + * @return new prefixed feature string + */ + private static String ubPrefixFeatureString(String feat) + { + if (feat.endsWith("|C")) { + return "U-" + feat; + } else if (feat.endsWith("|CpC")) { + return "B-" + feat; + } else { + return feat; + } + } + + /** + * Constructs a big string representing the input list of CoreLabel, + * with one line per token using the following format + * word label feat1 feat2 ... + * (where each space is actually a tab) + * Assume that CoreLabel has both TextAnnotation and AnswerAnnotation + * @param document List of CoreLabel + * (does not have to represent a "document", just a sequence of text, + * like a sentence or a paragraph) + * @return String representation of features + */ + private String getFeatureString(List document) { + int docSize = document.size(); + if (classifier.flags.useReverse) { + Collections.reverse(document); + } + + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < docSize; j++) { + IN token = document.get(j); + sb.append(token.get(CoreAnnotations.TextAnnotation.class)); + sb.append(delimiter); + sb.append(token.get(CoreAnnotations.AnswerAnnotation.class)); + + CRFDatum d = classifier.makeDatum(document, j, classifier.featureFactory); + + List features = d.asFeatures(); + for (int k = 0, fSize = features.size(); k < fSize; k++) { + Collection cliqueFeatures = (Collection) features.get(k); + for (String feat: cliqueFeatures) { + feat = ubPrefixFeatureString(feat); + sb.append(delimiter); + sb.append(feat); + } + } + sb.append(eol); + } + if (classifier.flags.useReverse) { + Collections.reverse(document); + } + return sb.toString(); + } + + /** + * Output features that have already been converted into features + * (using documentToDataAndLabels) in format suitable for CRFSuite + * Format is with one line per token using the following format + * label feat1 feat2 ... + * (where each space is actually a tab) + * Each document is separated by an empty line + * @param exportFile file to export the features to + * @param docsData array of document features + * @param labels correct labels indexed by document, and position within document + */ + public void printFeatures(String exportFile, int[][][][] docsData, int[][] labels) { + try { + PrintWriter pw = IOUtils.getPrintWriter(exportFile); + for (int i = 0; i < docsData.length; i++) { + for (int j = 0; j < docsData[i].length; j++) { + StringBuilder sb = new StringBuilder(); + int label = labels[i][j]; + sb.append(classifier.classIndex.get(label)); + for (int k = 0; k < docsData[i][j].length; k++) { + for (int m = 0; m < docsData[i][j][k].length; m++) { + String feat = classifier.featureIndex.get(docsData[i][j][k][m]); + feat = ubPrefixFeatureString(feat); + sb.append(delimiter); + sb.append(feat); + } + } + pw.println(sb.toString()); + } + pw.println(); + } + pw.close(); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + /** + * Output features from a collection of documents to a file + * Format is with one line per token using the following format + * word label feat1 feat2 ... + * (where each space is actually a tab) + * Each document is separated by an empty line + * This format is suitable for modified crfsgd + * @param exportFile file to export the features to + * @param documents input collection of documents + */ + public void printFeatures(String exportFile, Collection> documents) { + try { + PrintWriter pw = IOUtils.getPrintWriter(exportFile); + for (List doc:documents) { + String str = getFeatureString(doc); + pw.println(str); + } + pw.close(); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + public static void main(String[] args) throws Exception { + StringUtils.printErrInvocationString("CRFFeatureExporter", args); + Properties props = StringUtils.argsToProperties(args); + CRFClassifier crf = new CRFClassifier(props); + String inputFile = crf.flags.trainFile; + if (inputFile == null) { + System.err.println("Please provide input file using -trainFile"); + System.exit(-1); + } + String outputFile = crf.flags.exportFeatures; + if (outputFile == null) { + System.err.println("Please provide output file using -exportFeatures"); + System.exit(-1); + } + CRFFeatureExporter featureExporter = new CRFFeatureExporter(crf); + Collection> docs = + crf.makeObjectBankFromFile(inputFile, crf.makeReaderAndWriter()); + crf.makeAnswerArraysAndTagIndex(docs); + featureExporter.printFeatures(outputFile, docs); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLabel.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLabel.java new file mode 100644 index 0000000..effd7b5 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLabel.java @@ -0,0 +1,90 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.util.Index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + + +/** + * @author Jenny Finkel + */ + +public class CRFLabel implements Serializable { + + private static final long serialVersionUID = 7403010868396790276L; + + private final int[] label; + int hashCode = -1; + + // todo: When rebuilding, change this to a better hash function like 31 + private static final int maxNumClasses = 10; + + public CRFLabel(int[] label) { + this.label = label; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof CRFLabel)) { + return false; + } + CRFLabel other = (CRFLabel) o; + + if (other.label.length != label.length) { + return false; + } + for (int i = 0; i < label.length; i++) { + if (label[i] != other.label[i]) { + return false; + } + } + + return true; + } + + public CRFLabel getSmallerLabel(int size) { + int[] newLabel = new int[size]; + System.arraycopy(label, label.length - size, newLabel, 0, size); + return new CRFLabel(newLabel); + } + + public CRFLabel getOneSmallerLabel() { + return getSmallerLabel(label.length - 1); + } + + public int[] getLabel() { + return label; + } + + public String toString(Index classIndex) { + List l = new ArrayList(); + for (int i = 0; i < label.length; i++) { + l.add(classIndex.get(label[i])); + } + return l.toString(); + } + + @Override + public String toString() { + List l = new ArrayList(); + for (int i = 0; i < label.length; i++) { + l.add(Integer.valueOf(label[i])); + } + return l.toString(); + } + + @Override + public int hashCode() { + if (hashCode < 0) { + hashCode = 0; + for (int i = 0; i < label.length; i++) { + hashCode *= maxNumClasses; + hashCode += label[i]; + } + } + return hashCode; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFloatFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFloatFunction.java new file mode 100644 index 0000000..43a7e23 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFloatFunction.java @@ -0,0 +1,611 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.optimization.AbstractCachingDiffFloatFunction; +import edu.stanford.nlp.util.Index; + +import java.util.Arrays; +import java.util.List; + + +/** + * @author Jenny Finkel + */ + +public class CRFLogConditionalObjectiveFloatFunction extends AbstractCachingDiffFloatFunction implements HasCliquePotentialFunction { + + public static final int NO_PRIOR = 0; + public static final int QUADRATIC_PRIOR = 1; + /* Use a Huber robust regression penalty (L1 except very near 0) not L2 */ + public static final int HUBER_PRIOR = 2; + public static final int QUARTIC_PRIOR = 3; + + protected int prior; + protected float sigma; + protected float epsilon; + + List> labelIndices; + Index classIndex; + Index featureIndex; + float[][] Ehat; // empirical counts of all the features [feature][class] + int window; + int numClasses; + int[] map; + int[][][][] data; + int[][] labels; + int domainDimension = -1; + + String backgroundSymbol; + + public static boolean VERBOSE = false; + + CRFLogConditionalObjectiveFloatFunction(int[][][][] data, int[][] labels, Index featureIndex, int window, Index classIndex, List> labelIndices, int[] map, String backgroundSymbol) { + this(data, labels, featureIndex, window, classIndex, labelIndices, map, QUADRATIC_PRIOR, backgroundSymbol); + } + + CRFLogConditionalObjectiveFloatFunction(int[][][][] data, int[][] labels, Index featureIndex, int window, Index classIndex, List> labelIndices, int[] map, String backgroundSymbol, double sigma) { + this(data, labels, featureIndex, window, classIndex, labelIndices, map, QUADRATIC_PRIOR, backgroundSymbol, sigma); + } + + CRFLogConditionalObjectiveFloatFunction(int[][][][] data, int[][] labels, Index featureIndex, int window, Index classIndex, List> labelIndices, int[] map, int prior, String backgroundSymbol) { + this(data, labels, featureIndex, window, classIndex, labelIndices, map, prior, backgroundSymbol, 1.0f); + } + + CRFLogConditionalObjectiveFloatFunction(int[][][][] data, int[][] labels, Index featureIndex, int window, Index classIndex, List> labelIndices, int[] map, int prior, String backgroundSymbol, double sigma) { + this.featureIndex = featureIndex; + this.window = window; + this.classIndex = classIndex; + this.numClasses = classIndex.size(); + this.labelIndices = labelIndices; + this.map = map; + this.data = data; + this.labels = labels; + this.prior = prior; + this.backgroundSymbol = backgroundSymbol; + this.sigma = (float) sigma; + empiricalCounts(data, labels); + } + + @Override + public int domainDimension() { + if (domainDimension < 0) { + domainDimension = 0; + for (int i = 0; i < map.length; i++) { + domainDimension += labelIndices.get(map[i]).size(); + } + } + return domainDimension; + } + + public CliquePotentialFunction getCliquePotentialFunction(double[] x) { + throw new UnsupportedOperationException("CRFLogConditionalObjectiveFloatFunction is not clique potential compatible yet"); + } + + public float[][] to2D(float[] weights) { + float[][] newWeights = new float[map.length][]; + int index = 0; + for (int i = 0; i < map.length; i++) { + newWeights[i] = new float[labelIndices.get(map[i]).size()]; + System.arraycopy(weights, index, newWeights[i], 0, labelIndices.get(map[i]).size()); + index += labelIndices.get(map[i]).size(); + } + return newWeights; + } + + public float[] to1D(float[][] weights) { + float[] newWeights = new float[domainDimension()]; + int index = 0; + for (int i = 0; i < weights.length; i++) { + System.arraycopy(weights[i], 0, newWeights, index, weights[i].length); + index += weights[i].length; + } + return newWeights; + } + + public float[][] empty2D() { + float[][] d = new float[map.length][]; + int index = 0; + for (int i = 0; i < map.length; i++) { + d[i] = new float[labelIndices.get(map[i]).size()]; + Arrays.fill(d[i], 0); + index += labelIndices.get(map[i]).size(); + } + return d; + } + + private void empiricalCounts(int[][][][] data, int[][] labels) { + Ehat = empty2D(); + + for (int m = 0; m < data.length; m++) { + int[][][] dataDoc = data[m]; + int[] labelsDoc = labels[m]; + int[] label = new int[window]; + //Arrays.fill(label, classIndex.indexOf("O")); + Arrays.fill(label, classIndex.indexOf(backgroundSymbol)); + for (int i = 0; i < dataDoc.length; i++) { + System.arraycopy(label, 1, label, 0, window - 1); + label[window - 1] = labelsDoc[i]; + for (int j = 0; j < dataDoc[i].length; j++) { + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(label, window - 1 - j, cliqueLabel, 0, j + 1); + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + int labelIndex = labelIndices.get(j).indexOf(crfLabel); + //System.err.println(crfLabel + " " + labelIndex); + for (int k = 0; k < dataDoc[i][j].length; k++) { + Ehat[dataDoc[i][j][k]][labelIndex]++; + } + } + } + } + } + + public static FloatFactorTable getFloatFactorTable(float[][] weights, int[][] data, List> labelIndices, int numClasses) { + + FloatFactorTable factorTable = null; + + for (int j = 0; j < labelIndices.size(); j++) { + Index labelIndex = labelIndices.get(j); + FloatFactorTable ft = new FloatFactorTable(numClasses, j + 1); + + // ...and each possible labeling for that clique + for (int k = 0; k < labelIndex.size(); k++) { + int[] label = ((CRFLabel) labelIndex.get(k)).getLabel(); + float weight = 0.0f; + for (int m = 0; m < data[j].length; m++) { + //System.err.println("**"+weights[data[j][m]][k]); + weight += weights[data[j][m]][k]; + } + ft.setValue(label, weight); + //System.err.println(">>"+ft); + } + //System.err.println("::"+ft); + if (j > 0) { + ft.multiplyInEnd(factorTable); + } + //System.err.println("::"+ft); + factorTable = ft; + + } + + return factorTable; + + } + + + + public static FloatFactorTable[] getCalibratedCliqueTree(float[][] weights, int[][][] data, List> labelIndices, int numClasses) { + + // for (int i = 0; i < weights.length; i++) { + // for (int j = 0; j < weights[i].length; j++) { + // System.err.println(i+" "+j+": "+weights[i][j]); + // } + // } + + //System.err.println("calibrating clique tree"); + + FloatFactorTable[] factorTables = new FloatFactorTable[data.length]; + FloatFactorTable[] messages = new FloatFactorTable[data.length - 1]; + + for (int i = 0; i < data.length; i++) { + + factorTables[i] = getFloatFactorTable(weights, data[i], labelIndices, numClasses); + if (VERBOSE) { + System.err.println(i + ": " + factorTables[i]); + } + + if (i > 0) { + messages[i - 1] = factorTables[i - 1].sumOutFront(); + if (VERBOSE) { + System.err.println(messages[i - 1]); + } + factorTables[i].multiplyInFront(messages[i - 1]); + if (VERBOSE) { + System.err.println(factorTables[i]); + if (i == data.length - 1) { + System.err.println(i + ": " + factorTables[i].toProbString()); + } + } + } + } + + for (int i = factorTables.length - 2; i >= 0; i--) { + + FloatFactorTable summedOut = factorTables[i + 1].sumOutEnd(); + if (VERBOSE) { + System.err.println((i + 1) + "-->" + i + ": " + summedOut); + } + summedOut.divideBy(messages[i]); + if (VERBOSE) { + System.err.println((i + 1) + "-->" + i + ": " + summedOut); + } + factorTables[i].multiplyInEnd(summedOut); + if (VERBOSE) { + System.err.println(i + ": " + factorTables[i]); + System.err.println(i + ": " + factorTables[i].toProbString()); + } + + + } + + return factorTables; + } + + @Override + public void calculate(float[] x) { + + // if (crfType.equalsIgnoreCase("weird")) { + // calculateWeird(x); + // return; + // } + + float[][] weights = to2D(x); + float prob = 0; + + float[][] E = empty2D(); + + for (int m = 0; m < data.length; m++) { + + FloatFactorTable[] factorTables = getCalibratedCliqueTree(weights, data[m], labelIndices, numClasses); + // System.err.println("calibrated:"); + // for (int i = 0; i < factorTables.length; i++) { + // System.out.println(factorTables[i]); + // System.out.println("+++++++++++++++++++++++++++++"); + + // } + // System.exit(0); + float z = factorTables[0].totalMass(); + + int[] given = new int[window - 1]; + Arrays.fill(given, classIndex.indexOf(backgroundSymbol)); + for (int i = 0; i < data[m].length; i++) { + float p = factorTables[i].conditionalLogProb(given, labels[m][i]); + if (VERBOSE) { + System.err.println("P(" + labels[m][i] + "|" + Arrays.toString(given) + ")=" + p); + } + prob += p; + System.arraycopy(given, 1, given, 0, given.length - 1); + given[given.length - 1] = labels[m][i]; + } + + // get predicted count + for (int i = 0; i < data[m].length; i++) { + // go through each clique... + for (int j = 0; j < data[m][i].length; j++) { + Index labelIndex = labelIndices.get(j); + // ...and each possible labeling for that clique + for (int k = 0; k < labelIndex.size(); k++) { + int[] label = ((CRFLabel) labelIndex.get(k)).getLabel(); + + // float p = Math.pow(Math.E, factorTables[i].logProbEnd(label)); + float p = (float) Math.exp(factorTables[i].unnormalizedLogProbEnd(label) - z); + for (int n = 0; n < data[m][i][j].length; n++) { + E[data[m][i][j][n]][k] += p; + } + } + } + } + } + + if (Float.isNaN(prob)) { + System.exit(0); + } + value = -prob; + + // compute the partial derivative for each feature + int index = 0; + for (int i = 0; i < E.length; i++) { + for (int j = 0; j < E[i].length; j++) { + derivative[index++] = (E[i][j] - Ehat[i][j]); + if (VERBOSE) { + System.err.println("deriv(" + i + "," + j + ") = " + E[i][j] + " - " + Ehat[i][j] + " = " + derivative[index - 1]); + } + } + } + + + // priors + if (prior == QUADRATIC_PRIOR) { + float sigmaSq = sigma * sigma; + for (int i = 0; i < x.length; i++) { + float k = 1.0f; + float w = x[i]; + value += k * w * w / 2.0 / sigmaSq; + derivative[i] += k * w / sigmaSq; + } + } else if (prior == HUBER_PRIOR) { + float sigmaSq = sigma * sigma; + for (int i = 0; i < x.length; i++) { + float w = x[i]; + float wabs = Math.abs(w); + if (wabs < epsilon) { + value += w * w / 2.0 / epsilon / sigmaSq; + derivative[i] += w / epsilon / sigmaSq; + } else { + value += (wabs - epsilon / 2) / sigmaSq; + derivative[i] += ((w < 0.0) ? -1.0 : 1.0) / sigmaSq; + } + } + } else if (prior == QUARTIC_PRIOR) { + float sigmaQu = sigma * sigma * sigma * sigma; + for (int i = 0; i < x.length; i++) { + float k = 1.0f; + float w = x[i]; + value += k * w * w * w * w / 2.0 / sigmaQu; + derivative[i] += k * w / sigmaQu; + } + } + + + } + + public void calculateWeird1(float[] x) { + + float[][] weights = to2D(x); + float[][] E = empty2D(); + + value = 0.0f; + Arrays.fill(derivative, 0.0f); + float[][] sums = new float[labelIndices.size()][]; + float[][] probs = new float[labelIndices.size()][]; + float[][] counts = new float[labelIndices.size()][]; + + for (int i = 0; i < sums.length; i++) { + int size = labelIndices.get(i).size(); + sums[i] = new float[size]; + probs[i] = new float[size]; + counts[i] = new float[size]; + Arrays.fill(counts[i], 0.0f); + } + + for (int d = 0; d < data.length; d++) { + int[] llabels = labels[d]; + for (int e = 0; e < data[d].length; e++) { + int[][] ddata = this.data[d][e]; + + for (int cl = 0; cl < ddata.length; cl++) { + int[] features = ddata[cl]; + // activation + Arrays.fill(sums[cl], 0.0f); + int numClasses = labelIndices.get(cl).size(); + for (int c = 0; c < numClasses; c++) { + for (int f = 0; f < features.length; f++) { + sums[cl][c] += weights[features[f]][c]; + } + } + } + + + for (int cl = 0; cl < ddata.length; cl++) { + + int[] label = new int[cl + 1]; + //Arrays.fill(label, classIndex.indexOf("O")); + Arrays.fill(label, classIndex.indexOf(backgroundSymbol)); + int index1 = label.length - 1; + for (int pos = e; pos >= 0 && index1 >= 0; pos--) { + //System.err.println(index1+" "+pos); + label[index1--] = llabels[pos]; + } + CRFLabel crfLabel = new CRFLabel(label); + int labelIndex = labelIndices.get(cl).indexOf(crfLabel); + + float total = ArrayMath.logSum(sums[cl]); + // int[] features = ddata[cl]; + int numClasses = labelIndices.get(cl).size(); + for (int c = 0; c < numClasses; c++) { + probs[cl][c] = (float) Math.exp(sums[cl][c] - total); + } + // for (int f=0; f> labelIndices; + private final Index classIndex; // didn't have before. Added since that's what is assumed everywhere. + private final double[][] Ehat; // empirical counts of all the features [feature][class] + private final int window; + private final int numClasses; + private final int[] map; + private final int[][][][] data; // data[docIndex][tokenIndex][][] + private final double[][][][] featureVal; // featureVal[docIndex][tokenIndex][][] + private final int[][] labels; // labels[docIndex][tokenIndex] + private final int domainDimension; + private double[][] eHat4Update, e4Update; + + private int[][] weightIndices; + + private final String backgroundSymbol; + + public static boolean VERBOSE = false; + + public static int getPriorType(String priorTypeStr) { + if (priorTypeStr == null) return QUADRATIC_PRIOR; // default + if ("QUADRATIC".equalsIgnoreCase(priorTypeStr)) { + return QUADRATIC_PRIOR; + } else if ("HUBER".equalsIgnoreCase(priorTypeStr)) { + return HUBER_PRIOR; + } else if ("QUARTIC".equalsIgnoreCase(priorTypeStr)) { + return QUARTIC_PRIOR; + } else if ("NONE".equalsIgnoreCase(priorTypeStr)) { + return NO_PRIOR; + } else { + throw new IllegalArgumentException("Unknown prior type: " + priorTypeStr); + } + } + + CRFLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, String backgroundSymbol) { + this(data, labels, window, classIndex, labelIndices, map, "QUADRATIC", backgroundSymbol); + } + + CRFLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, String priorType, String backgroundSymbol) { + this(data, labels, window, classIndex, labelIndices, map, priorType, backgroundSymbol, 1.0, null); + } + + CRFLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, String backgroundSymbol, double sigma, double[][][][] featureVal) { + this(data, labels, window, classIndex, labelIndices, map, "QUADRATIC", backgroundSymbol, sigma, featureVal); + } + + CRFLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, String priorType, String backgroundSymbol, double sigma, double[][][][] featureVal) { + this.window = window; + this.classIndex = classIndex; + this.numClasses = classIndex.size(); + this.labelIndices = labelIndices; + this.map = map; + this.data = data; + this.featureVal = featureVal; + this.labels = labels; + this.prior = getPriorType(priorType); + this.backgroundSymbol = backgroundSymbol; + this.sigma = sigma; + Ehat = empty2D(); + empiricalCounts(Ehat); + int myDomainDimension = 0; + for (int dim : map) { + myDomainDimension += labelIndices.get(dim).size(); + } + domainDimension = myDomainDimension; + } + + // this used to be computed lazily, but that was clearly erroneous for multithreading! + @Override + public int domainDimension() { + return domainDimension; + } + + /** + * Takes a double array of weights and creates a 2D array where: + * + * the first element is the mapped index of featuresIndex + * the second element is the index of the of the element + * + * @return a 2D weight array + */ + public static double[][] to2D(double[] weights, List> labelIndices, int[] map) { + double[][] newWeights = new double[map.length][]; + int index = 0; + for (int i = 0; i < map.length; i++) { + newWeights[i] = new double[labelIndices.get(map[i]).size()]; + System.arraycopy(weights, index, newWeights[i], 0, labelIndices.get(map[i]).size()); + index += labelIndices.get(map[i]).size(); + } + return newWeights; + } + + public double[][] to2D(double[] weights) { + return to2D(weights, this.labelIndices, this.map); + } + + public double[][] to2D(double[] weights, double wscale) { + for (int i = 0; i < weights.length; i++) + weights[i] = weights[i] * wscale; + + return to2D(weights, this.labelIndices, this.map); + } + + public static double[] to1D(double[][] weights, int domainDimension) { + double[] newWeights = new double[domainDimension]; + int index = 0; + for (int i = 0; i < weights.length; i++) { + System.arraycopy(weights[i], 0, newWeights, index, weights[i].length); + index += weights[i].length; + } + return newWeights; + } + + public double[] to1D(double[][] weights) { + return to1D(weights, domainDimension()); + } + + public int[][] getWeightIndices() + { + if (weightIndices == null) { + weightIndices = new int[map.length][]; + int index = 0; + for (int i = 0; i < map.length; i++) { + weightIndices[i] = new int[labelIndices.get(map[i]).size()]; + for (int j = 0; j < labelIndices.get(map[i]).size(); j++) { + weightIndices[i][j] = index; + index++; + } + } + } + return weightIndices; + } + + private double[][] empty2D() { + double[][] d = new double[map.length][]; + // int index = 0; + for (int i = 0; i < map.length; i++) { + d[i] = new double[labelIndices.get(map[i]).size()]; + } + return d; + } + + private void empiricalCounts(double[][] eHat) { + for (int m = 0; m < data.length; m++) { + empiricalCountsForADoc(eHat, m); + } + } + + private void empiricalCountsForADoc(double[][] eHat, int docIndex) { + int[][][] docData = data[docIndex]; + int[] docLabels = labels[docIndex]; + int[] windowLabels = new int[window]; + Arrays.fill(windowLabels, classIndex.indexOf(backgroundSymbol)); + double[][][] featureValArr = null; + if (featureVal != null) + featureValArr = featureVal[docIndex]; + + if (docLabels.length>docData.length) { // only true for self-training + // fill the windowLabel array with the extra docLabels + System.arraycopy(docLabels, 0, windowLabels, 0, windowLabels.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + for (int i = 0; i < docData.length; i++) { + System.arraycopy(windowLabels, 1, windowLabels, 0, window - 1); + windowLabels[window - 1] = docLabels[i]; + for (int j = 0; j < docData[i].length; j++) { + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + int labelIndex = labelIndices.get(j).indexOf(crfLabel); + //System.err.println(crfLabel + " " + labelIndex); + for (int n = 0; n < docData[i][j].length; n++) { + double fVal = 1.0; + if (featureValArr != null && j == 0) // j == 0 because only node features gets feature values + fVal = featureValArr[i][j][n]; + eHat[docData[i][j][n]][labelIndex] += fVal; + } + } + } + } + + public double valueForADoc(double[][] weights, int docIndex) { + return expectedCountsAndValueForADoc(weights, null, docIndex, true); + } + + private double expectedCountsAndValueForADoc(double[][] weights, double[][] E, int docIndex) { + return expectedCountsAndValueForADoc(weights, E, docIndex, false); + } + + public CliquePotentialFunction getCliquePotentialFunction(double[] x) { + double[][] weights = to2D(x); + return new LinearCliquePotentialFunction(weights); + } + + private double expectedCountsAndValueForADoc(double[][] weights, double[][] E, int docIndex, boolean skipExpectedCountCal) { + double prob = 0; + int[][][] docData = data[docIndex]; + int[] docLabels = labels[docIndex]; + + double[][][] featureVal3DArr = null; + if (featureVal != null) + featureVal3DArr = featureVal[docIndex]; + + CliquePotentialFunction cliquePotentialFunc = new LinearCliquePotentialFunction(weights); + // make a clique tree for this document + CRFCliqueTree cliqueTree = CRFCliqueTree.getCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, featureVal3DArr); + + // compute the log probability of the document given the model with the parameters x + int[] given = new int[window - 1]; + Arrays.fill(given, classIndex.indexOf(backgroundSymbol)); + if (docLabels.length>docData.length) { // only true for self-training + // fill the given array with the extra docLabels + System.arraycopy(docLabels, 0, given, 0, given.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + int label = docLabels[i]; + double p = cliqueTree.condLogProbGivenPrevious(i, label, given); + if (VERBOSE) { + System.err.println("P(" + label + "|" + ArrayMath.toString(given) + ")=" + p); + } + prob += p; + System.arraycopy(given, 1, given, 0, given.length - 1); + given[given.length - 1] = label; + } + + if (!skipExpectedCountCal) { + // compute the expected counts for this document, which we will need to compute the derivative + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + // for each possible clique at this position + for (int j = 0; j < docData[i].length; j++) { + Index labelIndex = labelIndices.get(j); + // for each possible labeling for that clique + for (int k = 0; k < labelIndex.size(); k++) { + int[] label = labelIndex.get(k).getLabel(); + double p = cliqueTree.prob(i, label); // probability of these labels occurring in this clique with these features + for (int n = 0; n < docData[i][j].length; n++) { + double fVal = 1.0; + if (j == 0 && featureVal3DArr != null) // j == 0 because only node features gets feature values + fVal = featureVal3DArr[i][j][n]; + E[docData[i][j][n]][k] += p * fVal; + } + } + } + } + } + + return prob; + } + + /** + * Calculates both value and partial derivatives at the point x, and save them internally. + */ + @Override + public void calculate(double[] x) { + + double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point + double[][] weights = to2D(x); + + // the expectations over counts + // first index is feature index, second index is of possible labeling + double[][] E = empty2D(); + + // iterate over all the documents + for (int m = 0; m < data.length; m++) { + prob += expectedCountsAndValueForADoc(weights, E, m); + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()" + + " - this may well indicate numeric underflow due to overly long documents."); + } + + value = -prob; + if (VERBOSE) { + System.err.println("value is " + value); + } + + // compute the partial derivative for each feature by comparing expected counts to empirical counts + int index = 0; + for (int i = 0; i < E.length; i++) { + for (int j = 0; j < E[i].length; j++) { + derivative[index++] = (E[i][j] - Ehat[i][j]); + if (VERBOSE) { + System.err.println("deriv(" + i + "," + j + ") = " + E[i][j] + " - " + Ehat[i][j] + " = " + derivative[index - 1]); + } + } + } + + applyPrior(x, 1.0); + } + + private void applyPrior(double[] x, double batchScale) { + // incorporate priors + if (prior == QUADRATIC_PRIOR) { + double sigmaSq = sigma * sigma; + double lambda = 1 / 2.0 / sigmaSq; + for (int i = 0; i < x.length; i++) { + double w = x[i]; + value += batchScale * w * w * lambda; + derivative[i] += batchScale * w / sigmaSq; + } + } else if (prior == HUBER_PRIOR) { + double sigmaSq = sigma * sigma; + for (int i = 0; i < x.length; i++) { + double w = x[i]; + double wabs = Math.abs(w); + if (wabs < epsilon) { + value += batchScale*w * w / 2.0 / epsilon / sigmaSq; + derivative[i] += batchScale*w / epsilon / sigmaSq; + } else { + value += batchScale*(wabs - epsilon / 2) / sigmaSq; + derivative[i] += batchScale*((w < 0.0) ? -1.0 : 1.0) / sigmaSq; + } + } + } else if (prior == QUARTIC_PRIOR) { + double sigmaQu = sigma * sigma * sigma * sigma; + double lambda = 1 / 2.0 / sigmaQu; + for (int i = 0; i < x.length; i++) { + double w = x[i]; + value += batchScale * w * w * w * w * lambda; + derivative[i] += batchScale * w / sigmaQu; + } + } + } + + @Override + public void calculateStochastic(double[] x, double [] v, int[] batch){ + calculateStochasticGradientOnly(x,batch); + } + + @Override + public int dataDimension(){ + return data.length; + } + + + //TODO(mengqiu) SGD based methods are not yet compatible with featureVals + public void calculateStochasticGradientOnly(double[] x, int[] batch) { + + double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point + double[][] weights = to2D(x); + + double batchScale = ((double) batch.length)/((double) this.dataDimension()); + + // the expectations over counts + // first index is feature index, second index is of possible labeling + double[][] E = empty2D(); + // iterate over all the documents + for (int ind : batch) { + prob += expectedCountsAndValueForADoc(weights, E, ind); + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()"); + } + + value = -prob; + + // compute the partial derivative for each feature by comparing expected counts to empirical counts + int index = 0; + for (int i = 0; i < E.length; i++) { + for (int j = 0; j < E[i].length; j++) { + derivative[index++] = (E[i][j] - batchScale*Ehat[i][j]); + if (VERBOSE) { + System.err.println("deriv(" + i + "," + j + ") = " + E[i][j] + " - " + Ehat[i][j] + " = " + derivative[index - 1]); + } + } + } + + applyPrior(x, batchScale); + } + + // re-inititalization is faster than Arrays.fill(arr, 0) + private void clearUpdateEs() { + for (int i = 0; i < eHat4Update.length; i++) + eHat4Update[i] = new double[eHat4Update[i].length]; + for (int i = 0; i < e4Update.length; i++) + e4Update[i] = new double[e4Update[i].length]; + } + + /** + * Performs stochastic update of weights x (scaled by xscale) based + * on samples indexed by batch. + * NOTE: This function does not do regularization (regularization is done by the minimizer). + * + * @param x - unscaled weights + * @param xscale - how much to scale x by when performing calculations + * @param batch - indices of which samples to compute function over + * @param gscale - how much to scale adjustments to x + * @return value of function at specified x (scaled by xscale) for samples + */ + @Override + public double calculateStochasticUpdate(double[] x, double xscale, int[] batch, double gscale) { + double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point + // int[][] wis = getWeightIndices(); + double[][] weights = to2D(x, xscale); + + if (eHat4Update == null) { + eHat4Update = empty2D(); + e4Update = new double[eHat4Update.length][]; + for (int i = 0; i < e4Update.length; i++) + e4Update[i] = new double[eHat4Update[i].length]; + } else { + clearUpdateEs(); + } + + // Adjust weight by -gscale*gradient + // gradient is expected count - empirical count + // so we adjust by + gscale(empirical count - expected count) + + // iterate over all the documents + for (int ind : batch) { + // clearUpdateEs(); + + empiricalCountsForADoc(eHat4Update, ind); + prob += expectedCountsAndValueForADoc(weights, e4Update, ind); + + /* the commented out code below is to iterate over the batch docs instead of iterating over all + parameters at the end, which is more efficient; but it would also require us to clearUpdateEs() + for each document, which is likely to out-weight the cost of iterating over params once at the end + + for (int i = 0; i < data[ind].length; i++) { + // for each possible clique at this position + for (int j = 0; j < data[ind][i].length; j++) { + Index labelIndex = labelIndices.get(j); + // for each possible labeling for that clique + for (int k = 0; k < labelIndex.size(); k++) { + for (int n = 0; n < data[ind][i][j].length; n++) { + // Adjust weight by (eHat-e)*gscale (empirical count minus expected count scaled) + int fIndex = docData[i][j][n]; + x[wis[fIndex][k]] += (eHat4Update[fIndex][k] - e4Update[fIndex][k]) * gscale; + } + } + } + } + */ + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()"); + } + + value = -prob; + + int index = 0; + for (int i = 0; i < e4Update.length; i++) { + for (int j = 0; j < e4Update[i].length; j++) { + x[index++] += (eHat4Update[i][j] - e4Update[i][j]) * gscale; + } + } + + return value; + } + + /** + * Computes value of function for specified value of x (scaled by xscale) + * only over samples indexed by batch. + * NOTE: This function does not do regularization (regularization is done by the minimizer). + * + * @param x - unscaled weights + * @param xscale - how much to scale x by when performing calculations + * @param batch - indices of which samples to compute function over + * @return value of function at specified x (scaled by xscale) for samples + */ + @Override + public double valueAt(double[] x, double xscale, int[] batch) { + double prob = 0; // the log prob of the sequence given the model, which is the negation of value at this point + // int[][] wis = getWeightIndices(); + double[][] weights = to2D(x, xscale); + + // iterate over all the documents + for (int ind : batch) { + prob += valueForADoc(weights, ind); + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()"); + } + + value = -prob; + return value; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunctionForLOP.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunctionForLOP.java new file mode 100644 index 0000000..0bb6e3f --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunctionForLOP.java @@ -0,0 +1,441 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; +import edu.stanford.nlp.util.Index; + +import java.util.*; + +/** + * @author Mengqiu Wang + * TODO(mengqiu) currently only works with disjoint feature sets + * for non-disjoint feature sets, need to recompute EHat each iteration, and multiply in the scale + * in EHat and E calculations for each lopExpert + */ + +public class CRFLogConditionalObjectiveFunctionForLOP extends AbstractCachingDiffFunction implements HasCliquePotentialFunction { + + /** label indices - for all possible label sequences - for each feature */ + List> labelIndices; + Index classIndex; // didn't have before. Added since that's what is assumed everywhere. + double[][][] Ehat; // empirical counts of all the features [lopIter][feature][class] + double[] sumOfObservedLogPotential; // empirical sum of all log potentials [lopIter] + double[][][][][] sumOfExpectedLogPotential; // sumOfExpectedLogPotential[m][i][j][lopIter][k] m-docNo;i-position;j-cliqueNo;k-label + List> featureIndicesSetArray; + List> featureIndicesListArray; + int window; + int numClasses; + int[] map; + int[][][][] data; // data[docIndex][tokenIndex][][] + double[][] lopExpertWeights; // lopExpertWeights[expertIter][weightIndex] + double[][][] lopExpertWeights2D; + int[][] labels; // labels[docIndex][tokenIndex] + int[][] learnedParamsMapping; + int numLopExpert; + boolean backpropTraining; + int domainDimension = -1; + + String crfType = "maxent"; + String backgroundSymbol; + + public static boolean VERBOSE = false; + + CRFLogConditionalObjectiveFunctionForLOP(int[][][][] data, int[][] labels, double[][] lopExpertWeights, int window, + Index classIndex, List> labelIndices, int[] map, String backgroundSymbol, int numLopExpert, + List> featureIndicesSetArray, List> featureIndicesListArray, boolean backpropTraining) { + this.window = window; + this.classIndex = classIndex; + this.numClasses = classIndex.size(); + this.labelIndices = labelIndices; + this.map = map; + this.data = data; + this.lopExpertWeights = lopExpertWeights; + this.labels = labels; + this.backgroundSymbol = backgroundSymbol; + this.numLopExpert = numLopExpert; + this.featureIndicesSetArray = featureIndicesSetArray; + this.featureIndicesListArray = featureIndicesListArray; + this.backpropTraining = backpropTraining; + initialize2DWeights(); + if (backpropTraining) { + computeEHat(); + } else { + logPotential(lopExpertWeights2D); + } + } + + @Override + public int domainDimension() { + if (domainDimension < 0) { + domainDimension = numLopExpert; + if (backpropTraining) { + // for (int i = 0; i < map.length; i++) { + // domainDimension += labelIndices[map[i]].size(); + // } + for (int i = 0; i < numLopExpert; i++) { + List featureIndicesList = featureIndicesListArray.get(i); + double[][] expertWeights2D = lopExpertWeights2D[i]; + for (int fIndex: featureIndicesList) { + int len = expertWeights2D[fIndex].length; + domainDimension += len; + } + } + } + } + return domainDimension; + } + + @Override + public double[] initial() { + double[] initial = new double[domainDimension()]; + if (backpropTraining) { + learnedParamsMapping = new int[domainDimension()][3]; + int index = 0; + for (; index < numLopExpert; index++) { + initial[index] = 1.0; + } + for (int i = 0; i < numLopExpert; i++) { + List featureIndicesList = featureIndicesListArray.get(i); + double[][] expertWeights2D = lopExpertWeights2D[i]; + for (int fIndex: featureIndicesList) { + for (int j = 0; j < expertWeights2D[fIndex].length; j++) { + initial[index] = expertWeights2D[fIndex][j]; + learnedParamsMapping[index] = new int[]{i, fIndex, j}; + index++; + } + } + } + } else { + Arrays.fill(initial, 1.0); + } + return initial; + } + + public double[][][] empty2D() { + double[][][] d2 = new double[numLopExpert][][]; + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + double[][] d = new double[map.length][]; + // int index = 0; + for (int i = 0; i < map.length; i++) { + d[i] = new double[labelIndices.get(map[i]).size()]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) 4.12.5 + // Arrays.fill(d[i], 0.0); + // index += labelIndices[map[i]].size(); + } + d2[lopIter] = d; + } + return d2; + } + + private void initialize2DWeights() { + lopExpertWeights2D = new double[numLopExpert][][]; + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + lopExpertWeights2D[lopIter] = CRFLogConditionalObjectiveFunction.to2D(lopExpertWeights[lopIter], labelIndices, map); + } + } + + private void computeEHat() { + Ehat = empty2D(); + + for (int m = 0; m < data.length; m++) { + int[][][] docData = data[m]; + int[] docLabels = labels[m]; + int[] windowLabels = new int[window]; + Arrays.fill(windowLabels, classIndex.indexOf(backgroundSymbol)); + + if (docLabels.length>docData.length) { // only true for self-training + // fill the windowLabel array with the extra docLabels + System.arraycopy(docLabels, 0, windowLabels, 0, windowLabels.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + for (int i = 0; i < docData.length; i++) { + System.arraycopy(windowLabels, 1, windowLabels, 0, window - 1); + windowLabels[window - 1] = docLabels[i]; + int[][] docDataI = docData[i]; + + for (int j = 0; j < docDataI.length; j++) { // j iterates over cliques + int[] docDataIJ = docDataI[j]; + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + Index labelIndex = labelIndices.get(j); + + int observedLabelIndex = labelIndex.indexOf(crfLabel); + //System.err.println(crfLabel + " " + observedLabelIndex); + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + double[][] ehatOfIter = Ehat[lopIter]; + Set indicesSet = featureIndicesSetArray.get(lopIter); + for (int k = 0; k < docDataIJ.length; k++) { // k iterates over features + int featureIdx = docDataIJ[k]; + if (indicesSet.contains(featureIdx)) { + ehatOfIter[featureIdx][observedLabelIndex]++; + } + } + } + } + } + } + } + + private void logPotential(double[][][] learnedLopExpertWeights2D) { + sumOfExpectedLogPotential = new double[data.length][][][][]; + sumOfObservedLogPotential = new double[numLopExpert]; + + for (int m = 0; m < data.length; m++) { + int[][][] docData = data[m]; + int[] docLabels = labels[m]; + int[] windowLabels = new int[window]; + Arrays.fill(windowLabels, classIndex.indexOf(backgroundSymbol)); + + double[][][][] sumOfELPm = new double[docData.length][][][]; + + if (docLabels.length>docData.length) { // only true for self-training + // fill the windowLabel array with the extra docLabels + System.arraycopy(docLabels, 0, windowLabels, 0, windowLabels.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + for (int i = 0; i < docData.length; i++) { + System.arraycopy(windowLabels, 1, windowLabels, 0, window - 1); + windowLabels[window - 1] = docLabels[i]; + + double[][][] sumOfELPmi = new double[docData[i].length][][]; + int[][] docDataI = docData[i]; + + for (int j = 0; j < docDataI.length; j++) { // j iterates over cliques + int[] docDataIJ = docDataI[j]; + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + Index labelIndex = labelIndices.get(j); + + double[][] sumOfELPmij = new double[numLopExpert][]; + + int observedLabelIndex = labelIndex.indexOf(crfLabel); + //System.err.println(crfLabel + " " + observedLabelIndex); + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + double[] sumOfELPmijIter = new double[labelIndex.size()]; + Set indicesSet = featureIndicesSetArray.get(lopIter); + for (int k = 0; k < docDataIJ.length; k++) { // k iterates over features + int featureIdx = docDataIJ[k]; + if (indicesSet.contains(featureIdx)) { + sumOfObservedLogPotential[lopIter] += learnedLopExpertWeights2D[lopIter][featureIdx][observedLabelIndex]; + // sum over potential of this clique over all possible labels, used later in calculating expected counts + for (int l = 0; l < labelIndex.size(); l++) { + sumOfELPmijIter[l] += learnedLopExpertWeights2D[lopIter][featureIdx][l]; + } + } + } + sumOfELPmij[lopIter] = sumOfELPmijIter; + } + sumOfELPmi[j] = sumOfELPmij; + } + sumOfELPm[i] = sumOfELPmi; + } + sumOfExpectedLogPotential[m] = sumOfELPm; + } + } + + public static double[] combineAndScaleLopWeights(int numLopExpert, double[][] lopExpertWeights, double[] lopScales) { + double[] newWeights = new double[lopExpertWeights[0].length]; + for (int i = 0; i < newWeights.length; i++) { + double tempWeight = 0; + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + tempWeight += lopExpertWeights[lopIter][i] * lopScales[lopIter]; + } + newWeights[i] = tempWeight; + } + return newWeights; + } + + public static double[][] combineAndScaleLopWeights2D(int numLopExpert, double[][][] lopExpertWeights2D, double[] lopScales) { + double[][] newWeights = new double[lopExpertWeights2D[0].length][]; + for (int i = 0; i < newWeights.length; i++) { + int innerDim = lopExpertWeights2D[0][i].length; + double[] innerWeights = new double[innerDim]; + for (int j = 0; j < innerDim; j++) { + double tempWeight = 0; + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + tempWeight += lopExpertWeights2D[lopIter][i][j] * lopScales[lopIter]; + } + innerWeights[j] = tempWeight; + } + newWeights[i] = innerWeights; + } + return newWeights; + } + + public double[][][] separateLopExpertWeights2D(double[] learnedParams) { + double[][][] learnedWeights2D = empty2D(); + for (int paramIndex = numLopExpert; paramIndex < learnedParams.length; paramIndex++) { + int[] mapping = learnedParamsMapping[paramIndex]; + learnedWeights2D[mapping[0]][mapping[1]][mapping[2]] = learnedParams[paramIndex]; + } + return learnedWeights2D; + } + + public double[][] separateLopExpertWeights(double[] learnedParams) { + double[][] learnedWeights = new double[numLopExpert][]; + double[][][] learnedWeights2D = separateLopExpertWeights2D(learnedParams); + for (int i = 0; i < numLopExpert; i++) { + learnedWeights[i] = CRFLogConditionalObjectiveFunction.to1D(learnedWeights2D[i], lopExpertWeights[i].length); + } + return learnedWeights; + } + + public double[] separateLopScales(double[] learnedParams) { + double[] rawScales = new double[numLopExpert]; + System.arraycopy(learnedParams, 0, rawScales, 0, numLopExpert); + return rawScales; + } + + public CliquePotentialFunction getCliquePotentialFunction(double[] x) { + double[] rawScales = separateLopScales(x); + double[] scales = ArrayMath.softmax(rawScales); + double[][][] learnedLopExpertWeights2D = lopExpertWeights2D; + if (backpropTraining) { + learnedLopExpertWeights2D = separateLopExpertWeights2D(x); + } + + double[][] combinedWeights2D = combineAndScaleLopWeights2D(numLopExpert, learnedLopExpertWeights2D, scales); + return new LinearCliquePotentialFunction(combinedWeights2D); + } + + // todo [cdm]: Below data[m] --> docData + /** + * Calculates both value and partial derivatives at the point x, and save them internally. + */ + @Override + public void calculate(double[] x) { + + double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point + double[][][] E = empty2D(); + double[] eScales = new double[numLopExpert]; + + double[] rawScales = separateLopScales(x); + double[] scales = ArrayMath.softmax(rawScales); + double[][][] learnedLopExpertWeights2D = lopExpertWeights2D; + if (backpropTraining) { + learnedLopExpertWeights2D = separateLopExpertWeights2D(x); + logPotential(learnedLopExpertWeights2D); + } + + double[][] combinedWeights2D = combineAndScaleLopWeights2D(numLopExpert, learnedLopExpertWeights2D, scales); + // iterate over all the documents + for (int m = 0; m < data.length; m++) { + int[][][] docData = data[m]; + int[] docLabels = labels[m]; + double[][][][] sumOfELPm = sumOfExpectedLogPotential[m]; // sumOfExpectedLogPotential[m][i][j][lopIter][k] m-docNo;i-position;j-cliqueNo;k-label + + // make a clique tree for this document + CliquePotentialFunction cliquePotentialFunc = new LinearCliquePotentialFunction(combinedWeights2D); + CRFCliqueTree cliqueTree = CRFCliqueTree.getCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, null); + + // compute the log probability of the document given the model with the parameters x + int[] given = new int[window - 1]; + Arrays.fill(given, classIndex.indexOf(backgroundSymbol)); + if (docLabels.length > docData.length) { // only true for self-training + // fill the given array with the extra docLabels + System.arraycopy(docLabels, 0, given, 0, given.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + int label = docLabels[i]; + double p = cliqueTree.condLogProbGivenPrevious(i, label, given); + if (VERBOSE) { + System.err.println("P(" + label + "|" + ArrayMath.toString(given) + ")=" + p); + } + prob += p; + System.arraycopy(given, 1, given, 0, given.length - 1); + given[given.length - 1] = label; + } + + // compute the expected counts for this document, which we will need to compute the derivative + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + // for each possible clique at this position + double[][][] sumOfELPmi = sumOfELPm[i]; + for (int j = 0; j < docData[i].length; j++) { + double[][] sumOfELPmij = sumOfELPmi[j]; + Index labelIndex = labelIndices.get(j); + // for each possible labeling for that clique + for (int l = 0; l < labelIndex.size(); l++) { + int[] label = labelIndex.get(l).getLabel(); + double p = cliqueTree.prob(i, label); // probability of these labels occurring in this clique with these features + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + Set indicesSet = featureIndicesSetArray.get(lopIter); + double scale = scales[lopIter]; + double expected = sumOfELPmij[lopIter][l]; + for (int innerLopIter = 0; innerLopIter < numLopExpert; innerLopIter++) { + expected -= scales[innerLopIter] * sumOfELPmij[innerLopIter][l]; + } + expected *= scale; + eScales[lopIter] += (p * expected); + + double[][] eOfIter = E[lopIter]; + if (backpropTraining) { + for (int k = 0; k < docData[i][j].length; k++) { // k iterates over features + int featureIdx = docData[i][j][k]; + if (indicesSet.contains(featureIdx)) { + eOfIter[featureIdx][l] += p; + } + } + } + } + } + } + } + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunctionForLOP.calculate()"); + } + + value = -prob; + if(VERBOSE){ + System.err.println("value is " + value); + } + // compute the partial derivative for each feature by comparing expected counts to empirical counts + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + double scale = scales[lopIter]; + double observed = sumOfObservedLogPotential[lopIter]; + for (int j = 0; j < numLopExpert; j++) { + observed -= scales[j] * sumOfObservedLogPotential[j]; + } + observed *= scale; + double expected = eScales[lopIter]; + + derivative[lopIter] = (expected - observed); + if (VERBOSE) { + System.err.println("deriv(" + lopIter + ") = " + expected + " - " + observed + " = " + derivative[lopIter]); + } + } + if (backpropTraining) { + int dIndex = numLopExpert; + for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { + double scale = scales[lopIter]; + double[][] eOfExpert = E[lopIter]; + double[][] ehatOfExpert = Ehat[lopIter]; + List featureIndicesList = featureIndicesListArray.get(lopIter); + for (int fIndex: featureIndicesList) { + for (int j = 0; j < eOfExpert[fIndex].length; j++) { + derivative[dIndex++] = scale * (eOfExpert[fIndex][j] - ehatOfExpert[fIndex][j]); + if (VERBOSE) { + System.err.println("deriv[" + lopIter+ "](" + fIndex + "," + j + ") = " + scale + " * (" + eOfExpert[fIndex][j] + " - " + ehatOfExpert[fIndex][j] + ") = " + derivative[dIndex - 1]); + } + } + } + } + assert(dIndex == domainDimension()); + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFNonLinearLogConditionalObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFNonLinearLogConditionalObjectiveFunction.java new file mode 100644 index 0000000..b121720 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFNonLinearLogConditionalObjectiveFunction.java @@ -0,0 +1,906 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; +import edu.stanford.nlp.optimization.HasL1ParamRange; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Triple; + +import java.util.*; + +/** + * @author Mengqiu Wang + */ + +public class CRFNonLinearLogConditionalObjectiveFunction extends AbstractCachingDiffFunction implements HasCliquePotentialFunction, HasL1ParamRange { + + public static final int NO_PRIOR = 0; + public static final int QUADRATIC_PRIOR = 1; + /* Use a Huber robust regression penalty (L1 except very near 0) not L2 */ + public static final int HUBER_PRIOR = 2; + public static final int QUARTIC_PRIOR = 3; + public static final int L1_PRIOR = 4; + public static final int L1_NODE_L2_EDGE_PRIOR = 5; + public static final int L1_SPARSENODE_L2_EDGE_PRIOR = 6; + boolean useOutputLayer; + boolean useHiddenLayer; + boolean useSigmoid; + SeqClassifierFlags flags; + + int count = 0; + protected int prior; + protected double sigma; + protected double priorL1Lambda; //used for L1 Prior, L2's lambda calculated from sigma (1/(2*sigma*sigma)) + protected double epsilon; + Random random = new Random(2147483647L); + /** label indices - for all possible label sequences - for each feature */ + List> labelIndices; + Index classIndex; // didn't have before. Added since that's what is assumed everywhere. + double[][] Ehat; // empirical counts of all the linear features [feature][class] + double[][] Uhat; // empirical counts of all the output layer features [num of class][input layer size] + double[][] What; // empirical counts of all the input layer features [input layer size][featureIndex.size()] + int window; + int numClasses; + // hidden layer number of neuron = numHiddenUnits * numClasses + int numHiddenUnits; + int[] map; + int[][][][] data; // data[docIndex][tokenIndex][][] + double[][][][] featureVal; // featureVal[docIndex][tokenIndex][][] + int[][] docWindowLabels; + + int[][] labels; // labels[docIndex][tokenIndex] + int domainDimension = -1; + int inputLayerSize = -1; + int outputLayerSize = -1; + int edgeParamCount = -1; + int numNodeFeatures = -1; + int numEdgeFeatures = -1; + int beforeOutputWeights = -1; + int originalFeatureCount = -1; + + int[][] weightIndices; + + String backgroundSymbol; + + public static boolean VERBOSE = false; + + public static int getPriorType(String priorTypeStr) + { + if (priorTypeStr == null) return QUADRATIC_PRIOR; // default + if ("QUADRATIC".equalsIgnoreCase(priorTypeStr)) { + return QUADRATIC_PRIOR; + } else if ("L1".equalsIgnoreCase(priorTypeStr)) { + return L1_PRIOR; + } else if ("L1_NODE_L2_EDGE".equalsIgnoreCase(priorTypeStr)) { + return L1_NODE_L2_EDGE_PRIOR; + } else if ("L1_SPARSENODE_L2_EDGE".equalsIgnoreCase(priorTypeStr)) { + return L1_SPARSENODE_L2_EDGE_PRIOR; + } else if ("HUBER".equalsIgnoreCase(priorTypeStr)) { + return HUBER_PRIOR; + } else if ("QUARTIC".equalsIgnoreCase(priorTypeStr)) { + return QUARTIC_PRIOR; + } else if ("NONE".equalsIgnoreCase(priorTypeStr)) { + return NO_PRIOR; + } else { + throw new IllegalArgumentException("Unknown prior type: " + priorTypeStr); + } + } + + CRFNonLinearLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, SeqClassifierFlags flags, int numNodeFeatures, int numEdgeFeatures, double[][][][] featureVal) { + this.window = window; + this.classIndex = classIndex; + this.numClasses = classIndex.size(); + this.labelIndices = labelIndices; + this.data = data; + this.featureVal = featureVal; + this.flags = flags; + this.map = map; + this.labels = labels; + this.prior = getPriorType(flags.priorType); + this.backgroundSymbol = flags.backgroundSymbol; + this.sigma = flags.sigma; + this.priorL1Lambda = flags.priorL1Lambda; + this.outputLayerSize = numClasses; + this.numHiddenUnits = flags.numHiddenUnits; + if (flags.arbitraryInputLayerSize != -1) + this.inputLayerSize = flags.arbitraryInputLayerSize; + else + this.inputLayerSize = numHiddenUnits * numClasses; + this.numNodeFeatures = numNodeFeatures; + this.numEdgeFeatures = numEdgeFeatures; + System.err.println("numOfEdgeFeatures: " + numEdgeFeatures); + this.useOutputLayer = flags.useOutputLayer; + this.useHiddenLayer = flags.useHiddenLayer; + this.useSigmoid = flags.useSigmoid; + this.docWindowLabels = new int[data.length][]; + if (!useOutputLayer) { + System.err.println("Output layer not activated, inputLayerSize must be equal to numClasses, setting it to " + numClasses); + this.inputLayerSize = numClasses; + } else if (flags.softmaxOutputLayer && !(flags.sparseOutputLayer || flags.tieOutputLayer)) { + throw new RuntimeException("flags.softmaxOutputLayer == true, but neither flags.sparseOutputLayer or flags.tieOutputLayer is true"); + } + empiricalCounts(); + } + + @Override + public int domainDimension() { + if (domainDimension < 0) { + domainDimension = 0; + edgeParamCount = numEdgeFeatures * labelIndices.get(1).size(); + + originalFeatureCount = 0; + for (int i = 0; i < map.length; i++) { + int s = labelIndices.get(map[i]).size(); + originalFeatureCount += s; + } + + domainDimension += edgeParamCount; + domainDimension += inputLayerSize * numNodeFeatures; + beforeOutputWeights = domainDimension; + // TODO(mengqiu) temporary fix for debugging + if (useOutputLayer) { + if (flags.sparseOutputLayer) { + domainDimension += outputLayerSize * numHiddenUnits; + } else if (flags.tieOutputLayer) { + domainDimension += 1 * numHiddenUnits; + } else { + domainDimension += outputLayerSize * inputLayerSize; + } + } + System.err.println("edgeParamCount: "+edgeParamCount); + System.err.println("originalFeatureCount: "+originalFeatureCount); + System.err.println("beforeOutputWeights: "+beforeOutputWeights); + System.err.println("domainDimension: "+domainDimension); + } + return domainDimension; + } + + @Override + //TODO(mengqiu) initialize edge feature weights to be weights from CRF + public double[] initial() { + double[] initial = new double[domainDimension()]; + // randomly initialize weights + if (useHiddenLayer || useOutputLayer) { + double epsilon = 0.1; + double twoEpsilon = epsilon * 2; + int count = 0; + double val = 0; + + // init edge param weights + for (int i = 0; i < edgeParamCount; i++) { + val = random.nextDouble() * twoEpsilon - epsilon; + initial[count++] = val; + } + + if (flags.blockInitialize) { + double fanIn = 1/Math.sqrt(numNodeFeatures+0.0); + double twoFanIn = 2.0 * fanIn; + int interval = numNodeFeatures / numHiddenUnits; + for (int i = 0; i < numHiddenUnits; i++) { + int lower = i * interval; + int upper = (i + 1) * interval; + if (i == numHiddenUnits - 1) + upper = numNodeFeatures; + for (int j = 0; j < outputLayerSize; j++) { + for (int k = 0; k < numNodeFeatures; k++) { + val = 0; + if (k >= lower && k < upper) { + val = random.nextDouble() * twoFanIn - fanIn; + } + initial[count++] = val; + } + } + } + if (count != beforeOutputWeights) { + throw new RuntimeException("after blockInitialize, param Index (" + count + ") not equal to beforeOutputWeights (" + beforeOutputWeights + ")"); + } + } else { + double fanIn = 1 / Math.sqrt(numNodeFeatures+0.0); + double twoFanIn = 2.0 * fanIn; + for (int i = edgeParamCount; i < beforeOutputWeights; i++) { + val = random.nextDouble() * twoFanIn - fanIn; + initial[count++] = val; + } + } + + // init output layer weights + if (flags.sparseOutputLayer) { + for (int i = 0; i < outputLayerSize; i++) { + double total = 1; + for (int j = 0; j < numHiddenUnits-1; j++) { + val = random.nextDouble() * total; + initial[count++] = val; + total -= val; + } + initial[count++] = total; + } + } else if (flags.tieOutputLayer) { + double total = 1; + double sum = 0; + for (int j = 0; j < numHiddenUnits-1; j++) { + if (flags.hardcodeSoftmaxOutputWeights) + val = 1.0 / numHiddenUnits; + else { + val = random.nextDouble() * total; + total -= val; + } + initial[count++] = val; + } + if (flags.hardcodeSoftmaxOutputWeights) + initial[count++] = 1.0 / numHiddenUnits; + else + initial[count++] = total; + } else { + for (int i = beforeOutputWeights; i < domainDimension(); i++) { + val = random.nextDouble() * twoEpsilon - epsilon; + initial[count++] = val; + } + } + if (count != domainDimension()) { + throw new RuntimeException("after param initialization, param Index (" + count + ") not equal to domainDimension (" + domainDimension() + ")"); + } + } + return initial; + } + + private void empiricalCounts() { + Ehat = empty2D(); + + for (int m = 0; m < data.length; m++) { + int[][][] docData = data[m]; + int[] docLabels = labels[m]; + int[] windowLabels = new int[window]; + Arrays.fill(windowLabels, classIndex.indexOf(backgroundSymbol)); + + if (docLabels.length>docData.length) { // only true for self-training + // fill the windowLabel array with the extra docLabels + System.arraycopy(docLabels, 0, windowLabels, 0, windowLabels.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + for (int i = 0; i < docData.length; i++) { + System.arraycopy(windowLabels, 1, windowLabels, 0, window - 1); + windowLabels[window - 1] = docLabels[i]; + // for (int j = 1; j < docData[i].length; j++) { // j starting from 1, skip all node features + //TODO(mengqiu) generalize this for bigger cliques + int j = 1; + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + int labelIndex = labelIndices.get(j).indexOf(crfLabel); + int[] cliqueFeatures = docData[i][j]; + //System.err.println(crfLabel + " " + labelIndex); + for (int n = 0; n < cliqueFeatures.length; n++) { + Ehat[cliqueFeatures[n]][labelIndex]++; + } + } + } + } + + private double[][] emptyU() { + int innerSize = inputLayerSize; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + innerSize = numHiddenUnits; + } + int outerSize = outputLayerSize; + if (flags.tieOutputLayer) { + outerSize = 1; + } + + double[][] temp = new double[outerSize][innerSize]; + for (int i = 0; i < outerSize; i++) { + temp[i] = new double[innerSize]; + } + return temp; + } + + private double[][] emptyW() { + // TODO(mengqiu) temporary fix for debugging + double[][] temp = new double[inputLayerSize][numNodeFeatures]; + for (int i = 0; i < inputLayerSize; i++) { + temp[i] = new double[numNodeFeatures]; + } + return temp; + } + + public Triple separateWeights(double[] x) { + double[] linearWeights = new double[edgeParamCount]; + System.arraycopy(x, 0, linearWeights, 0, edgeParamCount); + double[][] linearWeights2D = to2D(linearWeights); + int index = edgeParamCount; + + double[][] inputLayerWeights = emptyW(); + for (int i = 0; i < inputLayerWeights.length; i++) { + for (int j = 0; j < inputLayerWeights[i].length; j++) { + inputLayerWeights[i][j] = x[index++]; + } + } + + double[][] outputLayerWeights = emptyU(); + for (int i = 0; i < outputLayerWeights.length; i++) { + for (int j = 0; j < outputLayerWeights[i].length; j++) { + if (useOutputLayer) { + if (flags.hardcodeSoftmaxOutputWeights) + outputLayerWeights[i][j] = 1.0 / numHiddenUnits; + else + outputLayerWeights[i][j] = x[index++]; + } else + outputLayerWeights[i][j] = 1; + } + } + assert(index == x.length); + return new Triple(linearWeights2D, inputLayerWeights, outputLayerWeights); + } + + public CliquePotentialFunction getCliquePotentialFunction(double[] x) { + Triple allParams = separateWeights(x); + double[][] linearWeights = allParams.first(); + double[][] W = allParams.second(); // inputLayerWeights + double[][] U = allParams.third(); // outputLayerWeights + return new NonLinearCliquePotentialFunction(linearWeights, W, U, flags); + } + + /** + * Calculates both value and partial derivatives at the point x, and save them internally. + */ + @Override + public void calculate(double[] x) { + + double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point + Triple allParams = separateWeights(x); + double[][] linearWeights = allParams.first(); + double[][] W = allParams.second(); // inputLayerWeights + double[][] U = allParams.third(); // outputLayerWeights + + double[][] Y = null; + if (flags.softmaxOutputLayer) { + Y = new double[U.length][]; + for (int i = 0; i < U.length; i++) { + Y[i] = ArrayMath.softmax(U[i]); + } + } + + double[][] What = emptyW(); + double[][] Uhat = emptyU(); + + // the expectations over counts + // first index is feature index, second index is of possible labeling + double[][] E = empty2D(); + double[][] eW = emptyW(); + double[][] eU = emptyU(); + + // iterate over all the documents + for (int m = 0; m < data.length; m++) { + int[][][] docData = data[m]; + int[] docLabels = labels[m]; + + double[][][] featureVal3DArr = null; + if (featureVal != null) + featureVal3DArr = featureVal[m]; + + // make a clique tree for this document + CRFCliqueTree cliqueTree = CRFCliqueTree.getCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, + backgroundSymbol, new NonLinearCliquePotentialFunction(linearWeights, W, U, flags), featureVal3DArr); + + // compute the log probability of the document given the model with the parameters x + int[] given = new int[window - 1]; + Arrays.fill(given, classIndex.indexOf(backgroundSymbol)); + int[] windowLabels = new int[window]; + Arrays.fill(windowLabels, classIndex.indexOf(backgroundSymbol)); + + if (docLabels.length>docData.length) { // only true for self-training + // fill the given array with the extra docLabels + System.arraycopy(docLabels, 0, given, 0, given.length); + System.arraycopy(docLabels, 0, windowLabels, 0, windowLabels.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + int label = docLabels[i]; + double p = cliqueTree.condLogProbGivenPrevious(i, label, given); + if (VERBOSE) { + System.err.println("P(" + label + "|" + ArrayMath.toString(given) + ")=" + p); + } + prob += p; + System.arraycopy(given, 1, given, 0, given.length - 1); + given[given.length - 1] = label; + } + + // compute the expected counts for this document, which we will need to compute the derivative + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + // for each possible clique at this position + System.arraycopy(windowLabels, 1, windowLabels, 0, window - 1); + windowLabels[window - 1] = docLabels[i]; + for (int j = 0; j < docData[i].length; j++) { + Index labelIndex = labelIndices.get(j); + // for each possible labeling for that clique + int[] cliqueFeatures = docData[i][j]; + double[] As = null; + double[] fDeriv = null; + double[][] yTimesA = null; + double[] sumOfYTimesA = null; + + // calculating empirical counts of node features + if (j == 0) { + double[] featureValArr = null; + if (featureVal3DArr != null) + featureValArr = featureVal3DArr[i][j]; + As = NonLinearCliquePotentialFunction.hiddenLayerOutput(W, cliqueFeatures, flags, featureValArr); + fDeriv = new double[inputLayerSize]; + double fD = 0; + for (int q = 0; q < inputLayerSize; q++) { + if (useSigmoid) { + fD = As[q] * (1 - As[q]); + } else { + fD = 1 - As[q] * As[q]; + } + fDeriv[q] = fD; + } + + // calculating yTimesA for softmax + if (flags.softmaxOutputLayer) { + double val = 0; + + yTimesA = new double[outputLayerSize][numHiddenUnits]; + for (int ii = 0; ii < outputLayerSize; ii++) { + yTimesA[ii] = new double[numHiddenUnits]; + } + sumOfYTimesA = new double[outputLayerSize]; + + for (int k = 0; k < outputLayerSize; k++) { + double[] Yk = null; + if (flags.tieOutputLayer) { + Yk = Y[0]; + } else { + Yk = Y[k]; + } + double sum = 0; + for (int q = 0; q < inputLayerSize; q++) { + if (q % outputLayerSize == k) { + int hiddenUnitNo = q / outputLayerSize; + val = As[q] * Yk[hiddenUnitNo]; + yTimesA[k][hiddenUnitNo] = val; + sum += val; + } + } + sumOfYTimesA[k] = sum; + } + } + + // calculating Uhat What + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); + + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + int givenLabelIndex = labelIndex.indexOf(crfLabel); + double[] Uk = null; + double[] UhatK = null; + double[] Yk = null; + double[] yTimesAK = null; + double sumOfYTimesAK = 0; + if (flags.tieOutputLayer) { + Uk = U[0]; + UhatK = Uhat[0]; + if (flags.softmaxOutputLayer) { + Yk = Y[0]; + } + } else { + Uk = U[givenLabelIndex]; + UhatK = Uhat[givenLabelIndex]; + if (flags.softmaxOutputLayer) { + Yk = Y[givenLabelIndex]; + } + } + + if (flags.softmaxOutputLayer) { + yTimesAK = yTimesA[givenLabelIndex]; + sumOfYTimesAK = sumOfYTimesA[givenLabelIndex]; + } + + for (int k = 0; k < inputLayerSize; k++) { + double deltaK = 1; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (k % outputLayerSize == givenLabelIndex) { + int hiddenUnitNo = k / outputLayerSize; + if (flags.softmaxOutputLayer) { + UhatK[hiddenUnitNo] += (yTimesAK[hiddenUnitNo] - Yk[hiddenUnitNo] * sumOfYTimesAK); + deltaK *= Yk[hiddenUnitNo]; + } else { + UhatK[hiddenUnitNo] += As[k]; + deltaK *= Uk[hiddenUnitNo]; + } + } + } else { + UhatK[k] += As[k]; + if (useOutputLayer) { + deltaK *= Uk[k]; + } + } + if (useHiddenLayer) + deltaK *= fDeriv[k]; + if (useOutputLayer) { + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (k % outputLayerSize == givenLabelIndex) { + double[] WhatK = What[k]; + for (int n = 0; n < cliqueFeatures.length; n++) { + double fVal = 1.0; + if (featureVal3DArr != null) + fVal = featureVal3DArr[i][j][n]; + WhatK[cliqueFeatures[n]] += deltaK * fVal; + } + } + } else { + double[] WhatK = What[k]; + double fVal = 1.0; + for (int n = 0; n < cliqueFeatures.length; n++) { + fVal = 1.0; + if (featureVal3DArr != null) + fVal = featureVal3DArr[i][j][n]; + WhatK[cliqueFeatures[n]] += deltaK * fVal; + } + } + } else { + if (k == givenLabelIndex) { + double[] WhatK = What[k]; + double fVal = 1.0; + for (int n = 0; n < cliqueFeatures.length; n++) { + fVal = 1.0; + if (featureVal3DArr != null) + fVal = featureVal3DArr[i][j][n]; + WhatK[cliqueFeatures[n]] += deltaK * fVal; + } + } + } + } + } + + // calculate expected count of features + for (int k = 0; k < labelIndex.size(); k++) { // labelIndex.size() == numClasses + int[] label = labelIndex.get(k).getLabel(); + double p = cliqueTree.prob(i, label); // probability of these labels occurring in this clique with these features + if (j == 0) { // for node features + double[] Uk = null; + double[] eUK = null; + double[] Yk = null; + if (flags.tieOutputLayer) { + Uk = U[0]; + eUK = eU[0]; + if (flags.softmaxOutputLayer) { + Yk = Y[0]; + } + } else { + Uk = U[k]; + eUK = eU[k]; + if (flags.softmaxOutputLayer) { + Yk = Y[k]; + } + } + if (useOutputLayer) { + for (int q = 0; q < inputLayerSize; q++) { + double deltaQ = 1; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (q % outputLayerSize == k) { + int hiddenUnitNo = q / outputLayerSize; + if (flags.softmaxOutputLayer) { + eUK[hiddenUnitNo] += (yTimesA[k][hiddenUnitNo] - Yk[hiddenUnitNo] * sumOfYTimesA[k]) * p; + deltaQ = Yk[hiddenUnitNo]; + } else { + eUK[hiddenUnitNo] += As[q] * p; + deltaQ = Uk[hiddenUnitNo]; + } + } + } else { + eUK[q] += As[q] * p; + deltaQ = Uk[q]; + } + if (useHiddenLayer) + deltaQ *= fDeriv[q]; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (q % outputLayerSize == k) { + double[] eWq = eW[q]; + double fVal = 1.0; + for (int n = 0; n < cliqueFeatures.length; n++) { + fVal = 1.0; + if (featureVal3DArr != null) + fVal = featureVal3DArr[i][j][n]; + eWq[cliqueFeatures[n]] += deltaQ * p * fVal; + } + } + } else { + double[] eWq = eW[q]; + double fVal = 1.0; + for (int n = 0; n < cliqueFeatures.length; n++) { + fVal = 1.0; + if (featureVal3DArr != null) + fVal = featureVal3DArr[i][j][n]; + eWq[cliqueFeatures[n]] += deltaQ * p * fVal; + } + } + } + } else { + double deltaK = 1; + if (useHiddenLayer) + deltaK *= fDeriv[k]; + double[] eWK = eW[k]; + double fVal = 1.0; + for (int n = 0; n < cliqueFeatures.length; n++) { + fVal = 1.0; + if (featureVal3DArr != null) + fVal = featureVal3DArr[i][j][n]; + eWK[cliqueFeatures[n]] += deltaK * p * fVal; + } + } + } else { // for edge features + for (int n = 0; n < cliqueFeatures.length; n++) { + E[cliqueFeatures[n]][k] += p; + } + } + } + } + } + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFNonLinearLogConditionalObjectiveFunction.calculate()"); + } + + value = -prob; + if(VERBOSE){ + System.err.println("value is " + value); + } + + // compute the partial derivative for each feature by comparing expected counts to empirical counts + int index = 0; + for (int i = 0; i < E.length; i++) { + for (int j = 0; j < E[i].length; j++) { + derivative[index++] = (E[i][j] - Ehat[i][j]); + if (VERBOSE) { + System.err.println("linearWeights deriv(" + i + "," + j + ") = " + E[i][j] + " - " + Ehat[i][j] + " = " + derivative[index - 1]); + } + } + } + if (index != edgeParamCount) + throw new RuntimeException("after edge derivative, index("+index+") != edgeParamCount("+edgeParamCount+")"); + + for (int i = 0; i < eW.length; i++) { + for (int j = 0; j < eW[i].length; j++) { + derivative[index++] = (eW[i][j] - What[i][j]); + if (VERBOSE) { + System.err.println("inputLayerWeights deriv(" + i + "," + j + ") = " + eW[i][j] + " - " + What[i][j] + " = " + derivative[index - 1]); + } + } + } + + if (index != beforeOutputWeights) + throw new RuntimeException("after W derivative, index("+index+") != beforeOutputWeights("+beforeOutputWeights+")"); + + if (useOutputLayer) { + for (int i = 0; i < eU.length; i++) { + for (int j = 0; j < eU[i].length; j++) { + if (flags.hardcodeSoftmaxOutputWeights) + derivative[index++] = 0; + else + derivative[index++] = (eU[i][j] - Uhat[i][j]); + if (VERBOSE) { + System.err.println("outputLayerWeights deriv(" + i + "," + j + ") = " + eU[i][j] + " - " + Uhat[i][j] + " = " + derivative[index - 1]); + } + } + } + } + + if (index != x.length) + throw new RuntimeException("after W derivative, index("+index+") != x.length("+x.length+")"); + + int regSize = x.length; + if (flags.skipOutputRegularization || flags.softmaxOutputLayer || flags.hardcodeSoftmaxOutputWeights) { + regSize = beforeOutputWeights; + } + + // incorporate priors + if (prior == QUADRATIC_PRIOR) { + double sigmaSq = sigma * sigma; + double twoSigmaSq = 2.0 * sigmaSq; + double w = 0; + double valueSum = 0; + for (int i = 0; i < regSize; i++) { + w = x[i]; + valueSum += w * w; + derivative[i] += w / sigmaSq; + } + value += valueSum / twoSigmaSq; + } else if (prior == L1_PRIOR) { // Do nothing, as the prior will be applied in OWL-QN + } else if (prior == L1_NODE_L2_EDGE_PRIOR) { + int paramIndex = 0; + double sigmaSq = sigma * sigma; + double lambda = 1 / 2.0 / sigmaSq; + double w = 0; + double valueSum = 0; + for (; paramIndex < edgeParamCount; paramIndex++) { + w = x[paramIndex]; + valueSum += w * w; + derivative[paramIndex] += w / sigmaSq; + } + value += valueSum * lambda; + } else if (prior == L1_SPARSENODE_L2_EDGE_PRIOR) { + double sigmaSq = sigma * sigma; + double lambda = 1 / 2.0 / sigmaSq; + double w = 0; + double valueSum = 0; + for (int paramIndex = 0; paramIndex < edgeParamCount; paramIndex++) { + w = x[paramIndex]; + valueSum += w * w; + derivative[paramIndex] += w / sigmaSq; + } + value += valueSum * lambda; + for (int nodeFeatureIndex = 0; nodeFeatureIndex < numNodeFeatures; nodeFeatureIndex++) { // for each node feature, we enforce the sparsity + for (int outputClassIndex = 0; outputClassIndex < numClasses; outputClassIndex++) { + double maxParamAbsVal = 0; + int maxHiddenUnitIndex = 0; + for (int hiddenUnitIndex = 0; hiddenUnitIndex < numHiddenUnits; hiddenUnitIndex++) { + int firstLayerIndex = hiddenUnitIndex * numClasses + outputClassIndex; + double absWeight = Math.abs(W[firstLayerIndex][nodeFeatureIndex]); + if (absWeight > maxParamAbsVal) { + maxParamAbsVal = absWeight; + maxHiddenUnitIndex = hiddenUnitIndex; + } + } + for (int hiddenUnitIndex = 0; hiddenUnitIndex < numHiddenUnits; hiddenUnitIndex++) { + if (hiddenUnitIndex == maxHiddenUnitIndex) {// only performs L2 regularization on max param, the rest will be applied L1 in OWL-QN + int firstLayerIndex = hiddenUnitIndex * numClasses + outputClassIndex; + int oneDIndex = firstLayerIndex * numNodeFeatures + nodeFeatureIndex + edgeParamCount; + w = x[oneDIndex]; + value += w * w * lambda; + derivative[oneDIndex] += w / sigmaSq; + } + } + } + } + } else if (prior == HUBER_PRIOR) { + double sigmaSq = sigma * sigma; + for (int i = 0; i < regSize; i++) { + double w = x[i]; + double wabs = Math.abs(w); + if (wabs < epsilon) { + value += w * w / 2.0 / epsilon / sigmaSq; + derivative[i] += w / epsilon / sigmaSq; + } else { + value += (wabs - epsilon / 2) / sigmaSq; + derivative[i] += ((w < 0.0) ? -1.0 : 1.0) / sigmaSq; + } + } + } else if (prior == QUARTIC_PRIOR) { + double sigmaQu = sigma * sigma * sigma * sigma; + for (int i = 0; i < regSize; i++) { + double k = 1.0; + double w = x[i]; + value += k * w * w * w * w / 2.0 / sigmaQu; + derivative[i] += k * w / sigmaQu; + } + } + + if (flags.regularizeSoftmaxTieParam && + flags.softmaxOutputLayer && !flags.hardcodeSoftmaxOutputWeights) { + // lambda is 1/(2*sigma*sigma) + double softmaxLambda = flags.softmaxTieLambda; + double oneDividedByTwoSigmaSq = softmaxLambda * 2; + double y = 0; + double mean = 1.0 / numHiddenUnits; + /* + double[] Yk = Y[0]; + for (int i = 0; i < Yk.length; i++) { + y = Yk[i]; + value += (y-mean) * (y-mean) * softmaxLambda; + double derivAdd = 0; + for (int j = 0; j < Yk.length; j++) { + if (j == i) { + derivAdd += (y - mean) * y * (1-y); + } else { + derivAdd -= (Yk[j] - mean) * Yk[j] * y; + } + } + double grad = oneDividedByTwoSigmaSq * derivAdd; + System.err.println("U["+i+"]="+x[beforeOutputWeights+i]+", Y["+i+"]="+y+", grad="+grad); + y = x[beforeOutputWeights+i]; + } + */ + int count = 0; + for (int i = 0; i < U.length; i++) { + for (int j = 0; j < U[i].length; j++) { + y = U[i][j]; + value += (y-mean) * (y-mean) * softmaxLambda; + double grad = (y-mean) * oneDividedByTwoSigmaSq; + // System.err.println("U["+i+"]["+j+"]="+x[beforeOutputWeights+count]+", Y["+i+"]["+j+"]="+Y[i][j]+", grad="+grad); + derivative[beforeOutputWeights+count] += grad; + count++; + } + } + } + } + + public Set getL1ParamRange(double[] x) { + if (prior == L1_PRIOR) { + Set paramRange = Generics.newHashSet(x.length); + for (int i = 0; i < x.length; i++) + paramRange.add(i); + return paramRange; + } else if (prior == L1_NODE_L2_EDGE_PRIOR) { + Set paramRange = Generics.newHashSet(beforeOutputWeights - edgeParamCount); + for (int i = edgeParamCount; i < beforeOutputWeights; i++) + paramRange.add(i); + return paramRange; + } else if (prior == L1_SPARSENODE_L2_EDGE_PRIOR) { + double[][] W = separateWeights(x).second(); // inputLayerWeights + Set paramRange = Generics.newHashSet(); + for (int nodeFeatureIndex = 0; nodeFeatureIndex < numNodeFeatures; nodeFeatureIndex++) { // for each node feature, we enforce the sparsity + for (int outputClassIndex = 0; outputClassIndex < numClasses; outputClassIndex++) { + double maxParamAbsVal = 0; + int maxHiddenUnitIndex = 0; + for (int hiddenUnitIndex = 0; hiddenUnitIndex < numHiddenUnits; hiddenUnitIndex++) { + int firstLayerIndex = hiddenUnitIndex * numClasses + outputClassIndex; + double absWeight = Math.abs(W[firstLayerIndex][nodeFeatureIndex]); + if (absWeight > maxParamAbsVal) { + maxParamAbsVal = absWeight; + maxHiddenUnitIndex = hiddenUnitIndex; + } + } + for (int hiddenUnitIndex = 0; hiddenUnitIndex < numHiddenUnits; hiddenUnitIndex++) { + if (hiddenUnitIndex != maxHiddenUnitIndex) {// do not penalize the max param + int firstLayerIndex = hiddenUnitIndex * numClasses + outputClassIndex; + int oneDIndex = firstLayerIndex * numNodeFeatures + nodeFeatureIndex + edgeParamCount; + paramRange.add(oneDIndex); + } + } + } + } + return paramRange; + } else { + return Generics.newHashSet(); + } + } + + public double[][] to2D(double[] linearWeights) { + double[][] newWeights = new double[numEdgeFeatures][]; + int index = 0; + int labelIndicesSize = labelIndices.get(1).size(); + for (int i = 0; i < numEdgeFeatures; i++) { + newWeights[i] = new double[labelIndicesSize]; + System.arraycopy(linearWeights, index, newWeights[i], 0, labelIndicesSize); + index += labelIndicesSize; + } + return newWeights; + } + + public double[][] empty2D() { + double[][] d = new double[numEdgeFeatures][]; + // int index = 0; + int labelIndicesSize = labelIndices.get(1).size(); + for (int i = 0; i < numEdgeFeatures; i++) { + d[i] = new double[labelIndicesSize]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) 4.12.5 + // Arrays.fill(d[i], 0.0); + // index += labelIndices.get(map[i]).size(); + } + return d; + } + + public double[][] emptyFull2D() { + double[][] d = new double[map.length][]; + // int index = 0; + for (int i = 0; i < map.length; i++) { + d[i] = new double[labelIndices.get(map[i]).size()]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) 4.12.5 + // Arrays.fill(d[i], 0.0); + // index += labelIndices.get(map[i]).size(); + } + return d; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFNonLinearSecondOrderLogConditionalObjectiveFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFNonLinearSecondOrderLogConditionalObjectiveFunction.java new file mode 100644 index 0000000..a4edc4c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CRFNonLinearSecondOrderLogConditionalObjectiveFunction.java @@ -0,0 +1,821 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.optimization.AbstractCachingDiffFunction; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Triple; +import edu.stanford.nlp.util.Quadruple; + +import java.util.*; + +/** + * @author Mengqiu Wang + */ + +public class CRFNonLinearSecondOrderLogConditionalObjectiveFunction extends AbstractCachingDiffFunction implements HasCliquePotentialFunction { + + public static final int NO_PRIOR = 0; + public static final int QUADRATIC_PRIOR = 1; + /* Use a Huber robust regression penalty (L1 except very near 0) not L2 */ + public static final int HUBER_PRIOR = 2; + public static final int QUARTIC_PRIOR = 3; + + boolean useOutputLayer; + boolean useHiddenLayer; + boolean useSigmoid; + SeqClassifierFlags flags; + + int count = 0; + protected int prior; + protected double sigma; + protected double epsilon; + Random random = new Random(2147483647L); + /** label indices - for all possible label sequences - for each feature */ + List> labelIndices; + Index classIndex; // didn't have before. Added since that's what is assumed everywhere. + double[][] Ehat; // empirical counts of all the linear features [feature][class] + double[][] Uhat; // empirical counts of all the output layer features [num of class][input layer size] + double[][] What; // empirical counts of all the input layer features [input layer size][featureIndex.size()] + int window; + int numClasses; + // hidden layer number of neuron = numHiddenUnits * numClasses + int numHiddenUnits; + int[] map; + int[][][][] data; // data[docIndex][tokenIndex][][] + int[][] docWindowLabels; + + int[][] labels; // labels[docIndex][tokenIndex] + int domainDimension = -1; + int inputLayerSize = -1; + int outputLayerSize = -1; + int inputLayerSize4Edge= -1; + int outputLayerSize4Edge = -1; + + int edgeParamCount = -1; + int numNodeFeatures = -1; + int numEdgeFeatures = -1; + int beforeOutputWeights = -1; + + // for debugging + int originalFeatureCount = -1; + + int[][] weightIndices; + + String crfType = "maxent"; + String backgroundSymbol; + + public static boolean VERBOSE = false; + + public static int getPriorType(String priorTypeStr) + { + if (priorTypeStr == null) return QUADRATIC_PRIOR; // default + if ("QUADRATIC".equalsIgnoreCase(priorTypeStr)) { + return QUADRATIC_PRIOR; + } else if ("HUBER".equalsIgnoreCase(priorTypeStr)) { + return HUBER_PRIOR; + } else if ("QUARTIC".equalsIgnoreCase(priorTypeStr)) { + return QUARTIC_PRIOR; + } else if ("NONE".equalsIgnoreCase(priorTypeStr)) { + return NO_PRIOR; + } else { + throw new IllegalArgumentException("Unknown prior type: " + priorTypeStr); + } + } + + CRFNonLinearSecondOrderLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, SeqClassifierFlags flags, int numNodeFeatures, int numEdgeFeatures) { + this(data, labels, window, classIndex, labelIndices, map, QUADRATIC_PRIOR, flags, numNodeFeatures, numEdgeFeatures); + } + + CRFNonLinearSecondOrderLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, Index classIndex, List> labelIndices, int[] map, int prior, SeqClassifierFlags flags, int numNodeFeatures, int numEdgeFeatures) { + this.window = window; + this.classIndex = classIndex; + this.numClasses = classIndex.size(); + this.labelIndices = labelIndices; + this.data = data; + this.flags = flags; + this.map = map; + this.labels = labels; + this.prior = prior; + this.backgroundSymbol = flags.backgroundSymbol; + this.sigma = flags.sigma; + this.outputLayerSize = numClasses; + this.outputLayerSize4Edge = numClasses * numClasses; + this.numHiddenUnits = flags.numHiddenUnits; + this.inputLayerSize = numHiddenUnits * numClasses; + this.inputLayerSize4Edge = numHiddenUnits * numClasses * numClasses; + this.numNodeFeatures = numNodeFeatures; + this.numEdgeFeatures = numEdgeFeatures; + this.useOutputLayer = flags.useOutputLayer; + this.useHiddenLayer = flags.useHiddenLayer; + this.useSigmoid = flags.useSigmoid; + this.docWindowLabels = new int[data.length][]; + if (!useOutputLayer) { + System.err.println("Output layer not activated, inputLayerSize must be equal to numClasses, setting it to " + numClasses); + this.inputLayerSize = numClasses; + this.inputLayerSize4Edge = numClasses * numClasses; + } else if (flags.softmaxOutputLayer && !(flags.sparseOutputLayer || flags.tieOutputLayer)) { + throw new RuntimeException("flags.softmaxOutputLayer == true, but neither flags.sparseOutputLayer or flags.tieOutputLayer is true"); + } + // empiricalCounts(); + } + + @Override + public int domainDimension() { + if (domainDimension < 0) { + originalFeatureCount = 0; + for (int i = 0; i < map.length; i++) { + int s = labelIndices.get(map[i]).size(); + originalFeatureCount += s; + } + domainDimension = 0; + domainDimension += inputLayerSize4Edge * numEdgeFeatures; + domainDimension += inputLayerSize * numNodeFeatures; + beforeOutputWeights = domainDimension; + if (useOutputLayer) { + if (flags.sparseOutputLayer) { + domainDimension += outputLayerSize4Edge * numHiddenUnits; + domainDimension += outputLayerSize * numHiddenUnits; + } else if (flags.tieOutputLayer) { + domainDimension += 1 * numHiddenUnits; + domainDimension += 1 * numHiddenUnits; + } else { + domainDimension += outputLayerSize4Edge * inputLayerSize4Edge; + domainDimension += outputLayerSize * inputLayerSize; + } + } + System.err.println("originalFeatureCount: "+originalFeatureCount); + System.err.println("beforeOutputWeights: "+beforeOutputWeights); + System.err.println("domainDimension: "+domainDimension); + } + return domainDimension; + } + + @Override + public double[] initial() { + double[] initial = new double[domainDimension()]; + // randomly initialize weights + if (useHiddenLayer || useOutputLayer) { + double epsilon = 0.1; + double twoEpsilon = epsilon * 2; + int count = 0; + double val = 0; + + if (flags.blockInitialize) { + int interval4Edge = numEdgeFeatures / numHiddenUnits; + for (int i = 0; i < numHiddenUnits; i++) { + int lower = i * interval4Edge; + int upper = (i + 1) * interval4Edge; + if (i == numHiddenUnits - 1) + upper = numEdgeFeatures; + for (int j = 0; j < outputLayerSize4Edge; j++) { + for (int k = 0; k < numEdgeFeatures; k++) { + val = 0; + if (k >= lower && k < upper) { + val = random.nextDouble() * twoEpsilon - epsilon; + } + initial[count++] = val; + } + } + } + + int interval = numNodeFeatures / numHiddenUnits; + for (int i = 0; i < numHiddenUnits; i++) { + int lower = i * interval; + int upper = (i + 1) * interval; + if (i == numHiddenUnits - 1) + upper = numNodeFeatures; + for (int j = 0; j < outputLayerSize; j++) { + for (int k = 0; k < numNodeFeatures; k++) { + val = 0; + if (k >= lower && k < upper) { + val = random.nextDouble() * twoEpsilon - epsilon; + } + initial[count++] = val; + } + } + } + if (count != beforeOutputWeights) { + throw new RuntimeException("after blockInitialize, param Index (" + count + ") not equal to beforeOutputWeights (" + beforeOutputWeights + ")"); + } + } else { + for (int i = 0; i < beforeOutputWeights; i++) { + val = random.nextDouble() * twoEpsilon - epsilon; + initial[count++] = val; + } + } + + if (flags.sparseOutputLayer) { + for (int i = 0; i < outputLayerSize4Edge; i++) { + double total = 1; + for (int j = 0; j < numHiddenUnits-1; j++) { + val = random.nextDouble() * total; + initial[count++] = val; + total -= val; + } + initial[count++] = total; + } + for (int i = 0; i < outputLayerSize; i++) { + double total = 1; + for (int j = 0; j < numHiddenUnits-1; j++) { + val = random.nextDouble() * total; + initial[count++] = val; + total -= val; + } + initial[count++] = total; + } + } else if (flags.tieOutputLayer) { + double total = 1; + double sum = 0; + for (int j = 0; j < numHiddenUnits-1; j++) { + val = random.nextDouble() * total; + initial[count++] = val; + total -= val; + } + initial[count++] = total; + total = 1; + sum = 0; + for (int j = 0; j < numHiddenUnits-1; j++) { + val = random.nextDouble() * total; + initial[count++] = val; + total -= val; + } + initial[count++] = total; + } else { + for (int i = beforeOutputWeights; i < domainDimension(); i++) { + val = random.nextDouble() * twoEpsilon - epsilon; + initial[count++] = val; + } + } + if (count != domainDimension()) { + throw new RuntimeException("after param initialization, param Index (" + count + ") not equal to domainDimension (" + domainDimension() + ")"); + } + } + return initial; + } + + private double[][] emptyU4Edge() { + int innerSize = inputLayerSize4Edge; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + innerSize = numHiddenUnits; + } + int outerSize = outputLayerSize4Edge; + if (flags.tieOutputLayer) { + outerSize = 1; + } + + double[][] temp = new double[outerSize][innerSize]; + for (int i = 0; i < outerSize; i++) { + temp[i] = new double[innerSize]; + } + return temp; + } + + private double[][] emptyW4Edge() { + double[][] temp = new double[inputLayerSize4Edge][numEdgeFeatures]; + for (int i = 0; i < inputLayerSize; i++) { + temp[i] = new double[numEdgeFeatures]; + } + return temp; + } + + private double[][] emptyU() { + int innerSize = inputLayerSize; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + innerSize = numHiddenUnits; + } + int outerSize = outputLayerSize; + if (flags.tieOutputLayer) { + outerSize = 1; + } + + double[][] temp = new double[outerSize][innerSize]; + for (int i = 0; i < outerSize; i++) { + temp[i] = new double[innerSize]; + } + return temp; + } + + private double[][] emptyW() { + double[][] temp = new double[inputLayerSize][numNodeFeatures]; + for (int i = 0; i < inputLayerSize; i++) { + temp[i] = new double[numNodeFeatures]; + } + return temp; + } + + public Quadruple separateWeights(double[] x) { + int index = 0; + double[][] inputLayerWeights4Edge = emptyW4Edge(); + for (int i = 0; i < inputLayerWeights4Edge.length; i++) { + for (int j = 0; j < inputLayerWeights4Edge[i].length; j++) { + inputLayerWeights4Edge[i][j] = x[index++]; + } + } + + double[][] inputLayerWeights = emptyW(); + for (int i = 0; i < inputLayerWeights.length; i++) { + for (int j = 0; j < inputLayerWeights[i].length; j++) { + inputLayerWeights[i][j] = x[index++]; + } + } + + double[][] outputLayerWeights4Edge = emptyU4Edge(); + for (int i = 0; i < outputLayerWeights4Edge.length; i++) { + for (int j = 0; j < outputLayerWeights4Edge[i].length; j++) { + if (useOutputLayer) + outputLayerWeights4Edge[i][j] = x[index++]; + else + outputLayerWeights4Edge[i][j] = 1; + } + } + + double[][] outputLayerWeights = emptyU(); + for (int i = 0; i < outputLayerWeights.length; i++) { + for (int j = 0; j < outputLayerWeights[i].length; j++) { + if (useOutputLayer) + outputLayerWeights[i][j] = x[index++]; + else + outputLayerWeights[i][j] = 1; + } + } + assert(index == x.length); + return new Quadruple(inputLayerWeights4Edge, outputLayerWeights4Edge, inputLayerWeights, outputLayerWeights); + } + + public CliquePotentialFunction getCliquePotentialFunction(double[] x) { + Quadruple allParams = separateWeights(x); + double[][] W4Edge = allParams.first(); // inputLayerWeights4Edge + double[][] U4Edge = allParams.second(); // outputLayerWeights4Edge + double[][] W = allParams.third(); // inputLayerWeights + double[][] U = allParams.fourth(); // outputLayerWeights + + return new NonLinearSecondOrderCliquePotentialFunction(W4Edge, U4Edge, W, U, flags); + } + + + // todo [cdm]: Below data[m] --> docData + /** + * Calculates both value and partial derivatives at the point x, and save them internally. + */ + @Override + public void calculate(double[] x) { + + double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point + Quadruple allParams = separateWeights(x); + double[][] W4Edge = allParams.first(); // inputLayerWeights4Edge + double[][] U4Edge = allParams.second(); // outputLayerWeights4Edge + double[][] W = allParams.third(); // inputLayerWeights + double[][] U = allParams.fourth(); // outputLayerWeights + + double[][] Y4Edge = null; + double[][] Y = null; + if (flags.softmaxOutputLayer) { + Y4Edge = new double[U4Edge.length][]; + for (int i = 0; i < U4Edge.length; i++) { + Y4Edge[i] = ArrayMath.softmax(U4Edge[i]); + } + Y = new double[U.length][]; + for (int i = 0; i < U.length; i++) { + Y[i] = ArrayMath.softmax(U[i]); + } + } + + double[][] What4Edge = emptyW4Edge(); + double[][] Uhat4Edge = emptyU4Edge(); + double[][] What = emptyW(); + double[][] Uhat = emptyU(); + + // the expectations over counts + // first index is feature index, second index is of possible labeling + double[][] eW4Edge = emptyW4Edge(); + double[][] eU4Edge = emptyU4Edge(); + double[][] eW = emptyW(); + double[][] eU = emptyU(); + + // iterate over all the documents + for (int m = 0; m < data.length; m++) { + int[][][] docData = data[m]; + int[] docLabels = labels[m]; + + // make a clique tree for this document + CRFCliqueTree cliqueTree = CRFCliqueTree.getCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, + backgroundSymbol, new NonLinearSecondOrderCliquePotentialFunction(W4Edge, U4Edge, W, U, flags), null); + + // compute the log probability of the document given the model with the parameters x + int[] given = new int[window - 1]; + Arrays.fill(given, classIndex.indexOf(backgroundSymbol)); + int[] windowLabels = new int[window]; + Arrays.fill(windowLabels, classIndex.indexOf(backgroundSymbol)); + + if (docLabels.length>docData.length) { // only true for self-training + // fill the given array with the extra docLabels + System.arraycopy(docLabels, 0, given, 0, given.length); + System.arraycopy(docLabels, 0, windowLabels, 0, windowLabels.length); + // shift the docLabels array left + int[] newDocLabels = new int[docData.length]; + System.arraycopy(docLabels, docLabels.length-newDocLabels.length, newDocLabels, 0, newDocLabels.length); + docLabels = newDocLabels; + } + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + int label = docLabels[i]; + double p = cliqueTree.condLogProbGivenPrevious(i, label, given); + if (VERBOSE) { + System.err.println("P(" + label + "|" + ArrayMath.toString(given) + ")=" + p); + } + prob += p; + System.arraycopy(given, 1, given, 0, given.length - 1); + given[given.length - 1] = label; + } + + // compute the expected counts for this document, which we will need to compute the derivative + // iterate over the positions in this document + for (int i = 0; i < docData.length; i++) { + // for each possible clique at this position + System.arraycopy(windowLabels, 1, windowLabels, 0, window - 1); + windowLabels[window - 1] = docLabels[i]; + for (int j = 0; j < docData[i].length; j++) { + Index labelIndex = labelIndices.get(j); + // for each possible labeling for that clique + int[] cliqueFeatures = docData[i][j]; + double[] As = null; + double[] fDeriv = null; + double[][] yTimesA = null; + double[] sumOfYTimesA = null; + + int inputSize, outputSize = -1; + if (j == 0) { + inputSize = inputLayerSize; + outputSize = outputLayerSize; + As = NonLinearCliquePotentialFunction.hiddenLayerOutput(W, cliqueFeatures, flags, null); + } else { + inputSize = inputLayerSize4Edge; + outputSize = outputLayerSize4Edge; + As = NonLinearCliquePotentialFunction.hiddenLayerOutput(W4Edge, cliqueFeatures, flags, null); + } + + fDeriv = new double[inputSize]; + double fD = 0; + for (int q = 0; q < inputSize; q++) { + if (useSigmoid) { + fD = As[q] * (1 - As[q]); + } else { + fD = 1 - As[q] * As[q]; + } + fDeriv[q] = fD; + } + + // calculating yTimesA for softmax + if (flags.softmaxOutputLayer) { + double val = 0; + + yTimesA = new double[outputSize][numHiddenUnits]; + for (int ii = 0; ii < outputSize; ii++) { + yTimesA[ii] = new double[numHiddenUnits]; + } + sumOfYTimesA = new double[outputSize]; + + for (int k = 0; k < outputSize; k++) { + double[] Yk = null; + if (flags.tieOutputLayer) { + if (j == 0) { + Yk = Y[0]; + } else { + Yk = Y4Edge[0]; + } + } else { + if (j == 0) { + Yk = Y[k]; + } else { + Yk = Y4Edge[k]; + } + } + double sum = 0; + for (int q = 0; q < inputSize; q++) { + if (q % outputSize == k) { + int hiddenUnitNo = q / outputSize; + val = As[q] * Yk[hiddenUnitNo]; + yTimesA[k][hiddenUnitNo] = val; + sum += val; + } + } + sumOfYTimesA[k] = sum; + } + } + + // calculating Uhat What + int[] cliqueLabel = new int[j + 1]; + System.arraycopy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); + + CRFLabel crfLabel = new CRFLabel(cliqueLabel); + int givenLabelIndex = labelIndex.indexOf(crfLabel); + double[] Uk = null; + double[] UhatK = null; + double[] Yk = null; + double[] yTimesAK = null; + double sumOfYTimesAK = 0; + + if (flags.tieOutputLayer) { + if (j == 0) { + Uk = U[0]; + UhatK = Uhat[0]; + } else { + Uk = U4Edge[0]; + UhatK = Uhat4Edge[0]; + } + if (flags.softmaxOutputLayer) { + if (j == 0) { + Yk = Y[0]; + } else { + Yk = Y4Edge[0]; + } + } + } else { + if (j == 0) { + Uk = U[givenLabelIndex]; + UhatK = Uhat[givenLabelIndex]; + } else { + Uk = U4Edge[givenLabelIndex]; + UhatK = Uhat4Edge[givenLabelIndex]; + } + if (flags.softmaxOutputLayer) { + if (j == 0) { + Yk = Y[givenLabelIndex]; + } else { + Yk = Y4Edge[givenLabelIndex]; + } + } + } + + if (flags.softmaxOutputLayer) { + yTimesAK = yTimesA[givenLabelIndex]; + sumOfYTimesAK = sumOfYTimesA[givenLabelIndex]; + } + + for (int k = 0; k < inputSize; k++) { + double deltaK = 1; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (k % outputSize == givenLabelIndex) { + int hiddenUnitNo = k / outputSize; + if (flags.softmaxOutputLayer) { + UhatK[hiddenUnitNo] += (yTimesAK[hiddenUnitNo] - Yk[hiddenUnitNo] * sumOfYTimesAK); + deltaK *= Yk[hiddenUnitNo]; + } else { + UhatK[hiddenUnitNo] += As[k]; + deltaK *= Uk[hiddenUnitNo]; + } + } + } else { + UhatK[k] += As[k]; + if (useOutputLayer) { + deltaK *= Uk[k]; + } + } + if (useHiddenLayer) + deltaK *= fDeriv[k]; + if (useOutputLayer) { + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (k % outputSize == givenLabelIndex) { + double[] WhatK = null; + if (j == 0) { + WhatK = What[k]; + } else { + WhatK = What4Edge[k]; + } + for (int n = 0; n < cliqueFeatures.length; n++) { + WhatK[cliqueFeatures[n]] += deltaK; + } + } + } else { + double[] WhatK = null; + if (j == 0) { + WhatK = What[k]; + } else { + WhatK = What4Edge[k]; + } + for (int n = 0; n < cliqueFeatures.length; n++) { + WhatK[cliqueFeatures[n]] += deltaK; + } + } + } else { + if (k == givenLabelIndex) { + double[] WhatK = null; + if (j == 0) { + WhatK = What[k]; + } else { + WhatK = What4Edge[k]; + } + for (int n = 0; n < cliqueFeatures.length; n++) { + WhatK[cliqueFeatures[n]] += deltaK; + } + } + } + } + + for (int k = 0; k < labelIndex.size(); k++) { // labelIndex.size() == numClasses + int[] label = labelIndex.get(k).getLabel(); + double p = cliqueTree.prob(i, label); // probability of these labels occurring in this clique with these features + double[] Uk2 = null; + double[] eUK = null; + double[] Yk2 = null; + + if (flags.tieOutputLayer) { + if (j == 0) { // for node features + Uk2 = U[0]; + eUK = eU[0]; + } else { + Uk2 = U4Edge[0]; + eUK = eU4Edge[0]; + } + if (flags.softmaxOutputLayer) { + if (j == 0) { + Yk2 = Y[0]; + } else { + Yk2 = Y4Edge[0]; + } + } + } else { + if (j == 0) { + Uk2 = U[k]; + eUK = eU[k]; + } else { + Uk2 = U4Edge[k]; + eUK = eU4Edge[k]; + } + if (flags.softmaxOutputLayer) { + if (j == 0) { + Yk2 = Y[k]; + } else { + Yk2 = Y4Edge[k]; + } + } + } + if (useOutputLayer) { + for (int q = 0; q < inputSize; q++) { + double deltaQ = 1; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (q % outputSize == k) { + int hiddenUnitNo = q / outputSize; + if (flags.softmaxOutputLayer) { + eUK[hiddenUnitNo] += (yTimesA[k][hiddenUnitNo] - Yk2[hiddenUnitNo] * sumOfYTimesA[k]) * p; + deltaQ = Yk2[hiddenUnitNo]; + } else { + eUK[hiddenUnitNo] += As[q] * p; + deltaQ = Uk2[hiddenUnitNo]; + } + } + } else { + eUK[q] += As[q] * p; + deltaQ = Uk2[q]; + } + if (useHiddenLayer) + deltaQ *= fDeriv[q]; + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (q % outputSize == k) { + double[] eWq = null; + if (j == 0) { + eWq = eW[q]; + } else { + eWq = eW4Edge[q]; + } + for (int n = 0; n < cliqueFeatures.length; n++) { + eWq[cliqueFeatures[n]] += deltaQ * p; + } + } + } else { + double[] eWq = null; + if (j == 0) { + eWq = eW[q]; + } else { + eWq = eW4Edge[q]; + } + for (int n = 0; n < cliqueFeatures.length; n++) { + eWq[cliqueFeatures[n]] += deltaQ * p; + } + } + } + } else { + double deltaK = 1; + if (useHiddenLayer) + deltaK *= fDeriv[k]; + double[] eWK = null; + if (j == 0) { + eWK = eW[k]; + } else { + eWK = eW4Edge[k]; + } + for (int n = 0; n < cliqueFeatures.length; n++) { + eWK[cliqueFeatures[n]] += deltaK * p; + } + } + } + } + } + } + + if (Double.isNaN(prob)) { // shouldn't be the case + throw new RuntimeException("Got NaN for prob in CRFNonLinearSecondOrderLogConditionalObjectiveFunction.calculate()"); + } + + value = -prob; + if(VERBOSE){ + System.err.println("value is " + value); + } + + // compute the partial derivative for each feature by comparing expected counts to empirical counts + int index = 0; + for (int i = 0; i < eW4Edge.length; i++) { + for (int j = 0; j < eW4Edge[i].length; j++) { + derivative[index++] = (eW4Edge[i][j] - What4Edge[i][j]); + if (VERBOSE) { + System.err.println("inputLayerWeights4Edge deriv(" + i + "," + j + ") = " + eW4Edge[i][j] + " - " + What4Edge[i][j] + " = " + derivative[index - 1]); + } + } + } + + for (int i = 0; i < eW.length; i++) { + for (int j = 0; j < eW[i].length; j++) { + derivative[index++] = (eW[i][j] - What[i][j]); + if (VERBOSE) { + System.err.println("inputLayerWeights deriv(" + i + "," + j + ") = " + eW[i][j] + " - " + What[i][j] + " = " + derivative[index - 1]); + } + } + } + + if (index != beforeOutputWeights) + throw new RuntimeException("after W derivative, index("+index+") != beforeOutputWeights("+beforeOutputWeights+")"); + + if (useOutputLayer) { + for (int i = 0; i < eU4Edge.length; i++) { + for (int j = 0; j < eU4Edge[i].length; j++) { + derivative[index++] = (eU4Edge[i][j] - Uhat4Edge[i][j]); + if (VERBOSE) { + System.err.println("outputLayerWeights4Edge deriv(" + i + "," + j + ") = " + eU4Edge[i][j] + " - " + Uhat4Edge[i][j] + " = " + derivative[index - 1]); + } + } + } + for (int i = 0; i < eU.length; i++) { + for (int j = 0; j < eU[i].length; j++) { + derivative[index++] = (eU[i][j] - Uhat[i][j]); + if (VERBOSE) { + System.err.println("outputLayerWeights deriv(" + i + "," + j + ") = " + eU[i][j] + " - " + Uhat[i][j] + " = " + derivative[index - 1]); + } + } + } + } + + if (index != x.length) + throw new RuntimeException("after W derivative, index("+index+") != x.length("+x.length+")"); + + int regSize = x.length; + if (flags.skipOutputRegularization || flags.softmaxOutputLayer) { + regSize = beforeOutputWeights; + } + + // incorporate priors + if (prior == QUADRATIC_PRIOR) { + double sigmaSq = sigma * sigma; + for (int i = 0; i < regSize; i++) { + double k = 1.0; + double w = x[i]; + value += k * w * w / 2.0 / sigmaSq; + derivative[i] += k * w / sigmaSq; + } + } else if (prior == HUBER_PRIOR) { + double sigmaSq = sigma * sigma; + for (int i = 0; i < regSize; i++) { + double w = x[i]; + double wabs = Math.abs(w); + if (wabs < epsilon) { + value += w * w / 2.0 / epsilon / sigmaSq; + derivative[i] += w / epsilon / sigmaSq; + } else { + value += (wabs - epsilon / 2) / sigmaSq; + derivative[i] += ((w < 0.0) ? -1.0 : 1.0) / sigmaSq; + } + } + } else if (prior == QUARTIC_PRIOR) { + double sigmaQu = sigma * sigma * sigma * sigma; + for (int i = 0; i < regSize; i++) { + double k = 1.0; + double w = x[i]; + value += k * w * w * w * w / 2.0 / sigmaQu; + derivative[i] += k * w / sigmaQu; + } + } + } + + public double[][] emptyFull2D() { + double[][] d = new double[map.length][]; + // int index = 0; + for (int i = 0; i < map.length; i++) { + d[i] = new double[labelIndices.get(map[i]).size()]; + // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) 4.12.5 + // Arrays.fill(d[i], 0.0); + // index += labelIndices.get(map[i]).size(); + } + return d; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CliquePotentialFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CliquePotentialFunction.java new file mode 100644 index 0000000..7652da8 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/CliquePotentialFunction.java @@ -0,0 +1,19 @@ +package edu.stanford.nlp.ie.crf; + +/** + * @author Mengqiu Wang + */ + +public interface CliquePotentialFunction { + + /** + * @param cliqueSize 1 if node clique, 2 if edge clique, etc + * @param labelIndex the index of the output class label + * @param cliqueFeatures an int array containing the feature indices that are active in this clique + * @param featureVal a double array containing the feature values corresponding to feature indices in cliqueFeatures + * + * @return clique potential value + */ + public double computeCliquePotential(int cliqueSize, int labelIndex, int[] cliqueFeatures, double[] featureVal); + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/FactorTable.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/FactorTable.java new file mode 100644 index 0000000..4459462 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/FactorTable.java @@ -0,0 +1,599 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.math.SloppyMath; +import edu.stanford.nlp.util.Index; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Stores a factor table as a one dimensional array of doubles. + * This class supports a restricted form of factor table where each + * variable has the same set of values, but supports cliques of + * arbitrary size. + * + * @author Jenny Finkel + */ +public class FactorTable { + + private final int numClasses; + private final int windowSize; + + private final double[] table; + + + public FactorTable(int numClasses, int windowSize) { + this.numClasses = numClasses; + this.windowSize = windowSize; + + table = new double[SloppyMath.intPow(numClasses, windowSize)]; + Arrays.fill(table, Double.NEGATIVE_INFINITY); + } + + public FactorTable(FactorTable t) { + numClasses = t.numClasses(); + windowSize = t.windowSize(); + table = new double[t.size()]; + System.arraycopy(t.table, 0, table, 0, t.size()); + } + + public boolean hasNaN() { + return ArrayMath.hasNaN(table); + } + + public String toProbString() { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(Arrays.toString(toArray(i))); + sb.append(": "); + sb.append(prob(toArray(i))); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + public String toNonLogString() { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(Arrays.toString(toArray(i))); + sb.append(": "); + sb.append(Math.exp(getValue(i))); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + public String toString(Index classIndex) { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(toString(toArray(i), classIndex)); + sb.append(": "); + sb.append(getValue(i)); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(Arrays.toString(toArray(i))); + sb.append(": "); + sb.append(getValue(i)); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + private static String toString(int[] array, Index classIndex) { + List l = new ArrayList(array.length); + for (int i = 0; i < array.length; i++) { + l.add(classIndex.get(array[i])); + } + return l.toString(); + } + + public int[] toArray(int index) { + int[] indices = new int[windowSize]; + for (int i = indices.length - 1; i >= 0; i--) { + indices[i] = index % numClasses; + index /= numClasses; + } + return indices; + } + + private int indexOf(int[] entry) { + int index = 0; + for (int i = 0; i < entry.length; i++) { + index *= numClasses; + index += entry[i]; + } + // if (index < 0) throw new RuntimeException("index=" + index + " entry=" + Arrays.toString(entry)); // only if overflow + return index; + } + + private int indexOf(int[] front, int end) { + int index = 0; + for (int i = 0; i < front.length; i++) { + index *= numClasses; + index += front[i]; + } + index *= numClasses; + index += end; + return index; + } + + private int indexOf(int front, int[] end) { + int index = front; + for (int i = 0; i < end.length; i++) { + index *= numClasses; + index += end[i]; + } + return index; + } + + private int[] indicesEnd(int[] entries) { + int index = 0; + for (int i = 0; i < entries.length; i++) { + index *= numClasses; + index += entries[i]; + } + int[] indices = new int[SloppyMath.intPow(numClasses, windowSize - entries.length)]; + final int offset = SloppyMath.intPow(numClasses, entries.length); + for (int i = 0; i < indices.length; i++) { + indices[i] = index; + index += offset; + } + // System.err.println("indicesEnd returning: " + Arrays.toString(indices)); + return indices; + } + + + /** This now returns the first index of the requested entries. + * The run of numClasses ^ (windowSize - entries.length) + * successive entries will give all of them. + * + * @param entries The class indices of size windowsSize + * @return First index of requested entries + */ + private int indicesFront(int[] entries) { + int start = 0; + for (int entry : entries) { + start *= numClasses; + start += entry; + } + int offset = SloppyMath.intPow(numClasses, windowSize - entries.length); + return start * offset; + } + + public int windowSize() { + return windowSize; + } + + public int numClasses() { + return numClasses; + } + + public int size() { + return table.length; + } + + public double totalMass() { + return ArrayMath.logSum(table); + } + + /** Returns a single clique potential. */ + public double unnormalizedLogProb(int[] label) { + return getValue(label); + } + + public double logProb(int[] label) { + return unnormalizedLogProb(label) - totalMass(); + } + + public double prob(int[] label) { + return Math.exp(unnormalizedLogProb(label) - totalMass()); + } + + /** + * Computes the probability of the tag OF being at the end of the table given + * that the previous tag sequence in table is GIVEN. given is at the beginning, + * of is at the end. + * + * @return the probability of the tag OF being at the end of the table + */ + public double conditionalLogProbGivenPrevious(int[] given, int of) { + if (given.length != windowSize - 1) { + throw new IllegalArgumentException("conditionalLogProbGivenPrevious requires given one less than clique size (" + + windowSize + ") but was " + Arrays.toString(given)); + } + // Note: other similar methods could be optimized like this one, but this is the one the CRF uses.... + /* + int startIndex = indicesFront(given); + int numCellsToSum = SloppyMath.intPow(numClasses, windowSize - given.length); + double z = ArrayMath.logSum(table, startIndex, startIndex + numCellsToSum); + int i = indexOf(given, of); + System.err.printf("startIndex is %d, numCellsToSum is %d, i is %d (of is %d)%n", startIndex, numCellsToSum, i, of); + */ + int startIndex = indicesFront(given); + double z = ArrayMath.logSum(table, startIndex, startIndex + numClasses); + int i = startIndex + of; + // System.err.printf("startIndex is %d, numCellsToSum is %d, i is %d (of is %d)%n", startIndex, numClasses, i, of); + + return table[i] - z; + } + +// public double conditionalLogProbGivenPreviousForPartial(int[] given, int of) { +// if (given.length != windowSize - 1) { +// System.err.println("error computing conditional log prob"); +// System.exit(0); +// } +// // int[] label = indicesFront(given); +// // double[] masses = new double[label.length]; +// // for (int i = 0; i < masses.length; i++) { +// // masses[i] = table[label[i]]; +// // } +// // double z = ArrayMath.logSum(masses); +// +// int i = indexOf(given, of); +// // if (SloppyMath.isDangerous(z) || SloppyMath.isDangerous(table[i])) { +// // System.err.println("z="+z); +// // System.err.println("t="+table[i]); +// // } +// +// return table[i]; +// } + + /** + * Computes the probabilities of the tag at the end of the table given that + * the previous tag sequence in table is GIVEN. given is at the beginning, + * position in question is at the end + * + * @return the probabilities of the tag at the end of the table + */ + public double[] conditionalLogProbsGivenPrevious(int[] given) { + if (given.length != windowSize - 1) { + throw new IllegalArgumentException("conditionalLogProbsGivenPrevious requires given one less than clique size (" + + windowSize + ") but was " + Arrays.toString(given)); + } + double[] result = new double[numClasses]; + for (int i = 0; i < numClasses; i++) { + int index = indexOf(given, i); + result[i] = table[index]; + } + ArrayMath.logNormalize(result); + return result; + } + + /** + * Computes the probability of the sequence OF being at the end of the table + * given that the first tag in table is GIVEN. given is at the beginning, of is + * at the end + * + * @return the probability of the sequence of being at the end of the table + */ + public double conditionalLogProbGivenFirst(int given, int[] of) { + if (of.length != windowSize - 1) { + throw new IllegalArgumentException("conditionalLogProbGivenFirst requires of one less than clique size (" + + windowSize + ") but was " + Arrays.toString(of)); + } + // compute P(given, of) + int[] labels = new int[windowSize]; + labels[0] = given; + System.arraycopy(of, 0, labels, 1, windowSize - 1); + // double probAll = logProb(labels); + double probAll = unnormalizedLogProb(labels); + + // compute P(given) + // double probGiven = logProbFront(given); + double probGiven = unnormalizedLogProbFront(given); + + // compute P(given, of) / P(given) + return probAll - probGiven; + } + + /** + * Computes the probability of the sequence OF being at the end of the table + * given that the first tag in table is GIVEN. given is at the beginning, of is + * at the end. + * + * @return the probability of the sequence of being at the end of the table + */ + public double unnormalizedConditionalLogProbGivenFirst(int given, int[] of) { + if (of.length != windowSize - 1) { + throw new IllegalArgumentException("unnormalizedConditionalLogProbGivenFirst requires of one less than clique size (" + + windowSize + ") but was " + Arrays.toString(of)); + } + // compute P(given, of) + int[] labels = new int[windowSize]; + labels[0] = given; + System.arraycopy(of, 0, labels, 1, windowSize - 1); + // double probAll = logProb(labels); + double probAll = unnormalizedLogProb(labels); + + // compute P(given) + // double probGiven = logProbFront(given); + // double probGiven = unnormalizedLogProbFront(given); + + // compute P(given, of) / P(given) + // return probAll - probGiven; + return probAll; + } + + /** + * Computes the probability of the tag OF being at the beginning of the table + * given that the tag sequence GIVEN is at the end of the table. given is at + * the end, of is at the beginning + * + * @return the probability of the tag of being at the beginning of the table + */ + public double conditionalLogProbGivenNext(int[] given, int of) { + if (given.length != windowSize - 1) { + throw new IllegalArgumentException("conditionalLogProbGivenNext requires given one less than clique size (" + + windowSize + ") but was " + Arrays.toString(given)); + } + int[] label = indicesEnd(given); + double[] masses = new double[label.length]; + for (int i = 0; i < masses.length; i++) { + masses[i] = table[label[i]]; + } + double z = ArrayMath.logSum(masses); + + return table[indexOf(of, given)] - z; + } + + public double unnormalizedLogProbFront(int[] labels) { + int startIndex = indicesFront(labels); + int numCellsToSum = SloppyMath.intPow(numClasses, windowSize - labels.length); + // double[] masses = new double[labels.length]; + // for (int i = 0; i < masses.length; i++) { + // masses[i] = table[labels[i]]; + // } + return ArrayMath.logSum(table, startIndex, startIndex + numCellsToSum); + } + + public double logProbFront(int[] label) { + return unnormalizedLogProbFront(label) - totalMass(); + } + + public double unnormalizedLogProbFront(int label) { + int[] labels = { label }; + return unnormalizedLogProbFront(labels); + } + + public double logProbFront(int label) { + return unnormalizedLogProbFront(label) - totalMass(); + } + + public double unnormalizedLogProbEnd(int[] labels) { + labels = indicesEnd(labels); + double[] masses = new double[labels.length]; + for (int i = 0; i < masses.length; i++) { + masses[i] = table[labels[i]]; + } + return ArrayMath.logSum(masses); + } + + public double logProbEnd(int[] labels) { + return unnormalizedLogProbEnd(labels) - totalMass(); + } + + public double unnormalizedLogProbEnd(int label) { + int[] labels = { label }; + return unnormalizedLogProbEnd(labels); + } + + public double logProbEnd(int label) { + return unnormalizedLogProbEnd(label) - totalMass(); + } + + public double getValue(int index) { + return table[index]; + } + + public double getValue(int[] label) { + return table[indexOf(label)]; + } + + public void setValue(int index, double value) { + table[index] = value; + } + + public void setValue(int[] label, double value) { + // try{ + table[indexOf(label)] = value; + // } catch (Exception e) { + // e.printStackTrace(); + // System.err.println("Table length: " + table.length + " indexOf(label): " + // + indexOf(label)); + // throw new ArrayIndexOutOfBoundsException(e.toString()); + // // System.exit(1); + // } + } + + public void incrementValue(int[] label, double value) { + incrementValue(indexOf(label), value); + } + + public void incrementValue(int index, double value) { + table[index] += value; + } + + void logIncrementValue(int index, double value) { + table[index] = SloppyMath.logAdd(table[index], value); + } + + public void logIncrementValue(int[] label, double value) { + logIncrementValue(indexOf(label), value); + } + + public void multiplyInFront(FactorTable other) { + int divisor = SloppyMath.intPow(numClasses, windowSize - other.windowSize()); + for (int i = 0; i < table.length; i++) { + table[i] += other.getValue(i / divisor); + } + } + + public void multiplyInEnd(FactorTable other) { + int divisor = SloppyMath.intPow(numClasses, other.windowSize()); + for (int i = 0; i < table.length; i++) { + table[i] += other.getValue(i % divisor); + } + } + + public FactorTable sumOutEnd() { + FactorTable ft = new FactorTable(numClasses, windowSize - 1); + for (int i = 0, sz = ft.size(); i < sz; i++) { + ft.table[i] = ArrayMath.logSum(table, i * numClasses, (i+1) * numClasses); + } + /* + for (int i = 0; i < table.length; i++) { + ft.logIncrementValue(i / numClasses, table[i]); + } + */ + return ft; + } + + public FactorTable sumOutFront() { + FactorTable ft = new FactorTable(numClasses, windowSize - 1); + int stride = ft.size(); + for (int i = 0; i < stride; i++) { + ft.setValue(i, ArrayMath.logSum(table, i, table.length, stride)); + } + return ft; + } + + public void divideBy(FactorTable other) { + for (int i = 0; i < table.length; i++) { + if (table[i] != Double.NEGATIVE_INFINITY || other.table[i] != Double.NEGATIVE_INFINITY) { + table[i] -= other.table[i]; + } + } + } + + + public static void main(String[] args) { + int numClasses = 6; + final int cliqueSize = 3; + System.err.printf("Creating factor table with %d classes and window (clique) size %d%n", numClasses, cliqueSize); + FactorTable ft = new FactorTable(numClasses, cliqueSize); + + /** + * for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { for (int k = + * 0; k < 2; k++) { int[] a = new int[]{i, j, k}; + * System.out.print(ft.toString(a)+": "+ft.indexOf(a)); } } } for (int i = + * 0; i < 2; i++) { int[] b = new int[]{i}; + * System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesFront(b))); } + * for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { int[] b = new + * int[]{i, j}; + * System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesFront(b))); } + * } for (int i = 0; i < 2; i++) { int[] b = new int[]{i}; + * System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesBack(b))); } + * for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { int[] b = new + * int[]{i, j}; ft2.setValue(b, (i*2)+j); } } for (int i = 0; i < 2; i++) { + * for (int j = 0; j < 2; j++) { int[] b = new int[]{i, j}; + * System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesBack(b))); } } + * + * System.out.println("##########################################"); + **/ + + for (int i = 0; i < numClasses; i++) { + for (int j = 0; j < numClasses; j++) { + for (int k = 0; k < numClasses; k++) { + int[] b = { i, j, k }; + ft.setValue(b, (i * 4) + (j * 2) + k); + } + } + } + + System.err.println(ft); + double normalization = 0.0; + for (int i = 0; i < numClasses; i++) { + for (int j = 0; j < numClasses; j++) { + for (int k = 0; k < numClasses; k++) { + normalization += ft.unnormalizedLogProb(new int[] {i, j, k}); + } + } + } + System.err.println("Normalization Z = " + normalization); + + System.err.println(ft.sumOutFront()); + + FactorTable ft2 = new FactorTable(numClasses, 2); + for (int i = 0; i < numClasses; i++) { + for (int j = 0; j < numClasses; j++) { + int[] b = { i, j }; + ft2.setValue(b, i * numClasses + j); + } + } + + System.err.println(ft2); + // FactorTable ft3 = ft2.sumOutFront(); + // System.err.println(ft3); + + for (int i = 0; i < numClasses; i++) { + for (int j = 0; j < numClasses; j++) { + int[] b = { i, j }; + double t = 0; + for (int k = 0; k < numClasses; k++) { + t += Math.exp(ft.conditionalLogProbGivenPrevious(b, k)); + System.err + .println(k + "|" + i + "," + j + " : " + Math.exp(ft.conditionalLogProbGivenPrevious(b, k))); + } + System.err.println(t); + } + } + + System.err.println("conditionalLogProbGivenFirst"); + for (int j = 0; j < numClasses; j++) { + for (int k = 0; k < numClasses; k++) { + int[] b = { j, k }; + double t = 0.0; + for (int i = 0; i < numClasses; i++) { + t += ft.unnormalizedConditionalLogProbGivenFirst(i, b); + System.err + .println(i + "|" + j + "," + k + " : " + ft.unnormalizedConditionalLogProbGivenFirst(i, b)); + } + System.err.println(t); + } + } + + System.err.println("conditionalLogProbGivenFirst"); + for (int i = 0; i < numClasses; i++) { + for (int j = 0; j < numClasses; j++) { + int[] b = { i, j }; + double t = 0.0; + for (int k = 0; k < numClasses; k++) { + t += ft.conditionalLogProbGivenNext(b, k); + System.err + .println(i + "," + j + "|" + k + " : " + ft.conditionalLogProbGivenNext(b, k)); + } + System.err.println(t); + } + } + + numClasses = 2; + FactorTable ft3 = new FactorTable(numClasses, cliqueSize); + ft3.setValue(new int[] {0, 0, 0}, Math.log(0.25)); + ft3.setValue(new int[] {0, 0, 1}, Math.log(0.35)); + ft3.setValue(new int[] {0, 1, 0}, Math.log(0.05)); + ft3.setValue(new int[] {0, 1, 1}, Math.log(0.07)); + ft3.setValue(new int[] {1, 0, 0}, Math.log(0.08)); + ft3.setValue(new int[] {1, 0, 1}, Math.log(0.16)); + ft3.setValue(new int[] {1, 1, 0}, Math.log(1e-50)); + ft3.setValue(new int[] {1, 1, 1}, Math.log(1e-50)); + + FactorTable ft4 = ft3.sumOutFront(); + System.err.println(ft4.toNonLogString()); + FactorTable ft5 = ft3.sumOutEnd(); + System.err.println(ft5.toNonLogString()); + } // end main + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/FloatFactorTable.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/FloatFactorTable.java new file mode 100644 index 0000000..13790be --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/FloatFactorTable.java @@ -0,0 +1,380 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.math.SloppyMath; +import edu.stanford.nlp.util.Index; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +/** Stores a factor table as a one dimensional array of floats. + * + * @author Jenny Finkel + */ + +public class FloatFactorTable { + + private final int numClasses; + private final int windowSize; + + private final float[] table; + + public FloatFactorTable(int numClasses, int windowSize) { + this.numClasses = numClasses; + this.windowSize = windowSize; + + table = new float[SloppyMath.intPow(numClasses, windowSize)]; + Arrays.fill(table, Float.NEGATIVE_INFINITY); + } + + public boolean hasNaN() { + return ArrayMath.hasNaN(table); + } + + public String toProbString() { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(Arrays.toString(toArray(i))); + sb.append(": "); + sb.append(prob(toArray(i))); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + public String toString(Index classIndex) { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(toString(toArray(i), classIndex)); + sb.append(": "); + sb.append(getValue(i)); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("{\n"); + for (int i = 0; i < table.length; i++) { + sb.append(Arrays.toString(toArray(i))); + sb.append(": "); + sb.append(getValue(i)); + sb.append("\n"); + } + sb.append("}"); + return sb.toString(); + } + + private String toString(int[] array, Index classIndex) { + List l = new ArrayList(); + for (int i = 0; i < array.length; i++) { + l.add(classIndex.get(array[i])); + } + return l.toString(); + } + + private int[] toArray(int index) { + int[] indices = new int[windowSize]; + for (int i = indices.length - 1; i >= 0; i--) { + indices[i] = index % numClasses; + index /= numClasses; + } + return indices; + } + + private int indexOf(int[] entry) { + int index = 0; + for (int i = 0; i < entry.length; i++) { + index *= numClasses; + index += entry[i]; + } + return index; + } + + private int indexOf(int[] front, int end) { + int index = 0; + for (int i = 0; i < front.length; i++) { + index *= numClasses; + index += front[i]; + } + index *= numClasses; + index += end; + return index; + } + + private int[] indicesEnd(int[] entries) { + int[] indices = new int[SloppyMath.intPow(numClasses, windowSize - entries.length)]; + int offset = SloppyMath.intPow(numClasses, entries.length); + int index = 0; + for (int i = 0; i < entries.length; i++) { + index *= numClasses; + index += entries[i]; + } + for (int i = 0; i < indices.length; i++) { + indices[i] = index; + index += offset; + } + return indices; + } + + private int[] indicesFront(int[] entries) { + int[] indices = new int[SloppyMath.intPow(numClasses, windowSize - entries.length)]; + int offset = SloppyMath.intPow(numClasses, windowSize - entries.length); + int start = 0; + for (int i = 0; i < entries.length; i++) { + start *= numClasses; + start += entries[i]; + } + start *= offset; + int end = 0; + for (int i = 0; i < entries.length; i++) { + end *= numClasses; + end += entries[i]; + if (i == entries.length - 1) { + end += 1; + } + } + end *= offset; + for (int i = start; i < end; i++) { + indices[i - start] = i; + } + return indices; + } + + public int windowSize() { + return windowSize; + } + + public int numClasses() { + return numClasses; + } + + private int size() { + return table.length; + } + + public float totalMass() { + return ArrayMath.logSum(table); + } + + public float unnormalizedLogProb(int[] label) { + return getValue(label); + } + + public float logProb(int[] label) { + return unnormalizedLogProb(label) - totalMass(); + } + + + public float prob(int[] label) { + return (float) Math.exp(unnormalizedLogProb(label) - totalMass()); + } + + // given is at the begining, of is at the end + public float conditionalLogProb(int[] given, int of) { + if (given.length != windowSize - 1) { + System.err.println("error computing conditional log prob"); + System.exit(0); + } + int[] label = indicesFront(given); + float[] masses = new float[label.length]; + for (int i = 0; i < masses.length; i++) { + masses[i] = table[label[i]]; + } + float z = ArrayMath.logSum(masses); + + return table[indexOf(given, of)] - z; + } + + public float unnormalizedLogProbFront(int[] label) { + label = indicesFront(label); + float[] masses = new float[label.length]; + for (int i = 0; i < masses.length; i++) { + masses[i] = table[label[i]]; + } + return ArrayMath.logSum(masses); + } + + public float logProbFront(int[] label) { + return unnormalizedLogProbFront(label) - totalMass(); + } + + public float unnormalizedLogProbEnd(int[] label) { + label = indicesEnd(label); + float[] masses = new float[label.length]; + for (int i = 0; i < masses.length; i++) { + masses[i] = table[label[i]]; + } + return ArrayMath.logSum(masses); + } + + public float logProbEnd(int[] label) { + return unnormalizedLogProbEnd(label) - totalMass(); + } + + public float unnormalizedLogProbEnd(int label) { + int[] l = {label}; + l = indicesEnd(l); + float[] masses = new float[l.length]; + for (int i = 0; i < masses.length; i++) { + masses[i] = table[l[i]]; + } + return ArrayMath.logSum(masses); + } + + public float logProbEnd(int label) { + return unnormalizedLogProbEnd(label) - totalMass(); + } + + private float getValue(int index) { + return table[index]; + } + + public float getValue(int[] label) { + return table[indexOf(label)]; + } + + private void setValue(int index, float value) { + table[index] = value; + } + + public void setValue(int[] label, float value) { + table[indexOf(label)] = value; + } + + public void incrementValue(int[] label, float value) { + table[indexOf(label)] += value; + } + + private void logIncrementValue(int index, float value) { + table[index] = SloppyMath.logAdd(table[index], value); + } + + public void logIncrementValue(int[] label, float value) { + int index = indexOf(label); + table[index] = SloppyMath.logAdd(table[index], value); + } + + public void multiplyInFront(FloatFactorTable other) { + int divisor = SloppyMath.intPow(numClasses, windowSize - other.windowSize()); + for (int i = 0; i < table.length; i++) { + table[i] += other.getValue(i / divisor); + } + } + + public void multiplyInEnd(FloatFactorTable other) { + int divisor = SloppyMath.intPow(numClasses, other.windowSize()); + for (int i = 0; i < table.length; i++) { + table[i] += other.getValue(i % divisor); + } + } + + public FloatFactorTable sumOutEnd() { + FloatFactorTable ft = new FloatFactorTable(numClasses, windowSize - 1); + for (int i = 0; i < table.length; i++) { + ft.logIncrementValue(i / numClasses, table[i]); + } + return ft; + } + + public FloatFactorTable sumOutFront() { + FloatFactorTable ft = new FloatFactorTable(numClasses, windowSize - 1); + int mod = SloppyMath.intPow(numClasses, windowSize - 1); + for (int i = 0; i < table.length; i++) { + ft.logIncrementValue(i % mod, table[i]); + } + return ft; + } + + public void divideBy(FloatFactorTable other) { + for (int i = 0; i < table.length; i++) { + if (table[i] != Float.NEGATIVE_INFINITY || other.table[i] != Float.NEGATIVE_INFINITY) { + table[i] -= other.table[i]; + } + } + } + + public static void main(String[] args) { + FloatFactorTable ft = new FloatFactorTable(6, 3); + + /** + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + for (int k = 0; k < 2; k++) { + int[] a = new int[]{i, j, k}; + System.out.print(ft.toString(a)+": "+ft.indexOf(a)); + } + } + } + for (int i = 0; i < 2; i++) { + int[] b = new int[]{i}; + System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesFront(b))); + } + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + int[] b = new int[]{i, j}; + System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesFront(b))); + } + } + for (int i = 0; i < 2; i++) { + int[] b = new int[]{i}; + System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesBack(b))); + } for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + int[] b = new int[]{i, j}; + ft2.setValue(b, (i*2)+j); + } + } + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + int[] b = new int[]{i, j}; + System.out.print(ft.toString(b)+": "+ft.toString(ft.indicesBack(b))); + } + } + + System.out.println("##########################################"); + + **/ + + for (int i = 0; i < 6; i++) { + for (int j = 0; j < 6; j++) { + for (int k = 0; k < 6; k++) { + int[] b = new int[]{i, j, k}; + ft.setValue(b, (i * 4) + (j * 2) + k); + } + } + } + + //System.out.println(ft); + //System.out.println(ft.sumOutFront()); + + FloatFactorTable ft2 = new FloatFactorTable(6, 2); + for (int i = 0; i < 6; i++) { + for (int j = 0; j < 6; j++) { + int[] b = new int[]{i, j}; + ft2.setValue(b, i * 6 + j); + } + } + + System.out.println(ft); + //FloatFactorTable ft3 = ft2.sumOutFront(); + //System.out.println(ft3); + + for (int i = 0; i < 6; i++) { + for (int j = 0; j < 6; j++) { + int[] b = new int[]{i, j}; + float t = 0; + for (int k = 0; k < 6; k++) { + t += Math.exp(ft.conditionalLogProb(b, k)); + System.err.println(k + "|" + i + "," + j + " : " + Math.exp(ft.conditionalLogProb(b, k))); + } + System.out.println(t); + } + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/HasCliquePotentialFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/HasCliquePotentialFunction.java new file mode 100644 index 0000000..c01f04e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/HasCliquePotentialFunction.java @@ -0,0 +1,10 @@ +package edu.stanford.nlp.ie.crf; + +/** + * Indicates that this function can build a clique potential function for external use + * + * @author Mengqiu Wang + */ +public interface HasCliquePotentialFunction { + public CliquePotentialFunction getCliquePotentialFunction(double[] x); +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/LinearCliquePotentialFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/LinearCliquePotentialFunction.java new file mode 100644 index 0000000..ed37f38 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/LinearCliquePotentialFunction.java @@ -0,0 +1,27 @@ +package edu.stanford.nlp.ie.crf; + +/** + * @author Mengqiu Wang + */ + +public class LinearCliquePotentialFunction implements CliquePotentialFunction { + + double[][] weights; + + LinearCliquePotentialFunction(double[][] weights) { + this.weights = weights; + } + + @Override + public double computeCliquePotential(int cliqueSize, int labelIndex, int[] cliqueFeatures, double[] featureVal) { + double output = 0.0; + double dotProd = 0; + for (int m = 0; m < cliqueFeatures.length; m++) { + dotProd = weights[cliqueFeatures[m]][labelIndex]; + if (featureVal != null) + dotProd *= featureVal[m]; + output += dotProd; + } + return output; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/NonLinearCliquePotentialFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/NonLinearCliquePotentialFunction.java new file mode 100644 index 0000000..da14577 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/NonLinearCliquePotentialFunction.java @@ -0,0 +1,97 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.util.Index; + +/** + * @author Mengqiu Wang + */ + +public class NonLinearCliquePotentialFunction implements CliquePotentialFunction { + + double[][] linearWeights; + double[][] inputLayerWeights; // first index is number of hidden units in layer one, second index is the input feature indices + double[][] outputLayerWeights; // first index is the output class, second index is the number of hidden units + SeqClassifierFlags flags; + + private static double sigmoid(double x) { + return 1 / (1 + Math.exp(-x)); + } + + public NonLinearCliquePotentialFunction(double[][] linearWeights, double[][] inputLayerWeights, double[][] outputLayerWeights, SeqClassifierFlags flags) { + this.linearWeights = linearWeights; + this.inputLayerWeights = inputLayerWeights; + this.outputLayerWeights = outputLayerWeights; + this.flags = flags; + } + + public static double[] hiddenLayerOutput(double[][] inputLayerWeights, int[] nodeCliqueFeatures, SeqClassifierFlags aFlag, double[] featureVal) { + int layerOneSize = inputLayerWeights.length; + double[] layerOne = new double[layerOneSize]; + for (int i = 0; i < layerOneSize; i++) { + double[] ws = inputLayerWeights[i]; + double lOneW = 0; + double dotProd = 0; + for (int m = 0; m < nodeCliqueFeatures.length; m++) { + dotProd = ws[nodeCliqueFeatures[m]]; + if (featureVal != null) + dotProd *= featureVal[m]; + lOneW += dotProd; + } + layerOne[i] = lOneW; + } + // transform layer one through hidden + double[] hiddenLayer = new double[layerOneSize]; + for (int i = 0; i < layerOneSize; i++) { + if (aFlag.useHiddenLayer) { + if (aFlag.useSigmoid) { + hiddenLayer[i] = sigmoid(layerOne[i]); + } else { + hiddenLayer[i] = Math.tanh(layerOne[i]); + } + } else { + hiddenLayer[i] = layerOne[i]; + } + } + return hiddenLayer; + } + + @Override + public double computeCliquePotential(int cliqueSize, int labelIndex, int[] cliqueFeatures, double[] featureVal) { + double output = 0.0; + if (cliqueSize > 1) { // linear potential for edge cliques + for (int m = 0; m < cliqueFeatures.length; m++) { + output += linearWeights[cliqueFeatures[m]][labelIndex]; + } + } else { // non-linear potential for node cliques + double[] hiddenLayer = hiddenLayerOutput(inputLayerWeights, cliqueFeatures, flags, featureVal); + int outputLayerSize = inputLayerWeights.length / outputLayerWeights[0].length; + + // transform the hidden layer to output layer through linear transformation + if (flags.useOutputLayer) { + double[] outputWs = null; + if (flags.tieOutputLayer) { + outputWs = outputLayerWeights[0]; + } else { + outputWs = outputLayerWeights[labelIndex]; + } + if (flags.softmaxOutputLayer) { + outputWs = ArrayMath.softmax(outputWs); + } + for (int i = 0; i < inputLayerWeights.length; i++) { + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (i % outputLayerSize == labelIndex) { + output += outputWs[ i / outputLayerSize ] * hiddenLayer[i]; + } + } else { + output += outputWs[i] * hiddenLayer[i]; + } + } + } else { + output = hiddenLayer[labelIndex]; + } + } + return output; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/NonLinearSecondOrderCliquePotentialFunction.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/NonLinearSecondOrderCliquePotentialFunction.java new file mode 100644 index 0000000..75d5a16 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/crf/NonLinearSecondOrderCliquePotentialFunction.java @@ -0,0 +1,68 @@ +package edu.stanford.nlp.ie.crf; + +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.util.Index; + +/** + * @author Mengqiu Wang + */ + +public class NonLinearSecondOrderCliquePotentialFunction implements CliquePotentialFunction { + + double[][] inputLayerWeights4Edge; // first index is number of hidden units in layer one, second index is the input feature indices + double[][] outputLayerWeights4Edge; // first index is the output class, second index is the number of hidden units + double[][] inputLayerWeights; // first index is number of hidden units in layer one, second index is the input feature indices + double[][] outputLayerWeights; // first index is the output class, second index is the number of hidden units + SeqClassifierFlags flags; + + public NonLinearSecondOrderCliquePotentialFunction(double[][] inputLayerWeights4Edge, double[][] outputLayerWeights4Edge, double[][] inputLayerWeights, double[][] outputLayerWeights, SeqClassifierFlags flags) { + this.inputLayerWeights4Edge = inputLayerWeights4Edge; + this.outputLayerWeights4Edge = outputLayerWeights4Edge; + this.inputLayerWeights = inputLayerWeights; + this.outputLayerWeights = outputLayerWeights; + this.flags = flags; + } + + @Override + public double computeCliquePotential(int cliqueSize, int labelIndex, int[] cliqueFeatures, double[] featureVal) { + double output = 0.0; + double[][] inputWeights, outputWeights = null; + if (cliqueSize > 1) { + inputWeights = inputLayerWeights4Edge; + outputWeights = outputLayerWeights4Edge; + } else { + inputWeights = inputLayerWeights; + outputWeights = outputLayerWeights; + } + + double[] hiddenLayer = NonLinearCliquePotentialFunction.hiddenLayerOutput(inputWeights, cliqueFeatures, flags, featureVal); + int outputLayerSize = inputWeights.length / outputWeights[0].length; + + // transform the hidden layer to output layer through linear transformation + if (flags.useOutputLayer) { + double[] outputWs = null; + if (flags.tieOutputLayer) { + outputWs = outputWeights[0]; + } else { + outputWs = outputWeights[labelIndex]; + } + if (flags.softmaxOutputLayer) { + outputWs = ArrayMath.softmax(outputWs); + } + for (int i = 0; i < inputWeights.length; i++) { + if (flags.sparseOutputLayer || flags.tieOutputLayer) { + if (i % outputLayerSize == labelIndex) { + output += outputWs[ i / outputLayerSize ] * hiddenLayer[i]; + } + } else { + output += outputWs[i] * hiddenLayer[i]; + } + } + } else { + output = hiddenLayer[labelIndex]; + } + + return output; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java new file mode 100644 index 0000000..367976b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java @@ -0,0 +1,537 @@ +package edu.stanford.nlp.ie.machinereading; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Pattern; + +import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder; +import edu.stanford.nlp.ie.machinereading.structure.EntityMention; +import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; +import edu.stanford.nlp.ie.machinereading.structure.Span; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.Label; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.parser.lexparser.ParserConstraint; +import edu.stanford.nlp.parser.lexparser.ParserAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.Annotator; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.HeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.CoreMap; + +/** + * + * @author Andrey Gusev + * @author Mihai + * + */ +public class GenericDataSetReader { + protected Logger logger; + + /** Finds the syntactic head of a syntactic constituent */ + protected final HeadFinder headFinder = new NoPunctuationHeadFinder(); + + /** NL processor to use for sentence pre-processing */ + protected StanfordCoreNLP processor; + + /** + * Additional NL processor that implements only syntactic parsing (needed for head detection) + * We need this processor to detect heads of predicted entities that cannot be matched to an existing constituent. + * This is created on demand, only when necessary + */ + protected Annotator parserProcessor; + + /** If true, we perform syntactic analysis of the dataset sentences and annotations */ + protected final boolean preProcessSentences; + + /** + * If true, sets the head span to match the syntactic head of the extent. + * Otherwise, the head span is not modified. + * This is enabled for the NFL domain, where head spans are not given. + */ + protected final boolean calculateHeadSpan; + + /** If true, it regenerates the index spans for all tree nodes (useful for KBP) */ + protected final boolean forceGenerationOfIndexSpans; + + /** Only around for legacy results */ + protected boolean useNewHeadFinder = true; + + public GenericDataSetReader() { + this(null, false, false, false); + } + + public GenericDataSetReader(StanfordCoreNLP processor, boolean preProcessSentences, boolean calculateHeadSpan, boolean forceGenerationOfIndexSpans) { + this.logger = Logger.getLogger(GenericDataSetReader.class.getName()); + this.logger.setLevel(Level.SEVERE); + + if(processor != null) setProcessor(processor); + parserProcessor = null; + /* old parser options + parser.setOptionFlags(new String[] { + "-outputFormat", "penn,typedDependenciesCollapsed", + "-maxLength", "100", + "-retainTmpSubcategories" + }); + */ + + this.preProcessSentences = preProcessSentences; + this.calculateHeadSpan = calculateHeadSpan; + this.forceGenerationOfIndexSpans = forceGenerationOfIndexSpans; + } + + public void setProcessor(StanfordCoreNLP p) { + this.processor = p; + } + + public void setUseNewHeadFinder(boolean useNewHeadFinder) { + this.useNewHeadFinder = useNewHeadFinder; + } + + public Annotator getParser() { + if(parserProcessor == null){ + parserProcessor = StanfordCoreNLP.getExistingAnnotator("parse"); + assert(parserProcessor != null); + } + return parserProcessor; + } + + public void setLoggerLevel(Level level) { + logger.setLevel(level); + } + public Level getLoggerLevel() { + return logger.getLevel(); + } + + /** + * Parses one file or directory with data from one domain + * @param path + * @throws IOException + */ + public final Annotation parse(String path) throws IOException { + Annotation retVal; // set below or exceptions + + try { + // + // this must return a dataset Annotation. each sentence in this dataset must contain: + // - TokensAnnotation + // - EntityMentionAnnotation + // - RelationMentionAnnotation + // - EventMentionAnnotation + // the other annotations (parse, NER) are generated in preProcessSentences + // + retVal = this.read(path); + } catch (Exception ex) { + IOException iox = new IOException(); + iox.initCause(ex); + throw iox; + } + + if (preProcessSentences) { + preProcessSentences(retVal); + } + + return retVal; + } + + public Annotation read(String path) throws Exception { + return null; + } + + private static String sentenceToString(List tokens) { + StringBuilder os = new StringBuilder(); + + // + // Print text and tokens + // + if(tokens != null){ + boolean first = true; + for(CoreLabel token: tokens) { + if(! first) os.append(" "); + os.append(token.word()); + first = false; + } + } + + return os.toString(); + } + + + /** + * Find the index of the head of an entity. + * + * @param ent The entity mention + * @param tree The Tree for the entire sentence in which it occurs. + * @param tokens The Sentence in which it occurs + * @param setHeadSpan Whether to set the head span in the entity mention. + * @return The index of the entity head + */ + public int assignSyntacticHead(EntityMention ent, Tree tree, List tokens, boolean setHeadSpan) { + if (ent.getSyntacticHeadTokenPosition() != -1) { + return ent.getSyntacticHeadTokenPosition(); + } + + logger.finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.toString()); + logger.finest("Flat sentence is: " + tokens); + Tree sh = null; + try { + sh = findSyntacticHead(ent, tree, tokens); + } catch(Exception e) { + logger.severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + sentenceToString(tokens)); + e.printStackTrace(); + } catch(AssertionError e) { + logger.severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + sentenceToString(tokens)); + e.printStackTrace(); + } + + int headPos = ent.getExtentTokenEnd() - 1; + if(sh != null){ + CoreLabel label = (CoreLabel) sh.label(); + headPos = label.get(CoreAnnotations.BeginIndexAnnotation.class); + } else { + logger.fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree); + logger.fine("Fallback strategy: will set head to last token in mention: " + tokens.get(headPos)); + } + ent.setHeadTokenPosition(headPos); + + if (setHeadSpan){ + // set the head span to match exactly the syntactic head + // this is needed for some corpora where the head span is not given + ent.setHeadTokenSpan(new Span(headPos, headPos + 1)); + } + + return headPos; + } + + /** + * Take a dataset Annotation, generate their parse trees and identify syntactic heads (and head spans, if necessary) + */ + public void preProcessSentences(Annotation dataset) { + logger.severe("GenericDataSetReader: Started pre-processing the corpus..."); + // run the processor, i.e., NER, parse etc. + if (processor != null) { + // we might already have syntactic annotation from offline files + List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); + if (sentences.size() > 0 && !sentences.get(0).containsKey(TreeCoreAnnotations.TreeAnnotation.class)) { + logger.info("Annotating dataset with " + processor); + processor.annotate(dataset); + } else { + logger.info("Found existing syntactic annotations. Will not use the NLP processor."); + } + } + /* + List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); + for(int i = 0; i < sentences.size(); i ++){ + CoreMap sent = sentences.get(i); + List tokens = sent.get(CoreAnnotations.TokensAnnotation.class); + logger.info("Tokens for sentence #" + i + ": " + tokens); + logger.info("Parse tree for sentence #" + i + ": " + sent.get(TreeCoreAnnotations.TreeAnnotation.class).pennString()); + } + */ + + List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); + logger.fine("Extracted " + sentences.size() + " sentences."); + for (CoreMap sentence : sentences) { + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + logger.fine("Processing sentence " + tokens); + Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + if(tree == null) throw new RuntimeException("ERROR: MR requires full syntactic analysis!"); + + // convert tree labels to CoreLabel if necessary + // we need this because we store additional info in the CoreLabel, such as the spans of each tree + convertToCoreLabels(tree); + + // store the tree spans, if not present already + CoreLabel l = (CoreLabel) tree.label(); + if(forceGenerationOfIndexSpans || (! l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && ! l.containsKey(CoreAnnotations.EndIndexAnnotation.class))){ + tree.indexSpans(0); + logger.fine("Index spans were generated."); + } else { + logger.fine("Index spans were NOT generated."); + } + logger.fine("Parse tree using CoreLabel:\n" + tree.pennString()); + + // + // now match all entity mentions against the syntactic tree + // + if (sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class) != null) { + for (EntityMention ent : sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class)) { + logger.fine("Finding head for entity: " + ent); + int headPos = assignSyntacticHead(ent, tree, tokens, calculateHeadSpan); + logger.fine("Syntactic head of mention \"" + ent + "\" is: " + tokens.get(headPos).word()); + + assert(ent.getExtent() != null); + assert(ent.getHead() != null); + assert(ent.getSyntacticHeadTokenPosition() >= 0); + } + } + } + logger.severe("GenericDataSetReader: Pre-processing complete."); + } + + /** + * Converts the tree labels to CoreLabels. + * We need this because we store additional info in the CoreLabel, like token span. + * @param tree + */ + public static void convertToCoreLabels(Tree tree) { + Label l = tree.label(); + if(! (l instanceof CoreLabel)){ + CoreLabel cl = new CoreLabel(); + cl.setValue(l.value()); + tree.setLabel(cl); + } + + for (Tree kid : tree.children()) { + convertToCoreLabels(kid); + } + } + + private static String printTree(Tree tree) { + StringBuilder sb = new StringBuilder(); + return tree.toStringBuilder(sb, true).toString(); + } + + private Tree safeHead(Tree top) { + Tree head = top.headTerminal(headFinder); + if (head != null) return head; + // if no head found return the right-most leaf + List leaves = top.getLeaves(); + if(leaves.size() > 0) return leaves.get(leaves.size() - 1); + // fallback: return top + return top; + } + + /** + * Finds the syntactic head of the given entity mention. + * + * @param ent The entity mention + * @param root The Tree for the entire sentence in which it occurs. + * @param tokens The Sentence in which it occurs + * @return The tree object corresponding to the head. This MUST be a child of root. + * It will be a leaf in the parse tree. + */ + public Tree findSyntacticHead(EntityMention ent, Tree root, List tokens) { + if (!useNewHeadFinder) { + return originalFindSyntacticHead(ent, root, tokens); + } + + logger.fine("Searching for tree matching " + ent); + Tree exactMatch = findTreeWithSpan(root, ent.getExtentTokenStart(), ent.getExtentTokenEnd()); + + // + // found an exact match + // + if (exactMatch != null) { + logger.fine("Mention \"" + ent + "\" mapped to tree: " + printTree(exactMatch)); + return safeHead(exactMatch); + } + + // no exact match found + // in this case, we parse the actual extent of the mention, embedded in a sentence + // context, so as to make the parser work better :-) + + int approximateness = 0; + List extentTokens = new ArrayList(); + extentTokens.add(initCoreLabel("It")); + extentTokens.add(initCoreLabel("was")); + final int ADDED_WORDS = 2; + for (int i = ent.getExtentTokenStart(); i < ent.getExtentTokenEnd(); i++) { + // Add everything except separated dashes! The separated dashes mess with the parser too badly. + CoreLabel label = tokens.get(i); + if ( ! "-".equals(label.word())) { + extentTokens.add(tokens.get(i)); + } else { + approximateness++; + } + } + extentTokens.add(initCoreLabel(".")); + + // constrain the parse to the part we're interested in. + // Starting from ADDED_WORDS comes from skipping "It was". + // -1 to exclude the period. + // We now let it be any kind of nominal constituent, since there + // are VP and S ones + ParserConstraint constraint = new ParserConstraint(); + constraint.start = ADDED_WORDS; + constraint.end = extentTokens.size() - 1; + constraint.state = Pattern.compile(".*"); + List constraints = Collections.singletonList(constraint); + Tree tree = parse(extentTokens, constraints); + logger.fine("No exact match found. Local parse:\n" + tree.pennString()); + convertToCoreLabels(tree); + tree.indexSpans(ent.getExtentTokenStart() - ADDED_WORDS); // remember it has ADDED_WORDS extra words at the beginning + Tree subtree = findPartialSpan(tree, ent.getExtentTokenStart()); + Tree extentHead = safeHead(subtree); + logger.fine("Head is: " + extentHead); + assert(extentHead != null); + // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree + // Because we deleted dashes, it's index will be >= the index in the extent parse tree + CoreLabel l = (CoreLabel) extentHead.label(); + // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class)); + Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness); + if(realHead != null) logger.fine("Chosen head: " + realHead); + return realHead; + } + + private Tree findPartialSpan(Tree current, int start) { + CoreLabel label = (CoreLabel) current.label(); + int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class); + if (startIndex == start) { + logger.fine("findPartialSpan: Returning " + current); + return current; + } + for (Tree kid : current.children()) { + CoreLabel kidLabel = (CoreLabel) kid.label(); + int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class); + int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class); + // System.err.println("findPartialSpan: Examining " + kidLabel.value() + " from " + kidStart + " to " + kidEnd); + if (kidStart <= start && kidEnd > start) { + return findPartialSpan(kid, start); + } + } + throw new RuntimeException("Shouldn't happen: " + start + " " + current); + } + + private Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) { + logger.fine("Looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.pennString()); + List leaves = root.getLeaves(); + for (Tree leaf : leaves) { + CoreLabel label = CoreLabel.class.cast(leaf.label()); + int ind = label.get(CoreAnnotations.BeginIndexAnnotation.class); + // System.err.println("Token #" + ind + ": " + leaf.value()); + if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) { + return leaf; + } + } + // this shouldn't happen + // but it does happen (VERY RARELY) on some weird web text that includes SGML tags with spaces + // TODO: does this mean that somehow tokenization is different for the parser? check this by throwing an Exception in KBP + logger.severe("GenericDataSetReader: WARNING: Failed to find head token"); + logger.severe(" when looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.pennString()); + return null; + } + + /** + * This is the original version of {@link #findSyntacticHead} before Chris's modifications. + * There's no good reason to use it except for producing historical results. + * It Finds the syntactic head of the given entity mention. + * + * @param ent The entity mention + * @param root The Tree for the entire sentence in which it occurs. + * @param tokens The Sentence in which it occurs + * @return The tree object corresponding to the head. This MUST be a child of root. + * It will be a leaf in the parse tree. + */ + public Tree originalFindSyntacticHead(EntityMention ent, Tree root, List tokens) { + logger.fine("Searching for tree matching " + ent); + Tree exactMatch = findTreeWithSpan(root, ent.getExtentTokenStart(), ent.getExtentTokenEnd()); + + // + // found an exact match + // + if (exactMatch != null) { + logger.fine("Mention \"" + ent + "\" mapped to tree: " + printTree(exactMatch)); + return safeHead(exactMatch); + } + + // + // no exact match found + // in this case, we parse the actual extent of the mention + // + List extentTokens = new ArrayList(); + for (int i = ent.getExtentTokenStart(); i < ent.getExtentTokenEnd(); i++) + extentTokens.add(tokens.get(i)); + + Tree tree = parse(extentTokens); + logger.fine("No exact match found. Local parse:\n" + tree.pennString()); + convertToCoreLabels(tree); + tree.indexSpans(ent.getExtentTokenStart()); + Tree extentHead = safeHead(tree); + assert (extentHead != null); + // extentHead is a child in the local extent parse tree. we need to find the + // corresponding node in the main tree + CoreLabel l = (CoreLabel) extentHead.label(); + Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class)); + assert (realHead != null); + + return realHead; + } + + private static CoreLabel initCoreLabel(String token) { + CoreLabel label = new CoreLabel(); + label.setWord(token); + label.set(CoreAnnotations.TextAnnotation.class, token); + return label; + } + + protected Tree parseStrings(List tokens) { + List labels = new ArrayList(); + for (String t : tokens) { + CoreLabel l = initCoreLabel(t); + labels.add(l); + } + return parse(labels); + } + + protected Tree parse(List tokens) { + return parse(tokens, null); + } + + protected Tree parse(List tokens, + List constraints) { + CoreMap sent = new Annotation(""); + sent.set(CoreAnnotations.TokensAnnotation.class, tokens); + sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints); + Annotation doc = new Annotation(""); + List sents = new ArrayList(); + sents.add(sent); + doc.set(CoreAnnotations.SentencesAnnotation.class, sents); + getParser().annotate(doc); + sents = doc.get(CoreAnnotations.SentencesAnnotation.class); + return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class); + } + + /** + * Finds the tree with the given token span. + * The tree must have CoreLabel labels and Tree.indexSpans must be called before this method. + * + * @param tree The tree to search in + * @param start The beginning index + * @param end + * @return A child of tree if match; otherwise null + */ + private static Tree findTreeWithSpan(Tree tree, int start, int end) { + CoreLabel l = (CoreLabel) tree.label(); + if (l != null && l.has(CoreAnnotations.BeginIndexAnnotation.class) && l.has(CoreAnnotations.EndIndexAnnotation.class)) { + int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class); + int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class); + if (start == myStart && end == myEnd){ + // found perfect match + return tree; + } else if (end < myStart) { + return null; + } else if (start >= myEnd) { + return null; + } + } + + // otherwise, check inside children - a match is possible + for (Tree kid : tree.children()) { + if (kid == null) continue; + Tree ret = findTreeWithSpan(kid, start, end); + // found matching child + if (ret != null) return ret; + } + + // no match + return null; + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/DomReader.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/DomReader.java new file mode 100644 index 0000000..c105c42 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/DomReader.java @@ -0,0 +1,159 @@ +package edu.stanford.nlp.ie.machinereading.common; + + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * Generic DOM reader for an XML file + */ +public class DomReader { + + /** + * Searches (recursively) for the first child that has the given name + */ + protected static Node getChildByName(Node node, String name) { + NodeList children = node.getChildNodes(); + + // this node matches + if (node.getNodeName().equals(name)) + return node; + + // search children + for (int i = 0; i < children.getLength(); i++) { + Node found = getChildByName(children.item(i), name); + if (found != null) + return found; + } + + // failed + return null; + } + + /** + * Searches for all immediate children with the given name + */ + protected static List getChildrenByName(Node node, String name) { + List matches = new ArrayList(); + NodeList children = node.getChildNodes(); + + // search children + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeName().equals(name)) { + matches.add(child); + } + } + + return matches; + } + + /** + * Searches for children that have the given attribute + */ + protected static Node getChildByAttribute(Node node, String attributeName, String attributeValue) { + NodeList children = node.getChildNodes(); + NamedNodeMap attribs = node.getAttributes(); + Node attribute = null; + + // this node matches + if (attribs != null && (attribute = attribs.getNamedItem(attributeName)) != null + && attribute.getNodeValue().equals(attributeValue)) + return node; + + // search children + for (int i = 0; i < children.getLength(); i++) { + Node found = getChildByAttribute(children.item(i), attributeName, attributeValue); + if (found != null) + return found; + } + + // failed + return null; + } + + /** + * Searches for children that have the given name and attribute + */ + protected static Node getChildByNameAndAttribute(Node node, String name, String attributeName, String attributeValue) { + NodeList children = node.getChildNodes(); + NamedNodeMap attribs = node.getAttributes(); + Node attribute = null; + + // this node matches + if (node.getNodeName().equals(name) && attribs != null + && (attribute = attribs.getNamedItem(attributeName)) != null + && attribute.getNodeValue().equals(attributeValue)) + return node; + + // search children + for (int i = 0; i < children.getLength(); i++) { + Node found = getChildByAttribute(children.item(i), attributeName, attributeValue); + if (found != null) + return found; + } + + // failed + return null; + } + + /** + * Fetches the value of a given attribute + */ + public static String getAttributeValue(Node node, String attributeName) { + try { + return node.getAttributes().getNamedItem(attributeName).getNodeValue(); + } catch (Exception e) { + } + + return null; + } + + /** + * Constructs one Document from an XML file + */ + public static Document readDocument(File f) throws IOException, SAXException, ParserConfigurationException { + Document document = null; + + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + // factory.setValidating(true); + // factory.setNamespaceAware(true); + + try { + DocumentBuilder builder = factory.newDocumentBuilder(); + document = builder.parse(f); + + // displayDocument(document); + + } catch (SAXException sxe) { + // Error generated during parsing) + Exception x = sxe; + if (sxe.getException() != null) + x = sxe.getException(); + x.printStackTrace(); + throw sxe; + } catch (ParserConfigurationException pce) { + // Parser with specified options can't be built + pce.printStackTrace(); + throw pce; + } catch (IOException ioe) { + // I/O error + ioe.printStackTrace(); + throw ioe; + } + + return document; + } // readDocument +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/NoPunctuationHeadFinder.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/NoPunctuationHeadFinder.java new file mode 100644 index 0000000..681de7b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/NoPunctuationHeadFinder.java @@ -0,0 +1,66 @@ +package edu.stanford.nlp.ie.machinereading.common; + +import edu.stanford.nlp.ling.CategoryWordTag; +import edu.stanford.nlp.trees.DiskTreebank; +import edu.stanford.nlp.trees.HeadFinder; +import edu.stanford.nlp.trees.ModCollinsHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeVisitor; +import edu.stanford.nlp.trees.Treebank; + +/** + * Simple variant of the ModCollinsHeadFinder avoids supplying punctuation tags + * as heads whenever possible. + * + * @author David McClosky (mcclosky@stanford.edu) + * + */ +public class NoPunctuationHeadFinder extends ModCollinsHeadFinder { + + private static final long serialVersionUID = 1201891305937180385L; + + /** + * Returns whether a part of speech tag is the tag for a punctuation mark (by + * checking whether the first character is a letter. + * + * @param label + * part of speech tag + * @return whether the tag is (typically) assigned to punctuation + */ + private boolean isPunctuationLabel(String label) { + return !Character.isLetter(label.charAt(0)) + && !(label.equals("$") || label.equals("%")); + } + + protected int postOperationFix(int headIdx, Tree[] daughterTrees) { + int index = super.postOperationFix(headIdx, daughterTrees); + // if the current index is a punctuation mark, we search left until we + // find a non-punctuation mark tag or hit the left end of the sentence + while (index > 0) { + String label = daughterTrees[index].label().value(); + if (isPunctuationLabel(label)) { + index--; + } else { + break; + } + } + + return index; + } + + public static void main(String[] args) { + // simple testing code + Treebank treebank = new DiskTreebank(); + CategoryWordTag.suppressTerminalDetails = true; + treebank.loadPath(args[0]); + final HeadFinder chf = new NoPunctuationHeadFinder(); + treebank.apply(new TreeVisitor() { + public void visitTree(Tree pt) { + pt.percolateHeads(chf); + pt.pennPrint(); + System.out.println(); + } + }); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/SimpleTokenize.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/SimpleTokenize.java new file mode 100644 index 0000000..f9c413d --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/SimpleTokenize.java @@ -0,0 +1,137 @@ + +package edu.stanford.nlp.ie.machinereading.common; + +import java.util.ArrayList; +import java.util.StringTokenizer; + +/** + * Simple string tokenization + */ +public class SimpleTokenize { + /** Basic string tokenization, skipping over white spaces */ + public static ArrayList tokenize(String line) { + ArrayList tokens = new ArrayList(); + StringTokenizer tokenizer = new StringTokenizer(line); + while (tokenizer.hasMoreElements()) { + tokens.add(tokenizer.nextToken()); + } + return tokens; + } + + /** Basic string tokenization, skipping over white spaces */ + public static ArrayList tokenize(String line, String separators) { + ArrayList tokens = new ArrayList(); + StringTokenizer tokenizer = new StringTokenizer(line, separators); + while (tokenizer.hasMoreElements()) { + tokens.add(tokenizer.nextToken()); + } + return tokens; + } + + /** + * Finds the first non-whitespace character starting at start + */ + private static int findNonWhitespace(String s, int start) { + for (; start < s.length(); start++) { + if (Character.isWhitespace(s.charAt(start)) == false) + return start; + } + return -1; + } + + private static int findWhitespace(String s, int start) { + for (; start < s.length(); start++) { + if (Character.isWhitespace(s.charAt(start))) + return start; + } + return -1; + } + + /** + * Replaces all occurences of \" with " + */ + private static String normalizeQuotes(String str) { + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < str.length(); i++) { + // do not include \ if followed by " + if (str.charAt(i) == '\\' && i < str.length() - 1 && str.charAt(i + 1) == '\"') { + continue; + } else { + buffer.append(str.charAt(i)); + } + } + return buffer.toString(); + } + + /** + * String tokenization, considering everything within quotes as 1 token + * Regular quotes inside tokens MUST be preceded by \ + */ + public static ArrayList tokenizeWithQuotes(String line) { + ArrayList tokens = new ArrayList(); + int position = 0; + + while ((position = findNonWhitespace(line, position)) != -1) { + int end = -1; + + // found quoted token (not preceded by \) + if (line.charAt(position) == '\"' && (position == 0 || line.charAt(position - 1) != '\\')) { + + // find the first quote not preceded by \ + int current = position; + for (;;) { + // found end of string first + if ((end = line.indexOf('\"', current + 1)) == -1) { + end = line.length(); + break; + } else { // found a quote + if (line.charAt(end - 1) != '\\') { // valid quote + end++; + break; + } else { // quote preceded by \ + current = end; + } + } + } + + // do not include the quotes in the token + tokens.add(normalizeQuotes(line.substring(position + 1, end - 1))); + } + + // regular token + else { + if ((end = findWhitespace(line, position + 1)) == -1) + end = line.length(); + + tokens.add(new String(line.substring(position, end))); + } + + position = end; + } + + return tokens; + } + + /** + * Constructs a valid quote-surrounded token All inside quotes are preceded by + * \ + */ + public static String quotify(String str) { + StringBuffer buffer = new StringBuffer(); + buffer.append('\"'); + for (int i = 0; i < str.length(); i++) { + if (str.charAt(i) == '\"') + buffer.append('\\'); + buffer.append(str.charAt(i)); + } + buffer.append('\"'); + return buffer.toString(); + } + + /** Implements a simple test */ + public static void main(String[] argv) { + String in = "T \"Athens \\\"the beautiful\\\"\" \"Athens\" \"\" \"Greece\""; + System.err.println("Input: " + in); + System.err.println(tokenizeWithQuotes(in)); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/StringDictionary.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/StringDictionary.java new file mode 100644 index 0000000..1f16896 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/common/StringDictionary.java @@ -0,0 +1,186 @@ +package edu.stanford.nlp.ie.machinereading.common; + +import java.io.BufferedReader; +import java.util.ArrayList; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.util.Generics; + +public class StringDictionary { + + public static class IndexAndCount { + + public final int mIndex; + public int mCount; + + IndexAndCount(int i, int c) { + mIndex = i; + mCount = c; + } + } + + /** Name of this dictionary */ + private final String mName; + + /** + * Access type: If true, create a dictionary entry if the entry does not exist + * in get Otherwise, return -1 if the entry does not exist in get + */ + private boolean mCreate; + + /** The actual dictionary */ + private Map mDict; + + /** Inverse mapping from integer keys to the string values */ + private Map mInverse; + + public StringDictionary(String name) { + mName = name; + mCreate = false; + mDict = Generics.newHashMap(); + mInverse = Generics.newHashMap(); + } + + public void setMode(boolean mode) { + mCreate = mode; + } + + public int size() { + return mDict.size(); + } + + public int get(String s) { + return get(s, true); + } + + public IndexAndCount getIndexAndCount(String s) { + IndexAndCount ic = mDict.get(s); + if (mCreate) { + if (ic == null) { + ic = new IndexAndCount(mDict.size(), 0); + mDict.put(s, ic); + mInverse.put(Integer.valueOf(ic.mIndex), s); + } + ic.mCount++; + } + return ic; + } + + /** + * Fetches the index of this string If mCreate is true, the entry is created + * if it does not exist. If mCreate is true, the count of the entry is + * incremented for every get If no entry found throws an exception if + * shouldThrow == true + */ + public int get(String s, boolean shouldThrow) { + IndexAndCount ic = mDict.get(s); + if (mCreate) { + if (ic == null) { + ic = new IndexAndCount(mDict.size(), 0); + mDict.put(s, ic); + mInverse.put(Integer.valueOf(ic.mIndex), s); + } + ic.mCount++; + } + if (ic != null) + return ic.mIndex; + + if (shouldThrow) { + throw new RuntimeException("Unknown entry \"" + s + "\" in dictionary \"" + mName + "\"!"); + } else { + return -1; + } + } + + public static final String NIL_VALUE = "nil"; + + /** + * Reverse mapping from integer key to string value + */ + public String get(int idx) { + if (idx == -1) + return NIL_VALUE; + + String s = mInverse.get(idx); + if (s == null) + throw new RuntimeException("Unknown index \"" + idx + "\" in dictionary \"" + mName + "\"!"); + return s; + } + + public int getCount(int idx) { + if (idx == -1) + return 0; + + String s = mInverse.get(idx); + if (s == null) + throw new RuntimeException("Unknown index \"" + idx + "\" in dictionary \"" + mName + "\"!"); + + return getIndexAndCount(s).mCount; + } + + /** + * Saves all dictionary entries that appeared > threshold times Note: feature + * indices are changed to contiguous values starting at 0. This is needed in + * order to minimize the memory allocated for the expanded feature vectors + * (average perceptron). + */ + public void save(String path, String prefix, int threshold) throws java.io.IOException { + + String fileName = path + java.io.File.separator + prefix + "." + mName; + java.io.PrintStream os = new java.io.PrintStream(new java.io.FileOutputStream(fileName)); + + Set keys = mDict.keySet(); + int index = 0; + for (String key : keys) { + IndexAndCount ic = mDict.get(key); + if (ic.mCount > threshold) { + os.println(key + " " + index + " " + ic.mCount); + index++; + } + } + + os.close(); + System.err.println("Saved " + index + "/" + mDict.size() + " entries for dictionary \"" + mName + "\"."); + } + + public void clear() { + mDict.clear(); + mInverse.clear(); + } + + public Set keySet() { + return mDict.keySet(); + } + + /** Loads all saved dictionary entries from disk */ + public void load(String path, String prefix) throws java.io.IOException { + + String fileName = path + java.io.File.separator + prefix + "." + mName; + BufferedReader is = IOUtils.readerFromString(fileName); + + for (String line; (line = is.readLine()) != null; ) { + ArrayList tokens = SimpleTokenize.tokenize(line); + if (tokens.size() != 3) { + throw new RuntimeException("Invalid dictionary line: " + line); + } + int index = Integer.parseInt(tokens.get(1)); + int count = Integer.parseInt(tokens.get(2)); + if (index < 0 || count <= 0) { + throw new RuntimeException("Invalid dictionary line: " + line); + } + + IndexAndCount ic = new IndexAndCount(index, count); + mDict.put(tokens.get(0), ic); + mInverse.put(Integer.valueOf(index), tokens.get(0)); + } + + is.close(); + System.err.println("Loaded " + mDict.size() + " entries for dictionary \"" + mName + "\"."); + } + + public java.util.Set keys() { + return mDict.keySet(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/AceReader.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/AceReader.java new file mode 100644 index 0000000..6f2516e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/AceReader.java @@ -0,0 +1,495 @@ +package edu.stanford.nlp.ie.machinereading.domains.ace; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import edu.stanford.nlp.ie.machinereading.GenericDataSetReader; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMentionArgument; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken; +import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils; +import edu.stanford.nlp.ie.machinereading.structure.EntityMention; +import edu.stanford.nlp.ie.machinereading.structure.EventMention; +import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject; +import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; +import edu.stanford.nlp.ie.machinereading.structure.RelationMention; +import edu.stanford.nlp.ie.machinereading.structure.Span; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.StringUtils; + +/** + * + * Simple wrapper of Mihai's ACE code to ie.machinereading.structure objects + * + * @author David McClosky + * + */ +public class AceReader extends GenericDataSetReader { + + private final Counter entityCounts; + private final Counter adjacentEntityMentions; + private final Counter relationCounts; + private final Counter nameRelationCounts; + private final Counter eventCounts; + private final Counter mentionTypeCounts; + private final String aceVersion; + private static final boolean VERBOSE = false; + + /** + * Make an AceReader. + */ + public AceReader() { + this(null, true); + } + + public AceReader(StanfordCoreNLP processor, boolean preprocess) { + this(processor, preprocess, "ACE2005"); + } + + public AceReader(StanfordCoreNLP processor, boolean preprocess, String version) { + super(processor, preprocess, false, true); + + entityCounts = new ClassicCounter(); + adjacentEntityMentions = new ClassicCounter(); + nameRelationCounts = new ClassicCounter(); + relationCounts = new ClassicCounter(); + eventCounts = new ClassicCounter(); + mentionTypeCounts = new ClassicCounter(); + + logger = Logger.getLogger(AceReader.class.getName()); + // run quietly by default + logger.setLevel(Level.SEVERE); + + aceVersion = version; + } + + /** + * Reads in ACE*.apf.xml files and converts them to RelationSentence objects. + * Note that you probably should call parse() instead. + * + * Currently, this ignores document boundaries (the list returned will include + * sentences from all documents). + * + * @param path directory containing ACE files to read (e.g. + * "/home/mcclosky/scr/data/ACE2005/english_test"). This can also be + * the path to a single file. * + * @return list of RelationSentence objects + */ + @Override + public Annotation read(String path) throws IOException, SAXException, ParserConfigurationException { + List allSentences = new ArrayList(); + File basePath = new File(path); + assert basePath.exists(); + Annotation corpus = new Annotation(""); + + if (basePath.isDirectory()) { + for (File aceFile : IOUtils.iterFilesRecursive(basePath, ".apf.xml")) { + if (aceFile.getName().endsWith(".UPC1.apf.xml")) { + continue; + } + allSentences.addAll(readDocument(aceFile, corpus)); + } + } else { + // in case it's a file + allSentences.addAll(readDocument(basePath, corpus)); + } + + AnnotationUtils.addSentences(corpus, allSentences); + + // quick stats + if (VERBOSE) { + printCounter(entityCounts, "entity mention"); + printCounter(relationCounts, "relation mention"); + printCounter(eventCounts, "event mention"); + } + + + for(CoreMap sent: allSentences){ + // check for entity mentions of the same type that are adjacent + countAdjacentMentions(sent); + // count relations between two proper nouns + countNameRelations(sent); + // count types of mentions + countMentionTypes(sent); + } + if (VERBOSE) { + printCounter(adjacentEntityMentions, "adjacent entity mention"); + printCounter(nameRelationCounts, "name relation mention"); + printCounter(mentionTypeCounts, "mention type counts"); + } + + return corpus; + } + + private void countMentionTypes(CoreMap sent) { + List mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + if(mentions != null){ + for(EntityMention m: mentions){ + mentionTypeCounts.incrementCount(m.getMentionType()); + } + } + } + + private void countNameRelations(CoreMap sent) { + List mentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + if(mentions != null){ + for(RelationMention m: mentions) { + List args = m.getEntityMentionArgs(); + if(args.size() == 2 && args.get(0).getMentionType().equals("NAM") && args.get(1).getMentionType().equals("NAM")){ + nameRelationCounts.incrementCount(m.getType() + "." + m.getSubType()); + } + } + } + } + + private void countAdjacentMentions(CoreMap sent) { + List mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + if(mentions != null){ + for(EntityMention m1: mentions){ + for(EntityMention m2: mentions){ + if(m1 == m2) continue; + if(m1.getHeadTokenEnd() == m2.getHeadTokenStart() && m1.getType().equals(m2.getType())){ + adjacentEntityMentions.incrementCount(m1.getType()); + } + } + } + } + } + + // todo: Change to use a counters print method (get sorting for free!) + private void printCounter(Counter c, String h) { + StringBuilder b = new StringBuilder(); + b.append(h).append(" counts:\n"); + Set keys = c.keySet(); + for(String k: keys){ + b.append("\t").append(k).append(": ").append(c.getCount(k)).append("\n"); + } + logger.info(b.toString()); + } + + /** + * Reads in a single ACE*.apf.xml file and convert it to RelationSentence + * objects. However, you probably should call parse() instead. + * + * @param file A file object of an ACE file + * @return list of RelationSentence objects + */ + private List readDocument(File file, Annotation corpus) throws IOException, SAXException, + ParserConfigurationException { + // remove the extension to make it into a prefix + String aceFilename = file.getAbsolutePath().replace(".apf.xml", ""); + List sentencesFromFile = readDocument(aceFilename, corpus); + return sentencesFromFile; + } + + /** + * Reads in a single ACE*.apf.xml file and convert it to RelationSentence + * objects. However, you probably should call parse() instead. + * + * @param prefix prefix of ACE filename to read (e.g. + * "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01" + * ) (no ".apf.xml" extension) + * @return list of RelationSentence objects + */ + private List readDocument(String prefix, Annotation corpus) throws IOException, SAXException, + ParserConfigurationException { + logger.info("Reading document: " + prefix); + List results = new ArrayList(); + AceDocument aceDocument; + if(aceVersion.equals("ACE2004")){ + aceDocument = AceDocument.parseDocument(prefix, false, aceVersion); + } else { + aceDocument = AceDocument.parseDocument(prefix, false); + } + String docId = aceDocument.getId(); + + // map entity mention ID strings to their EntityMention counterparts + Map entityMentionMap = Generics.newHashMap(); + + /* + for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { + List tokens = aceDocument.getSentence(sentenceIndex); + StringBuffer b = new StringBuffer(); + for(AceToken t: tokens) b.append(t.getLiteral() + " " ); + logger.info("SENTENCE: " + b.toString()); + } + */ + + int tokenOffset = 0; + for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { + List tokens = aceDocument.getSentence(sentenceIndex); + + List words = new ArrayList(); + StringBuilder textContent = new StringBuilder(); + for(int i = 0; i < tokens.size(); i ++){ + CoreLabel l = new CoreLabel(); + l.setWord(tokens.get(i).getLiteral()); + l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart()); + l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd()); + words.add(l); + if(i > 0) textContent.append(" "); + textContent.append(tokens.get(i).getLiteral()); + } + + // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer) + if (words.size() == 1) { + String word = words.get(0).word(); + if (word.startsWith("<") && word.endsWith(">")) { + tokenOffset += tokens.size(); + continue; + } + } + + CoreMap sentence = new Annotation(textContent.toString()); + sentence.set(CoreAnnotations.DocIDAnnotation.class, docId); + sentence.set(CoreAnnotations.TokensAnnotation.class, words); + logger.info("Reading sentence: \"" + textContent + "\""); + + List entityMentions = aceDocument.getEntityMentions(sentenceIndex); + List relationMentions = aceDocument.getRelationMentions(sentenceIndex); + List eventMentions = aceDocument.getEventMentions(sentenceIndex); + + // convert entity mentions + for (AceEntityMention aceEntityMention : entityMentions) { + String corefID=""; + for(String entityID : aceDocument.getKeySetEntities()){ + AceEntity e = aceDocument.getEntity(entityID); + if(e.getMentions().contains(aceEntityMention)){ + corefID = entityID; + break; + } + } + EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID); +// EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset); + entityCounts.incrementCount(convertedMention.getType()); + logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead()); + logger.info("CONVERTED ENTITY MENTION: " + convertedMention); + AnnotationUtils.addEntityMention(sentence, convertedMention); + entityMentionMap.put(aceEntityMention.getId(), convertedMention); + + // TODO: make Entity objects as needed + } + + // convert relation mentions + for (AceRelationMention aceRelationMention : relationMentions) { + RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap); + if(convertedMention != null){ + relationCounts.incrementCount(convertedMention.getType()); + logger.info("CONVERTED RELATION MENTION: " + convertedMention); + AnnotationUtils.addRelationMention(sentence, convertedMention); + } + + // TODO: make Relation objects + } + + // convert EventMentions + for(AceEventMention aceEventMention: eventMentions){ + EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset); + if(convertedMention != null){ + eventCounts.incrementCount(convertedMention.getType()); + logger.info("CONVERTED EVENT MENTION: " + convertedMention); + AnnotationUtils.addEventMention(sentence, convertedMention); + } + + // TODO: make Event objects + } + + results.add(sentence); + tokenOffset += tokens.size(); + } + return results; + } + + private EventMention convertAceEventMention( + AceEventMention aceEventMention, String docId, + CoreMap sentence, Map entityMap, + int tokenOffset) { + Set roleSet = aceEventMention.getRoles(); + List roles = new ArrayList(); + for(String role: roleSet) roles.add(role); + List convertedArgs = new ArrayList(); + + int left = Integer.MAX_VALUE; + int right = Integer.MIN_VALUE; + for(String role: roles){ + AceEntityMention arg = aceEventMention.getArg(role); + ExtractionObject o = entityMap.get(arg.getId()); + if(o == null){ + logger.severe("READER ERROR: Failed to find event argument with id " + arg.getId()); + logger.severe("This happens because a few event mentions illegally span multiple sentences. Will ignore this mention."); + return null; + } + convertedArgs.add(o); + if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart(); + if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd(); + } + + AceCharSeq anchor = aceEventMention.getAnchor(); + ExtractionObject anchorObject = new ExtractionObject( + aceEventMention.getId() + "-anchor", + sentence, + new Span(anchor.getTokenStart() - tokenOffset, anchor.getTokenEnd() + 1 - tokenOffset), + "ANCHOR", + null); + + EventMention em = new EventMention( + aceEventMention.getId(), + sentence, + new Span(left, right), + aceEventMention.getParent().getType(), + aceEventMention.getParent().getSubtype(), + anchorObject, + convertedArgs, + roles); + return em; + } + + private RelationMention convertAceRelationMention(AceRelationMention aceRelationMention, String docId, + CoreMap sentence, Map entityMap) { + List args = Arrays.asList(aceRelationMention.getArgs()); + List convertedArgs = new ArrayList(); + List argNames = new ArrayList(); + + // the arguments are already stored in semantic order. Make sure we preserve the same ordering! + int left = Integer.MAX_VALUE; + int right = Integer.MIN_VALUE; + for (AceRelationMentionArgument arg : args) { + ExtractionObject o = entityMap.get(arg.getContent().getId()); + if(o == null){ + logger.severe("READER ERROR: Failed to find relation argument with id " + arg.getContent().getId()); + logger.severe("This happens because a few relation mentions illegally span multiple sentences. Will ignore this mention."); + return null; + } + convertedArgs.add(o); + argNames.add(arg.getRole()); + if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart(); + if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd(); + } + + if(argNames.size() != 2 || ! argNames.get(0).equalsIgnoreCase("arg-1") || ! argNames.get(1).equalsIgnoreCase("arg-2")){ + logger.severe("READER ERROR: Invalid succession of arguments in relation mention: " + argNames); + logger.severe("ACE relations must have two arguments. Will ignore this mention."); + return null; + } + + RelationMention relation = new RelationMention( + aceRelationMention.getId(), + sentence, + new Span(left, right), + aceRelationMention.getParent().getType(), + aceRelationMention.getParent().getSubtype(), + convertedArgs, + null); + return relation; + } + + /** + * Convert an {@link AceEntityMention} to an {@link EntityMention}. + * + * @param entityMention {@link AceEntityMention} to convert + * @param docId ID of the document containing this entity mention + * @param sentence + * @param tokenOffset An offset in the calculations of position of the extent to sentence boundary + * (the ace.reader stores absolute token offset from the beginning of the document, but + * we need token offsets from the beginning of the sentence => adjust by tokenOffset) + * @return entity as an {@link EntityMention} + */ + private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset) { + //System.err.println("TYPE is " + entityMention.getParent().getType()); + //System.err.println("SUBTYPE is " + entityMention.getParent().getSubtype()); + //System.err.println("LDCTYPE is " + entityMention.getLdctype()); + + AceCharSeq ext = entityMention.getExtent(); + AceCharSeq head = entityMention.getHead(); + + int extStart = ext.getTokenStart() - tokenOffset; + int extEnd = ext.getTokenEnd() - tokenOffset + 1; + if (extStart < 0) { + logger.severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); + logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent."); + extStart = 0; + } + if (extEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) { + logger.severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); + logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent."); + extEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size(); + } + + int headStart = head.getTokenStart() - tokenOffset; + int headEnd = head.getTokenEnd() - tokenOffset + 1; + if (headStart < 0) { + logger.severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); + logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span."); + headStart = 0; + } + if(headEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()){ + logger.severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); + logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span."); + headEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size(); + } + + // must adjust due to possible incorrect EOS detection + if(headStart < extStart){ + headStart = extStart; + } + if(headEnd > extEnd){ + headEnd = extEnd; + } + assert(headStart < headEnd); + + // note: the ace.reader stores absolute token offset from the beginning of the document, but + // we need token offsets from the beginning of the sentence => adjust by tokenOffset + // note: in ace.reader the end token position is inclusive, but + // in our setup the end token position is exclusive => add 1 to end + EntityMention converted = new EntityMention( + entityMention.getId(), + sentence, + new Span(extStart, extEnd), + new Span(headStart, headEnd), + entityMention.getParent().getType(), + entityMention.getParent().getSubtype(), + entityMention.getLdctype()); + return converted; + } + + private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset, String corefID) { + EntityMention converted = convertAceEntityMention(entityMention, docId, sentence, tokenOffset); + converted.setCorefID(corefID); + return converted; + } + + // simple testing code + public static void main(String[] args) throws IOException { + Properties props = StringUtils.argsToProperties(args); + AceReader r = new AceReader(new StanfordCoreNLP(props, false), false); + r.setLoggerLevel(Level.INFO); + r.parse("/scr/nlp/data/ACE2005/"); + System.out.println("done"); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceCharSeq.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceCharSeq.java new file mode 100644 index 0000000..8d77e77 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceCharSeq.java @@ -0,0 +1,179 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.Vector; + +import edu.stanford.nlp.trees.Span; + +/** + * Implements the ACE construct + */ +public class AceCharSeq { + /** The exact text matched by this sequence */ + private String mText; + + /** Offset in the document stream */ + private Span mByteOffset; + + /** Span of tokens that match this char sequence */ + private Span mTokenOffset; + + /** + * Token that incorporates this whole char sequence, e.g. + * "George_Bush/NNP_NNP" for the text "George Bush" XXX: not used anymore + */ + // private AceToken mPhrase; + + public AceCharSeq(String text, int start, int end) { + mText = text; + mByteOffset = new Span(start, end); + mTokenOffset = null; + // mPhrase = null; + } + + public String toXml(String label, int offset) { + StringBuffer buffer = new StringBuffer(); + AceElement.appendOffset(buffer, offset); + buffer.append("<" + label + ">\n"); + + AceElement.appendOffset(buffer, offset + 2); + buffer.append("" + mText + + ""); + buffer.append("\n"); + + AceElement.appendOffset(buffer, offset); + buffer.append(""); + return buffer.toString(); + } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + + AceElement.appendOffset(buffer, offset + 2); + buffer.append("" + mText + + ""); + + return buffer.toString(); + } + + public String getText() { + return mText; + } + + public int getByteStart() { + return mByteOffset.start(); + } + + public int getByteEnd() { + return mByteOffset.end(); + } + + public Span getByteOffset() { + return mByteOffset; + } + + public int getTokenStart() { + if (mTokenOffset == null) + return -1; + return mTokenOffset.start(); + } + + public int getTokenEnd() { + if (mTokenOffset == null) + return -1; + return mTokenOffset.end(); + } + + public Span getTokenOffset() { + return mTokenOffset; + } + + // public AceToken getPhrase() { return mPhrase; } + + /** + * Matches this char seq against the full token stream As a result of this + * method mTokenOffset is initialized + */ + public void match(Vector tokens) throws MatchException { + int start = -1; + int end = -1; + + for (int i = 0; i < tokens.size(); i++) { + // + // we found the starting token + // + if (tokens.get(i).getByteOffset().start() == mByteOffset.start()) { + start = i; + } + + // + // we do not tokenize dashed-words, hence the start may be inside a token + // e.g. Saddam => pro-Saddam + // the same situation will happen due to (uncommon) annotation errors + // + else if (mByteOffset.start() > tokens.get(i).getByteOffset().start() + && mByteOffset.start() < tokens.get(i).getByteOffset().end()) { + start = i; + } + + // + // we found the ending token + // Note: ACE is inclusive for the end position, my tokenization is not + // in ACE: end position == position of last byte in token + // in .sgm.pre: end position == position of last byte + 1 + // + if (tokens.get(i).getByteOffset().end() == mByteOffset.end() + 1) { + end = i; + break; + } + + // + // we do not tokenize dashed-words, hence the end may be inside a token + // e.g. Conference => Conference-leading + // the same situation will happen due to (uncommon) annotation errors + // + else if (mByteOffset.end() >= tokens.get(i).getByteOffset().start() + && mByteOffset.end() < tokens.get(i).getByteOffset().end() - 1) { + end = i; + break; + } + } + + if (start >= 0 && end >= 0) { + mTokenOffset = new Span(start, end); + // mPhrase = makePhrase(tokens, mTokenOffset); + } else { + throw new MatchException("Match failed!"); + } + } + + @Override + public String toString() { + return "AceCharSeq [mByteOffset=" + mByteOffset + ", mText=" + mText + + ", mTokenOffset=" + mTokenOffset + "]"; + } + + /* + * private AceToken makePhrase(Vector tokens, Span span) { + * StringBuffer word = new StringBuffer(); StringBuffer lemma = new + * StringBuffer(); StringBuffer pos = new StringBuffer(); StringBuffer chunk = + * new StringBuffer(); StringBuffer nerc = new StringBuffer(); + * + * for(int i = span.mStart; i <= span.mEnd; i ++){ if(i > span.mStart){ + * word.append("_"); lemma.append("_"); pos.append("_"); chunk.append("_"); + * nerc.append("_"); } + * + * AceToken tok = tokens.get(i); + * word.append(AceToken.WORDS.get(tok.getWord())); + * lemma.append(AceToken.LEMMAS.get(tok.getLemma())); + * pos.append(AceToken.OTHERS.get(tok.getPos())); + * chunk.append(AceToken.OTHERS.get(tok.getChunk())); + * nerc.append(AceToken.OTHERS.get(tok.getNerc())); } + * + * AceToken phrase = new AceToken(word.toString(), lemma.toString(), + * pos.toString(), chunk.toString(), nerc.toString(), null, null, -1); + * + * //System.err.println("Constructed phrase: " + phrase.display()); return + * phrase; } + */ +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceDocument.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceDocument.java new file mode 100644 index 0000000..cd3ec3e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceDocument.java @@ -0,0 +1,863 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Vector; +import java.util.logging.Logger; + +import edu.stanford.nlp.ie.machinereading.common.SimpleTokenize; +import edu.stanford.nlp.ie.machinereading.domains.ace.AceReader; +import edu.stanford.nlp.util.Generics; + +/** + * Stores the ACE elements annotated in this document + */ +public class AceDocument extends AceElement { + /** Prefix of the files from where this doc was created */ + private String mPrefix; + + /** Value of the SOURCE XML field */ + private String mSource; + + /** All entities */ + private Map mEntities; + /** All entity mentions */ + private Map mEntityMentions; + /** All entity mentions in a given sentence, sorted in textual order */ + private ArrayList> mSentenceEntityMentions; + + /** All relations */ + private Map mRelations; + /** All relation mentions */ + private Map mRelationMentions; + /** All relation mentions in a given sentence, sorted in textual order */ + private ArrayList> mSentenceRelationMentions; + + /** All events */ + private Map mEvents; + /** All event mentions */ + private Map mEventMentions; + /** All event mentions in a given sentence, sorted in textual order */ + private ArrayList> mSentenceEventMentions; + + /** The list of all tokens in the document, sorted in textual order */ + private Vector mTokens; + + /** List of all sentences in the document */ + private List> mSentences; + + /** The raw byte document, no preprocessing */ + private String mRawBuffer; + + static Logger mLog = Logger.getLogger(AceReader.class.getName()); + + public AceDocument(String id) { + super(id); + + mEntities = Generics.newHashMap(); + mEntityMentions = Generics.newHashMap(); + mSentenceEntityMentions = new ArrayList>(); + + mRelations = Generics.newHashMap(); + mRelationMentions = Generics.newHashMap(); + mSentenceRelationMentions = new ArrayList>(); + + mEvents = Generics.newHashMap(); + mEventMentions = Generics.newHashMap(); + mSentenceEventMentions = new ArrayList>(); + + mTokens = new Vector(); + } + + public void setPrefix(String p) { + mPrefix = p; + setSource(mPrefix); + } + + public String getPrefix() { + return mPrefix; + } + + public void setSource(String p) { + if (p.indexOf("bc/") >= 0) + mSource = "broadcast conversation"; + else if (p.indexOf("bn/") >= 0) + mSource = "broadcast news"; + else if (p.indexOf("cts/") >= 0) + mSource = "telephone"; + else if (p.indexOf("nw/") >= 0) + mSource = "newswire"; + else if (p.indexOf("un/") >= 0) + mSource = "usenet"; + else if (p.indexOf("wl/") >= 0) + mSource = "weblog"; + else { + System.err.println("WARNING: Unknown source for doc: " + p); + mSource = "none"; + } + } + + public int getSentenceCount() { + return mSentenceEntityMentions.size(); + } + + public ArrayList getEntityMentions(int sent) { + return mSentenceEntityMentions.get(sent); + } + + public ArrayList> getAllEntityMentions() { + return mSentenceEntityMentions; + } + + public ArrayList getRelationMentions(int sent) { + return mSentenceRelationMentions.get(sent); + } + + public ArrayList> getAllRelationMentions() { + return mSentenceRelationMentions; + } + + public ArrayList getEventMentions(int sent) { + return mSentenceEventMentions.get(sent); + } + + public ArrayList> getAllEventMentions() { + return mSentenceEventMentions; + } + + public AceEntity getEntity(String id) { + return mEntities.get(id); + } + + public Set getKeySetEntities() { + return mEntities.keySet(); + } + + public void addEntity(AceEntity e) { + mEntities.put(e.getId(), e); + } + + public Map getEntityMentions() { + return mEntityMentions; + } + + public AceEntityMention getEntityMention(String id) { + return mEntityMentions.get(id); + } + + public void addEntityMention(AceEntityMention em) { + mEntityMentions.put(em.getId(), em); + } + + public AceRelation getRelation(String id) { + return mRelations.get(id); + } + + public void addRelation(AceRelation r) { + mRelations.put(r.getId(), r); + } + + public Map getRelationMentions() { + return mRelationMentions; + } + + public AceRelationMention getRelationMention(String id) { + return mRelationMentions.get(id); + } + + public void addRelationMention(AceRelationMention e) { + mRelationMentions.put(e.getId(), e); + } + + public AceEvent getEvent(String id) { + return mEvents.get(id); + } + + public void addEvent(AceEvent r) { + mEvents.put(r.getId(), r); + } + + public Map getEventMentions() { + return mEventMentions; + } + + public AceEventMention getEventMention(String id) { + return mEventMentions.get(id); + } + + public void addEventMention(AceEventMention e) { + mEventMentions.put(e.getId(), e); + } + + public void addToken(AceToken t) { + mTokens.add(t); + } + + public int getTokenCount() { + return mTokens.size(); + } + + public AceToken getToken(int i) { + return mTokens.get(i); + } + + public List getSentence(int index) { + return mSentences.get(index); + } + + public List> getSentences() { + return mSentences; + } + + public void setSentences(List> sentences) { + mSentences = sentences; + } + + public String toString() { + return toXml(0); + } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + appendOffset(buffer, offset); + buffer.append("\n"); + appendOffset(buffer, offset); + buffer.append("\n"); + appendOffset(buffer, offset); + buffer.append("\n"); + appendOffset(buffer, offset); + buffer.append("\n"); + + // display all entities + Set entKeys = mEntities.keySet(); + for (String key : entKeys) { + AceEntity e = mEntities.get(key); + buffer.append(e.toXml(offset)); + buffer.append("\n"); + } + + // display all relations + Set relKeys = mRelations.keySet(); + for (String key : relKeys) { + AceRelation r = mRelations.get(key); + if (!r.getType().equals(AceRelation.NIL_LABEL)) { + buffer.append(r.toXml(offset)); + buffer.append("\n"); + } + } + + // TODO: display all events + + appendOffset(buffer, offset); + buffer.append("\n"); + appendOffset(buffer, offset); + buffer.append("\n"); + return buffer.toString(); + } + + private String tokensWithByteSpan(int start, int end) { + StringBuffer buf = new StringBuffer(); + boolean doPrint = false; + buf.append("..."); + for (int i = 0; i < mTokens.size(); i++) { + // start printing + if (doPrint == false && mTokens.get(i).getByteOffset().start() > start - 20 + && mTokens.get(i).getByteOffset().end() < end) { + doPrint = true; + } + + // end printing + else if (doPrint == true && mTokens.get(i).getByteOffset().start() > end + 20) { + doPrint = false; + } + + if (doPrint) { + buf.append(" " + mTokens.get(i).display()); + } + } + buf.append("..."); + return buf.toString(); + } + + /** + * Matches all relevant mentions, i.e. entities and anchors, to tokens Note: + * entity mentions may match with multiple tokens! + */ + public void matchCharSeqs(String filePrefix) { + // + // match the head and extent of entity mentions + // + Set keys = mEntityMentions.keySet(); + for (String key : keys) { + AceEntityMention m = mEntityMentions.get(key); + + // + // match the head charseq to 1+ phrase(s) + // + try { + m.getHead().match(mTokens); + } catch (MatchException e) { + mLog.severe("READER ERROR: Failed to match entity mention head: " + "[" + m.getHead().getText() + ", " + + m.getHead().getByteStart() + ", " + m.getHead().getByteEnd() + "]"); + mLog.severe("Document tokens: " + tokensWithByteSpan(m.getHead().getByteStart(), m.getHead().getByteEnd())); + mLog.severe("Document prefix: " + filePrefix); + System.exit(1); + } + + // + // match the extent charseq to 1+ phrase(s) + // + try { + m.getExtent().match(mTokens); + } catch (MatchException e) { + mLog.severe("READER ERROR: Failed to match entity mention extent: " + "[" + m.getExtent().getText() + ", " + + m.getExtent().getByteStart() + ", " + m.getExtent().getByteEnd() + "]"); + mLog.severe("Document tokens: " + tokensWithByteSpan(m.getExtent().getByteStart(), m.getExtent().getByteEnd())); + System.exit(1); + } + + // + // set the head word of the mention + // + m.detectHeadToken(this); + } + + // we need to do this for events as well since they may not have any AceEntityMentions associated with them (if they have no arguments) + Set eventKeys = mEventMentions.keySet(); + for (String key : eventKeys) { + AceEventMention m = mEventMentions.get(key); + + // + // match the extent charseq to 1+ phrase(s) + // + try { + m.getExtent().match(mTokens); + } catch (MatchException e) { + mLog.severe("READER ERROR: Failed to match event mention extent: " + "[" + m.getExtent().getText() + ", " + + m.getExtent().getByteStart() + ", " + m.getExtent().getByteEnd() + "]"); + mLog.severe("Document tokens: " + tokensWithByteSpan(m.getExtent().getByteStart(), m.getExtent().getByteEnd())); + System.exit(1); + } + } + } + + public static final String XML_EXT = ".apf.xml"; + public static final String ORIG_EXT = ".sgm"; + + /** + * Parses an ACE document. Works in the following steps: (a) reads both the + * XML annotations; (b) reads the tokens; (c) matches the tokens against the + * annotations (d) constructs mSentenceEntityMentions and + * mRelationEntityMentions + */ + public static AceDocument parseDocument(String prefix, boolean usePredictedBoundaries) throws java.io.IOException, + org.xml.sax.SAXException, javax.xml.parsers.ParserConfigurationException { + mLog.fine("Reading document " + prefix); + AceDocument doc = null; + + // + // read the ACE XML annotations + // + if (usePredictedBoundaries == false) { + doc = AceDomReader.parseDocument(new File(prefix + XML_EXT)); + // System.err.println("Parsed " + doc.getEntityMentions().size() + + // " entities in document " + prefix); + } + + // + // will use the predicted entity boundaries (see below) + // + else { + int lastSlash = prefix.lastIndexOf(File.separator); + assert (lastSlash > 0 && lastSlash < prefix.length() - 1); + String id = prefix.substring(lastSlash + 1); + // System.err.println(id + ": " + prefix); + doc = new AceDocument(id); + } + doc.setPrefix(prefix); + + // + // read the raw byte stream + // + String trueCasedFileName = prefix + ORIG_EXT + ".truecase"; + if((new File(trueCasedFileName).exists())){ + mLog.severe("Using truecased file: " + trueCasedFileName); + doc.readRawBytes(trueCasedFileName); + } else { + doc.readRawBytes(prefix + ORIG_EXT); + } + + // + // read the AceTokens + // + int offsetToSubtract = 0; + List> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(prefix); + doc.setSentences(sentences); + for (List sentence : sentences) { + for (AceToken token : sentence) { + offsetToSubtract = token.adjustPhrasePositions(offsetToSubtract, token.getLiteral()); + doc.addToken(token); + } + } + + // + // match char sequences to phrases + // + doc.matchCharSeqs(prefix); + + // + // construct the mEntityMentions matrix + // + Set entityKeys = doc.mEntityMentions.keySet(); + int sentence; + for (String key : entityKeys) { + AceEntityMention em = doc.mEntityMentions.get(key); + sentence = doc.mTokens.get(em.getHead().getTokenStart()).getSentence(); + + // adjust the number of rows if necessary + while (sentence >= doc.mSentenceEntityMentions.size()) { + doc.mSentenceEntityMentions.add(new ArrayList()); + doc.mSentenceRelationMentions.add(new ArrayList()); + doc.mSentenceEventMentions.add(new ArrayList()); + } + + // store the entity mentions in increasing order: + // (a) of the start position of their head + // (b) if start is the same, in increasing order of the head end + ArrayList sentEnts = doc.mSentenceEntityMentions.get(sentence); + boolean added = false; + for (int i = 0; i < sentEnts.size(); i++) { + AceEntityMention crt = sentEnts.get(i); + if ((crt.getHead().getTokenStart() > em.getHead().getTokenStart()) + || (crt.getHead().getTokenStart() == em.getHead().getTokenStart() && crt.getHead().getTokenEnd() > em + .getHead().getTokenEnd())) { + sentEnts.add(i, em); + added = true; + break; + } + } + if (!added) { + sentEnts.add(em); + } + } + + // + // construct the mRelationMentions matrix + // + Set relKeys = doc.mRelationMentions.keySet(); + for (String key : relKeys) { + AceRelationMention rm = doc.mRelationMentions.get(key); + sentence = doc.mTokens.get(rm.getArg(0).getHead().getTokenStart()).getSentence(); + + // + // no need to adjust the number of rows: was done above + // + + // store the relation mentions in increasing order + // (a) of the start position of their head, or + // (b) if start is the same, in increasing order of ends + ArrayList sentRels = doc.mSentenceRelationMentions.get(sentence); + boolean added = false; + for (int i = 0; i < sentRels.size(); i++) { + AceRelationMention crt = sentRels.get(i); + if ((crt.getMinTokenStart() > rm.getMinTokenStart()) + || (crt.getMinTokenStart() == rm.getMinTokenStart() && crt.getMaxTokenEnd() > rm.getMaxTokenEnd())) { + sentRels.add(i, rm); + added = true; + break; + } + } + if (!added) { + sentRels.add(rm); + } + } + + // + // construct the mEventMentions matrix + // + Set eventKeys = doc.mEventMentions.keySet(); + for (String key : eventKeys) { + AceEventMention em = doc.mEventMentions.get(key); + sentence = doc.mTokens.get(em.getMinTokenStart()).getSentence(); + + /* + * adjust the number of rows if necessary -- if you're wondering why we do + * this here again, (after we've done it for entities) it's because we can + * have an event with no entities near the end of the document and thus + * won't have created rows in mSentence*Mentions + */ + while (sentence >= doc.mSentenceEntityMentions.size()) { + doc.mSentenceEntityMentions.add(new ArrayList()); + doc.mSentenceRelationMentions.add(new ArrayList()); + doc.mSentenceEventMentions.add(new ArrayList()); + } + + // store the event mentions in increasing order + // (a) first, event mentions with no arguments + // (b) then by the start position of their head, or + // (c) if start is the same, in increasing order of ends + ArrayList sentEvents = doc.mSentenceEventMentions.get(sentence); + boolean added = false; + for (int i = 0; i < sentEvents.size(); i++) { + AceEventMention crt = sentEvents.get(i); + if ((crt.getMinTokenStart() > em.getMinTokenStart()) + || (crt.getMinTokenStart() == em.getMinTokenStart() && crt.getMaxTokenEnd() > em.getMaxTokenEnd())) { + sentEvents.add(i, em); + added = true; + break; + } + } + if (!added) { + sentEvents.add(em); + } + } + + return doc; + } + + // + // heeyoung : skip relation, event parsing part - for ACE2004 + // + public static AceDocument parseDocument(String prefix, boolean usePredictedBoundaries, String AceVersion) throws java.io.IOException, + org.xml.sax.SAXException, javax.xml.parsers.ParserConfigurationException { + mLog.fine("Reading document " + prefix); + AceDocument doc = null; + + // + // read the ACE XML annotations + // + if (usePredictedBoundaries == false) { + doc = AceDomReader.parseDocument(new File(prefix + XML_EXT)); + // System.err.println("Parsed " + doc.getEntityMentions().size() + + // " entities in document " + prefix); + } + + // + // will use the predicted entity boundaries (see below) + // + else { + int lastSlash = prefix.lastIndexOf(File.separator); + assert (lastSlash > 0 && lastSlash < prefix.length() - 1); + String id = prefix.substring(lastSlash + 1); + // System.err.println(id + ": " + prefix); + doc = new AceDocument(id); + } + doc.setPrefix(prefix); + + // + // read the raw byte stream + // + String trueCasedFileName = prefix + ORIG_EXT + ".truecase"; + if((new File(trueCasedFileName).exists())){ + mLog.severe("Using truecased file: " + trueCasedFileName); + doc.readRawBytes(trueCasedFileName); + } else { + doc.readRawBytes(prefix + ORIG_EXT); + } + + // + // read the AceTokens + // + int offsetToSubtract = 0; + List> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(prefix); + doc.setSentences(sentences); + for (List sentence : sentences) { + for (AceToken token : sentence) { + offsetToSubtract = token.adjustPhrasePositions(offsetToSubtract, token.getLiteral()); + doc.addToken(token); + } + } + + // + // match char sequences to phrases + // + doc.matchCharSeqs(prefix); + + // + // construct the mEntityMentions matrix + // + Set entityKeys = doc.mEntityMentions.keySet(); + int sentence; + for (String key : entityKeys) { + AceEntityMention em = doc.mEntityMentions.get(key); + sentence = doc.mTokens.get(em.getHead().getTokenStart()).getSentence(); + + // adjust the number of rows if necessary + while (sentence >= doc.mSentenceEntityMentions.size()) { + doc.mSentenceEntityMentions.add(new ArrayList()); + doc.mSentenceRelationMentions.add(new ArrayList()); + doc.mSentenceEventMentions.add(new ArrayList()); + } + + // store the entity mentions in increasing order: + // (a) of the start position of their head + // (b) if start is the same, in increasing order of the head end + ArrayList sentEnts = doc.mSentenceEntityMentions.get(sentence); + boolean added = false; + for (int i = 0; i < sentEnts.size(); i++) { + AceEntityMention crt = sentEnts.get(i); + if ((crt.getHead().getTokenStart() > em.getHead().getTokenStart()) + || (crt.getHead().getTokenStart() == em.getHead().getTokenStart() && crt.getHead().getTokenEnd() > em + .getHead().getTokenEnd())) { + sentEnts.add(i, em); + added = true; + break; + } + } + if (!added) { + sentEnts.add(em); + } + } + + return doc; + } + + + // TODO: never used? + public void constructSentenceRelationMentions() { + // + // construct the mRelationEntityMentions matrix + // + Set relKeys = mRelationMentions.keySet(); + for (String key : relKeys) { + AceRelationMention rm = mRelationMentions.get(key); + int sentence = mTokens.get(rm.getArg(0).getHead().getTokenStart()).getSentence(); + + // + // no need to adjust the number of rows: was done in parseDocument + // + + // store the relation mentions in increasing order + // (a) of the start position of their head, or + // (b) if start is the same, in increasing order of ends + ArrayList sentRels = mSentenceRelationMentions.get(sentence); + boolean added = false; + for (int i = 0; i < sentRels.size(); i++) { + AceRelationMention crt = sentRels.get(i); + if ((crt.getMinTokenStart() > rm.getMinTokenStart()) + || (crt.getMinTokenStart() == rm.getMinTokenStart() && crt.getMaxTokenEnd() > rm.getMaxTokenEnd())) { + sentRels.add(i, rm); + added = true; + break; + } + } + if (!added) { + sentRels.add(rm); + } + } + } + + /** + * Verifies if the two tokens are part of the same chunk + */ + public boolean sameChunk(int left, int right) { + for (int i = right; i > left; i--) { + String chunk = AceToken.OTHERS.get(getToken(i).getChunk()); + if (!chunk.startsWith("I-")) + return false; + String word = AceToken.WORDS.get(getToken(i).getWord()); + if (word.equals(",") || word.equals("(") || word.equals("-")) + return false; + } + String leftChunk = AceToken.OTHERS.get(getToken(left).getChunk()); + if (leftChunk.equals("O")) + return false; + return true; + } + + public boolean isChunkHead(int pos) { + String next = AceToken.OTHERS.get(getToken(pos + 1).getChunk()); + if (next.startsWith("I-")) + return false; + return true; + } + + public int findChunkEnd(int pos) { + String crt = AceToken.OTHERS.get(getToken(pos).getChunk()); + if (crt.equals("O")) + return pos; + + for (pos = pos + 1; pos < getTokenCount(); pos++) { + crt = AceToken.OTHERS.get(getToken(pos).getChunk()); + if (!crt.startsWith("I-")) + break; + } + + return pos - 1; + } + + public int findChunkStart(int pos) { + String crt = AceToken.OTHERS.get(getToken(pos).getChunk()); + if (crt.equals("O") || crt.startsWith("B-")) + return pos; + + for (pos = pos - 1; pos >= 0; pos--) { + crt = AceToken.OTHERS.get(getToken(pos).getChunk()); + if (crt.startsWith("B-")) + break; + } + + return pos; + } + + public boolean isApposition(int left, int right) { + int leftEnd = findChunkEnd(left); + int rightStart = findChunkStart(right); + + if (rightStart == leftEnd + 1) + return true; + + if (rightStart == leftEnd + 2) { + String comma = AceToken.WORDS.get(getToken(leftEnd + 1).getWord()); + if (comma.equals(",") || comma.equals("-") || comma.equals("_")) { + return true; + } + } + + return false; + } + + public int countVerbs(int start, int end) { + int count = 0; + for (int i = start; i < end; i++) { + String crt = AceToken.OTHERS.get(getToken(i).getPos()); + if (crt.startsWith("VB")) + count++; + } + return count; + } + + public int countCommas(int start, int end) { + int count = 0; + for (int i = start; i < end; i++) { + String crt = AceToken.WORDS.get(getToken(i).getWord()); + if (crt.equals(",")) + count++; + } + return count; + } + + private void readRawBytes(String fileName) throws IOException { + BufferedReader in = new BufferedReader(new FileReader(fileName)); + StringBuffer buf = new StringBuffer(); + int c; + while ((c = in.read()) >= 0) + buf.append((char) c); + mRawBuffer = buf.toString(); + // System.out.println(mRawBuffer); + in.close(); + } + + @SuppressWarnings("unused") + private void readPredictedEntityBoundaries(BufferedReader is) throws java.io.IOException { + // System.out.println("Reading boundaries from file: " + mPrefix); + + // + // read Massi's B-ENT, I-ENT, or O labels + // + ArrayList labels = new ArrayList(); + String line; + while ((line = is.readLine()) != null) { + ArrayList tokens = SimpleTokenize.tokenize(line); + if (tokens.isEmpty() == false) + labels.add(tokens.get(0)); + } + assert (labels.size() == mTokens.size()); + + int entityId = 1; + + // + // traverse the label array and create entities as needed + // + for (int i = 0; i < labels.size(); i++) { + // System.out.println(labels.get(i)); + if (labels.get(i).startsWith("B-") || labels.get(i).startsWith("I-")) { // Massi's + // ents + // may + // start + // with + // I-ENT + int startToken = i; + int endToken = i + 1; + while (endToken < labels.size() && labels.get(endToken).startsWith("I-")) + endToken++; + + // + // Set the type/subtype to whatever Massi predicted + // This is not directly used in this system. It is needed only + // to generate the APF files with Massi info, which are needed + // by Edgar. Otherwise type/subtype could be safely set to "none". + // + String label = labels.get(startToken); + int dash = label.indexOf("-", 2); + if (dash <= 2 || dash >= label.length()) { + throw new RuntimeException(label); + } + assert (dash > 2 && dash < label.length() - 1); + String type = label.substring(2, dash); + String subtype = label.substring(dash + 1); + /* + * String type = "none"; String subtype = "none"; + */ + + // create a new entity between [startToken, endToken) + makeEntity(startToken, endToken, entityId, type, subtype); + + // skip over this entity + i = endToken - 1; + entityId++; + } else { + assert (labels.get(i).equals("O")); + } + } + } + + public AceCharSeq makeCharSeq(int startToken, int endToken) { + /* + * StringBuffer buf = new StringBuffer(); for(int i = startToken; i < + * endToken; i ++){ if(i > startToken) buf.append(" "); + * buf.append(mTokens.get(i).getLiteral()); } + */ + startToken = Math.max(0, startToken); + while (mTokens.get(startToken).getByteStart() < 0) + // SGML token + startToken++; + endToken = Math.min(endToken, mTokens.size()); + while (mTokens.get(endToken - 1).getByteStart() < 0) + // SGML token + endToken--; + assert (endToken > startToken); + + String text = mRawBuffer.substring(mTokens.get(startToken).getRawByteStart(), mTokens.get(endToken - 1) + .getRawByteEnd()); + + /* + * if(mTokens.get(startToken).getByteStart() > mTokens.get(endToken - + * 1).getByteEnd() - 1){ for(int i = startToken; i < endToken; i ++){ + * System.out.println("Token: " + mTokens.get(i).display()); } } + */ + return new AceCharSeq(text, // buf.toString(), + mTokens.get(startToken).getByteStart(), mTokens.get(endToken - 1).getByteEnd() - 1); + } + + /** Makes an ACE entity from the span [startToken, endToken) */ + private void makeEntity(int startToken, int endToken, int id, String type, String subtype) { + String eid = mId + "-E" + id; + AceEntity ent = new AceEntity(eid, type, subtype, "SPC"); + addEntity(ent); + + AceCharSeq cseq = makeCharSeq(startToken, endToken); + String emid = mId + "-E" + id + "-1"; + AceEntityMention entm = new AceEntityMention(emid, "NOM", "NOM", cseq, cseq); + addEntityMention(entm); + ent.addMention(entm); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceDomReader.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceDomReader.java new file mode 100644 index 0000000..136d944 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceDomReader.java @@ -0,0 +1,243 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +import edu.stanford.nlp.ie.machinereading.common.DomReader; + +/** + * DOM reader for an ACE specification. + * + * @author David McClosky + */ +public class AceDomReader extends DomReader { + + private static AceCharSeq parseCharSeq(Node node) { + Node child = getChildByName(node, "charseq"); + String start = getAttributeValue(child, "START"); + String end = getAttributeValue(child, "END"); + String text = child.getFirstChild().getNodeValue(); + return new AceCharSeq(text, + Integer.parseInt(start), + Integer.parseInt(end)); + } + + /** + * Extracts one entity mention + */ + private static AceEntityMention parseEntityMention(Node node) { + String id = getAttributeValue(node, "ID"); + String type = getAttributeValue(node, "TYPE"); + String ldctype = getAttributeValue(node, "LDCTYPE"); + AceCharSeq extent = parseCharSeq(getChildByName(node, "extent")); + AceCharSeq head = parseCharSeq(getChildByName(node, "head")); + return (new AceEntityMention(id, type, ldctype, extent, head)); + } + + /** + * Extracts info about one relation mention + */ + private static AceRelationMention parseRelationMention(Node node, + AceDocument doc) { + String id = getAttributeValue(node, "ID"); + AceCharSeq extent = parseCharSeq(getChildByName(node, "extent")); + String lc = getAttributeValue(node, "LEXICALCONDITION"); + + // create the mention + AceRelationMention mention = new AceRelationMention(id, extent, lc); + + // find the mention args + List args = getChildrenByName(node, "relation_mention_argument"); + for(Node arg: args){ + String role = getAttributeValue(arg, "ROLE"); + String refid = getAttributeValue(arg, "REFID"); + AceEntityMention am = doc.getEntityMention(refid); + + if(am != null){ + am.addRelationMention(mention); + if(role.equalsIgnoreCase("arg-1")){ + mention.getArgs()[0] = new AceRelationMentionArgument(role, am); + } else if(role.equalsIgnoreCase("arg-2")){ + mention.getArgs()[1] = new AceRelationMentionArgument(role, am); + } else { + throw new RuntimeException("Invalid relation mention argument role: " + role); + } + } + } + + return mention; + } + + /** + * Extracts info about one relation mention + */ + private static AceEventMention parseEventMention(Node node, + AceDocument doc) { + String id = getAttributeValue(node, "ID"); + AceCharSeq extent = parseCharSeq(getChildByName(node, "extent")); + AceCharSeq anchor = parseCharSeq(getChildByName(node, "anchor")); + + // create the mention + AceEventMention mention = new AceEventMention(id, extent, anchor); + + // find the mention args + List args = getChildrenByName(node, "event_mention_argument"); + for (Node arg : args) { + String role = getAttributeValue(arg, "ROLE"); + String refid = getAttributeValue(arg, "REFID"); + AceEntityMention am = doc.getEntityMention(refid); + + if(am != null){ + am.addEventMention(mention); + mention.addArg(am, role); + } + } + + return mention; + } + + /** + * Parses one ACE specification + * @return Simply displays the events to stdout + */ + public static AceDocument parseDocument(File f) + throws IOException, SAXException, ParserConfigurationException { + + // parse the Dom document + Document document = readDocument(f); + + // + // create the ACE document object + // + Node docElement = document.getElementsByTagName("document").item(0); + AceDocument aceDoc = + new AceDocument(getAttributeValue(docElement, "DOCID")); + + // + // read all entities + // + NodeList entities = document.getElementsByTagName("entity"); + int entityCount = 0; + for(int i = 0; i < entities.getLength(); i ++){ + Node node = entities.item(i); + + // + // the entity type and subtype + // + String id = getAttributeValue(node, "ID"); + String type = getAttributeValue(node, "TYPE"); + String subtype = getAttributeValue(node, "SUBTYPE"); + String cls = getAttributeValue(node, "CLASS"); + + // create the entity + AceEntity entity = new AceEntity(id, type, subtype, cls); + aceDoc.addEntity(entity); + + // fetch all mentions of this event + List mentions = getChildrenByName(node, "entity_mention"); + + // parse all its mentions + for (Node mention1 : mentions) { + AceEntityMention mention = parseEntityMention(mention1); + entity.addMention(mention); + aceDoc.addEntityMention(mention); + } + + entityCount++; + } + //System.err.println("Parsed " + entityCount + " XML entities."); + + // + // read all relations + // + NodeList relations = document.getElementsByTagName("relation"); + for(int i = 0; i < relations.getLength(); i ++){ + Node node = relations.item(i); + + // + // the relation type, subtype, tense, and modality + // + String id = getAttributeValue(node, "ID"); + String type = getAttributeValue(node, "TYPE"); + String subtype = getAttributeValue(node, "SUBTYPE"); + String modality = getAttributeValue(node, "MODALITY"); + String tense = getAttributeValue(node, "TENSE"); + + // create the relation + AceRelation relation = new AceRelation(id, type, subtype, + modality, tense); + aceDoc.addRelation(relation); + + // XXX: fetch relation_arguments here! + + // fetch all mentions of this relation + List mentions = getChildrenByName(node, "relation_mention"); + + // traverse all mentions + for (Node mention1 : mentions) { + AceRelationMention mention = parseRelationMention(mention1, aceDoc); + relation.addMention(mention); + aceDoc.addRelationMention(mention); + } + } + + // + // read all events + // + NodeList events = document.getElementsByTagName("event"); + for(int i = 0; i < events.getLength(); i ++){ + Node node = events.item(i); + + // + // the event type, subtype, tense, and modality + // + String id = getAttributeValue(node, "ID"); + String type = getAttributeValue(node, "TYPE"); + String subtype = getAttributeValue(node, "SUBTYPE"); + String modality = getAttributeValue(node, "MODALITY"); + String polarity = getAttributeValue(node, "POLARITY"); + String genericity = getAttributeValue(node, "GENERICITY"); + String tense = getAttributeValue(node, "TENSE"); + + // create the event + AceEvent event = new AceEvent(id, type, subtype, + modality, polarity, genericity, tense); + aceDoc.addEvent(event); + + // fetch all mentions of this relation + List mentions = getChildrenByName(node, "event_mention"); + + // traverse all mentions + for (Node mention1 : mentions) { + AceEventMention mention = parseEventMention(mention1, aceDoc); + event.addMention(mention); + aceDoc.addEventMention(mention); + } + } + + return aceDoc; + } + + public static void main(String [] argv) throws Exception { + if (argv.length != 1) { + System.err.println("Usage: java AceDomReader "); + System.exit(1); + } + + File f = new File(argv[0]); + AceDocument doc = parseDocument(f); + System.out.println("Processed ACE document:\n" + doc); + ArrayList> r = doc.getAllRelationMentions(); + System.out.println("size: " + r.size()); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceElement.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceElement.java new file mode 100644 index 0000000..8832f90 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceElement.java @@ -0,0 +1,23 @@ +/** + * Base class for all ACE annotation elements + */ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +public class AceElement { + /** Unique identifier for this element */ + protected String mId; + + public AceElement(String id) { + mId = id; + } + + public String getId() { return mId; } + + public static void appendOffset(StringBuffer buffer, + int offset) { + for(int i = 0; i < offset; i ++){ + buffer.append(" "); + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEntity.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEntity.java new file mode 100644 index 0000000..d6181dd --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEntity.java @@ -0,0 +1,61 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.List; +import java.util.ArrayList; + +/** + * Implements the ACE construct + */ +public class AceEntity extends AceElement { + private String mType; + + private String mSubtype; + + private String mClass; + + private List mMentions; + + public AceEntity(String id, + String type, + String subtype, + String cls) { + super(id); + mType = type; + mSubtype = subtype; + mClass = cls; + mMentions = new ArrayList(); + } + + public void addMention(AceEntityMention m) { + mMentions.add(m); + m.setParent(this); + } + public List getMentions() { return mMentions; } + + public String getType() { return mType; } + public void setType(String s) { mType = s; } + public String getSubtype() { return mSubtype; } + public void setSubtype(String s) { mSubtype = s; } + public void setClass(String s) { mClass = s; } + public String getClasss() { return mClass; } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + appendOffset(buffer, offset); + buffer.append("\n"); + + for(AceEntityMention m: mMentions){ + buffer.append(m.toXml(offset + 2)); + buffer.append("\n"); + } + + appendOffset(buffer, offset); + buffer.append(""); + return buffer.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEntityMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEntityMention.java new file mode 100644 index 0000000..53fd58e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEntityMention.java @@ -0,0 +1,158 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.ArrayList; +import java.util.List; + +/** + * Implements the ACE construct + */ +public class AceEntityMention extends AceMention { + @Override + public String toString() { + return "AceEntityMention [mHead=" + mHead + ", mLdctype=" + mLdctype + + ", mType=" + mType + "]"; + } + + private String mType; + + private String mLdctype; + + private AceCharSeq mHead; + + /** Position of the head word of this mention */ + private int mHeadTokenPosition; + + /** The parent entity */ + private AceEntity mParent; + + /** The set of relation mentions that contain this entity mention */ + private List mRelationMentions; + + /** The set of event mentions that contain this entity mention */ + private List mEventMentions; + + public AceEntityMention(String id, + String type, + String ldctype, + AceCharSeq extent, + AceCharSeq head) { + super(id, extent); + mType = type; + mLdctype = ldctype; + mHead = head; + mExtent = extent; + mHeadTokenPosition = -1; + mParent = null; + mRelationMentions = new ArrayList(); + mEventMentions = new ArrayList(); + } + + public String getMention() { return mType; } + + public void setParent(AceEntity e) { mParent = e; } + public AceEntity getParent() { return mParent; } + + public AceCharSeq getHead() { return mHead; } + public AceCharSeq getExtent() { return mExtent; } + public int getHeadTokenPosition() { return mHeadTokenPosition; } + + public void setType(String s) { mType = s; } + public String getType() { return mType; } + public void setLdctype(String s) { mLdctype = s; } + public String getLdctype() { return mLdctype; } + + public void addRelationMention(AceRelationMention rm) { + mRelationMentions.add(rm); + } + public List getRelationMentions() { + return mRelationMentions; + } + + public void addEventMention(AceEventMention rm) { + mEventMentions.add(rm); + } + public List getEventMentions() { + return mEventMentions; + } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + String mentionType = mType; + + appendOffset(buffer, offset); + buffer.append("\n"); + + buffer.append(mExtent.toXml("extent", offset + 2)); + buffer.append("\n"); + buffer.append(mHead.toXml("head", offset + 2)); + buffer.append("\n"); + + appendOffset(buffer, offset); + buffer.append(""); + + if(mentionType.equals("NAM")){ + // XXX: should be in Entity.toXml() + buffer.append("\n"); + appendOffset(buffer, offset); + buffer.append("\n"); + + appendOffset(buffer, offset + 2); + buffer.append("\n"); + buffer.append(mHead.toXml(offset + 4) + "\n"); + appendOffset(buffer, offset + 2); + buffer.append("\n"); + + appendOffset(buffer, offset); + buffer.append(""); + } + + return buffer.toString(); + } + + private static boolean contains(ArrayList set, + int elem) { + for(int i = 0; i < set.size(); i ++){ + if(elem == set.get(i)) return true; + } + return false; + } + + /** + * Detects the head word of this mention + * Heuristic: + * (a) the last token in mHead, if there are no prepositions + * (b) the last word before the first preposition + * Note: the mHead must be already matched against tokens! + */ + public void detectHeadToken(AceDocument doc) { + ArrayList preps = new ArrayList(); + preps.add(AceToken.OTHERS.get("IN")); + + for(int i = mHead.getTokenStart(); i <= mHead.getTokenEnd(); i ++){ + // found a prep + if(contains(preps, doc.getToken(i).getPos()) && + i > mHead.getTokenStart()){ + mHeadTokenPosition = i - 1; + return; + } + } + + // set as the last word in mHead + mHeadTokenPosition = mHead.getTokenEnd(); + } + + /** Verifies if this mention appears before the parameter in textual order */ + public boolean before(AceEntityMention em) { + if(mHead.getByteEnd() < em.mHead.getByteStart()) return true; + return false; + } + + /** Verifies if this mention appears after the parameter in textual order */ + public boolean after(AceEntityMention em) { + if(mHead.getByteStart() > em.mHead.getByteEnd()) return true; + return false; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEvent.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEvent.java new file mode 100644 index 0000000..ef4f4fe --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEvent.java @@ -0,0 +1,69 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.List; +import java.util.ArrayList; + +/** + * Stores one ACE event + */ +public class AceEvent extends AceElement { + private String mType; + + private String mSubtype; + + private String mModality; + + private String mPolarity; + + private String mGenericity; + + private String mTense; + + /** The list of mentions for this event */ + private List mMentions; + + public static final String NIL_LABEL = "nil"; + + public AceEvent(String id, + String type, + String subtype, + String modality, + String polarity, + String genericity, + String tense) { + super(id); + mType = type; + mSubtype = subtype; + mModality = modality; + mPolarity = polarity; + mGenericity = genericity; + mTense = tense; + mMentions = new ArrayList(); + } + + public void addMention(AceEventMention m) { + mMentions.add(m); + m.setParent(this); + } + + public AceEventMention getMention(int which) { + return mMentions.get(which); + } + public int getMentionCount() { return mMentions.size(); } + + public String getType() { return mType; } + public void setType(String s) { mType = s; } + public String getSubtype() { return mSubtype; } + public void setSubtype(String s) { mSubtype = s; } + public String getModality() { return mModality; } + public void setModality(String modality) { this.mModality = modality; } + public String getmPolarity() { return mPolarity; } + public void setmPolarity(String mPolarity) { this.mPolarity = mPolarity; } + public String getGenericity() { return mGenericity; } + public void setGenericity(String genericity) { this.mGenericity = genericity; } + public String getTense() { return mTense; } + public void setTense(String tense) { this.mTense = tense; } + + // TODO: didn't implement toXml +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEventMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEventMention.java new file mode 100644 index 0000000..b347915 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEventMention.java @@ -0,0 +1,120 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.util.Generics; + +/** + * Stores one ACE event mention + */ +public class AceEventMention extends AceMention { + + /** Maps argument roles to argument mentions */ + private Map mRolesToArguments; + + /** the parent event */ + private AceEvent mParent; + + /** anchor text for this event */ + private AceCharSeq mAnchor; + + public AceEventMention(String id, AceCharSeq extent, AceCharSeq anchor) { + super(id, extent); + mRolesToArguments = Generics.newHashMap(); + this.mAnchor = anchor; + } + + @Override + public String toString() { + return "AceEventMention [mAnchor=" + mAnchor + ", mParent=" + mParent + + ", mRolesToArguments=" + mRolesToArguments + ", mExtent=" + mExtent + + ", mId=" + mId + "]"; + } + + public Collection getArgs() { + return mRolesToArguments.values(); + } + + public Set getRoles() { + return mRolesToArguments.keySet(); + } + + public AceEntityMention getArg(String role) { + return mRolesToArguments.get(role).getContent(); + } + + public void addArg(AceEntityMention em, String role) { + mRolesToArguments.put(role, new AceEventMentionArgument(role, em)); + } + + public void setParent(AceEvent e) { + mParent = e; + } + + public AceEvent getParent() { + return mParent; + } + + public void setAnchor(AceCharSeq anchor) { + mAnchor = anchor; + } + + public AceCharSeq getAnchor() { + return mAnchor; + } + + /** Fetches the id of the sentence that contains this mention */ + // TODO disabled until we tie in sentence boundaries + // public int getSentence(AceDocument doc) { + // return doc.getToken(getArg(0).getHead().getTokenStart()).getSentence(); + // } + + /** + * Returns the smallest start of all argument heads (or the beginning of the + * mention's extent if there are no arguments) + */ + public int getMinTokenStart() { + Collection args = getArgs(); + int earliestTokenStart = -1; + for (AceEventMentionArgument arg : args) { + int tokenStart = arg.getContent().getHead().getTokenStart(); + if (earliestTokenStart == -1) + earliestTokenStart = tokenStart; + else + earliestTokenStart = Math.min(earliestTokenStart, tokenStart); + } + + // this will happen when we have no arguments + if (earliestTokenStart == -1) + return mExtent.getTokenStart(); + + return earliestTokenStart; + } + + /** + * Returns the largest start of all argument heads (or the beginning of the + * mention's extent if there are no arguments) + */ + public int getMaxTokenEnd() { + Collection args = getArgs(); + int latestTokenStart = -1; + for (AceEventMentionArgument arg : args) { + int tokenStart = arg.getContent().getHead().getTokenStart(); + if (latestTokenStart == -1) + latestTokenStart = tokenStart; + else + latestTokenStart = Math.max(latestTokenStart, tokenStart); + } + + // this will happen when we have no arguments + if (latestTokenStart == -1) + return mExtent.getTokenStart(); + + return latestTokenStart; + } + + // TODO: toXml method +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEventMentionArgument.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEventMentionArgument.java new file mode 100644 index 0000000..1ed7ffc --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceEventMentionArgument.java @@ -0,0 +1,9 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +public class AceEventMentionArgument extends AceMentionArgument { + public AceEventMentionArgument(String role, + AceEntityMention content) { + super(role, content, "event"); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceMention.java new file mode 100644 index 0000000..583c382 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceMention.java @@ -0,0 +1,19 @@ +/** + * Superclass for all ACE mentions (events, entities, values, etc) + */ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +public class AceMention extends AceElement { + protected AceCharSeq mExtent; + + protected AceMention(String id, + AceCharSeq extent) { + super(id); + mExtent = extent; + } + + public AceCharSeq getExtent() { return mExtent; } + + public String toXml(int offset) { return ""; } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceMentionArgument.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceMentionArgument.java new file mode 100644 index 0000000..7c98328 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceMentionArgument.java @@ -0,0 +1,46 @@ +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +public class AceMentionArgument { + + final protected String mRole; + final protected AceEntityMention mContent; + final private String mentionType; // in practice, event or relation + + public AceMentionArgument(String role, + AceEntityMention content, String mentionType) { + mRole = role; + mContent = content; + this.mentionType = mentionType; + } + + public AceEntityMention getContent() { return mContent; } + + public String getRole() { return mRole; } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + AceElement.appendOffset(buffer, offset); + buffer.append("<" + mentionType + "_mention_argument REFID=\"" + mContent.getId() + + "\" ROLE=\"" + mRole + "\">\n"); + + + //buffer.append(getContent().toXml(offset + 2)); + AceCharSeq ext = getContent().getExtent(); + buffer.append(ext.toXml("extent", offset + 2)); + buffer.append("\n"); + + AceElement.appendOffset(buffer, offset); + buffer.append(""); + return buffer.toString(); + } + + public String toXmlShort(int offset) { + StringBuffer buffer = new StringBuffer(); + AceElement.appendOffset(buffer, offset); + buffer.append("<" + mentionType + "_argument REFID=\"" + + mContent.getParent().getId() + + "\" ROLE=\"" + mRole + "\"/>"); + return buffer.toString(); + } + +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelation.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelation.java new file mode 100644 index 0000000..05c062b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelation.java @@ -0,0 +1,78 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.List; +import java.util.ArrayList; + +/** + * Stores one ACE relation + */ +public class AceRelation extends AceElement { + private String mType; + + private String mSubtype; + + private String mModality; + + private String mTense; + + /** The list of mentions for this event */ + private List mMentions; + + public static final String NIL_LABEL = "nil"; + + public AceRelation(String id, + String type, + String subtype, + String modality, + String tense) { + super(id); + mType = type; + mSubtype = subtype; + mModality = modality; + mTense = tense; + mMentions = new ArrayList(); + } + + public void addMention(AceRelationMention m) { + mMentions.add(m); + m.setParent(this); + } + + public AceRelationMention getMention(int which) { + return mMentions.get(which); + } + public int getMentionCount() { return mMentions.size(); } + + public String getType() { return mType; } + public void setType(String s) { mType = s; } + public String getSubtype() { return mSubtype; } + public void setSubtype(String s) { mSubtype = s; } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + appendOffset(buffer, offset); + buffer.append("\n"); + + AceRelationMentionArgument arg1 = mMentions.get(0).getArgs()[0]; + AceRelationMentionArgument arg2 = mMentions.get(0).getArgs()[1]; + if(arg1.getRole().equals("Arg-1")){ // left to right + buffer.append(arg1.toXmlShort(offset + 2) + "\n"); + buffer.append(arg2.toXmlShort(offset + 2) + "\n"); + } else { // right to left + buffer.append(arg2.toXmlShort(offset + 2) + "\n"); + buffer.append(arg1.toXmlShort(offset + 2) + "\n"); + } + + for(AceRelationMention m: mMentions){ + buffer.append(m.toXml(offset + 2)); + buffer.append("\n"); + } + + appendOffset(buffer, offset); + buffer.append(""); + return buffer.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelationMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelationMention.java new file mode 100644 index 0000000..ea4f62a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelationMention.java @@ -0,0 +1,101 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + + +/** + * Stores one ACE relation mention + */ +public class AceRelationMention extends AceMention { + private String mLexicalCondition; + + /** The two argument mentions */ + private AceRelationMentionArgument [] mArguments; + + /** the parent event */ + private AceRelation mParent; + + public AceRelationMention(String id, + AceCharSeq extent, + String lc) { + super(id, extent); + mLexicalCondition = lc; + mArguments = new AceRelationMentionArgument[2]; + } + + public AceRelationMentionArgument [] getArgs() { return mArguments; } + public AceEntityMention getArg(int which) { + return mArguments[which].getContent(); + } + public void setArg(int which, + AceEntityMention em, + String role) { + mArguments[which] = new AceRelationMentionArgument(role, em); + } + + /** Retrieves the argument that appears *first* in the sentence */ + public AceEntityMention getFirstArg() { + if(getArg(0).getHead().getTokenStart() <= + getArg(1).getHead().getTokenStart()){ + return getArg(0); + } + return getArg(1); + } + /** Retrieves the argument that appears *last* in the sentence */ + public AceEntityMention getLastArg() { + if(getArg(0).getHead().getTokenStart() > + getArg(1).getHead().getTokenStart()){ + return getArg(0); + } + return getArg(1); + } + + public void setParent(AceRelation e) { mParent = e; } + public AceRelation getParent() { return mParent; } + + public String getLexicalCondition() { return mLexicalCondition; } + + /** Fetches the id of the sentence that contains this mention */ + public int getSentence(AceDocument doc) { + return doc.getToken(getArg(0).getHead().getTokenStart()).getSentence(); + } + + /** Returns the smallest start of the two args heads */ + public int getMinTokenStart() { + int s1 = getArg(0).getHead().getTokenStart(); + int s2 = getArg(1).getHead().getTokenStart(); + return Math.min(s1, s2); + } + + /** Returns the largest end of the two args heads */ + public int getMaxTokenEnd() { + int s1 = getArg(0).getHead().getTokenEnd(); + int s2 = getArg(1).getHead().getTokenEnd(); + return Math.max(s1, s2); + } + + public String toXml(int offset) { + StringBuffer buffer = new StringBuffer(); + appendOffset(buffer, offset); + buffer.append("\n"); + + buffer.append(mExtent.toXml("extent", offset + 2)); + buffer.append("\n"); + + AceRelationMentionArgument arg1 = getArgs()[0]; + AceRelationMentionArgument arg2 = getArgs()[1]; + if(arg1.getRole().equals("Arg-1")){ // left to right + buffer.append(arg1.toXml(offset + 2) + "\n"); + buffer.append(arg2.toXml(offset + 2) + "\n"); + } else { // right to left + buffer.append(arg2.toXml(offset + 2) + "\n"); + buffer.append(arg1.toXml(offset + 2) + "\n"); + } + + appendOffset(buffer, offset); + buffer.append(""); + return buffer.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelationMentionArgument.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelationMentionArgument.java new file mode 100644 index 0000000..c61c817 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceRelationMentionArgument.java @@ -0,0 +1,9 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +public class AceRelationMentionArgument extends AceMentionArgument { + public AceRelationMentionArgument(String role, + AceEntityMention content) { + super(role, content, "relation"); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceSentenceSegmenter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceSentenceSegmenter.java new file mode 100644 index 0000000..0d5a056 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceSentenceSegmenter.java @@ -0,0 +1,111 @@ +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import edu.stanford.nlp.ie.machinereading.common.DomReader; +import edu.stanford.nlp.ie.machinereading.domains.ace.reader.RobustTokenizer.WordToken; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.util.Generics; + +public class AceSentenceSegmenter extends DomReader { + // list of tokens which mark sentence boundaries + private final static String[] sentenceFinalPunc = new String[] { ".", "!", + "?" }; + private static Set sentenceFinalPuncSet = Generics.newHashSet(); + + static { + // set up sentenceFinalPuncSet + for (int i = 0; i < sentenceFinalPunc.length; i++) + sentenceFinalPuncSet.add(sentenceFinalPunc[i]); + } + + /** + * @param filenamePrefix + * path to an ACE .sgm file (but not including the .sgm extension) + */ + public static List> tokenizeAndSegmentSentences(String filenamePrefix) + throws IOException, SAXException, ParserConfigurationException { + + List> sentences = new ArrayList>(); + File inputFile = new File(filenamePrefix + AceDocument.ORIG_EXT); + String input =IOUtils.slurpFile(inputFile); + + // now we can split the text into tokens + RobustTokenizer tokenizer = new RobustTokenizer(input); + List tokenList = tokenizer.tokenizeToWordTokens(); + + // and group the tokens into sentences + ArrayList currentSentence = new ArrayList(); + int quoteCount = 0; + for (int i = 0; i < tokenList.size(); i ++){ + WordToken token = tokenList.get(i); + String tokenText = token.getWord(); + AceToken convertedToken = wordTokenToAceToken(token, sentences.size()); + + // start a new sentence if we skipped 2+ lines (after datelines, etc.) + // or we hit some SGML + // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) { + if(AceToken.isSgml(tokenText)) { + if (currentSentence.size() > 0) sentences.add(currentSentence); + currentSentence = new ArrayList(); + quoteCount = 0; + } + + currentSentence.add(convertedToken); + if(tokenText.equals("\"")) quoteCount ++; + + // start a new sentence whenever we hit sentence-final punctuation + if (sentenceFinalPuncSet.contains(tokenText)){ + // include quotes after EOS + if(i < tokenList.size() - 1 && quoteCount % 2 == 1 && tokenList.get(i + 1).getWord().equals("\"")){ + AceToken quoteToken = wordTokenToAceToken(tokenList.get(i + 1), sentences.size()); + currentSentence.add(quoteToken); + quoteCount ++; + i ++; + } + if (currentSentence.size() > 0) sentences.add(currentSentence); + currentSentence = new ArrayList(); + quoteCount = 0; + } + + // start a new sentence when we hit an SGML tag + else if(AceToken.isSgml(tokenText)) { + if (currentSentence.size() > 0) sentences.add(currentSentence); + currentSentence = new ArrayList(); + quoteCount = 0; + } + } + + return sentences; + } + + public static AceToken wordTokenToAceToken(WordToken wordToken, int sentence) { + return new AceToken(wordToken.getWord(), "", "", "", "", Integer + .toString(wordToken.getStart()), Integer.toString(wordToken.getEnd()), + sentence); + } + + // simple testing code + public static void main(String[] args) throws IOException, SAXException, + ParserConfigurationException { + String testFilename = "/home/mcclosky/data/ACE2005/English/wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1144"; + // testFilename = + // "/home/mcclosky/data/ACE2005/English/bc/timex2norm/CNN_CF_20030303.1900.02"; + // testFilename = + // "/home/mcclosky/data/ACE2005/English/un/timex2norm/alt.atheism_20041104.2428"; + testFilename = "/home/mcclosky/data/ACE2005/English/nw/timex2norm/AFP_ENG_20030502.0614"; + + List> sentences = tokenizeAndSegmentSentences(testFilename); + for (List sentence : sentences) + System.out.println("s: [" + sentence + "]"); + } +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceToken.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceToken.java new file mode 100644 index 0000000..a07bcef --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceToken.java @@ -0,0 +1,495 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.ie.machinereading.common.SimpleTokenize; +import edu.stanford.nlp.ie.machinereading.common.StringDictionary; +import edu.stanford.nlp.trees.Span; +import edu.stanford.nlp.util.Generics; + +public class AceToken { + /** + * The actual token bytes + * Normally we work with mWord (see below), but mLiteral is needed when + * we need to check if a sequence of tokens exists in a gazetteer + */ + private String mLiteral; + + /** The index of the literal in the WORDS hash */ + private int mWord; + + /** Case of mWord */ + private int mCase; + + /** Suffixes of mWord */ + private int[] mSuffixes; + + private int mLemma; + + private int mPos; + + private int mChunk; + + private int mNerc; + + private Span mByteOffset; + + /** Raw byte offset in the SGM doc */ + private Span mRawByteOffset; + + private int mSentence; + + /** Entity class from Massi */ + private String mMassiClass; + /** Entity label from the BBN corpus */ + private String mMassiBbn; + /** WordNet super-senses detected by Massi */ + private String mMassiWnss; + + /** Dictionary for all words in the corpus */ + public static final StringDictionary WORDS; + + /** Dictionary for all lemmas in the corpus */ + public static final StringDictionary LEMMAS; + + /** Dictionary for all other strings in the corpus */ + public static final StringDictionary OTHERS; + + /** Map of all proximity classes */ + public static final Map> PROX_CLASSES; + /** How many elements per proximity class */ + private static final int PROXIMITY_CLASS_SIZE = 5; + + /** The location gazetteer */ + private static Map LOC_GAZ = null; + + /** The person first name dictionary */ + private static Map FIRST_GAZ = null; + + /** The person last name dictionary */ + private static Map LAST_GAZ = null; + + /** List of trigger words */ + private static Map TRIGGER_GAZ = null; + + private final static Pattern SGML_PATTERN; + + static { + WORDS = new StringDictionary("words"); + LEMMAS = new StringDictionary("lemmas"); + OTHERS = new StringDictionary("others"); + WORDS.setMode(true); + LEMMAS.setMode(true); + OTHERS.setMode(true); + PROX_CLASSES = Generics.newHashMap(); + + SGML_PATTERN = Pattern.compile("<[^<>]+>"); + } + + public static void loadGazetteers(String dataPath) throws java.io.FileNotFoundException, java.io.IOException { + + System.err.print("Loading location gazetteer... "); + LOC_GAZ = Generics.newHashMap(); + loadDictionary(LOC_GAZ, dataPath + File.separator + "world_small.gaz.nonambiguous"); + System.err.println("done."); + + System.err.print("Loading first-name gazetteer... "); + FIRST_GAZ = Generics.newHashMap(); + loadDictionary(FIRST_GAZ, dataPath + File.separator + "per_first.gaz"); + System.err.println("done."); + + System.err.print("Loading last-name gazetteer... "); + LAST_GAZ = Generics.newHashMap(); + loadDictionary(LAST_GAZ, dataPath + File.separator + "per_last.gaz"); + System.err.println("done."); + + System.err.print("Loading trigger-word gazetteer... "); + TRIGGER_GAZ = Generics.newHashMap(); + loadDictionary(TRIGGER_GAZ, dataPath + File.separator + "triggers.gaz"); + System.err.println("done."); + } + + /** Loads one dictionary from disk */ + private static void loadDictionary(Map dict, String file) throws java.io.FileNotFoundException, + java.io.IOException { + + BufferedReader in = new BufferedReader(new FileReader(file)); + + String line; + while ((line = in.readLine()) != null) { + ArrayList tokens = SimpleTokenize.tokenize(line); + if (tokens.size() > 0) { + String lower = tokens.get(0).toLowerCase(); + if (tokens.size() == 1) + dict.put(lower, "true"); + else + dict.put(lower, tokens.get(1)); + } + } + } + + public static boolean isLocation(String lower) { + return exists(LOC_GAZ, lower); + } + + public static boolean isFirstName(String lower) { + return exists(FIRST_GAZ, lower); + } + + public static boolean isLastName(String lower) { + return exists(LAST_GAZ, lower); + } + + public static String isTriggerWord(String lower) { + return TRIGGER_GAZ.get(lower); + } + + /** + * Verifies if the given string exists in the given dictionary + */ + public static boolean exists(Map dict, String elem) { + if (dict.get(elem) != null) + return true; + return false; + } + + /** + * Loads all proximity classes from the hard disk The WORDS map must be + * created before! + */ + public static void loadProximityClasses(String proxFileName) throws java.io.IOException { + + System.err.println("Loading proximity classes..."); + + BufferedReader in = null; + try { + in = new BufferedReader(new FileReader(proxFileName)); + } catch (java.io.IOException e) { + System.err.println("Warning: no proximity database found."); + return; + } + + String line; + while ((line = in.readLine()) != null) { + ArrayList tokens = SimpleTokenize.tokenize(line); + if (tokens.size() > 0) { + Integer key = WORDS.get(tokens.get(0)); + ArrayList value = new ArrayList(); + + for (int i = 0; i < tokens.size() && i < PROXIMITY_CLASS_SIZE; i++) { + Integer word = WORDS.get(tokens.get(i)); + value.add(word); + } + + PROX_CLASSES.put(key, value); + } + } + + in.close(); + System.err.println("Finished loading proximity classes."); + } + + public String getLiteral() { + return mLiteral; + } + + public int getWord() { + return mWord; + } + + public int getCase() { + return mCase; + } + + public int[] getSuffixes() { + return mSuffixes; + } + + public int getLemma() { + return mLemma; + } + + public int getPos() { + return mPos; + } + + public int getChunk() { + return mChunk; + } + + public int getNerc() { + return mNerc; + } + + public Span getByteOffset() { + return mByteOffset; + } + + public int getByteStart() { + return mByteOffset.start(); + } + + public int getByteEnd() { + return mByteOffset.end(); + } + + public int getSentence() { + return mSentence; + } + + public Span getRawByteOffset() { + return mRawByteOffset; + } + + public int getRawByteStart() { + return mRawByteOffset.start(); + } + + public int getRawByteEnd() { + return mRawByteOffset.end(); + } + + public void setMassiClass(String i) { + mMassiClass = i; + } + + public String getMassiClass() { + return mMassiClass; + } + + public void setMassiBbn(String i) { + mMassiBbn = i; + } + + public String getMassiBbn() { + return mMassiBbn; + } + + public void setMassiWnss(String i) { + mMassiWnss = i; + } + + public String getMassiWnss() { + return mMassiWnss; + } + + public static boolean isSgml(String s) { + Matcher match = SGML_PATTERN.matcher(s); + return match.find(0); + } + + public static String removeSpaces(String s) { + if (s == null) + return s; + return s.replaceAll(" ", "_"); + } + + public static final int CASE_OTHER = 0; + public static final int CASE_ALLCAPS = 1; + public static final int CASE_ALLCAPSORDOTS = 2; + public static final int CASE_CAPINI = 3; + public static final int CASE_INCAP = 4; + public static final int CASE_ALLDIGITS = 5; + public static final int CASE_ALLDIGITSORDOTS = 6; + + private static int detectCase(String word) { + + // + // is the word all caps? (e.g. IBM) + // + boolean isAllCaps = true; + for (int i = 0; i < word.length(); i++) { + if (!Character.isUpperCase(word.charAt(i))) { + isAllCaps = false; + break; + } + } + if (isAllCaps) + return CASE_ALLCAPS; + + // + // is the word all caps or dots?(e.g. I.B.M.) + // + boolean isAllCapsOrDots = true; + if (Character.isUpperCase(word.charAt(0))) { + for (int i = 0; i < word.length(); i++) { + if (!Character.isUpperCase(word.charAt(i)) && word.charAt(i) != '.') { + isAllCapsOrDots = false; + break; + } + } + } else { + isAllCapsOrDots = false; + } + if (isAllCapsOrDots) + return CASE_ALLCAPSORDOTS; + + // + // does the word start with a cap?(e.g. Tuesday) + // + boolean isInitialCap = false; + if (Character.isUpperCase(word.charAt(0))) + isInitialCap = true; + if (isInitialCap) + return CASE_CAPINI; + + // + // does the word contain a capitalized letter? + // + boolean isInCap = false; + for (int i = 1; i < word.length(); i++) { + if (Character.isUpperCase(word.charAt(i))) { + isInCap = true; + break; + } + } + if (isInCap) + return CASE_INCAP; + + // + // is the word all digits? (e.g. 123) + // + boolean isAllDigits = false; + for (int i = 0; i < word.length(); i++) { + if (!Character.isDigit(word.charAt(i))) { + isAllDigits = false; + break; + } + } + if (isAllDigits) + return CASE_ALLDIGITS; + + // + // is the word all digits or . or ,? (e.g. 1.3) + // + boolean isAllDigitsOrDots = true; + if (Character.isDigit(word.charAt(0))) { + for (int i = 0; i < word.length(); i++) { + if (!Character.isDigit(word.charAt(i)) && word.charAt(i) != '.' && word.charAt(i) != ',') { + isAllDigitsOrDots = false; + break; + } + } + } else { + isAllDigitsOrDots = false; + } + if (isAllDigitsOrDots) + return CASE_ALLDIGITSORDOTS; + + return CASE_OTHER; + } + + private static int[] extractSuffixes(String word) { + String lower = word.toLowerCase(); + ArrayList suffixes = new ArrayList(); + for (int i = 2; i <= 4; i++) { + if (lower.length() >= i) { + try { + String suf = lower.substring(lower.length() - i); + suffixes.add(WORDS.get(suf)); + } catch (java.lang.RuntimeException e) { + // unknown suffix + } + } else { + break; + } + } + + int[] sufs = new int[suffixes.size()]; + for (int i = 0; i < suffixes.size(); i++) { + sufs[i] = suffixes.get(i); + } + + return sufs; + } + + /** + * Constructs an AceToken from a tokenized line generated by Tokey + */ + public AceToken(String word, String lemma, String pos, String chunk, String nerc, String start, String end, + int sentence) { + mLiteral = word; + if (word == null) { + mWord = -1; + mCase = -1; + mSuffixes = null; + } else { + mWord = WORDS.get(removeSpaces(word), false); + mCase = detectCase(word); + mSuffixes = extractSuffixes(word); + } + + if (lemma == null) + mLemma = -1; + else + mLemma = LEMMAS.get(removeSpaces(lemma), false); + + if (pos == null) + mPos = -1; + else + mPos = OTHERS.get(pos, false); + + if (chunk == null) + mChunk = -1; + else + mChunk = OTHERS.get(chunk, false); + + if (nerc == null) + mNerc = -1; + else + mNerc = OTHERS.get(nerc, false); + + if (start != null && end != null) { + mByteOffset = new Span(Integer.parseInt(start), Integer.parseInt(end)); + mRawByteOffset = new Span(Integer.parseInt(start), Integer.parseInt(end)); + } + mSentence = sentence; + + mMassiClass = ""; + mMassiBbn = ""; + mMassiWnss = ""; + } + + /** + * Recomputes start/end phrase positions by removing SGML tag strings This is + * required because ACE annotations skip over SGML tags when computing + * positions in stream, hence annotations do not match with our preprocessing + * positions, which count everything + */ + public int adjustPhrasePositions(int offsetToSubtract, String word) { + if (isSgml(word)) { + // offsetToSubtract += word.length(); + // the token length may be different than (end - start)! + // i.e. QUOTE_PREVIOUSPOST is cleaned in Tokey! + offsetToSubtract += mByteOffset.end() - mByteOffset.start(); + mByteOffset.setStart(-1); + mByteOffset.setEnd(-1); + } else { + mByteOffset.setStart(mByteOffset.start() - offsetToSubtract); + mByteOffset.setEnd(mByteOffset.end() - offsetToSubtract); + } + + return offsetToSubtract; + } + + /** Pretty display */ + public String display() { + if (mByteOffset != null) { + return "['" + WORDS.get(mWord) + "', " + OTHERS.get(mPos) + ", " + mByteOffset.start() + ", " + + mByteOffset.end() + "]"; + } + + return "['" + WORDS.get(mWord) + "', " + OTHERS.get(mPos) + "]"; + } + + public String toString() { + return display(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/MatchException.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/MatchException.java new file mode 100644 index 0000000..978ce83 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/MatchException.java @@ -0,0 +1,8 @@ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +public class MatchException extends RuntimeException { + public static final long serialVersionUID = 24362462L; + + public MatchException(String m) { super(m); } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/RobustTokenizer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/RobustTokenizer.java new file mode 100644 index 0000000..9011b05 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/domains/ace/reader/RobustTokenizer.java @@ -0,0 +1,1146 @@ +/* + * RobustTokenizer.java + * Performs tokenization of natural language English text, following ACE data + * Use the method tokenize() for smart tokenization + * @author Mihai + */ + +package edu.stanford.nlp.ie.machinereading.domains.ace.reader; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.FileReader; +import java.io.BufferedReader; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.process.AbstractTokenizer; +import edu.stanford.nlp.util.Generics; + +public class RobustTokenizer extends AbstractTokenizer { + + /** Buffer to tokenize */ + String buffer; + + /** The set of known abbreviations */ + private AbbreviationMap mAbbreviations; + + public final static int MAX_MULTI_WORD_SIZE = 20; + + // basic tokens + public final static String DOT = block("\\."); + public final static String DOTDOT = block("\\:"); + public final static String APOSTROPHE = block("\\'"); + public final static String SLASH = block("\\/"); + public final static String UNDERSCORE = block("\\_"); + public final static String MINUS = block("\\-"); + public final static String PLUS = block("\\+"); + public final static String COMMA = block("\\,"); + public final static String DOTCOMMA = block("\\;"); + public final static String QUOTES = block(or("\\\"", "\\'\\'", "\\'", "\\`\\`", "\\`")); + public final static String DOUBLE_QUOTES = block(or("\\\"" , "\\'\\'")); + public final static String LRB = block("\\("); + public final static String RRB = block("\\)"); + public final static String LCB = block("\\{"); + public final static String RCB = block("\\}"); + public final static String GREATER = block("\\>"); + public final static String LOWER = block("\\<"); + public final static String AMPERSAND = block("\\&"); + public final static String AT = block("\\@"); + public final static String HTTP = block("[hH][tT][tT][pP]\\:\\/\\/"); + + // basic sequences + public final static String WHITE_SPACE = block("\\s"); + public final static String DIGIT = block("\\d"); + public final static String LETTER = block("[a-zA-Z]"); + public final static String UPPER = block("[A-Z]"); + public final static String SIGN = or(MINUS,PLUS); + + // numbers + public final static String FULLNUM = + block( + zeroOrOne(SIGN) + + oneOrMore(DIGIT) + + zeroOrMore( + zeroOrOne(or(DOT, COMMA, SLASH)) + + oneOrMore(DIGIT))); + public final static String DECNUM = block(DOT + oneOrMore(DIGIT)); + public final static String NUM = or(FULLNUM, DECNUM); + + // date and time + public final static String DATE = + block(oneOrMore(DIGIT) + SLASH + + oneOrMore(DIGIT) + SLASH + + oneOrMore(DIGIT)); + public final static String TIME = + block(oneOrMore(DIGIT) + + oneOrMore(block( + DOTDOT + + oneOrMore(DIGIT)))); + + // punctuation marks + public final static String PUNC = + or(QUOTES, + block(MINUS + oneOrMore(MINUS)), + block(DOT + oneOrMore(DOT))); + + // words + public final static String LETTERS = oneOrMore(LETTER); + public final static String BLOCK = or(NUM, LETTERS); + public final static String WORD = + block(zeroOrOne(APOSTROPHE) + + BLOCK + + zeroOrMore(block( + zeroOrOne(or(UNDERSCORE, + MINUS, + APOSTROPHE, + SLASH, + AMPERSAND)) + + BLOCK))); + + // acronyms + public final static String ACRONYM = block(oneOrMore(LETTER + DOT));// + zeroOrOne(LETTER)); + + // this matches acronyms AFTER abbreviation merging + public final static String LOOSE_ACRONYM = + block(oneOrMore((oneOrMore(LETTER) + DOT)) + zeroOrMore(LETTER)); + + // other possible constructs + public final static String PAREN = or(LRB, RRB, LCB, RCB); + public final static String SGML = "<[^<>]+>"; + public final static String HTMLCODE = block(AMPERSAND + UPPER + DOTCOMMA); + + public final static String ANY = block("\\S"); + + // email addresses must start with a letter, contain @, and end with a letter + public final static String EMAIL = block(LETTER + + zeroOrMore(or(LETTER, + DIGIT, + DOT, + MINUS, + UNDERSCORE)) + + AT + + zeroOrMore(or(LETTER, + DIGIT, + DOT, + MINUS, + UNDERSCORE)) + + LETTER); + + // email addresses must start with a letter, contain @, and end with . com + public final static String DOMAIN_EMAIL = block(LETTER + + zeroOrMore(or(LETTER, + DIGIT, + DOT, + MINUS, + UNDERSCORE)) + + AT + + oneOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + + zeroOrMore(WHITE_SPACE)+ DOT + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us")); + + // URLs must start with http:// or ftp://, followed by at least a letter + public final static String URL = + block(HTTP + + oneOrMore(or(LETTER, + DIGIT, + DOT, + UNDERSCORE, + SLASH, + AMPERSAND, + MINUS, + PLUS))); + + //URLs without http, but ending in org, com, net + public final static String SMALL_URL = + block(oneOrMore(oneOrMore(LETTER) + DOT) + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us")); + + // keep sequence of underscores as a single token + public final static String UNDERSCORESEQ = oneOrMore("_"); + + // list bullet, e.g., "(a)" + public final static String LIST_BULLET = block(LRB + LETTER + zeroOrOne(LETTER) + RRB); + // part of a phone number, e.g., "(214)" + public final static String PHONE_PART = block(LRB + oneOrMore(DIGIT) + RRB); + + // sequence of digits + public final static String DIGITSEQ = oneOrMore(DIGIT); + + // the complete pattern + public final static String RECOGNISED_PATTERN + = block(block(TIME) + "|" + + block(DOMAIN_EMAIL) + "|" + + block(EMAIL) + "|" + + block(URL) + "|" + + // block(SMALL_URL) + "|" + + block(ACRONYM) + "|" + + block(DATE) + "|" + + block(PHONE_PART) + "|" + // must be before WORD, otherwise it's broken into multiple tokens + block(WORD) + "|" + + block(PUNC) + "|" + + block(LIST_BULLET) + "|" + + block(PAREN) + "|" + + block(SGML) + "|" + + block(HTMLCODE) + "|" + + block(UNDERSCORESEQ) + "|" + + block(ANY)); + + /** The overall token pattern */ + private final static Pattern wordPattern; + + /** Pattern to recognize SGML tags */ + private final static Pattern sgmlPattern; + + /** Pattern to recognize slash-separated dates */ + private final static Pattern slashDatePattern; + + /** Pattern to recognize acronyms */ + private final static Pattern acronymPattern; + + /** Pattern to recognize URLs */ + private final static Pattern urlPattern; + + /** Pattern to recognize emails */ + private final static Pattern emailPattern; + + /** Recognized sequences of digits */ + private final static Pattern digitSeqPattern; + + static{ + wordPattern = Pattern.compile(RECOGNISED_PATTERN); + sgmlPattern = Pattern.compile(SGML); + slashDatePattern = Pattern.compile(DATE); + acronymPattern = Pattern.compile(LOOSE_ACRONYM); + urlPattern = Pattern.compile(URL); + emailPattern = Pattern.compile(EMAIL); + digitSeqPattern = Pattern.compile(DIGITSEQ); + } + + public RobustTokenizer(String buffer) { + mAbbreviations = new AbbreviationMap(true); + this.buffer = buffer; + this.cachedTokens = null; + } + + + public RobustTokenizer(boolean caseInsensitive, String buffer) { + mAbbreviations = new AbbreviationMap(caseInsensitive); + this.buffer = buffer; + this.cachedTokens = null; + } + + /** any in the set */ + public static String range(String s){ + return block("[" + s + "]"); + } + + + /** zero or one */ + public static String zeroOrOne(String s){ + return block(block(s) + "?"); + } + + /** zero or more */ + public static String zeroOrMore(String s){ + return block(block(s) + "*"); + } + + /** one or more */ + public static String oneOrMore(String s){ + return block(block(s) + "+"); + } + + /** parens */ + public static String block(String s){ + return "(" + s + ")"; + } + + /** any of the two */ + public static String or(String s1, String s2){ + return block(block(s1) + "|" + block(s2)); + } + + /** any of the three */ + public static String or(String s1, String s2, String s3){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3)); + } + + /** any of the four */ + public static String or(String s1, String s2, String s3, String s4){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4)); + } + + /** any of the five */ + public static String or(String s1, String s2, String s3, String s4, String s5){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5)); + } + + /** any of the six */ + public static String or(String s1, String s2, String s3, + String s4, String s5, String s6){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6)); + } + + /** any of the seven */ + public static String or(String s1, String s2, String s3, + String s4, String s5, String s6, String s7){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7)); + } + + /** any of the eight */ + public static String or(String s1, String s2, String s3, String s4, + String s5, String s6, String s7, String s8){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + + block(s7) + "|" + block(s8)); + } + + /** any of the nine */ + public static String or(String s1, String s2, String s3, String s4, + String s5, String s6, String s7, String s8, String s9){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + + block(s7) + "|" + block(s8) + "|" + block(s9)); + } + + public static String or(String s1, String s2, String s3, String s4, + String s5, String s6, String s7, String s8, + String s9, String s10){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + + block(s7) + "|" + block(s8) + "|" + block(s9) + "|" + + block(s10)); + } + + public static String or(String s1, String s2, String s3, String s4, + String s5, String s6, String s7, String s8, + String s9, String s10, String s11){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + + block(s7) + "|" + block(s8) + "|" + block(s9) + "|" + + block(s10) + "|" + block(s11)); + } + + public static String or(String s1, String s2, String s3, String s4, + String s5, String s6, String s7, String s8, + String s9, String s10, String s11, String s12){ + return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + + block(s7) + "|" + block(s8) + "|" + block(s9) + "|" + + block(s10) + "|" + block(s11) + "|" + block(s12)); + } + + /** not */ + public static String rangeNot(String s){ + return range(block("^" + s)); + } + + private static int hasApostropheBlock(String s) { + for(int i = s.length() - 1; i > 0; i --){ + if(s.charAt(i) == '\'' && i < s.length() - 1){ + return i; + } + + if(! Character.isLetter(s.charAt(i))){ + return -1; + } + } + + return -1; + } + + private static String concatenate(List tokens, + int start, + int end) { + StringBuffer buffer = new StringBuffer(); + + for(; start < end; start ++){ + buffer.append(((WordToken) tokens.get(start)).getWord()); + } + return buffer.toString(); + } + + private static int countNewLines(List tokens, + int start, + int end) { + int count = 0; + for(int i = start + 1; i < end; i ++){ + count += tokens.get(i).getNewLineCount(); + } + return count; + } + + public static boolean isUrl(String s) { + Matcher match = urlPattern.matcher(s); + return match.find(0); + } + + public static boolean isEmail(String s) { + Matcher match = emailPattern.matcher(s); + return match.find(0); + } + + public static boolean isSgml(String s) { + Matcher match = sgmlPattern.matcher(s); + return match.find(0); + } + + public static boolean isSlashDate(String s) { + Matcher match = slashDatePattern.matcher(s); + return match.find(0); + } + + public static boolean isAcronym(String s) { + Matcher match = acronymPattern.matcher(s); + return match.find(0); + } + + public static boolean isDigitSeq(String s) { + Matcher match = digitSeqPattern.matcher(s); + return match.find(0); + } + + public int countNewLines(String s, int start, int end) { + int count = 0; + for(int i = start; i < end; i ++) { + if(s.charAt(i) == '\n') count ++; + } + return count; + } + + /** + * Smart tokenization storing the output in an array of CoreLabel + * Sets the following fields: + * - TextAnnotation - the text of the token + * - TokenBeginAnnotation - the byte offset of the token (start) + * - TokenEndAnnotation - the byte offset of the token (end) + */ + public Word [] tokenizeToWords() { + List toks = tokenizeToWordTokens(); + Word [] labels = new Word[toks.size()]; + for(int i = 0; i < toks.size(); i ++){ + WordToken tok = toks.get(i); + Word l = new Word(tok.getWord(), tok.getStart(), tok.getEnd()); + labels[i] = l; + } + return labels; + } + + /** + * Tokenizes a natural language string + * @return List of WordTokens + */ + public List tokenizeToWordTokens() { + List result = new ArrayList(); + + // + // replace illegal characters with SPACE + // + /* + StringBuffer buffer = new StringBuffer(); + for(int i = 0; i < originalString.length(); i ++){ + int c = (int) originalString.charAt(i); + // + // regular character + // + if(c > 31 && c < 127) buffer.append((char) c); + + else{ + System.err.println("Control character at position " + i + ": " + c); + + // + // DOS new line counts as two characters + // + if(c == 10) buffer.append(" "); + + // + // other control character + // + else buffer.append(' '); + } + } + */ + + Matcher match = wordPattern.matcher(buffer); + int previousEndMatch = 0; + + // + // Straight tokenization, ignoring known abbreviations + // + while(match.find()){ + String crtMatch = match.group(); + int endMatch = match.end(); + int startMatch = endMatch - crtMatch.length(); + int i; + + // found word ending in "n't" + if (crtMatch.endsWith("n't")){ + if (crtMatch.length() > 3){ + WordToken token1 = + new WordToken( + crtMatch.substring(0, crtMatch.length() - 3), + startMatch, endMatch - 3, + countNewLines(buffer, previousEndMatch, startMatch)); + result.add(token1); + } + WordToken token2 = + new WordToken(crtMatch.substring(crtMatch.length() - 3, + crtMatch.length()), + endMatch - 3, endMatch, 0); + result.add(token2); + } + + // found word containing an appostrophe + // XXX: is this too relaxed? e.g. "O'Hare" + else if ((i = hasApostropheBlock(crtMatch)) != -1){ + WordToken token1 = new WordToken(crtMatch.substring(0, i), + startMatch, startMatch + i, countNewLines(buffer, previousEndMatch, startMatch)); + WordToken token2 = + new WordToken(crtMatch.substring(i, crtMatch.length()), + startMatch + i, endMatch, 0); + result.add(token1); + result.add(token2); + } + + // just a regular word + else{ + WordToken token = new WordToken(crtMatch, startMatch, endMatch, + countNewLines(buffer, previousEndMatch, startMatch)); + result.add(token); + } + + previousEndMatch = endMatch; + } + + // + // Merge known abreviations + // + List resultWithAbs = new ArrayList(); + for(int i = 0; i < result.size(); i ++){ + // where the mw ends + int end = result.size(); + if(end > i + MAX_MULTI_WORD_SIZE) end = i + MAX_MULTI_WORD_SIZE; + + boolean found = false; + + // must have at least two tokens per multiword + for(; end > i + 1; end --){ + WordToken startToken = result.get(i); + WordToken endToken = result.get(end - 1); + if(countNewLines(result, i, end) == 0){ // abbreviation tokens cannot appear on different lines + String conc = concatenate(result, i, end); + found = false; + + // found a multiword + if((mAbbreviations.contains(conc) == true)){ + found = true; + WordToken token = new WordToken(conc, + startToken.getStart(), + endToken.getEnd(), + startToken.getNewLineCount()); + resultWithAbs.add(token); + i = end - 1; + break; + } + } + } + + // no multiword starting at this position found + if(! found){ + resultWithAbs.add(result.get(i)); + } + } + + resultWithAbs = postprocess(resultWithAbs); + + return resultWithAbs; + } + + /** + * Redefine this method to implement additional domain-specific tokenization rules + * @param tokens + */ + protected List postprocess(List tokens) { return tokens; }; + + /** + * Tokenizes and adds blank spaces were needed between each token + */ + public String tokenizeText() throws java.io.IOException{ + List tokenList = tokenizeToWordTokens(); + StringBuffer strBuffer = new StringBuffer(); + Iterator iter = tokenList.iterator(); + if (iter.hasNext()){ + strBuffer.append(iter.next()); + } + while(iter.hasNext()){ + strBuffer.append(" "); + strBuffer.append(iter.next()); + } + return strBuffer.toString().replaceAll("\\s\\s+", " "); + } + + public static class AbbreviationMap { + + private Set mAbbrevSet; + + private static List normalizeCase(boolean caseInsensitive, List words) { + if(! caseInsensitive) return words; + List normWords = new ArrayList(); + for(String word: words) normWords.add(word.toLowerCase()); + return normWords; + } + + /** Creates a new instance of AbreviationMap with some know abbreviations */ + public AbbreviationMap(boolean caseInsensitive) { + mAbbrevSet = Generics.newHashSet(normalizeCase(caseInsensitive, Arrays.asList(new String[]{ + "1.", + "10.", + "11.", + "12.", + "13.", + "14.", + "15.", + "16.", + "17.", + "18.", + "19.", + "2.", + "20.", + "21.", + "22.", + "23.", + "24.", + "25.", + "26.", + "27.", + "28.", + "29.", + "3.", + "30.", + "31.", + "32.", + "33.", + "34.", + "35.", + "36.", + "37.", + "38.", + "39.", + "4.", + "40.", + "41.", + "42.", + "43.", + "44.", + "45.", + "46.", + "47.", + "48.", + "49.", + "5.", + "50.", + "6.", + "7.", + "8.", + "9.", + "A.", + "A.C.", + "A.D.", + "A.D.L.", + "A.F.", + "A.G.", + "A.H.", + "A.J.C.", + "A.L.", + "A.M", + "A.M.", + "A.P.", + "A.T.B.", + "AUG.", + "Act.", + "Adm.", + "Ala.", + "Ariz.", + "Ark.", + "Assn.", + "Ass'n.", + "Ass'n", + "Aug.", + "B.", + "B.A.T", + "B.B.", + "B.F.", + "B.J.", + "B.V.", + "Bancorp.", + "Bhd.", + "Blvd.", + "Br.", + "Brig.", + "Bros.", + "C.", + "C.B.", + "C.D.s", + "C.J.", + "C.O.", + "C.R.", + "C.W.", + "CEO.", + "CO.", + "CORP.", + "COS.", + "Cal.", + "Calif.", + "Capt.", + "Cie.", + "Cir.", + "Cmdr.", + "Co.", + "Col.", + "Colo.", + "Comdr.", + "Conn.", + "Corp.", + "Cos.", + "D.", + "D.B.", + "D.C", + "D.C.", + "D.H.", + "D.M.", + "D.N.", + "D.S.", + "D.T", + "D.T.", + "D.s", + "Dec.", + "Del.", + "Dept.", + "Dev.", + "Dr.", + "Ds.", + "E.", + "E.E.", + "E.F.", + "E.I.", + "E.M.", + "E.R.", + "E.W.", + "Etc.", + "F.", + "F.A.", + "F.A.O.", + "F.C", + "F.E.", + "F.J.", + "F.S.B.", + "F.W.", + "FEB.", + "FL.", + "Feb.", + "Fed.", + "Fla.", + "Fran.", + "French.", + "Freon.", + "Ft.", + "G.", + "G.D.", + "G.L.", + "G.O.", + "G.S.", + "G.m.b", + "G.m.b.H.", + "GP.", + "GPO.", + "Ga.", + "Gen.", + "Gov.", + "H.", + "H.F.", + "H.G.", + "H.H.", + "H.J.", + "H.L.", + "H.R.", + "Hon.", + "I.", + "I.B.M.", + "I.C.H.", + "I.E.P.", + "I.M.", + "I.V.", + "I.W.", + "II.", + "III.", + "INC.", + "Intl.", + "Int'l", + "IV.", + "IX.", + "Ill.", + "Inc.", + "Ind.", + "J.", + "J.C.", + "J.D.", + "J.E.", + "J.F.", + "J.F.K.", + "J.H.", + "J.L.", + "J.M.", + "JohnQ.Public", + "J.P.", + "J.R.", + "J.V", + "J.V.", + "J.X.", + "Jan.", + "Jansz.", + "Je.", + "Jos.", + "Jr.", + "K.", + "K.C.", + "Kan.", + "Ky.", + "L.", + "L.A.", + "L.H.", + "L.J.", + "L.L.", + "L.M.", + "L.P", + "L.P.", + "La.", + "Lt.", + "Ltd.", + "M.", + "M.A.", + "M.B.A.", + "M.D", + "M.D.", + "M.D.C.", + "M.E.", + "M.J.", + "M.R.", + "M.S.", + "M.W.", + "M8.7sp", + "Maj.", + "Mar.", + "Mass.", + "Md.", + "Med.", + "Messrs.", + "Mfg.", + "Mich.", + "Minn.", + "Mir.", + "Miss.", + "Mo.", + "Mr.", + "Mrs.", + "Ms.", + "Mt.", + "N.", + "N.A.", + "N.C", + "N.C.", + "N.D", + "N.D.", + "N.H", + "N.H.", + "N.J", + "N.J.", + "N.M", + "N.M.", + "N.V", + "N.V.", + "N.Y", + "N.Y.", + "NOV.", + "Neb.", + "Nev.", + "No.", + "no.", + "Nos.", + "Nov.", + "O.", + "O.P.", + "OK.", + "Oct.", + "Okla.", + "Ore.", + "P.", + "P.J.", + "P.M", + "P.M.", + "P.R.", + "Pa.", + "Penn.", + "Pfc.", + "Ph.", + "Ph.D.", + "pro-U.N.", + "Prof.", + "Prop.", + "Pty.", + "Q.", + "R.", + "R.D.", + "Ret.", + "R.H.", + "R.I", + "R.I.", + "R.L.", + "R.P.", + "R.R.", + "R.W.", + "RLV.", + "Rd.", + "Rep.", + "Reps.", + "Rev.", + "S.", + "S.A", + "S.A.", + "S.C", + "S.C.", + "S.D.", + "S.G.", + "S.I.", + "S.P.", + "S.S.", + "S.p", + "S.p.A", + "S.p.A.", + "SKr1.5", + "Sen.", + "Sens.", + "Sept.", + "Sgt.", + "Snr.", + "Spc.", + "Sr.", + "St.", + "Sys.", + "T.", + "T.D.", + "T.F.", + "T.T.", + "T.V.", + "TEL.", + "Tech.", + "Tenn.", + "Tex.", + "Tx.", + "U.", + "U.Cal-Davis", + "U.K", + "U.K.", + "U.N.", + "U.S.", + "U.S.A", + "U.S.A.", + "U.S.C.", + "U.S.C..", + "U.S.S.R", + "U.S.S.R.", + "UK.", + "US116.7", + "V.", + "V.H.", + "VI.", + "VII.", + "VIII.", + "VS.", + "Va.", + "Vs.", + "Vt.", + "W.", + "W.A.", + "W.G.", + "W.I.", + "W.J.", + "W.R.", + "W.T.", + "W.Va", + "W.Va.", + "Wash.", + "Wis.", + "Wyo.", + "X.", + "Y.", + "Y.J.", + "Z.", + "a.", + "a.d.", + "a.k.a", + "a.m", + "a.m.", + "al.", + "b.", + "c.", + "c.i.f", + "cf.", + "cnsl.", + "cnsls.", + "cont'd.", + "d.", + "deft.", + "defts.", + "e.", + "et.", + "etc.", + "etseq.", + "f.", + "f.o.b", + "ft.", + "g.", + "h.", + "i.", + "i.e.", + "j.", + "k.", + "l.", + "m.", + "mots.", + "n.", + "o.", + "p.", + "p.m", + "p.m.", + "pltf.", + "pltfs.", + "prelim.", + "r.", + "s.", + "seq.", + "supp.", + "sq.", + "t.", + "u.", + "v.", + "vs.", + "x.", + "y.", + "z.", + }))); + + } + + public boolean contains(String s){ + return mAbbrevSet.contains(s.toLowerCase()); + } + } + + public static class WordToken { + + /** Start position */ + protected int mStart; + + /** End position */ + protected int mEnd; + + /** Counts how many new lines appear between this token and the previous one in the stream */ + protected int mNewLineCount; + + /** The lexem */ + protected String mWord; + + public WordToken(String w, + int s, + int e) { + mWord = w; + mStart = s; + mEnd = e; + mNewLineCount = 0; + } + public WordToken(String w, int s, int e, int nl) { + mWord = w; + mStart = s; + mEnd = e; + mNewLineCount = nl; + } + + public String toString() { + StringBuffer buffer = new StringBuffer(); + buffer.append("["); + buffer.append(mWord); + buffer.append(", "); + buffer.append(mStart); + buffer.append(", "); + buffer.append(mEnd); + buffer.append("]"); + return buffer.toString(); + } + + public int getStart() { return mStart; } + public void setStart(int i) { mStart = i; } + + public int getEnd() { return mEnd; } + public void setEnd(int i) { mEnd = i; } + + public int getNewLineCount() { return mNewLineCount; } + public void setNewLineCount(int i) { mNewLineCount = i; } + + public String getWord() { return mWord; } + public void setWord(String w) { mWord = w; } + + } + + /** Cached tokens for this buffer. Used by getNext */ + Word [] cachedTokens; + /** Current position in the cachedTokens list. Used by getNext */ + int cachedPosition; + + @Override + protected Word getNext() { + if(cachedTokens == null){ + cachedTokens = tokenizeToWords(); + cachedPosition = 0; + } + + if(cachedPosition >= cachedTokens.length){ + return null; + } + + Word token = cachedTokens[cachedPosition]; + cachedPosition ++; + + return token; + } + + public static void main(String argv[]) throws Exception { + if(argv.length != 1){ + System.err.println("Usage: java edu.stanford.nlp.ie.machinereading.common.RobustTokenizer "); + System.exit(1); + } + + // tokenize this file + BufferedReader is = + new BufferedReader(new FileReader(argv[0])); + + // read the whole file in a buffer + // XXX: for sure there are more efficient ways of reading a file... + int ch; + StringBuffer buffer = new StringBuffer(); + while((ch = is.read()) != -1) buffer.append((char) ch); + + // create the tokenizer object + RobustTokenizer t = new RobustTokenizer(buffer.toString()); + + List tokens = t.tokenize(); + for(int i = 0; i < tokens.size(); i ++){ + System.out.println(tokens.get(i)); + } + } +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/AnnotationUtils.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/AnnotationUtils.java new file mode 100644 index 0000000..3e8b4b6 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/AnnotationUtils.java @@ -0,0 +1,538 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.Set; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.StringUtils; + +/** + * Utilities to manipulate Annotations storing datasets or sentences with Machine Reading info + * @author Mihai + * + */ +public class AnnotationUtils { + private AnnotationUtils() {} // only static methods + + /** + * Given a list of sentences (as CoreMaps), wrap it in a new Annotation. + */ + public static Annotation createDataset(List sentences) { + Annotation dataset = new Annotation(""); + addSentences(dataset,sentences); + return dataset; + } + + /** + * Randomized shuffle of all sentences int this dataset + * @param dataset + */ + public static void shuffleSentences(CoreMap dataset) { + List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); + + // we use a constant seed for replicability of experiments + Collections.shuffle(sentences, new Random(0)); + + dataset.set(CoreAnnotations.SentencesAnnotation.class, sentences); + } + + /** + * Converts the labels of all entity mentions in this dataset to sequences of CoreLabels + * @param dataset + * @param annotationsToSkip + * @param useSubTypes + */ + public static List> entityMentionsToCoreLabels(CoreMap dataset, Set annotationsToSkip, boolean useSubTypes, boolean useBIO) { + List> retVal = new ArrayList>(); + List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); + + for (CoreMap sentence : sentences) { + List labeledSentence = sentenceEntityMentionsToCoreLabels(sentence, true, annotationsToSkip, null, useSubTypes, useBIO); + assert(labeledSentence != null); + retVal.add(labeledSentence); + } + + return retVal; + } + + /** + * Converts the labels of all entity mentions in this sentence to sequences of CoreLabels + * @param sentence + * @param addAnswerAnnotation + * @param annotationsToSkip + * @param useSubTypes + */ + public static List sentenceEntityMentionsToCoreLabels( + CoreMap sentence, + boolean addAnswerAnnotation, + Set annotationsToSkip, + Set mentionTypesToUse, + boolean useSubTypes, + boolean useBIO) { + /* + Tree completeTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + if(completeTree == null){ + throw new RuntimeException("ERROR: TreeAnnotation MUST be set before calling this method!"); + } + */ + + // + // Set TextAnnotation and PartOfSpeechAnnotation (using the parser data) + // + /* + List labels = new ArrayList(); + List tokenList = completeTree.getLeaves(); + for (Tree tree : tokenList) { + Word word = new Word(tree.label()); + CoreLabel label = new CoreLabel(); + label.set(CoreAnnotations.TextAnnotation.class, word.value()); + if (addAnswerAnnotation) { + label.set(CoreAnnotations.AnswerAnnotation.class, + SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); + } + label.set(CoreAnnotations.PartOfSpeechAnnotation.class, tree.parent(completeTree).label().value()); + labels.add(label); + } + */ + // use the token CoreLabels not the parser data => more robust + List labels = new ArrayList(); + for(CoreLabel l: sentence.get(CoreAnnotations.TokensAnnotation.class)){ + CoreLabel nl = new CoreLabel(l); + if (addAnswerAnnotation) { + nl.set(CoreAnnotations.AnswerAnnotation.class, SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); + } + labels.add(nl); + } + + // Add AnswerAnnotation from the types of the entity mentions + if (addAnswerAnnotation) { + List entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + if(entities != null){ + for (EntityMention entity : entities) { + // is this a type that we should skip? + if(annotationsToSkip != null && annotationsToSkip.contains(entity.getType())) continue; + // is this a valid mention type? + if(mentionTypesToUse != null && ! mentionTypesToUse.contains(entity.getMentionType())) continue; + + // ignore entities without head span + if(entity.getHead() != null){ + for(int i = entity.getHeadTokenStart(); i < entity.getHeadTokenEnd(); i ++){ + String tag = entity.getType(); + if(useSubTypes && entity.getSubType() != null) tag += "-" + entity.getSubType(); + if(useBIO){ + if(i == entity.getHeadTokenStart()) tag = "B-" + tag; + else tag = "I-" + tag; + } + labels.get(i).set(CoreAnnotations.AnswerAnnotation.class, tag); + } + } + } + } + } + + /* + // Displaying the CoreLabels generated for this sentence + System.err.print("sentence to core labels:"); + for(CoreLabel l: labels){ + System.err.print(" " + l.word() + "/" + l.getString(CoreAnnotations.PartOfSpeechAnnotation.class)); + String tag = l.getString(CoreAnnotations.AnswerAnnotation.class); + if(tag != null && ! tag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)){ + System.err.print("/" + tag); + } + } + System.err.println(); + */ + + return labels; + } + + public static CoreMap getSentence(CoreMap dataset, int i) { + return dataset.get(CoreAnnotations.SentencesAnnotation.class).get(i); + } + + public static int sentenceCount(CoreMap dataset) { + List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class); + if(sents != null) return sents.size(); + return 0; + } + + public static void addSentence(CoreMap dataset, CoreMap sentence) { + List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class); + if(sents == null){ + sents = new ArrayList(); + dataset.set(CoreAnnotations.SentencesAnnotation.class, sents); + } + sents.add(sentence); + } + + public static void addSentences(CoreMap dataset, List sentences) { + List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class); + if(sents == null){ + sents = new ArrayList(); + dataset.set(CoreAnnotations.SentencesAnnotation.class, sents); + } + for(CoreMap sentence: sentences){ + sents.add(sentence); + } + } + + /** + * Creates a deep copy of the given dataset with new lists for all mentions (entity, relation, event) + * @param dataset + */ + public static Annotation deepMentionCopy(CoreMap dataset) { + Annotation newDataset = new Annotation(""); + + List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class); + List newSents = new ArrayList(); + if(sents != null){ + for(CoreMap sent: sents){ + if(! (sent instanceof Annotation)){ + throw new RuntimeException("ERROR: Sentences must instantiate Annotation!"); + } + CoreMap newSent = sentenceDeepMentionCopy((Annotation) sent); + newSents.add(newSent); + } + } + + addSentences(newDataset, newSents); + return newDataset; + } + + /** + * Deep copy of the sentence: we create new entity/relation/event lists here. + * However, we do not deep copy the ExtractionObjects themselves! + * @param sentence + */ + public static Annotation sentenceDeepMentionCopy(Annotation sentence) { + Annotation newSent = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); + + newSent.set(CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class)); + newSent.set(TreeCoreAnnotations.TreeAnnotation.class, sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); + newSent.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class)); + newSent.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)); + newSent.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class)); + newSent.set(CoreAnnotations.DocIDAnnotation.class, sentence.get(CoreAnnotations.DocIDAnnotation.class)); + + // deep copy of all mentions lists + List ents = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + if(ents != null) newSent.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList(ents)); + List rels = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + if(rels != null) newSent.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, new ArrayList(rels)); + List evs = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class); + if(evs != null) newSent.set(MachineReadingAnnotations.EventMentionsAnnotation.class, new ArrayList(evs)); + + return newSent; + } + + /** + * Return the relation that holds between the given entities. + * Return a relation of type UNRELATED if this sentence contains no relation between the entities. + */ + public static RelationMention getRelation(RelationMentionFactory factory, CoreMap sentence, ExtractionObject ... args) { + return getRelations(factory, sentence, args).get(0); + } + + /** + * Return all the relations that holds between the given entities. + * Returns a list containing a relation of type UNRELATED if this sentence contains no relation between the entities. + */ + public static List getRelations(RelationMentionFactory factory, CoreMap sentence, ExtractionObject... args) { + List relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + List matchingRelationMentions = new ArrayList(); + if (relationMentions != null) { + for (RelationMention rel : relationMentions) { + if (rel.argsMatch(args)) { + matchingRelationMentions.add(rel); + } + } + } + if (matchingRelationMentions.size() == 0) { + matchingRelationMentions.add(RelationMention.createUnrelatedRelation(factory, args)); + } + return matchingRelationMentions; + } + + /** + * Get list of all relations and non-relations between EntityMentions in this sentence + * Use with care. This is an expensive call due to getAllUnrelatedRelations, which creates all non-existing relations between all entity mentions + */ + public static List getAllRelations(RelationMentionFactory factory, CoreMap sentence, boolean createUnrelatedRelations) { + List relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + List allRelations = new ArrayList(); + if(relationMentions != null) allRelations.addAll(relationMentions); + if(createUnrelatedRelations){ + allRelations.addAll(getAllUnrelatedRelations(factory, sentence, true)); + } + return allRelations; + } + + public static List getAllUnrelatedRelations(RelationMentionFactory factory, CoreMap sentence, boolean checkExisting) { + + List relationMentions = (checkExisting ? sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class) : null); + List entityMentions = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + List nonRelations = new ArrayList(); + + // + // scan all possible arguments + // + if(entityMentions != null){ + for(int i = 0; i < entityMentions.size(); i ++){ + for(int j = 0; j < entityMentions.size(); j ++){ + if(i == j) continue; + EntityMention arg1 = entityMentions.get(i); + EntityMention arg2 = entityMentions.get(j); + boolean match = false; + if(relationMentions != null){ + for (RelationMention rel : relationMentions) { + if (rel.argsMatch(arg1, arg2)) { + match = true; + break; + } + } + } + if (match == false) { + nonRelations.add(RelationMention.createUnrelatedRelation(factory, arg1,arg2)); + } + } + } + } + + return nonRelations; + } + + public static void addEntityMention(CoreMap sentence, EntityMention arg) { + List l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + if(l == null){ + l = new ArrayList(); + sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l); + } + l.add(arg); + } + + public static void addEntityMentions(CoreMap sentence, Collection args) { + List l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); + if(l == null){ + l = new ArrayList(); + sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l); + } + l.addAll(args); + } + + public List getEntityMentions(CoreMap sent) { + return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class)); + } + + public static void addRelationMention(CoreMap sentence, RelationMention arg) { + List l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + if(l == null){ + l = new ArrayList(); + sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l); + } + l.add(arg); + } + + public static void addRelationMentions(CoreMap sentence, Collection args) { + List l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + if(l == null){ + l = new ArrayList(); + sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l); + } + l.addAll(args); + } + + public List getRelationMentions(CoreMap sent) { + return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class)); + } + + public static void addEventMention(CoreMap sentence, EventMention arg) { + List l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class); + if(l == null){ + l = new ArrayList(); + sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l); + } + l.add(arg); + } + + public static void addEventMentions(CoreMap sentence, Collection args) { + List l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class); + if(l == null){ + l = new ArrayList(); + sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l); + } + l.addAll(args); + } + + public List getEventMentions(CoreMap sent) { + return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EventMentionsAnnotation.class)); + } + + /** + * Prepare a string for printing in a spreadsheet for Mechanical Turk input. + * @param s String to be formatted + * @return String string enclosed in quotes with other quotes escaped, and with better formatting for readability by Turkers. + */ + public static String prettify(String s) { + if (s==null) return ""; + return s.replace( + " ,",",").replace( + " .",".").replace( + " :",":").replace( + "( ","(").replace( + "[ ","[").replace( + " )",")").replace( + " ]","]").replace( + " - ","-").replace( + " '","'").replace( + "-LRB- ","(").replace( + " -RRB-",")").replace( + "` ` ","\"").replace( + " ' '","\"").replace( + " COMMA",","); + } + + /** + * Fetches the sentence text in a given token span + * @param span + */ + public static String getTextContent(CoreMap sent, Span span) { + List tokens = sent.get(CoreAnnotations.TokensAnnotation.class); + StringBuffer buf = new StringBuffer(); + assert(span != null); + for(int i = span.start(); i < span.end(); i ++){ + if(i > span.start()) buf.append(" "); + buf.append(tokens.get(i).word()); + } + return buf.toString(); + } + + public static String sentenceToString(CoreMap sent) { + StringBuilder sb = new StringBuilder(512); + List tokens = sent.get(CoreAnnotations.TokensAnnotation.class); + sb.append("\"" + StringUtils.join(tokens, " ") + "\""); + sb.append("\n"); + + List relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); + if(relationMentions != null){ + for (RelationMention rel : relationMentions) { + sb.append("\n"); + sb.append(rel); + } + } + + // TODO: add entity and event mentions + + return sb.toString(); + } + + public static String tokensAndNELabelsToString(CoreMap sentence) { + StringBuffer os = new StringBuffer(); + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + if(tokens != null){ + boolean first = true; + for(CoreLabel token: tokens) { + if(! first) os.append(" "); + os.append(token.word()); + if(token.ner() != null && ! token.ner().equals("O")){ + os.append("/" + token.ner()); + } + first = false; + } + } + return os.toString(); + } + + public static String datasetToString(CoreMap dataset){ + List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class); + StringBuffer b = new StringBuffer(); + if(sents != null){ + for(CoreMap sent: sents){ + b.append(sentenceToString(sent)); + } + } + return b.toString(); + } + + /* + public static List wordsToCoreLabels(List words) { + List labels = new ArrayList(); + for(Word word: words){ + CoreLabel l = new CoreLabel(); + l.setWord(word.word()); + l.set(CoreAnnotations.TextAnnotation.class, word.word()); + l.setBeginPosition(word.beginPosition()); + l.setEndPosition(word.endPosition()); + labels.add(l); + } + return labels; + } + */ + + public static String tokensToString(List tokens) { + StringBuffer os = new StringBuffer(); + boolean first = true; + for(CoreLabel t: tokens){ + if(! first) os.append(" "); + os.append(t.word() + "{" + t.beginPosition() + ", " + t.endPosition() + "}"); + first = false; + } + return os.toString(); + } + + /* + public static boolean sentenceContainsSpan(CoreMap sentence, Span span) { + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + int sentenceStart = tokens.get(0).beginPosition(); + int sentenceEnd = tokens.get(tokens.size() - 1).endPosition(); + return sentenceStart <= span.start() && sentenceEnd >= span.end(); + } + */ + + /* + * Shift the character offsets of all tokens by offset. + */ + public static void updateOffsets(List tokens, int offset) { + for(Word l: tokens) { + l.setBeginPosition(l.beginPosition() + offset); + l.setEndPosition(l.endPosition() + offset); + } + } + + /* + * Shift the character offsets of all tokens by offset. + */ + public static void updateOffsetsInCoreLabels(List tokens, int offset) { + for(CoreLabel l: tokens) { + l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, l.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + offset); + l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, l.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) + offset); + } + } + + /** + * Process string to be a cell in Excel file. + * Escape any quotes in the string and enclose the whole string with quotes. + */ + public static String excelify(String s) { + return '"'+s.replace("\"","\"\"")+'"'; + } + + public static List readSentencesFromFile(String path) throws IOException, ClassNotFoundException { + Annotation doc = (Annotation) IOUtils.readObjectFromFile(path); + return doc.get(CoreAnnotations.SentencesAnnotation.class); + + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/EntityMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/EntityMention.java new file mode 100644 index 0000000..ba3d4f5 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/EntityMention.java @@ -0,0 +1,273 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.util.CoreMap; + +/** + * Each entity mention is described by a type (possibly subtype) and a span of text + * + * @author Andrey Gusev + * @author Mihai + */ +public class EntityMention extends ExtractionObject { + + private static final long serialVersionUID = -2745903102654191527L; + + /** Mention type, if available, e.g., nominal */ + private final String mentionType; + private String corefID = "-1"; + + /** + * Offsets the head span, e.g., "George Bush" in the extent "the president George Bush" + * The offsets are relative to the sentence containing this mention + */ + private Span headTokenSpan; + + /** + * Position of the syntactic head word of this mention, e.g., "Bush" for the head span "George Bush" + * The offset is relative the sentence containing this mention + * Note: use headTokenSpan when sequence tagging entity mentions not this. + * This is meant to be used only for event/relation feature extraction! + */ + private int syntacticHeadTokenPosition; + + private String normalizedName; + + public EntityMention(String objectId, + CoreMap sentence, + Span extentSpan, + Span headSpan, + String type, + String subtype, + String mentionType) { + super(objectId, sentence, extentSpan, type, subtype); + this.mentionType = (mentionType != null ? mentionType.intern() : null); + this.headTokenSpan = headSpan; + this.syntacticHeadTokenPosition = -1; + this.normalizedName = null; + } + + public String getCorefID(){ + return corefID; + } + + public void setCorefID(String id) { + this.corefID = id; + } + public String getMentionType() { return mentionType; } + + public Span getHead() { return headTokenSpan; } + + public int getHeadTokenStart() { + return headTokenSpan.start(); + } + + public int getHeadTokenEnd() { + return headTokenSpan.end(); + } + + public void setHeadTokenSpan(Span s) { + headTokenSpan = s; + } + + public void setHeadTokenPosition(int i) { + this.syntacticHeadTokenPosition = i; + } + + public int getSyntacticHeadTokenPosition() { + return this.syntacticHeadTokenPosition; + } + + public CoreLabel getSyntacticHeadToken() { + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + return tokens.get(syntacticHeadTokenPosition); + } + + public Tree getSyntacticHeadTree() { + Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + return tree.getLeaves().get(syntacticHeadTokenPosition); + } + + public String getNormalizedName() { return normalizedName; } + public void setNormalizedName(String n) { normalizedName = n; } + + /* + @Override + public boolean equals(Object other) { + if(! (other instanceof EntityMention)) return false; + ExtractionObject o = (ExtractionObject) other; + if(o.objectId.equals(objectId) && o.sentence == sentence) return true; + return false; + } + */ + + @Override + public boolean equals(Object other) { + if(! (other instanceof EntityMention)) return false; + EntityMention otherEnt = (EntityMention) other; + return equals(otherEnt, true); + } + + public boolean headIncludes(EntityMention otherEnt, boolean useSubType) { + if(otherEnt.getSyntacticHeadTokenPosition() >= getHeadTokenStart() && + otherEnt.getSyntacticHeadTokenPosition() < getHeadTokenEnd() && + ((type != null && otherEnt.type != null && type.equals(otherEnt.type)) || (type == null && otherEnt.type == null)) && + (! useSubType || ((subType != null && otherEnt.subType != null && subType.equals(otherEnt.subType)) || (subType == null && otherEnt.subType == null)))){ + return true; + } + return false; + } + + public boolean equals(EntityMention otherEnt, boolean useSubType) { + // + // two mentions are equal if they have the same head span, the same type/subtype, and the same text + // we need this for scoring NER + // + if(textEquals(otherEnt) && labelEquals(otherEnt, useSubType)){ + return true; + } + /* + if(((headTokenSpan != null && headTokenSpan.equals(otherEnt.headTokenSpan)) || + (extentTokenSpan != null && extentTokenSpan.equals(otherEnt.extentTokenSpan))) && + ((type != null && otherEnt.type != null && type.equals(otherEnt.type)) || (type == null && otherEnt.type == null)) && + (! useSubType || ((subType != null && otherEnt.subType != null && subType.equals(otherEnt.subType)) || (subType == null && otherEnt.subType == null))) && + AnnotationUtils.getTextContent(sentence, headTokenSpan).equals(AnnotationUtils.getTextContent(otherEnt.getSentence(), otherEnt.headTokenSpan))){ + return true; + } + */ + return false; + } + + /** + * Compares the labels of the two mentions + * @param otherEnt + * @param useSubType + */ + public boolean labelEquals(EntityMention otherEnt, boolean useSubType) { + if(((type != null && otherEnt.type != null && type.equals(otherEnt.type)) || (type == null && otherEnt.type == null)) && + (! useSubType || ((subType != null && otherEnt.subType != null && subType.equals(otherEnt.subType)) || (subType == null && otherEnt.subType == null)))){ + return true; + } + return false; + } + + /** + * Compares the text spans of the two entity mentions + * @param otherEnt + */ + public boolean textEquals(EntityMention otherEnt) { + // + // we attempt three comparisons: + // a) if syntactic heads are defined we consider two texts similar if they have the same syntactic head + // (this is necessary because in NFL we compare entities with different spans but same heads, e.g. "49ers" vs "San Francisco 49ers" + // b) if head spans are defined we consider two texts similar if they have the same head span + // c) if extent spans are defined we consider two texts similar if they have the same extent span + // + if(syntacticHeadTokenPosition != -1 && otherEnt.syntacticHeadTokenPosition != -1){ + if(syntacticHeadTokenPosition == otherEnt.syntacticHeadTokenPosition) return true; + return false; + } + + if(headTokenSpan != null && otherEnt.headTokenSpan != null){ + if(headTokenSpan.equals(otherEnt.headTokenSpan)) return true; + return false; + } + + if(extentTokenSpan != null && otherEnt.extentTokenSpan != null){ + if(extentTokenSpan.equals(otherEnt.extentTokenSpan)) return true; + return false; + } + + return false; + } + + /** + * Get the text value of this entity. + * The headTokenSpan MUST be set before calling this method! + */ + public String getValue() { + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + // int lastEnd = -1; + StringBuilder sb = new StringBuilder(); + for (int i = headTokenSpan.start(); i < headTokenSpan.end(); i ++){ + CoreLabel token = tokens.get(i); + + // we are not guaranteed to have CharacterOffsets so we can't use them... + /* + Integer start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); + Integer end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + + if (start != null && end != null) { + if (lastEnd != -1 && !start.equals(lastEnd)) { + sb.append(StringUtils.repeat(" ", start - lastEnd)); + lastEnd = end; + } + } else { + if (lastEnd != -1) sb.append(" "); + lastEnd = 0; + } + */ + if(i > headTokenSpan.start()) sb.append(" "); + + sb.append(token.word()); + + } + + return sb.toString(); + } + + + @Override + public String toString() { + return "EntityMention [type=" + type + + (subType != null ? ", subType=" + subType : "") + + (mentionType != null ? ", mentionType=" + mentionType : "") + + (objectId != null ? ", objectId=" + objectId : "") + + (headTokenSpan != null ? ", hstart=" + headTokenSpan.start() + ", hend=" + headTokenSpan.end() : "") + + (extentTokenSpan != null ? ", estart=" + extentTokenSpan.start() + ", eend=" + extentTokenSpan.end() : "") + + (syntacticHeadTokenPosition >= 0 ? ", headPosition=" + syntacticHeadTokenPosition : "") + + (headTokenSpan != null ? ", value=\"" + getValue() + "\"" : "") + + (normalizedName != null ? ", normalizedName=\"" + normalizedName + "\"" : "") + + ", corefID=" + corefID + + (typeProbabilities != null ? ", probs=" + probsToString() : "") + + "]"; + } + + static class CompByHead implements Comparator { + public int compare(EntityMention o1, EntityMention o2) { + if(o1.getHeadTokenStart() < o2.getHeadTokenStart()){ + return -1; + } else if(o1.getHeadTokenStart() > o2.getHeadTokenStart()){ + return 1; + } else if(o1.getHeadTokenEnd() < o2.getHeadTokenEnd()) { + return -1; + } else if(o1.getHeadTokenEnd() > o2.getHeadTokenEnd()) { + return 1; + } else { + return 0; + } + } + } + + public static void sortByHeadSpan(List mentions) { + Collections.sort(mentions, new CompByHead()); + } + + private static int MENTION_COUNTER = 0; + + /** + * Creates a new unique id for an entity mention + * @return the new id + */ + public static synchronized String makeUniqueId() { + MENTION_COUNTER ++; + return "EntityMention-" + MENTION_COUNTER; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/EventMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/EventMention.java new file mode 100644 index 0000000..695811b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/EventMention.java @@ -0,0 +1,222 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.IdentityHashSet; + +/** + * + * @author Andrey Gusev + * @author Mihai + * + */ +public class EventMention extends RelationMention { + + private static final long serialVersionUID = 1L; + + /** Modifier argument: used for BioNLP */ + private String eventModification; + + private final ExtractionObject anchor; + + // this is set if we're a subevent + // we might have multiple parents for the same event (at least in the reader before sanity check 4)! + private Set parents; + + public EventMention(String objectId, + CoreMap sentence, + Span span, + String type, + String subtype, + ExtractionObject anchor, + List args, + List argNames) { + super(objectId, sentence, span, type, subtype, args, argNames); + this.anchor = anchor; + this.parents = new IdentityHashSet(); + + // set ourselves as the parent of any EventMentions in our args + for (ExtractionObject arg : args) { + if (arg instanceof EventMention) { + ((EventMention) arg).addParent(this); + } + } + } + + public void resetArguments() { + args = new ArrayList(); + argNames = new ArrayList(); + } + + public void removeFromParents() { + // remove this from the arg list of all parents + for(ExtractionObject parent: parents){ + if(parent instanceof RelationMention){ + ((RelationMention) parent).removeArgument(this, false); + } + } + // reset the parent links + parents.clear(); + } + + public void removeParent(ExtractionObject p) { + parents.remove(p); + } + + public String getModification() { + return eventModification; + } + + public void setModification(String eventModification) { + this.eventModification = eventModification; + } + + public ExtractionObject getAnchor() { + return anchor; + } + + /** + * If this EventMention is a subevent, this will return the parent event. + * + * @return the parent EventMention or null if this isn't a subevent. + */ + public Set getParents() { + return parents; + } + + public ExtractionObject getSingleParent(CoreMap sentence) { + if(getParents().size() > 1){ + Set parents = getParents(); + System.err.println("This event has multiple parents: " + this); + int count = 1; + for(ExtractionObject po: parents){ + System.err.println("PARENT #" + count + ": " + po); + count ++; + } + System.err.println("DOC " + sentence.get(CoreAnnotations.DocIDAnnotation.class)); + System.err.print("SENTENCE:"); + for(CoreLabel t: sentence.get(CoreAnnotations.TokensAnnotation.class)){ + System.err.print(" " + t.word()); + } + System.err.println("EVENTS IN SENTENCE:"); + count = 1; + for(EventMention e: sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class)){ + System.err.println("EVENT #" + count + ": " + e); + count ++; + } + } + + assert(getParents().size() <= 1); + for(ExtractionObject p: getParents()){ + return p; + } + return null; + } + + public void addParent(EventMention p) { + parents.add(p); + } + + @Override + public String toString() { + return "EventMention [objectId=" + getObjectId() + ", type=" + type + ", subType=" + subType + + ", start=" + getExtentTokenStart() + ", end=" + getExtentTokenEnd() + + (anchor != null ? ", anchor=" + anchor : "") + + (args != null ? ", args=" + args : "") + + (argNames != null ? ", argNames=" + argNames : "") + "]"; + } + + public boolean contains(EventMention e) { + if(this == e) return true; + + for(ExtractionObject a: getArgs()){ + if(a instanceof EventMention){ + EventMention ea = (EventMention) a; + if(ea.contains(e)){ + return true; + } + } + } + + return false; + } + + public void addArg(ExtractionObject a, String an, boolean discardSameArgDifferentName) { + // only add if not already an argument + for(int i = 0; i < getArgs().size(); i ++){ + ExtractionObject myArg = getArg(i); + String myArgName = getArgNames().get(i); + if(myArg == a){ + if(myArgName.equals(an)){ + // safe to discard this arg: we already have it with the same name + return; + } else { + logger.info("Trying to add one argument: " + a + " with name " + an + " when this already exists with a different name: " + this + " in sentence: " + getSentence().get(CoreAnnotations.TextAnnotation.class)); + if(discardSameArgDifferentName) return; + } + } + } + + this.args.add(a); + this.argNames.add(an); + if(a instanceof EventMention){ + ((EventMention) a).addParent(this); + } + } + + @Override + public void setArgs(List args) { + this.args = args; + // set ourselves as the parent of any EventMentions in our args + for (ExtractionObject arg : args) { + if (arg instanceof EventMention) { + ((EventMention) arg).addParent(this); + } + } + } + + public void addArgs(List args, List argNames, boolean discardSameArgDifferentName){ + if(args == null) return; + assert (args.size() == argNames.size()); + for(int i = 0; i < args.size(); i ++){ + addArg(args.get(i), argNames.get(i), discardSameArgDifferentName); + } + } + + public void mergeEvent(EventMention e, boolean discardSameArgDifferentName){ + // merge types if necessary + String oldType = type; + type = ExtractionObject.concatenateTypes(type, e.getType()); + if(! type.equals(oldType)){ + // This is not important: we use anchor types in the parser, not event types + // This is done just for completeness of code + logger.fine("Type changed from " + oldType + " to " + type + " during check 3 merge."); + } + + // add e's arguments + for(int i = 0; i < e.getArgs().size(); i ++){ + ExtractionObject a = e.getArg(i); + String an = e.getArgNames().get(i); + // TODO: we might need more complex cycle detection than just contains()... + if(a instanceof EventMention && ((EventMention) a).contains(this)){ + logger.info("Found event cycle during merge between e1 " + this + " and e2 " + e); + } else { + // remove e from a's parents + if(a instanceof EventMention) ((EventMention) a).removeParent(e); + // add a as an arg to this + addArg(a, an, discardSameArgDifferentName); + } + } + + // remove e's arguments. they are now attached to this, so we don't want them moved around during removeEvents + e.resetArguments(); + // remove e from its parent(s) to avoid using this argument in other merges of the parent + e.removeFromParents(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/ExtractionObject.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/ExtractionObject.java new file mode 100644 index 0000000..4c2ac36 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/ExtractionObject.java @@ -0,0 +1,264 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Set; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.stats.Counters; +import edu.stanford.nlp.util.ArrayCoreMap; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; + +/** + * Represents any object that can be extracted - entity, relation, event + * + * @author Andrey Gusev + * @author Mihai + * + */ +public class ExtractionObject implements Serializable { + + private static final long serialVersionUID = 1L; + + /** Unique identifier of the object in its document */ + protected final String objectId; + + /** + * Sentence that contains this object + * This assumes that each extraction object is intra-sentential (true in ACE, Roth, BioNLP, MR) + */ + protected CoreMap sentence; + + /** Type of this mention, e.g., GPE */ + protected String type; + + /** Subtype, if available, e.g., GPE.CITY */ + protected final String subType; + + /** + * Maximal token span relevant for this object, e.g., the largest NP for an entity mention + * The offsets are relative to the sentence that contains this object + */ + protected Span extentTokenSpan; + + /** This stores any optional attributes of ExtractionObjects */ + protected CoreMap attributeMap; + + /** + * Probabilities associated with this object + * We report probability values for each possible type for this object + */ + protected Counter typeProbabilities; + + public ExtractionObject(String objectId, + CoreMap sentence, + Span span, + String type, + String subtype) { + this.objectId = objectId; + this.sentence = sentence; + this.extentTokenSpan = span; + this.type = type.intern(); + this.subType = (subtype != null ? subtype.intern() : null); + this.attributeMap = null; + } + + public String getObjectId() { + return objectId; + } + + public String getDocumentId() { + return sentence.get(CoreAnnotations.DocIDAnnotation.class); + } + + public CoreMap getSentence() { + return sentence; + } + + public void setSentence(CoreMap sent) { + this.sentence = sent; + } + + public int getExtentTokenStart() { return extentTokenSpan.start(); } + + public int getExtentTokenEnd() { return extentTokenSpan.end(); } + + public Span getExtent() { return extentTokenSpan; } + + public void setExtent(Span s) { + extentTokenSpan = s; + } + + public String getExtentString() { + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + StringBuilder sb = new StringBuilder(); + for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i ++){ + CoreLabel token = tokens.get(i); + if(i > extentTokenSpan.start()) sb.append(" "); + sb.append(token.word()); + } + return sb.toString(); + } + + public String getType() { return type; } + + public String getSubType() { return subType; } + + @Override + public boolean equals(Object other) { + if(! (other instanceof ExtractionObject)) return false; + ExtractionObject o = (ExtractionObject) other; + if(o.objectId.equals(objectId) && o.sentence == sentence) return true; + return false; + } + + static class CompByExtent implements Comparator { + public int compare(ExtractionObject o1, ExtractionObject o2) { + if(o1.getExtentTokenStart() < o2.getExtentTokenStart()){ + return -1; + } else if(o1.getExtentTokenStart() > o2.getExtentTokenStart()){ + return 1; + } else if(o1.getExtentTokenEnd() < o2.getExtentTokenEnd()) { + return -1; + } else if(o1.getExtentTokenEnd() > o2.getExtentTokenEnd()) { + return 1; + } else { + return 0; + } + } + } + + public static void sortByExtent(List objects) { + Collections.sort(objects, new CompByExtent()); + } + + /** + * Returns the smallest span that covers the extent of all these objects + * @param objs + */ + public static Span getSpan(ExtractionObject ... objs) { + int left = Integer.MAX_VALUE; + int right = Integer.MIN_VALUE; + for(int i = 0; i < objs.length; i ++){ + if(objs[i].getExtentTokenStart() < left){ + left = objs[i].getExtentTokenStart(); + } + if(objs[i].getExtentTokenEnd() > right) { + right = objs[i].getExtentTokenEnd(); + } + } + assert(left < Integer.MAX_VALUE); + assert(right > Integer.MIN_VALUE); + return new Span(left, right); + } + + /** + * Returns the text corresponding to the extent of this object + */ + public String getValue() { + return getFullValue(); + } + + /** + * Always returns the text corresponding to the extent of this object, even when + * getValue is overridden by subclass. + */ + final public String getFullValue() { + List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); + StringBuilder sb = new StringBuilder(); + if(tokens != null && extentTokenSpan != null){ + for(int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i ++){ + if(i > extentTokenSpan.start()) sb.append(" "); + sb.append(tokens.get(i).word()); + } + } + return sb.toString(); + } + + public void setType(String t) { + this.type = t; + } + + private static final String TYPE_SEP = "/"; + + /** + * Concatenates two types + * @param t1 + * @param t2 + */ + public static String concatenateTypes(String t1, String t2) { + String [] t1Toks = t1.split(TYPE_SEP); + String [] t2Toks = t2.split(TYPE_SEP); + Set uniqueTypes = Generics.newHashSet(); + for(String t: t1Toks) uniqueTypes.add(t); + for(String t: t2Toks) uniqueTypes.add(t); + String [] types = new String[uniqueTypes.size()]; + uniqueTypes.toArray(types); + Arrays.sort(types); + StringBuffer os = new StringBuffer(); + for(int i = 0; i < types.length; i ++){ + if(i > 0) os.append(TYPE_SEP); + os.append(types[i]); + } + return os.toString(); + } + + public CoreMap attributeMap() { + if(attributeMap == null){ + attributeMap = new ArrayCoreMap(); + } + return attributeMap; + } + + public void setTypeProbabilities(Counter probs) { + typeProbabilities = probs; + } + public Counter getTypeProbabilities() { + return typeProbabilities; + } + String probsToString() { + List> sorted = Counters.toDescendingMagnitudeSortedListWithCounts(typeProbabilities); + StringBuffer os = new StringBuffer(); + os.append("{"); + boolean first = true; + for(Pair lv: sorted) { + if(! first) os.append("; "); + os.append(lv.first + ", " + lv.second); + first = false; + } + os.append("}"); + return os.toString(); + } + + /** + * Returns true if it's worth saving/printing this object + * This happens in two cases: + * 1. The type of the object is not nilLabel + * 2. The type of the object is nilLabel but the second ranked label is within the given beam (0 -- 100) of the first choice + * @param beam + * @param nilLabel + */ + public boolean printableObject(double beam, String nilLabel) { + List> sorted = Counters.toDescendingMagnitudeSortedListWithCounts(typeProbabilities); + + // first choice not nil + if(sorted.size() > 0 && ! sorted.get(0).first.equals(nilLabel)){ + return true; + } + + // first choice is nil, but second is within beam + if(sorted.size() > 1 && sorted.get(0).first.equals(nilLabel) && beam > 0 && + 100.0 * (sorted.get(0).second - sorted.get(1).second) < beam){ + return true; + } + + return false; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/MachineReadingAnnotations.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/MachineReadingAnnotations.java new file mode 100644 index 0000000..84b9752 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/MachineReadingAnnotations.java @@ -0,0 +1,104 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.util.List; + +import edu.stanford.nlp.ling.CoreAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.util.ErasureUtils; + +/** + * Annotations specific to the machinereading data structures + * @author Mihai + * + */ +public class MachineReadingAnnotations { + private MachineReadingAnnotations() {} // only static members + + /** + * The CoreMap key for getting the entity mentions corresponding to a sentence. + * + * This key is typically set on sentence annotations. + */ + public static class EntityMentionsAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.>>uncheckedCast(List.class); + } + } + + /** + * The CoreMap key for getting the relation mentions corresponding to a sentence. + * + * This key is typically set on sentence annotations. + */ + public static class RelationMentionsAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.>>uncheckedCast(List.class); + } + } + + /** + * The CoreMap key for getting the event mentions corresponding to a sentence. + * + * This key is typically set on sentence annotations. + */ + public static class EventMentionsAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.>>uncheckedCast(List.class); + } + } + + /** + * The CoreMap key for getting the document id of a given sentence. + * + * This key is typically set on sentence annotations. + * + * NOTE: This is a trivial subclass of CoreAnnotations.DocIDAnnotation + */ + @Deprecated + public static class DocumentIdAnnotation extends CoreAnnotations.DocIDAnnotation { + public Class getType() { + return String.class; + } + } + public static class DocumentDirectoryAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key for getting the syntactic dependencies of a sentence. + * Note: this is no longer used, but it appears in sentences cached during KBP 2010 + * + * This key is typically set on sentence annotations. + */ + @Deprecated + public static class DependencyAnnotation implements CoreAnnotation { + public Class getType() { + return SemanticGraph.class; + } + } + + /** + * Marks trigger words for relation extraction + * @author Mihai + * + */ + public static class TriggerAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Marks words as belonging to a list of either male or female names + * + */ + public static class GenderAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/RelationMention.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/RelationMention.java new file mode 100644 index 0000000..5d53395 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/RelationMention.java @@ -0,0 +1,309 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.logging.Logger; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.IdentityHashSet; + +/** + * Each relation has a type and set of arguments + * + * @author Andrey Gusev + * @author Mihai + * @author David McClosky + * + */ +public class RelationMention extends ExtractionObject { + + private static final long serialVersionUID = 8962289597607972827L; + public static final Logger logger = Logger.getLogger(RelationMention.class.getName()); + + // index of the next unique id + private static int MENTION_COUNTER = 0; + + public static final String UNRELATED = "_NR"; + + /** + * List of argument names in this relation + */ + protected List argNames; + + /** + * List of arguments in this relation + * If unnamed, arguments MUST be stored in semantic order, e.g., ARG0 must be a person in a employed-by relation + */ + protected List args; + + /** + * A signature for a given relation mention, e.g., a concatenation of type and argument strings + * This is used in KBP, where we merge all RelationMentions corresponding to the same abstract relation + */ + protected String signature; + + public RelationMention(String objectId, + CoreMap sentence, + Span span, + String type, + String subtype, + List args) { + super(objectId, sentence, span, type, subtype); + this.args = args; + this.argNames = null; + this.signature = null; + } + + public RelationMention(String objectId, + CoreMap sentence, + Span span, + String type, + String subtype, + List args, + List argNames) { + super(objectId, sentence, span, type, subtype); + this.args = args; + this.argNames = argNames; + this.signature = null; + } + + public RelationMention(String objectId, + CoreMap sentence, + Span span, + String type, + String subtype, + ExtractionObject... args) { + this(objectId, sentence, span, type, subtype, Arrays.asList(args)); + } + + public boolean argsMatch(RelationMention rel) { + return argsMatch(rel.getArgs()); + } + + public boolean argsMatch(ExtractionObject... inputArgs) { + return argsMatch(Arrays.asList(inputArgs)); + } + + /** + * Verifies if the two sets of arguments match + * @param inputArgs List of arguments + */ + public boolean argsMatch(List inputArgs) { + if (inputArgs.size() != this.args.size()) { + return false; + } + + for (int ind = 0; ind < this.args.size(); ind++) { + ExtractionObject a1 = this.args.get(ind); + ExtractionObject a2 = inputArgs.get(ind); + if(! a1.equals(a2)) return false; + } + + return true; + } + + public List getArgs() { + return Collections.unmodifiableList(this.args); + } + public void setArgs(List args) { + this.args = args; + } + + /** + * Fetches the arguments of this relation that are entity mentions + * @return List of entity-mention args sorted in semantic order + */ + public List getEntityMentionArgs() { + List ents = new ArrayList(); + for(ExtractionObject o: args) { + if(o instanceof EntityMention){ + ents.add((EntityMention) o); + } + } + return ents; + } + + public ExtractionObject getArg(int argpos) { + return this.args.get(argpos); + } + + public List getArgNames() { + return argNames; + } + + public void setArgNames(List argNames) { + this.argNames = argNames; + } + + public void addArg(ExtractionObject a) { + this.args.add(a); + } + + public boolean isNegativeRelation() { + return isUnrelatedLabel(getType()); + } + + /** + * Find the left-most position of an argument's syntactic head + */ + public int getFirstSyntacticHeadPosition() { + int pos = Integer.MAX_VALUE; + for (ExtractionObject obj : args) { + if(obj instanceof EntityMention){ + EntityMention em = (EntityMention) obj; + if(em.getSyntacticHeadTokenPosition() < pos) { + pos = em.getSyntacticHeadTokenPosition(); + } + } + } + if(pos != Integer.MAX_VALUE) return pos; + return -1; + } + + /** + * Find the right-most position of an argument's syntactic head + */ + public int getLastSyntacticHeadPosition() { + int pos = Integer.MIN_VALUE; + for (ExtractionObject obj : args) { + if(obj instanceof EntityMention){ + EntityMention em = (EntityMention) obj; + if(em.getSyntacticHeadTokenPosition() > pos) { + pos = em.getSyntacticHeadTokenPosition(); + } + } + } + if(pos != Integer.MIN_VALUE) return pos; + return -1; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("RelationMention [type=" + type + + (subType != null ? ", subType=" + subType : "") + + ", start=" + getExtentTokenStart() + ", end=" + getExtentTokenEnd()); + if(typeProbabilities != null){ + sb.append(", " + probsToString()); + } + + if(args != null){ + for(int i = 0; i < args.size(); i ++){ + sb.append("\n\t"); + if(argNames != null) sb.append(argNames.get(i) + " "); + sb.append(args.get(i)); + } + } + sb.append("\n]"); + return sb.toString(); + } + + /** + * Replaces the arguments of this relations with equivalent mentions from the predictedMentions list + * This works only for arguments that are EntityMention! + * @param predictedMentions + */ + public boolean replaceGoldArgsWithPredicted(List predictedMentions) { + List newArgs = new ArrayList(); + for(ExtractionObject arg: args){ + if(! (arg instanceof EntityMention)){ + continue; + } + EntityMention goldEnt = (EntityMention) arg; + EntityMention newArg = null; + for(EntityMention pred: predictedMentions){ + if(goldEnt.textEquals(pred)){ + newArg = pred; + break; + } + } + if(newArg != null){ + newArgs.add(newArg); + logger.info("Replacing relation argument: [" + goldEnt + "] with predicted mention [" + newArg + "]"); + } else { + /* + logger.info("Failed to match relation argument: " + goldEnt); + return false; + */ + newArgs.add(goldEnt); + predictedMentions.add(goldEnt); + logger.info("Failed to match relation argument, so keeping gold: " + goldEnt); + } + } + this.args = newArgs; + return true; + } + + public void removeArgument(ExtractionObject argToRemove, boolean removeParent) { + Set thisEvent = new IdentityHashSet(); + thisEvent.add(argToRemove); + removeArguments(thisEvent, removeParent); + } + + public void removeArguments(Set argsToRemove, boolean removeParent) { + List newArgs = new ArrayList(); + List newArgNames = new ArrayList(); + for(int i = 0; i < args.size(); i ++){ + ExtractionObject a = args.get(i); + String n = argNames.get(i); + if(! argsToRemove.contains(a)){ + newArgs.add(a); + newArgNames.add(n); + } else { + if(a instanceof EventMention && removeParent){ + ((EventMention) a).removeParent(this); + } + } + } + args = newArgs; + argNames = newArgNames; + } + + public boolean printableObject(double beam) { + return printableObject(beam, RelationMention.UNRELATED); + } + + public void setSignature(String s) { signature = s; } + public String getSignature() { return signature; } + + /* + * Static utility functions + */ + + public static Collection filterUnrelatedRelations(Collection relationMentions) { + Collection filtered = new ArrayList(); + for (RelationMention relation : relationMentions) { + if (!relation.getType().equals(UNRELATED)) { + filtered.add(relation); + } + } + return filtered; + } + + /** + * Creates a new unique id for a relation mention + * @return the new id + */ + public static synchronized String makeUniqueId() { + MENTION_COUNTER++; + return "RelationMention-" + MENTION_COUNTER; + } + + public static RelationMention createUnrelatedRelation(RelationMentionFactory factory, ExtractionObject ... args) { + return createUnrelatedRelation(factory, "",args); + } + + private static RelationMention createUnrelatedRelation(RelationMentionFactory factory, String type, ExtractionObject ... args) { + return factory.constructRelationMention( + RelationMention.makeUniqueId(), args[0].getSentence(), ExtractionObject.getSpan(args), + RelationMention.UNRELATED + type, null, Arrays.asList(args), null); + } + + public static boolean isUnrelatedLabel(String label) { + return label.startsWith(UNRELATED); + } +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/RelationMentionFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/RelationMentionFactory.java new file mode 100644 index 0000000..455eed9 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/RelationMentionFactory.java @@ -0,0 +1,41 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.io.Serializable; +import java.util.List; + +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.CoreMap; + +public class RelationMentionFactory implements Serializable { + private static final long serialVersionUID = -662846276208839290L; + + /** + * Always use this method to construct RelationMentions + * Other factories that inherit from this (e.g., NFLRelationFactory) may override this + * @param objectId + * @param sentence + * @param span + * @param type + * @param subtype + * @param args + * @param probs + */ + public RelationMention constructRelationMention( + String objectId, + CoreMap sentence, + Span span, + String type, + String subtype, + List args, + Counter probs) { + RelationMention relation = new RelationMention( + objectId, + sentence, + span, + type, + subtype, + args); + relation.setTypeProbabilities(probs); + return relation; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/Span.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/Span.java new file mode 100644 index 0000000..c9403b1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/machinereading/structure/Span.java @@ -0,0 +1,121 @@ +package edu.stanford.nlp.ie.machinereading.structure; + +import java.io.Serializable; + +import edu.stanford.nlp.util.Pair; + +/** + * Stores the offsets for a span of text + * Offsets may indicate either token or byte positions + * Start is inclusive, end is exclusive + * @author Mihai + */ +public class Span implements Serializable { + private static final long serialVersionUID = -3861451490217976693L; + + private int start; + private int end; + + /** + * This assumes that s <= e. Use fromValues if you can't guarantee this. + */ + public Span(int s, int e) { + start = s; + end = e; + } + + /** + * Creates a span that encloses all spans in the argument list. Behavior is undefined if given no arguments. + */ + public Span(Span... spans) { + this(Integer.MAX_VALUE, Integer.MIN_VALUE); + + for (Span span : spans) { + expandToInclude(span); + } + } + + /** + * Safe way to construct Spans if you're not sure which value is higher. + */ + public static Span fromValues(int val1, int val2) { + if (val1 <= val2) { + return new Span(val1, val2); + } else { + return new Span(val2, val1); + } + } + + public int start() { return start; } + public int end() { return end; } + + public void setStart(int s) { start = s; } + public void setEnd(int e) { end = e; } + + @Override + public boolean equals(Object other) { + if(! (other instanceof Span)) return false; + Span otherSpan = (Span) other; + if(start == otherSpan.start && end == otherSpan.end){ + return true; + } + return false; + } + + @Override + public int hashCode() { + return (new Pair(start,end)).hashCode(); + } + + @Override + public String toString() { + return "[" + start + "," + end + ")"; + } + + public void expandToInclude(Span otherSpan) { + if (otherSpan.start() < start) { + setStart(otherSpan.start()); + } + if (otherSpan.end() > end) { + setEnd(otherSpan.end()); + } + } + + /** + * Returns true if this span contains otherSpan. Endpoints on spans may match. + */ + public boolean contains(Span otherSpan) { + return this.start <= otherSpan.start && otherSpan.end <= this.end; + } + + /** + * Returns true if i is inside this span. Note that the start is inclusive and the end is exclusive. + */ + public boolean contains(int i) { + return this.start <= i && i < this.end; + } + + /** + * Returns true if this span ends before the otherSpan starts. + * + * @throws IllegalArgumentException if either span contains the other span + */ + public boolean isBefore(Span otherSpan) { + if (this.contains(otherSpan) || otherSpan.contains(this)) { + throw new IllegalArgumentException("Span " + toString() + " contains otherSpan " + otherSpan + " (or vice versa)"); + } + return this.end <= otherSpan.start; + } + + /** + * Returns true if this span starts after the otherSpan's end. + * + * @throws IllegalArgumentException if either span contains the other span + */ + public boolean isAfter(Span otherSpan) { + if (this.contains(otherSpan) || otherSpan.contains(this)) { + throw new IllegalArgumentException("Span " + toString() + " contains otherSpan " + otherSpan + " (or vice versa)"); + } + return this.start >= otherSpan.end; + } +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/ner/CMMClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/ner/CMMClassifier.java new file mode 100644 index 0000000..b1e39d5 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/ner/CMMClassifier.java @@ -0,0 +1,1611 @@ +// CMMClassifier -- a probabilistic (CMM) Named Entity Recognizer +// Copyright (c) 2002-2006 The Board of Trustees of +// The Leland Stanford Junior University. All Rights Reserved. +// +// This program has been made available for research purposes only. +// Please do not further distribute it. +// Commercial development of the software is not to be undertaken without +// prior agreement from Stanford University. +// This program is not open source nor is it in the public domain. +// +// For information contact: +// Christopher Manning +// Dept of Computer Science, Gates 1A +// Stanford CA 94305-9010 +// USA +// manning@cs.stanford.edu + +package edu.stanford.nlp.ie.ner; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.regex.Pattern; + +import edu.stanford.nlp.classify.Dataset; +import edu.stanford.nlp.classify.LinearClassifier; +import edu.stanford.nlp.classify.LinearClassifierFactory; +import edu.stanford.nlp.classify.LogPrior; +import edu.stanford.nlp.classify.NBLinearClassifierFactory; +import edu.stanford.nlp.classify.ProbabilisticClassifier; +import edu.stanford.nlp.classify.SVMLightClassifierFactory; +import edu.stanford.nlp.ie.AbstractSequenceClassifier; +import edu.stanford.nlp.ie.NERFeatureFactory; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.BasicDatum; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.Datum; +import edu.stanford.nlp.ling.Document; +import edu.stanford.nlp.ling.HasTag; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.WordTag; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.math.ArrayMath; +import edu.stanford.nlp.math.SloppyMath; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.sequences.BeamBestSequenceFinder; +import edu.stanford.nlp.sequences.Clique; +import edu.stanford.nlp.sequences.DocumentReaderAndWriter; +import edu.stanford.nlp.sequences.ExactBestSequenceFinder; +import edu.stanford.nlp.sequences.FeatureFactory; +import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.sequences.SequenceModel; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.HashIndex; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.PaddedList; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; + + +/** + * Does Sequence Classification using a Conditional Markov Model. + * It could be used for other purposes, but the provided features + * are aimed at doing Named Entity Recognition. + * The code has functionality for different document encodings, but when + * using the standard ColumnDocumentReader, + * input files are expected to + * be one word per line with the columns indicating things like the word, + * POS, chunk, and class. + *

    + * Typical usage + *

    For running a trained model with a provided serialized classifier:

    + * + * java -server -mx1000m edu.stanford.nlp.ie.ner.CMMClassifier -loadClassifier + * conll.ner.gz -textFile samplesentences.txt + *

    + * When specifying all parameters in a properties file (train, test, or + * runtime):

    + * + * java -mx1000m edu.stanford.nlp.ie.ner.CMMClassifier -prop propFile + *

    + * To train and test a model from the command line:

    + * java -mx1000m edu.stanford.nlp.ie.ner.CMMClassifier + * -trainFile trainFile -testFile testFile -goodCoNLL > output + *

    + * Features are defined by a {@link FeatureFactory}; the + * {@link FeatureFactory} which is used by default is + * {@link NERFeatureFactory}, and you should look there for feature templates. + * Features are specified either by a Properties file (which is the + * recommended method) or on the command line. The features are read into + * a {@link SeqClassifierFlags} object, which the + * user need not know much about, unless one wishes to add new features. + *

    + * CMMClassifier may also be used programmatically. When creating a new instance, you + * must specify a properties file. The other way to get a CMMClassifier is to + * deserialize one via {@link CMMClassifier#getClassifier(String)}, which returns a + * deserialized classifier. You may then tag sentences using either the assorted + * test or testSentence methods. + * + * @author Dan Klein + * @author Jenny Finkel + * @author Christopher Manning + * @author Shipra Dingare + * @author Huy Nguyen + * @author Sarah Spikes (sdspikes@cs.stanford.edu) - cleanup and filling in types + */ + +public class CMMClassifier extends AbstractSequenceClassifier { + + private ProbabilisticClassifier classifier; + + /** The set of empirically legal label sequences (of length (order) at most + * flags.maxLeft). Used to filter valid class sequences if + * useObuseObservedSequencesOnly is set. + */ + Set> answerArrays; + + /** Default place to look in Jar file for classifier. */ + public static final String DEFAULT_CLASSIFIER = "/classifiers/ner-eng-ie.cmm-3-all2006.ser.gz"; + + protected CMMClassifier() { + super(new SeqClassifierFlags()); + } + + public CMMClassifier(Properties props) { + super(props); + } + + + /** + * Returns the Set of entities recognized by this Classifier. + * + * @return The Set of entities recognized by this Classifier. + */ + public Set getTags() { + Set tags = Generics.newHashSet(classIndex.objectsList()); + tags.remove(flags.backgroundSymbol); + return tags; + } + + /** + * Classify a {@link List} of {@link CoreLabel}s. + * + * @param document A {@link List} of {@link CoreLabel}s + * to be classified. + */ + @Override + public List classify(List document) { + if (flags.useSequences) { + classifySeq(document); + } else { + classifyNoSeq(document); + } + return document; + } + + /** + * Classify a List of {@link CoreLabel}s without using sequence information + * (i.e. no Viterbi algorithm, just distribution over next class). + * + * @param document a List of {@link CoreLabel}s to be classified + */ + private void classifyNoSeq(List document) { + if (flags.useReverse) { + Collections.reverse(document); + } + + if (flags.lowerNewgeneThreshold) { + // Used to raise recall for task 1B + System.err.println("Using NEWGENE threshold: " + flags.newgeneThreshold); + for (int i = 0, docSize = document.size(); i < docSize; i++) { + CoreLabel wordInfo = document.get(i); + Datum d = makeDatum(document, i, featureFactory); + Counter scores = classifier.scoresOf(d); + //String answer = BACKGROUND; + String answer = flags.backgroundSymbol; + // HN: The evaluation of scoresOf seems to result in some + // kind of side effect. Specifically, the symptom is that + // if scoresOf is not evaluated at every position, the + // answers are different + if ("NEWGENE".equals(wordInfo.get(CoreAnnotations.GazAnnotation.class))) { + for (String label : scores.keySet()) { + if ("G".equals(label)) { + System.err.println(wordInfo.word() + ':' + scores.getCount(label)); + if (scores.getCount(label) > flags.newgeneThreshold) { + answer = label; + } + } + } + } + wordInfo.set(CoreAnnotations.AnswerAnnotation.class, answer); + } + } else { + for (int i = 0, listSize = document.size(); i < listSize; i++) { + String answer = classOf(document, i); + CoreLabel wordInfo = document.get(i); + //System.err.println("XXX answer for " + + // wordInfo.word() + " is " + answer); + wordInfo.set(CoreAnnotations.AnswerAnnotation.class, answer); + } + if (flags.justify && (classifier instanceof LinearClassifier)) { + LinearClassifier lc = (LinearClassifier) classifier; + for (int i = 0, lsize = document.size(); i < lsize; i++) { + CoreLabel lineInfo = document.get(i); + System.err.print("@@ Position " + i + ": "); + System.err.println(lineInfo.word() + " chose " + lineInfo.get(CoreAnnotations.AnswerAnnotation.class)); + lc.justificationOf(makeDatum(document, i, featureFactory)); + } + } + } + if (flags.useReverse) { + Collections.reverse(document); + } + } + + /** + * Returns the most likely class for the word at the given position. + */ + protected String classOf(List lineInfos, int pos) { + Datum d = makeDatum(lineInfos, pos, featureFactory); + return classifier.classOf(d); + } + + /** + * Returns the log conditional likelihood of the given dataset. + * + * @return The log conditional likelihood of the given dataset. + */ + public double loglikelihood(List lineInfos) { + double cll = 0.0; + + for (int i = 0; i < lineInfos.size(); i++) { + Datum d = makeDatum(lineInfos, i, featureFactory); + Counter c = classifier.logProbabilityOf(d); + + double total = Double.NEGATIVE_INFINITY; + for (String s : c.keySet()) { + total = SloppyMath.logAdd(total, c.getCount(s)); + } + cll -= c.getCount(d.label()) - total; + } + // quadratic prior + // HN: TODO: add other priors + + if (classifier instanceof LinearClassifier) { + double sigmaSq = flags.sigma * flags.sigma; + LinearClassifier lc = (LinearClassifier)classifier; + for (String feature: lc.features()) { + for (String classLabel: classIndex) { + double w = lc.weight(feature, classLabel); + cll += w * w / 2.0 / sigmaSq; + } + } + } + return cll; + } + + @Override + public SequenceModel getSequenceModel(List document) { + //System.err.println(flags.useReverse); + + if (flags.useReverse) { + Collections.reverse(document); + } + + // cdm Aug 2005: why is this next line needed? Seems really ugly!!! [2006: it broke things! removed] + // document.add(0, new CoreLabel()); + + SequenceModel ts = new Scorer(document, + classIndex, + this, + (!flags.useTaggySequences ? (flags.usePrevSequences ? 1 : 0) : flags.maxLeft), + (flags.useNextSequences ? 1 : 0), + answerArrays); + + return ts; + } + + /** + * Classify a List of {@link CoreLabel}s using sequence information + * (i.e. Viterbi or Beam Search). + * + * @param document A List of {@link CoreLabel}s to be classified + */ + private void classifySeq(List document) { + + if (document.isEmpty()) { + return; + } + + SequenceModel ts = getSequenceModel(document); + + // TagScorer ts = new PrevOnlyScorer(document, tagIndex, this, (!flags.useTaggySequences ? (flags.usePrevSequences ? 1 : 0) : flags.maxLeft), 0, answerArrays); + + int[] tags; + //System.err.println("***begin test***"); + if (flags.useViterbi) { + ExactBestSequenceFinder ti = new ExactBestSequenceFinder(); + tags = ti.bestSequence(ts); + } else { + BeamBestSequenceFinder ti = new BeamBestSequenceFinder(flags.beamSize, true, true); + tags = ti.bestSequence(ts, document.size()); + } + //System.err.println("***end test***"); + + // used to improve recall in task 1b + if (flags.lowerNewgeneThreshold) { + System.err.println("Using NEWGENE threshold: " + flags.newgeneThreshold); + + int[] copy = new int[tags.length]; + System.arraycopy(tags, 0, copy, 0, tags.length); + + // for each sequence marked as NEWGENE in the gazette + // tag the entire sequence as NEWGENE and sum the score + // if the score is greater than newgeneThreshold, accept + int ngTag = classIndex.indexOf("G"); + //int bgTag = classIndex.indexOf(BACKGROUND); + int bgTag = classIndex.indexOf(flags.backgroundSymbol); + + for (int i = 0, dSize = document.size(); i < dSize; i++) { + CoreLabel wordInfo =document.get(i); + + if ("NEWGENE".equals(wordInfo.get(CoreAnnotations.GazAnnotation.class))) { + int start = i; + int j; + for (j = i; j < document.size(); j++) { + wordInfo = document.get(j); + if (!"NEWGENE".equals(wordInfo.get(CoreAnnotations.GazAnnotation.class))) { + break; + } + } + int end = j; + //int end = i + 1; + + int winStart = Math.max(0, start - 4); + int winEnd = Math.min(tags.length, end + 4); + // clear a window around the sequences + for (j = winStart; j < winEnd; j++) { + copy[j] = bgTag; + } + + // score as nongene + double bgScore = 0.0; + for (j = start; j < end; j++) { + double[] scores = ts.scoresOf(copy, j); + scores = Scorer.recenter(scores); + bgScore += scores[bgTag]; + } + + // first pass, compute all of the scores + ClassicCounter> prevScores = new ClassicCounter>(); + for (j = start; j < end; j++) { + // clear the sequence + for (int k = start; k < end; k++) { + copy[k] = bgTag; + } + + // grow the sequence from j until the end + for (int k = j; k < end; k++) { + copy[k] = ngTag; + // score the sequence + double ngScore = 0.0; + for (int m = start; m < end; m++) { + double[] scores = ts.scoresOf(copy, m); + scores = Scorer.recenter(scores); + ngScore += scores[tags[m]]; + } + prevScores.incrementCount(new Pair(Integer.valueOf(j), Integer.valueOf(k)), ngScore - bgScore); + } + } + for (j = start; j < end; j++) { + // grow the sequence from j until the end + for (int k = j; k < end; k++) { + double score = prevScores.getCount(new Pair(Integer.valueOf(j), Integer.valueOf(k))); + Pair al = new Pair(Integer.valueOf(j - 1), Integer.valueOf(k)); // adding a word to the left + Pair ar = new Pair(Integer.valueOf(j), Integer.valueOf(k + 1)); // adding a word to the right + Pair sl = new Pair(Integer.valueOf(j + 1), Integer.valueOf(k)); // subtracting word from left + Pair sr = new Pair(Integer.valueOf(j), Integer.valueOf(k - 1)); // subtracting word from right + + // make sure the score is greater than all its neighbors (one add or subtract) + if (score >= flags.newgeneThreshold && (!prevScores.containsKey(al) || score > prevScores.getCount(al)) && (!prevScores.containsKey(ar) || score > prevScores.getCount(ar)) && (!prevScores.containsKey(sl) || score > prevScores.getCount(sl)) && (!prevScores.containsKey(sr) || score > prevScores.getCount(sr))) { + StringBuilder sb = new StringBuilder(); + wordInfo = document.get(j); + String docId = wordInfo.get(CoreAnnotations.IDAnnotation.class); + String startIndex = wordInfo.get(CoreAnnotations.PositionAnnotation.class); + wordInfo = document.get(k); + String endIndex = wordInfo.get(CoreAnnotations.PositionAnnotation.class); + for (int m = j; m <= k; m++) { + wordInfo = document.get(m); + sb.append(wordInfo.word()); + sb.append(' '); + } + /*System.err.println(sb.toString()+"score:"+score+ + " al:"+prevScores.getCount(al)+ + " ar:"+prevScores.getCount(ar)+ + " sl:"+prevScores.getCount(sl)+" sr:"+ prevScores.getCount(sr));*/ + System.out.println(docId + '|' + startIndex + ' ' + endIndex + '|' + sb.toString().trim()); + } + } + } + + // restore the original tags + for (j = winStart; j < winEnd; j++) { + copy[j] = tags[j]; + } + i = end; + } + } + } + + for (int i = 0, docSize = document.size(); i < docSize; i++) { + CoreLabel lineInfo = document.get(i); + String answer = classIndex.get(tags[i]); + lineInfo.set(CoreAnnotations.AnswerAnnotation.class, answer); + } + + if (flags.justify && classifier instanceof LinearClassifier) { + LinearClassifier lc = (LinearClassifier) classifier; + if (flags.dump) { + lc.dump(); + } + for (int i = 0, docSize = document.size(); i < docSize; i++) { + CoreLabel lineInfo = document.get(i); + System.err.print("@@ Position is: " + i + ": "); + System.err.println(lineInfo.word() + ' ' + lineInfo.get(CoreAnnotations.AnswerAnnotation.class)); + lc.justificationOf(makeDatum(document, i, featureFactory)); + } + } + +// document.remove(0); + + if (flags.useReverse) { + Collections.reverse(document); + + } + } // end testSeq + + + /** + * @param filename adaptation file + * @param trainDataset original dataset (used in training) + */ + public void adapt(String filename, Dataset trainDataset, + DocumentReaderAndWriter readerWriter) { + flags.ocrTrain = false; // ?? Do we need this? (Pi-Chuan Sat Nov 5 15:42:49 2005) + ObjectBank> docs = + makeObjectBankFromFile(filename, readerWriter); + adapt(docs, trainDataset); + } + + /** + * @param featureLabels adaptation docs + * @param trainDataset original dataset (used in training) + */ + public void adapt(ObjectBank> featureLabels, Dataset trainDataset) { + Dataset adapt = getDataset(featureLabels, trainDataset); + adapt(adapt); + } + + /** + * @param featureLabels retrain docs + * @param featureIndex featureIndex of original dataset (used in training) + * @param labelIndex labelIndex of original dataset (used in training) + */ + public void retrain(ObjectBank> featureLabels, Index featureIndex, Index labelIndex) { + int fs = featureIndex.size(); // old dim + int ls = labelIndex.size(); // old dim + + Dataset adapt = getDataset(featureLabels, featureIndex, labelIndex); + + int prior = LogPrior.LogPriorType.QUADRATIC.ordinal(); + LinearClassifier lc = (LinearClassifier) classifier; + LinearClassifierFactory lcf = new LinearClassifierFactory(flags.tolerance, flags.useSum, prior, flags.sigma, flags.epsilon, flags.QNsize); + + double[][] weights = lc.weights(); // old dim + Index newF = adapt.featureIndex; + Index newL = adapt.labelIndex; + int newFS = newF.size(); + int newLS = newL.size(); + double[] x = new double[newFS*newLS]; // new dim + //System.err.println("old ["+fs+"]"+"["+ls+"]"); + //System.err.println("new ["+newFS+"]"+"["+newLS+"]"); + //System.err.println("new ["+newFS*newLS+"]"); + for (int i = 0; i < fs; i++) { + for (int j = 0; j < ls; j++) { + String f = featureIndex.get(i); + String l = labelIndex.get(j); + int newi = newF.indexOf(f)*newLS+newL.indexOf(l); + x[newi] = weights[i][j]; + //if (newi == 144745*2) { + //System.err.println("What??"+i+"\t"+j); + //} + } + } + //System.err.println("x[144745*2]"+x[144745*2]); + weights = lcf.trainWeights(adapt, x); + //System.err.println("x[144745*2]"+x[144745*2]); + //System.err.println("weights[144745]"+"[0]="+weights[144745][0]); + + lc.setWeights(weights); + /* + int delme = 0; + if (true) { + for (double[] dd : weights) { + delme++; + for (double d : dd) { + } + } + } + System.err.println(weights[delme-1][0]); + System.err.println("size of weights: "+delme); + */ + } + + + public void retrain(ObjectBank> doc) { + if (classifier == null) { + System.err.println("Cannot retrain before you train!"); + System.exit(-1); + } + Index findex = ((LinearClassifier)classifier).featureIndex(); + Index lindex = ((LinearClassifier)classifier).labelIndex(); + System.err.println("Starting retrain:\t# of original features"+findex.size()+", # of original labels"+lindex.size()); + retrain(doc, findex, lindex); + } + + + @Override + public void train(Collection> wordInfos, + DocumentReaderAndWriter readerAndWriter) { + Dataset train = getDataset(wordInfos); + //train.summaryStatistics(); + //train.printSVMLightFormat(); + // wordInfos = null; // cdm: I think this does no good as ptr exists in caller (could empty the list or better refactor so conversion done earlier?) + train(train); + + for (int i = 0; i < flags.numTimesPruneFeatures; i++) { + + Index featuresAboveThreshhold = getFeaturesAboveThreshhold(train, flags.featureDiffThresh); + System.err.println("Removing features with weight below " + flags.featureDiffThresh + " and retraining..."); + train = getDataset(train, featuresAboveThreshhold); + + int tmp = flags.QNsize; + flags.QNsize = flags.QNsize2; + train(train); + flags.QNsize = tmp; + } + + if (flags.doAdaptation && flags.adaptFile != null) { + adapt(flags.adaptFile,train,readerAndWriter); + } + + System.err.print("Built this classifier: "); + if (classifier instanceof LinearClassifier) { + String classString = ((LinearClassifier)classifier).toString(flags.printClassifier, flags.printClassifierParam); + System.err.println(classString); + } else { + String classString = classifier.toString(); + System.err.println(classString); + } + } + + public Index getFeaturesAboveThreshhold(Dataset dataset, double thresh) { + if (!(classifier instanceof LinearClassifier)) { + throw new RuntimeException("Attempting to remove features based on weight from a non-linear classifier"); + } + Index featureIndex = dataset.featureIndex; + Index labelIndex = dataset.labelIndex; + + Index features = new HashIndex(); + Iterator featureIt = featureIndex.iterator(); + LinearClassifier lc = (LinearClassifier)classifier; + LOOP: + while (featureIt.hasNext()) { + String f = featureIt.next(); + Iterator labelIt = labelIndex.iterator(); + double smallest = Double.POSITIVE_INFINITY; + double biggest = Double.NEGATIVE_INFINITY; + while (labelIt.hasNext()) { + String l = labelIt.next(); + double weight = lc.weight(f, l); + if (weight < smallest) { + smallest = weight; + } + if (weight > biggest) { + biggest = weight; + } + if (biggest - smallest > thresh) { + features.add(f); + continue LOOP; + } + } + } + return features; + } + + /** + * Build a Dataset from some data. Used for training a classifier. + * + * @param data This variable is a list of lists of CoreLabel. That is, + * it is a collection of documents, each of which is represented + * as a sequence of CoreLabel objects. + * @return The Dataset which is an efficient encoding of the information + * in a List of Datums + */ + public Dataset getDataset(Collection> data) { + return getDataset(data, null, null); + } + + /** + * Build a Dataset from some data. Used for training a classifier. + * + * By passing in extra featureIndex and classIndex, you can get a Dataset based on featureIndex and + * classIndex + * + * @param data This variable is a list of lists of CoreLabel. That is, + * it is a collection of documents, each of which is represented + * as a sequence of CoreLabel objects. + * @param classIndex if you want to get a Dataset based on featureIndex and + * classIndex in an existing origDataset + * @return The Dataset which is an efficient encoding of the information + * in a List of Datums + */ + public Dataset getDataset(Collection> data, Index featureIndex, Index classIndex) { + makeAnswerArraysAndTagIndex(data); + + int size = 0; + for (List doc : data) { + size += doc.size(); + } + + System.err.println("Making Dataset..."); + Dataset train; + if (featureIndex != null && classIndex != null) { + System.err.println("Using feature/class Index from existing Dataset..."); + System.err.println("(This is used when getting Dataset from adaptation set. We want to make the index consistent.)"); //pichuan + train = new Dataset(size, featureIndex, classIndex); + } else { + train = new Dataset(size); + } + + for (List doc : data) { + if (flags.useReverse) { + Collections.reverse(doc); + } + + for (int i = 0, dsize = doc.size(); i < dsize; i++) { + Datum d = makeDatum(doc, i, featureFactory); + + //CoreLabel fl = doc.get(i); + + train.add(d); + } + + if (flags.useReverse) { + Collections.reverse(doc); + } + } + + System.err.println("done."); + // reset printing before test data + // what is this???? -JRF +// if (featureFactory instanceof FeatureFactory) { +// ((FeatureFactory) featureFactory).resetPrintFeatures(); +// } + + if (flags.featThreshFile != null) { + System.err.println("applying thresholds..."); + List> thresh = getThresholds(flags.featThreshFile); + train.applyFeatureCountThreshold(thresh); + } else if (flags.featureThreshold > 1) { + System.err.println("Removing Features with counts < " + flags.featureThreshold); + train.applyFeatureCountThreshold(flags.featureThreshold); + } + train.summaryStatistics(); + return train; + } + + public Dataset getBiasedDataset(ObjectBank> data, Index featureIndex, Index classIndex) { + makeAnswerArraysAndTagIndex(data); + + Index origFeatIndex = new HashIndex(featureIndex.objectsList()); // mg2009: TODO: check + + int size = 0; + for (List doc : data) { + size += doc.size(); + } + + System.err.println("Making Dataset..."); + Dataset train = new Dataset(size, featureIndex, classIndex); + + for (List doc : data) { + if (flags.useReverse) { + Collections.reverse(doc); + } + + for (int i = 0, dsize = doc.size(); i < dsize; i++) { + Datum d = makeDatum(doc, i, featureFactory); + Collection newFeats = new ArrayList(); + for (String f : d.asFeatures()) { + if ( ! origFeatIndex.contains(f)) { + newFeats.add(f); + } + } +// System.err.println(d.label()+"\t"+d.asFeatures()+"\n\t"+newFeats); +// d = new BasicDatum(newFeats, d.label()); + train.add(d); + } + + if (flags.useReverse) { + Collections.reverse(doc); + } + } + + System.err.println("done."); + // reset printing before test data + // what is this???? -JRF +// if (featureFactory instanceof FeatureFactory) { +// ((FeatureFactory) featureFactory).resetPrintFeatures(); +// } + + if (flags.featThreshFile != null) { + System.err.println("applying thresholds..."); + List> thresh = getThresholds(flags.featThreshFile); + train.applyFeatureCountThreshold(thresh); + } else if (flags.featureThreshold > 1) { + System.err.println("Removing Features with counts < " + flags.featureThreshold); + train.applyFeatureCountThreshold(flags.featureThreshold); + } + train.summaryStatistics(); + return train; + } + + + + + /** + * Build a Dataset from some data. Used for training a classifier. + * + * By passing in an extra origDataset, you can get a Dataset based on featureIndex and + * classIndex in an existing origDataset. + * + * @param data This variable is a list of lists of CoreLabel. That is, + * it is a collection of documents, each of which is represented + * as a sequence of CoreLabel objects. + * @param origDataset if you want to get a Dataset based on featureIndex and + * classIndex in an existing origDataset + * @return The Dataset which is an efficient encoding of the information + * in a List of Datums + */ + public Dataset getDataset(ObjectBank> data, Dataset origDataset) { + if(origDataset == null) { + return getDataset(data); + } + return getDataset(data, origDataset.featureIndex, origDataset.labelIndex); + } + + + /** + * Build a Dataset from some data. + * + * @param oldData This {@link Dataset} represents data for which we which to + * some features, specifically those features not in the {@link edu.stanford.nlp.util.Index} + * goodFeatures. + * @param goodFeatures An {@link edu.stanford.nlp.util.Index} of features we wish to retain. + * @return A new {@link Dataset} wheres each datapoint contains only features + * which were in goodFeatures. + */ + + public Dataset getDataset(Dataset oldData, Index goodFeatures) { + //public Dataset getDataset(List data, Collection goodFeatures) { + //makeAnswerArraysAndTagIndex(data); + + int[][] oldDataArray = oldData.getDataArray(); + int[] oldLabelArray = oldData.getLabelsArray(); + Index oldFeatureIndex = oldData.featureIndex; + + int[] oldToNewFeatureMap = new int[oldFeatureIndex.size()]; + + int[][] newDataArray = new int[oldDataArray.length][]; + + System.err.print("Building reduced dataset..."); + + int size = oldFeatureIndex.size(); + int max = 0; + for (int i = 0; i < size; i++) { + oldToNewFeatureMap[i] = goodFeatures.indexOf(oldFeatureIndex.get(i)); + if (oldToNewFeatureMap[i] > max) { + max = oldToNewFeatureMap[i]; + } + } + + for (int i = 0; i < oldDataArray.length; i++) { + int[] data = oldDataArray[i]; + size = 0; + for (int j = 0; j < data.length; j++) { + if (oldToNewFeatureMap[data[j]] > 0) { + size++; + } + } + int[] newData = new int[size]; + int index = 0; + for (int j = 0; j < data.length; j++) { + int f = oldToNewFeatureMap[data[j]]; + if (f > 0) { + newData[index++] = f; + } + } + newDataArray[i] = newData; + } + + Dataset train = new Dataset(oldData.labelIndex, oldLabelArray, goodFeatures, newDataArray, newDataArray.length); + + System.err.println("done."); + if (flags.featThreshFile != null) { + System.err.println("applying thresholds..."); + List> thresh = getThresholds(flags.featThreshFile); + train.applyFeatureCountThreshold(thresh); + } else if (flags.featureThreshold > 1) { + System.err.println("Removing Features with counts < " + flags.featureThreshold); + train.applyFeatureCountThreshold(flags.featureThreshold); + } + train.summaryStatistics(); + return train; + } + + private void adapt(Dataset adapt) { + if (flags.classifierType.equalsIgnoreCase("SVM")) { + throw new UnsupportedOperationException(); + } + adaptMaxEnt(adapt); + } + + private void adaptMaxEnt(Dataset adapt) { + if (classifier instanceof LinearClassifier) { + // So far the adaptation is only done on Gaussian Prior. Haven't checked how it'll work on other kinds of priors. -pichuan + int prior = LogPrior.LogPriorType.QUADRATIC.ordinal(); + if (flags.useHuber) { + throw new UnsupportedOperationException(); + } else if (flags.useQuartic) { + throw new UnsupportedOperationException(); + } + + LinearClassifierFactory lcf = new LinearClassifierFactory(flags.tolerance, flags.useSum, prior, flags.adaptSigma, flags.epsilon, flags.QNsize); + ((LinearClassifier)classifier).adaptWeights(adapt,lcf); + } else { + throw new UnsupportedOperationException(); + } + } + + private void train(Dataset train) { + if (flags.classifierType.equalsIgnoreCase("SVM")) { + trainSVM(train); + } else { + trainMaxEnt(train); + } + } + + private void trainSVM(Dataset train) { + SVMLightClassifierFactory fact = new SVMLightClassifierFactory(); + classifier = fact.trainClassifier(train); + + } + + private void trainMaxEnt(Dataset train) { + int prior = LogPrior.LogPriorType.QUADRATIC.ordinal(); + if (flags.useHuber) { + prior = LogPrior.LogPriorType.HUBER.ordinal(); + } else if (flags.useQuartic) { + prior = LogPrior.LogPriorType.QUARTIC.ordinal(); + } + + LinearClassifier lc; + if (flags.useNB) { + lc = new NBLinearClassifierFactory(flags.sigma).trainClassifier(train); + } else { + LinearClassifierFactory lcf = new LinearClassifierFactory(flags.tolerance, flags.useSum, prior, flags.sigma, flags.epsilon, flags.QNsize); + if (flags.useQN) { + lcf.useQuasiNewton(flags.useRobustQN); + } else if(flags.useStochasticQN) { + lcf.useStochasticQN(flags.initialGain,flags.stochasticBatchSize); + } else if(flags.useSMD) { + lcf.useStochasticMetaDescent(flags.initialGain, flags.stochasticBatchSize,flags.stochasticMethod,flags.SGDPasses); + } else if(flags.useSGD) { + lcf.useStochasticGradientDescent(flags.gainSGD,flags.stochasticBatchSize); + } else if(flags.useSGDtoQN) { + lcf.useStochasticGradientDescentToQuasiNewton(flags.initialGain, flags.stochasticBatchSize, + flags.SGDPasses, flags.QNPasses, flags.SGD2QNhessSamples, + flags.QNsize, flags.outputIterationsToFile); + } else if(flags.useHybrid) { + lcf.useHybridMinimizer(flags.initialGain, flags.stochasticBatchSize ,flags.stochasticMethod ,flags.hybridCutoffIteration ); + } else { + lcf.useConjugateGradientAscent(); + } + lc = lcf.trainClassifier(train); + } + this.classifier = lc; + } + + private void trainSemiSup(Dataset data, Dataset biasedData, double[][] confusionMatrix) { + int prior = LogPrior.LogPriorType.QUADRATIC.ordinal(); + if (flags.useHuber) { + prior = LogPrior.LogPriorType.HUBER.ordinal(); + } else if (flags.useQuartic) { + prior = LogPrior.LogPriorType.QUARTIC.ordinal(); + } + + LinearClassifierFactory lcf; + lcf = new LinearClassifierFactory(flags.tolerance, flags.useSum, prior, flags.sigma, flags.epsilon, flags.QNsize); + if (flags.useQN) { + lcf.useQuasiNewton(); + } else{ + lcf.useConjugateGradientAscent(); + } + + this.classifier = (LinearClassifier) lcf.trainClassifierSemiSup(data, biasedData, confusionMatrix, null); + } + + +// public void crossValidateTrainAndTest() throws Exception { +// crossValidateTrainAndTest(flags.trainFile); +// } + +// public void crossValidateTrainAndTest(String filename) throws Exception { +// // wordshapes + +// for (int fold = flags.startFold; fold <= flags.endFold; fold++) { +// System.err.println("fold " + fold + " of " + flags.endFold); +// // train + +// List = makeObjectBank(filename); +// List folds = split(data, flags.numFolds); +// data = null; + +// List train = new ArrayList(); + +// for (int i = 0; i < flags.numFolds; i++) { +// List docs = (List) folds.get(i); +// if (i != fold) { +// train.addAll(docs); +// } +// } +// folds = null; +// train(train); +// train = null; + +// List test = new ArrayList(); +// data = makeObjectBank(filename); +// folds = split(data, flags.numFolds); +// data = null; + +// for (int i = 0; i < flags.numFolds; i++) { +// List docs = (List) folds.get(i); +// if (i == fold) { +// test.addAll(docs); +// } +// } +// folds = null; +// // test +// test(test); +// writeAnswers(test); +// } +// } + +// /** +// * Splits the given train corpus into a train and a test corpus based on the fold number. +// * 1 / numFolds documents are held out for test, with the offset determined by the fold number. +// * +// * @param data The original data +// * @param numFolds The number of folds to split the data into +// * @return A list of folds giving the new training set +// */ +// private List split(List data, int numFolds) { +// List folds = new ArrayList(); +// int foldSize = data.size() / numFolds; +// int r = data.size() - (numFolds * foldSize); + +// int index = 0; +// for (int i = 0; i < numFolds; i++) { +// List fold = new ArrayList(); +// int end = (i < r ? foldSize + 1 : foldSize); +// for (int j = 0; j < end; j++) { +// fold.add(data.get(index++)); +// } +// folds.add(fold); +// } + +// return folds; +// } + + @Override + public void serializeClassifier(String serializePath) { + + System.err.print("Serializing classifier to " + serializePath + "..."); + + try { + ObjectOutputStream oos = IOUtils.writeStreamFromString(serializePath); + + oos.writeObject(classifier); + oos.writeObject(flags); + oos.writeObject(featureFactory); + oos.writeObject(classIndex); + oos.writeObject(answerArrays); + //oos.writeObject(WordShapeClassifier.getKnownLowerCaseWords()); + + oos.writeObject(knownLCWords); + + oos.close(); + System.err.println("Done."); + + } catch (Exception e) { + System.err.println("Error serializing to " + serializePath); + e.printStackTrace(); + // dont actually exit in case they're testing too + //System.exit(1); + } + } + + + /** + * Used to load the default supplied classifier. **THIS FUNCTION + * WILL ONLY WORK IF RUN INSIDE A JAR FILE** + */ + public void loadDefaultClassifier() { + loadJarClassifier(DEFAULT_CLASSIFIER, null); + } + + /** + * Used to obtain the default classifier which is + * stored inside a jar file. THIS FUNCTION + * WILL ONLY WORK IF RUN INSIDE A JAR FILE. + * + * @return A Default CMMClassifier from a jar file + */ + public static CMMClassifier getDefaultClassifier() { + + CMMClassifier cmm = new CMMClassifier(); + cmm.loadDefaultClassifier(); + return cmm; + + } + + /** Load a classifier from the given Stream. + * Implementation note: This method does not close the + * Stream that it reads from. + * + * @param ois The ObjectInputStream to load the serialized classifier from + * + * @throws IOException If there are problems accessing the input stream + * @throws ClassCastException If there are problems interpreting the serialized data + * @throws ClassNotFoundException If there are problems interpreting the serialized data + + * */ + @SuppressWarnings("unchecked") + @Override + public void loadClassifier(ObjectInputStream ois, Properties props) throws ClassCastException, IOException, ClassNotFoundException { + classifier = (LinearClassifier) ois.readObject(); + flags = (SeqClassifierFlags) ois.readObject(); + featureFactory = (FeatureFactory) ois.readObject(); + + if (props != null) { + flags.setProperties(props); + } + reinit(); + + classIndex = (Index) ois.readObject(); + answerArrays = (Set>) ois.readObject(); + + knownLCWords = (Set) ois.readObject(); + } + + + public static CMMClassifier getClassifierNoExceptions(File file) { + CMMClassifier cmm = new CMMClassifier(); + cmm.loadClassifierNoExceptions(file); + return cmm; + + } + + public static CMMClassifier getClassifier(File file) throws IOException, ClassCastException, ClassNotFoundException { + + CMMClassifier cmm = new CMMClassifier(); + cmm.loadClassifier(file); + return cmm; + } + + public static CMMClassifier getClassifierNoExceptions(String loadPath) { + CMMClassifier cmm = new CMMClassifier(); + cmm.loadClassifierNoExceptions(loadPath); + return cmm; + + } + + public static CMMClassifier getClassifier(String loadPath) throws IOException, ClassCastException, ClassNotFoundException { + + CMMClassifier cmm = new CMMClassifier(); + cmm.loadClassifier(loadPath); + return cmm; + } + + public static CMMClassifier getClassifierNoExceptions(InputStream in) { + CMMClassifier cmm = new CMMClassifier(); + cmm.loadClassifierNoExceptions(new BufferedInputStream(in), null); + return cmm; + } + + public static CMMClassifier getClassifier(InputStream in) throws IOException, ClassCastException, ClassNotFoundException { + CMMClassifier cmm = new CMMClassifier(); + cmm.loadClassifier(new BufferedInputStream(in)); + return cmm; + } + + /** This routine builds the answerArrays which give the + * empirically legal label sequences (of length (order) at most + * flags.maxLeft) and the classIndex, + * which indexes known answer classes. + * + * @param docs The training data: A List of List of CoreLabel + */ + private void makeAnswerArraysAndTagIndex(Collection> docs) { + if (answerArrays == null) { + answerArrays = Generics.newHashSet(); + } + if (classIndex == null) { + classIndex = new HashIndex(); + } + + for (List doc : docs) { + if (flags.useReverse) { + Collections.reverse(doc); + } + + int leng = doc.size(); + for (int start = 0; start < leng; start++) { + for (int diff = 1; diff <= flags.maxLeft && start + diff <= leng; diff++) { + String[] seq = new String[diff]; + for (int i = start; i < start + diff; i++) { + seq[i - start] = doc.get(i).get(CoreAnnotations.AnswerAnnotation.class); + } + answerArrays.add(Arrays.asList(seq)); + } + } + for (int i = 0; i < leng; i++) { + CoreLabel wordInfo = doc.get(i); + classIndex.add(wordInfo.get(CoreAnnotations.AnswerAnnotation.class)); + } + + if (flags.useReverse) { + Collections.reverse(doc); + } + } + } + + /** Make an individual Datum out of the data list info, focused at position + * loc. + * @param info A List of WordInfo objects + * @param loc The position in the info list to focus feature creation on + * @param featureFactory The factory that constructs features out of the item + * @return A Datum (BasicDatum) representing this data instance + */ + public Datum makeDatum(List info, int loc, FeatureFactory featureFactory) { + PaddedList pInfo = new PaddedList(info, pad); + + Collection features = new ArrayList(); + List cliques = featureFactory.getCliques(); + for (Clique c : cliques) { + Collection feats = featureFactory.getCliqueFeatures(pInfo, loc, c); + feats = addOtherClasses(feats, pInfo, loc, c); + features.addAll(feats); + } + + printFeatures(pInfo.get(loc), features); + CoreLabel c = info.get(loc); + return new BasicDatum(features, c.get(CoreAnnotations.AnswerAnnotation.class)); + } + + + /** This adds to the feature name the name of classes that are other than + * the current class that are involved in the clique. In the CMM, these + * other classes become part of the conditioning feature, and only the + * class of the current position is being predicted. + * + * @return A collection of features with extra class information put + * into the feature name. + */ + private static Collection addOtherClasses(Collection feats, List info, + int loc, Clique c) { + String addend = null; + String pAnswer = info.get(loc - 1).get(CoreAnnotations.AnswerAnnotation.class); + String p2Answer = info.get(loc - 2).get(CoreAnnotations.AnswerAnnotation.class); + String p3Answer = info.get(loc - 3).get(CoreAnnotations.AnswerAnnotation.class); + String p4Answer = info.get(loc - 4).get(CoreAnnotations.AnswerAnnotation.class); + String p5Answer = info.get(loc - 5).get(CoreAnnotations.AnswerAnnotation.class); + String nAnswer = info.get(loc + 1).get(CoreAnnotations.AnswerAnnotation.class); + // cdm 2009: Is this really right? Do we not need to differentiate names that would collide??? + if (c == FeatureFactory.cliqueCpC) { + addend = '|' + pAnswer; + } else if (c == FeatureFactory.cliqueCp2C) { + addend = '|' + p2Answer; + } else if (c == FeatureFactory.cliqueCp3C) { + addend = '|' + p3Answer; + } else if (c == FeatureFactory.cliqueCp4C) { + addend = '|' + p4Answer; + } else if (c == FeatureFactory.cliqueCp5C) { + addend = '|' + p5Answer; + } else if (c == FeatureFactory.cliqueCpCp2C) { + addend = '|' + pAnswer + '-' + p2Answer; + } else if (c == FeatureFactory.cliqueCpCp2Cp3C) { + addend = '|' + pAnswer + '-' + p2Answer + '-' + p3Answer; + } else if (c == FeatureFactory.cliqueCpCp2Cp3Cp4C) { + addend = '|' + pAnswer + '-' + p2Answer + '-' + p3Answer + '-' + p4Answer; + } else if (c == FeatureFactory.cliqueCpCp2Cp3Cp4Cp5C) { + addend = '|' + pAnswer + '-' + p2Answer + '-' + p3Answer + '-' + p4Answer + '-' + p5Answer; + } else if (c == FeatureFactory.cliqueCnC) { + addend = '|' + nAnswer; + } else if (c == FeatureFactory.cliqueCpCnC) { + addend = '|' + pAnswer + '-' + nAnswer; + } + if (addend == null) { + return feats; + } + Collection newFeats = Generics.newHashSet(); + for (String feat : feats) { + String newFeat = feat + addend; + newFeats.add(newFeat); + } + return newFeats; + } + + + private static List> getThresholds(String filename) { + try { + BufferedReader in = new BufferedReader(new FileReader(filename)); + List> thresholds = new ArrayList>(); + String line; + while ((line = in.readLine()) != null) { + int i = line.lastIndexOf(' '); + Pattern p = Pattern.compile(line.substring(0, i)); + //System.err.println(":"+line.substring(0,i)+":"); + Integer t = Integer.valueOf(line.substring(i + 1)); + Pair pair = new Pair(p, t); + thresholds.add(pair); + } + in.close(); + return thresholds; + } catch (Exception e) { + throw new RuntimeException("Error reading threshold file", e); + } + } + + public void trainSemiSup() { + DocumentReaderAndWriter readerAndWriter = makeReaderAndWriter(); + + String filename = flags.trainFile; + String biasedFilename = flags.biasedTrainFile; + + ObjectBank> data = + makeObjectBankFromFile(filename, readerAndWriter); + ObjectBank> biasedData = + makeObjectBankFromFile(biasedFilename, readerAndWriter); + + Index featureIndex = new HashIndex(); + Index classIndex = new HashIndex(); + + Dataset dataset = getDataset(data, featureIndex, classIndex); + Dataset biasedDataset = getBiasedDataset(biasedData, featureIndex, classIndex); + + double[][] confusionMatrix = new double[classIndex.size()][classIndex.size()]; + + for (int i = 0; i < confusionMatrix.length; i++) { + Arrays.fill(confusionMatrix[i], 0.0); + confusionMatrix[i][i] = 1.0; + } + + String cm = flags.confusionMatrix; + String[] bits = cm.split(":"); + for (String bit : bits) { + String[] bits1 = bit.split("\\|"); + int i1 = classIndex.indexOf(bits1[0]); + int i2 = classIndex.indexOf(bits1[1]); + double d = Double.parseDouble(bits1[2]); + confusionMatrix[i2][i1] = d; + } + + for (int i = 0; i < confusionMatrix.length; i++) { + ArrayMath.normalize(confusionMatrix[i]); + } + + for (int i = 0; i < confusionMatrix.length; i++) { + for (int j = 0; j < i; j++) { + double d = confusionMatrix[i][j]; + confusionMatrix[i][j] = confusionMatrix[j][i]; + confusionMatrix[j][i] = d; + } + } + + for (int i = 0; i < confusionMatrix.length; i++) { + for (int j = 0; j < confusionMatrix.length; j++) { + System.err.println("P("+classIndex.get(j)+ '|' +classIndex.get(i)+") = "+confusionMatrix[j][i]); + } + } + + trainSemiSup(dataset, biasedDataset, confusionMatrix); + } + + static class Scorer implements SequenceModel { + private CMMClassifier classifier = null; + + private int[] tagArray = null; + private int[] backgroundTags = null; + private Index tagIndex = null; + private List lineInfos = null; + private int pre = 0; + private int post = 0; + private Set> legalTags = null; + + private static final boolean VERBOSE = false; + + void buildTagArray() { + int sz = tagIndex.size(); + tagArray = new int[sz]; + for (int i = 0; i < sz; i++) { + tagArray[i] = i; + } + } + + public int length() { + return lineInfos.size() - pre - post; + } + + public int leftWindow() { + return pre; + } + + public int rightWindow() { + return post; + } + + public int[] getPossibleValues(int position) { + // if (position == 0 || position == lineInfos.size() - 1) { + // int[] a = new int[1]; + // a[0] = tagIndex.indexOf(BACKGROUND); + // return a; + // } + if (tagArray == null) { + buildTagArray(); + } + if (position < pre) { + return backgroundTags; + } + return tagArray; + } + + public double scoreOf(int[] sequence) { + throw new UnsupportedOperationException(); + } + + private double[] scoreCache = null; + private int[] lastWindow = null; + //private int lastPos = -1; + + public double scoreOf(int[] tags, int pos) { + if (false) { + return scoresOf(tags, pos)[tags[pos]]; + } + if (lastWindow == null) { + lastWindow = new int[leftWindow() + rightWindow() + 1]; + Arrays.fill(lastWindow, -1); + } + boolean match = (pos == lastPos); + for (int i = pos - leftWindow(); i <= pos + rightWindow(); i++) { + if (i == pos || i < 0) { + continue; + } + /*System.err.println("p:"+pos); + System.err.println("lw:"+leftWindow()); + System.err.println("i:"+i);*/ + match &= tags[i] == lastWindow[i - pos + leftWindow()]; + } + if (!match) { + scoreCache = scoresOf(tags, pos); + for (int i = pos - leftWindow(); i <= pos + rightWindow(); i++) { + if (i < 0) { + continue; + } + lastWindow[i - pos + leftWindow()] = tags[i]; + } + lastPos = pos; + } + return scoreCache[tags[pos]]; + } + + private int percent = -1; + private int num = 0; + private long secs = System.currentTimeMillis(); + private long hit = 0; + private long tot = 0; + + public double[] scoresOf(int[] tags, int pos) { + if (VERBOSE) { + int p = (100 * pos) / length(); + if (p > percent) { + long secs2 = System.currentTimeMillis(); + System.err.println(StringUtils.padLeft(p, 3) + "% " + ((secs2 - secs == 0) ? 0 : (num * 1000 / (secs2 - secs))) + " hits per sec, position=" + pos + ", legal=" + ((tot == 0) ? 100 : ((100 * hit) / tot))); + // + "% [hit=" + hit + ", tot=" + tot + "]"); + percent = p; + num = 0; + secs = secs2; + } + tot++; + } + String[] answers = new String[1 + leftWindow() + rightWindow()]; + String[] pre = new String[leftWindow()]; + for (int i = 0; i < 1 + leftWindow() + rightWindow(); i++) { + int absPos = pos - leftWindow() + i; + if (absPos < 0) { + continue; + } + answers[i] = tagIndex.get(tags[absPos]); + CoreLabel li = lineInfos.get(absPos); + li.set(CoreAnnotations.AnswerAnnotation.class, answers[i]); + if (i < leftWindow()) { + pre[i] = answers[i]; + } + } + double[] scores = new double[tagIndex.size()]; + //System.out.println("Considering: "+Arrays.asList(pre)); + if (!legalTags.contains(Arrays.asList(pre)) && classifier.flags.useObservedSequencesOnly) { + // System.out.println("Rejecting: " + Arrays.asList(pre)); + // System.out.println(legalTags); + Arrays.fill(scores, -1000);// Double.NEGATIVE_INFINITY; + return scores; + } + num++; + hit++; + Counter c = classifier.scoresOf(lineInfos, pos); + //System.out.println("Pos "+pos+" hist "+Arrays.asList(pre)+" result "+c); + //System.out.println(c); + //if (false && flags.justify) { + // System.out.println("Considering position " + pos + ", word is " + ((CoreLabel) lineInfos.get(pos)).word()); + // //System.out.println("Datum is "+d.asFeatures()); + // System.out.println("History: " + Arrays.asList(pre)); + //} + for (String s : c.keySet()) { + int t = tagIndex.indexOf(s); + if (t > -1) { + int[] tA = getPossibleValues(pos); + for (int j = 0; j < tA.length; j++) { + if (tA[j] == t) { + scores[j] = c.getCount(s); + //if (false && flags.justify) { + // System.out.println("Label " + s + " got score " + scores[j]); + //} + } + } + } + } + // normalize? + if (classifier.normalize()) { + ArrayMath.logNormalize(scores); + } + return scores; + } + + static double[] recenter(double[] x) { + double[] r = new double[x.length]; + // double logTotal = Double.NEGATIVE_INFINITY; + // for (int i = 0; i < x.length; i++) + // logTotal = SloppyMath.logAdd(logTotal, x[i]); + double logTotal = ArrayMath.logSum(x); + for (int i = 0; i < x.length; i++) { + r[i] = x[i] - logTotal; + } + return r; + } + + /** + * Build a Scorer. + * + * @param lineInfos List of WordInfo data items to classify + * @param classifier The trained Classifier + * @param pre Number of previous tags that condition current tag + * @param post Number of following tags that condition previous tag + * (if pre and post are both nonzero, then you have a + * dependency network tagger) + */ + Scorer(List lineInfos, Index tagIndex, CMMClassifier classifier, int pre, int post, Set> legalTags) { + if (VERBOSE) { + System.err.println("Built Scorer for " + lineInfos.size() + " words, clique pre=" + pre + " post=" + post); + } + this.pre = pre; + this.post = post; + this.lineInfos = lineInfos; + this.tagIndex = tagIndex; + this.classifier = classifier; + this.legalTags = legalTags; + backgroundTags = new int[]{tagIndex.indexOf(classifier.flags.backgroundSymbol)}; + } + + } // end class Scorer + + private boolean normalize() { + return flags.normalize; + } + + static int lastPos = -1; + + public Counter scoresOf(List lineInfos, int pos) { +// if (pos != lastPos) { +// System.err.print(pos+"."); +// lastPos = pos; +// } +// System.err.print("!"); + Datum d = makeDatum(lineInfos, pos, featureFactory); + return classifier.logProbabilityOf(d); + } + + + /** + * Takes a {@link List} of {@link CoreLabel}s and prints the likelihood + * of each possible label at each point. + * TODO: Finish or delete this method! + * + * @param document A {@link List} of {@link CoreLabel}s. + */ + @Override + public void printProbsDocument(List document) { + + //ClassicCounter c = scoresOf(document, 0); + + } + + /** Command-line version of the classifier. See the class + * comments for examples of use, and SeqClassifierFlags + * for more information on supported flags. + */ + public static void main(String[] args) throws Exception { + StringUtils.printErrInvocationString("CMMClassifier", args); + + Properties props = StringUtils.argsToProperties(args); + CMMClassifier cmm = new CMMClassifier(props); + String testFile = cmm.flags.testFile; + String textFile = cmm.flags.textFile; + String loadPath = cmm.flags.loadClassifier; + String serializeTo = cmm.flags.serializeTo; + + // cmm.crossValidateTrainAndTest(trainFile); + if (loadPath != null) { + cmm.loadClassifierNoExceptions(loadPath, props); + } else if (cmm.flags.loadJarClassifier != null) { + cmm.loadJarClassifier(cmm.flags.loadJarClassifier, props); + } else if (cmm.flags.trainFile != null) { + if (cmm.flags.biasedTrainFile != null) { + cmm.trainSemiSup(); + } else { + cmm.train(); + } + } else { + cmm.loadDefaultClassifier(); + } + + if (serializeTo != null) { + cmm.serializeClassifier(serializeTo); + } + + if (testFile != null) { + cmm.classifyAndWriteAnswers(testFile, cmm.makeReaderAndWriter()); + } else if (cmm.flags.testFiles != null) { + cmm.classifyAndWriteAnswers(cmm.flags.baseTestDir, cmm.flags.testFiles, + cmm.makeReaderAndWriter()); + } + + if (textFile != null) { + DocumentReaderAndWriter readerAndWriter = + new PlainTextDocumentReaderAndWriter(); + cmm.classifyAndWriteAnswers(textFile, readerAndWriter); + } + } // end main + + + public double weight(String feature, String label) { + return ((LinearClassifier)classifier).weight(feature, label); + } + + public double[][] weights() { + return ((LinearClassifier)classifier).weights(); + } + + @Override + public List classifyWithGlobalInformation(List tokenSeq, final CoreMap doc, final CoreMap sent) { + return classify(tokenSeq); + } + +} // end class CMMClassifier diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/AcronymModel.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/AcronymModel.java new file mode 100644 index 0000000..83c0471 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/AcronymModel.java @@ -0,0 +1,669 @@ +package edu.stanford.nlp.ie.pascal; + +import java.util.*; +import java.io.*; + +import edu.stanford.nlp.util.Generics; + + /** + * Scores Pascal challenge workshop information templates. + * This score reflects which fields are present/absent, how well acronyms + * agree with the names and URLs they correspond to. + * + * @author Jamie Nicolson + */ +public class AcronymModel implements RelationalModel { + + + private static final double HIGH_PROB = 1.0; + private static final double LOW_PROB = 0.0; + private static boolean DEBUG= false; + + private static final String acronymStatistics = + "workshopname workshopacronym workshophomepage conferencename conferenceacronym conferencehomepage\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00549450549450549\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.0521978021978022\n" + + "0.00274725274725275\n" + + "0.0357142857142857\n" + + "0.00549450549450549\n" + + "0.021978021978022\n" + + "0.010989010989011\n" + + "0.0357142857142857\n" + + "0.0302197802197802\n" + + "0.0824175824175824\n" + + "0.00549450549450549\n" + + "0.043956043956044\n" + + "0.010989010989011\n" + + "0.021978021978022\n" + + "0.00549450549450549\n" + + "0.0521978021978022\n" + + "0.0412087912087912\n" + + "0.0467032967032967\n" + + "0.00274725274725275\n" + + "0.010989010989011\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.00274725274725275\n" + + "0.0137362637362637\n" + + "0.00824175824175824\n" + + "0.167582417582418\n" + + "0.00549450549450549\n" + + "0.0494505494505494\n" + + "0.00824175824175824\n" + + "0.0164835164835165\n" + + "0.00549450549450549\n" + + "0.0604395604395604\n" + + "0.0467032967032967\n"; + + private Prior priors; + + /** + * Scores the partial template containing only the fields relevant to the score. + * @param temp the {@link InfoTemplate} to be scored. + * @return the model's score + */ + public double computeProb(InfoTemplate temp){ + return computeProb(temp.wname,temp.wacronym,temp.cname,temp.cacronym, + temp.whomepage, temp.chomepage); + } +/** + * Scores the {@link PascalTemplate} using the fields it contains which are relevant to the score. + * (Ignores location and date fields.) + * @param temp the full {@link PascalTemplate} to be scored + * @return the model's score + */ + + public double computeProb(PascalTemplate temp) { + double prob = 1.0; + + String wsname = temp.getValue("workshopname"); + String confname = temp.getValue("conferencename"); + String wsacronym = temp.getValue("workshopacronym"); + String confacronym = temp.getValue("conferenceacronym"); + String wsurl = temp.getValue("workshophomepage"); + String confurl = temp.getValue("conferencehomepage"); + return computeProb(wsname, wsacronym,confname,confacronym, wsurl, confurl); + } + + /** + * @throws IOException if the acronym statistics/weights can't be read from file. + */ + public AcronymModel() throws IOException { + priors = new Prior(new BufferedReader(new StringReader(acronymStatistics))); + features = new Feature[]{new AcronymModel.LettersAligned(), new AcronymModel.BegWord(), new AcronymModel.EndWord(), new AcronymModel.AfterAligned(), new AcronymModel.AlignedPerWord(), new AcronymModel.WordsSkipped(), new AcronymModel.SyllableBoundary()}; + weights = new double[]{// here's weights from a bunch of training examples + //-4.1004, 18.4127, 0.1789, 16.3189, 0.8818, -0.0725, -0.6550 + //-12.4082, 18.3893, 2.1826, 18.8487, 0.5042, -0.1231, 1.8876 + -11.8880, 14.4534, -2.6316, 24.1838, -2.2320, -0.2508, 4.3501 + + }; + //intercept = -14.1449; + //intercept = -7.4882; + intercept = -2.2062; + } + + private double computeProb(String wsname, String wsacronym, String confname, + String confacronym, String wsurl, String confurl){ + + Set presentFields = Generics.newHashSet(); + if( wsname != null && !wsname.equals("null") && !wsname.equals("") ) + presentFields.add("workshopname"); + if( wsacronym != null && !wsacronym.equals("null") && !wsacronym.equals("")) + presentFields.add("workshopacronym"); + if( confname != null && !confname.equals("null") + && !confname.equals("")) + presentFields.add("conferencename"); + if( confacronym != null && !confacronym.equals("null") + && !confacronym.equals("")) + presentFields.add("conferenceacronym"); + if( wsurl != null && !wsurl.equals("null") && !wsurl.equals("")) + presentFields.add("workshophomepage"); + if( confurl != null && !confurl.equals("null") && !confurl.equals("")) + presentFields.add("conferencehomepage"); + + //if the workshop and conference have the same acronym we return 0. + if(presentFields.contains("conferenceacronym") && + presentFields.contains("workshopacronym") && + confacronym.equals(wsacronym)){ + + return 0.0; + } + + double prob = priors.get(presentFields); + //System.out.println("Setting prior to " + prob + " based on the following "+ + // "fields being present: " + presentFields.toString()); + if( wsname != null && wsacronym != null ) { + if(DEBUG)System.err.println("computing similarity for workshop"); + prob *= similarity(wsname, wsacronym); + } else { + if(DEBUG)System.err.println("NOT computing similarity for workshop"); + } + + if( confname != null && confacronym != null ) { + if(DEBUG)System.err.println("computing similarity for conference"); + prob *= similarity(confname, confacronym); + } else { + if(DEBUG)System.err.println("NOT computing similarity for conference"); + } + + if( confacronym != null && confurl != null ) { + if( acronymMatchesURL(confacronym, confurl) ) { + prob *= probMatchFromAcronymAndURLMatch; + } else { + prob *= probMatchFromAcronymAndURLNoMatch; + } + } + + if( wsacronym != null && wsurl != null ) { + if( acronymMatchesURL(wsacronym, wsurl) ) { + prob *= probMatchFromAcronymAndURLMatch; + } else { + prob *= probMatchFromAcronymAndURLNoMatch; + } + } + return prob; + } + + private static boolean acronymMatchesURL(String ac, String url) { + String lowerURL = url.toLowerCase(); + String strippedAc = (new String(AcronymModel.stripAcronym(ac))).toLowerCase(); + + return lowerURL.indexOf(strippedAc) != -1; + } + + private static final double probMatchFromAcronymAndURLMatch = .23934426; + private static final double probMatchFromAcronymAndURLNoMatch = .052516411378; + + /** + * Finds longest subsequent string of digits. Returns empty string + * if there aren't any digits. + */ + private static String acronymNumber(String acronym) { + return ""; + } + + public static double URLSimilarity(String URL, String acronym) { + String strippedAc = new String(stripAcronym(acronym)); + String acNumber = acronymNumber(acronym); + return 0.0; + } + + /** + * @return the "rich similarity" score + */ + public double similarity(String name, String acronym) { + return RichSimilarity(name, acronym); + } + /** + * + * @return the "naive similarity" score + */ + public double naiveSimilarity(String name, String acronym) { + double similarity = LOW_PROB; + String[] nameWords = splitOnWhitespace(name); + String[] acronymWords = splitOnWhitespace(acronym); + + // first put together the letters in the acronym + char[] acLetters = allLetters(acronymWords); + + // first let's try pulling the first letters from the name, and combining them to get the acronym + char[] nameFirstLetters = firstLetters(nameWords); + + if (firstLetterInOrderMatch(nameFirstLetters, acLetters)) { + // the letters in acronym can be constructed from the first letters in the name, in order + similarity = HIGH_PROB; + } + + if (DEBUG) { + System.err.println("Similarity between (" + name + ") and (" + acronym + ") is " + similarity); + } + return similarity; + } + + /** + * + * @return the Hearst similarity score + */ + public double HearstSimilarity(String name, String acronym) { + char[] namechars = name.toLowerCase().toCharArray(); + char[] acrochars = acronym.toLowerCase().toCharArray(); + + int nindex = namechars.length - 1; + for (int aindex = acrochars.length - 1; aindex >= 0; --aindex) { + if (!Character.isLetter(acrochars[aindex])) { + continue; + } + while ((nindex >= 0 && namechars[nindex] != acrochars[aindex]) || (aindex == 0 && nindex > 0 && Character.isLetterOrDigit(namechars[nindex - 1]))) { + nindex--; + } + if (nindex < 0) { + // System.err.println("\"" + name + "\" does NOT match \"" + + // acronym + "\"\n"); + return 0; + } + + nindex--; + } + + //System.err.println("\"" + name + "\" matches \"" + acronym + "\"\n"); + return 1.0; + } + + public static interface Feature { + public double value(Alignment alignment); + + public String toString(); + } + + public static class LettersAligned implements Feature { + public String toString() { + return "LettersAligned"; + }; + public double value(Alignment alignment) { + int numAligned = 0; + for (int i = 0; i < alignment.pointers.length; ++i) { + if (alignment.pointers[i] != -1) { + numAligned++; + } + } + double pct = (double) numAligned / (double) alignment.pointers.length; + if (DEBUG) + System.out.println("LettersAligned=" + pct); + return pct; + } + } + + public static class BegWord implements Feature { + public String toString() { return "BegWord"; }; + public double value(Alignment alignment) { + int begAligned = 0; + for( int s = 0; s < alignment.pointers.length; ++s) { + int idx = alignment.pointers[s]; + if( idx == 0 ) { + begAligned++; + } else if( idx > 0) { + char cur = alignment.longForm[idx]; + char prev = alignment.longForm[idx-1]; + if( !Character.isLetterOrDigit(prev) && + Character.isLetterOrDigit(cur) ) + { + begAligned++; + } + } + } + return (double)begAligned / (double)alignment.shortForm.length; + } + } + + public static class EndWord implements Feature { + public String toString() { return "EndWord"; }; + public double value(Alignment alignment) { + int endAligned = 0; + for( int s = 0; s < alignment.pointers.length; ++s) { + int idx = alignment.pointers[s]; + if( idx == alignment.longForm.length-1 ) { + endAligned++; + } else if( idx >= 0) { + char cur = alignment.longForm[idx]; + char next = alignment.longForm[idx+1]; + if( !Character.isLetterOrDigit(next) && + Character.isLetterOrDigit(cur) ) + { + endAligned++; + } + } + } + return (double)endAligned / (double)alignment.shortForm.length; + } + } + + /** + * Percent of letters aligned immediately after another aligned letter. + */ + public static class AfterAligned implements Feature { + public String toString() { return "AfterAligned"; } + + public double value(Alignment alignment) { + int numAfter = 0; + for( int i = 1; i < alignment.pointers.length; ++i) { + if( alignment.pointers[i] == alignment.pointers[i-1] + 1 ) { + numAfter++; + } + } + return (double)numAfter / (double)alignment.shortForm.length; + } + } + + private static class RunningAverage { + double average; + int numSamples; + + public RunningAverage() { + average = 0.0; + numSamples = 0; + } + + public void addSample(double sample) { + average = (numSamples * average) + sample; + numSamples++; + average /= numSamples; + } + + public double getAverage() { + return average; + } + + public double getNumSammples() { + return numSamples; + } + } + + /** + * Average number of aligned letters per word. + */ + public static class AlignedPerWord implements Feature { + public String toString() { return "AlignedPerWord"; } + + public double value(Alignment alignment) { +/* + RunningAverage alignedPerWord = new RunningAverage(); + boolean inWord = false; + int alignCount = 0; + int sidx = 0; + for(int lidx = 0; lidx < alignment.longForm.length; ++lidx ) { + char cur = alignment.longForm[lidx]; + if( Character.isLetterOrDigit(cur) && !inWord ) { + // beginning of word + inWord = true; + } else if( inWord && !Character.isLetterOrDigit(cur) ) { + // end of word + alignedPerWord.addSample(alignCount); + alignCount = 0; + inWord = false; + } + + while( sidx < alignment.pointers.length && + alignment.pointers[sidx] < lidx ) + sidx++; + + if( sidx < alignment.pointers.length && + alignment.pointers[sidx] == lidx && inWord) + { + alignCount++; + } + } + if( inWord ) { + // end of last word + alignedPerWord.addSample(alignCount); + } + + return alignedPerWord.getAverage(); +*/ + boolean inWord = false; + int wordCount = 0; + for(int lidx = 0; lidx < alignment.longForm.length; ++lidx ) { + char cur = alignment.longForm[lidx]; + if( Character.isLetterOrDigit(cur) && !inWord ) { + // beginning of word + ++wordCount; + inWord = true; + } else if( inWord && !Character.isLetterOrDigit(cur) ) { + // end of word + inWord = false; + } + } + int alignCount = 0; + for( int sidx = 0; sidx < alignment.pointers.length; ++sidx) { + if( alignment.pointers[sidx] != -1 ) { + ++alignCount; + } + } + if( wordCount == 0 ) { + return 0; + } else { + return (double)alignCount / (double)wordCount; + } + } + } + + public static class WordsSkipped implements Feature { + public String toString() { return "WordsSkipped"; }; + public double value(Alignment alignment) { + int wordsSkipped = 0; + int wordsAligned = 0; + boolean inWord = false; + boolean gotAlignedChar = false; + boolean []isAligned = new boolean[alignment.longForm.length]; + for( int s = 0; s < alignment.pointers.length; ++s ) { + if( alignment.pointers[s] != -1 ) { + isAligned[alignment.pointers[s]] = true; + } + } + for( int l = 0; l < alignment.longForm.length; ++l ) { + char cur = alignment.longForm[l]; + if( inWord ) { + if( !Character.isLetterOrDigit(cur)) { + // just finished a word + if( gotAlignedChar ) { + wordsAligned++; + } else { + wordsSkipped++; + } + inWord = false; + } + } else { + if( Character.isLetterOrDigit(cur)) { + inWord = true; + gotAlignedChar = false; + } + } + if( isAligned[l] ) gotAlignedChar = true; + } + if( inWord ) { + if( gotAlignedChar ) { + wordsAligned++; + } else { + wordsSkipped++; + } + } + if(DEBUG)System.out.println("Words skipped: " + wordsSkipped + "/" + + (wordsSkipped + wordsAligned) ); + return wordsSkipped; + } + } + + public static class SyllableBoundary implements Feature { + public String toString() { return "SyllableBoundary"; }; + TeXHyphenator teXHyphenator = new TeXHyphenator(); + public SyllableBoundary() throws IOException { + teXHyphenator.loadDefault(); + } + public double value(Alignment alignment) { + char [] lcLongForm = + (new String(alignment.longForm)).toLowerCase().toCharArray(); + boolean [] breakPoints = teXHyphenator.findBreakPoints(lcLongForm); + int numSylAligned = 0; + for( int i = 0; i < alignment.pointers.length; ++i ) { + if( alignment.pointers[i] != -1 && + breakPoints[alignment.pointers[i]] ) + { + numSylAligned++; + } + } + return (double)numSylAligned / (double)alignment.pointers.length; + } + } + + private final Feature[] features; + + private final double[] weights; + private final double intercept; + + public static char[] stripAcronym(String acronym) { + char [] raw = acronym.toCharArray(); + char [] firstTry = new char[raw.length]; + int outIdx = 0; + for( int inIdx = 0; inIdx < raw.length; ++inIdx) { + if( Character.isLetter(raw[inIdx]) ) { + firstTry[outIdx++] = raw[inIdx]; + } + } + if( outIdx == firstTry.length ) { + if(DEBUG) System.out.println("Converted \"" + acronym + "\" to \"" + + (new String(firstTry)) + "\"\n"); + return firstTry; + } else { + char [] polished = new char[outIdx]; + System.arraycopy(firstTry, 0, polished, 0, outIdx); + if(DEBUG) System.out.println("Converted \"" + acronym + "\" to \"" + + (new String(polished)) + "\"\n"); + return polished; + } + } + + + public double RichSimilarity(String name, String acronym) { + AlignmentFactory fact = new AlignmentFactory( + name.toCharArray(), stripAcronym(acronym) ); + + double maxprob = 0.0; + Iterator iter = fact.getAlignments(); + while(iter.hasNext()) { + Alignment align = (Alignment) iter.next(); + + double [] featureVals = new double[features.length]; + for( int f = 0; f < features.length; ++f) { + featureVals[f] = features[f].value(align); + } + + // compute dotproduct and sigmoid + double dotprod = dotproduct(weights, featureVals) + intercept; + double exp = Math.exp(dotprod); + double prob = exp / (1 + exp); + + // align.print(); + //System.out.println("Prob: " + prob + "\n-----------\n"); + + if( prob > maxprob ){ + maxprob = prob; + } + } + + return maxprob; + } + + private static double dotproduct(double[] one, double[]two) { + double sum = 0.0; + for( int i = 0; i < one.length; ++i) { + double product = one[i] * two[i]; + if(DEBUG)System.out.println("product: " + product); + sum += product; + } + if(DEBUG)System.out.println("sum: " + sum); + return sum; + } + + private static final String[] stringArrayType = new String[0]; + + private static String[] splitOnWhitespace(String words) { + String[] firstCut = words.split("\\s+"); + + ArrayList wordList = new ArrayList(firstCut.length); + for( int i = 0; i < firstCut.length; ++i ) { + if( firstCut[i].length() > 0 ) { + wordList.add(firstCut[i]); + } + } + return wordList.toArray(stringArrayType); + } + + private static boolean firstLetterInOrderMatch(char[] nameFirstLetters, char[] acLetters) { + int nameIdx = 0; + int acIdx = 0; + + for( ; acIdx < acLetters.length; ++acIdx) { + while( nameIdx < nameFirstLetters.length && nameFirstLetters[nameIdx] != acLetters[acIdx] ) { + ++nameIdx; + } + if( nameIdx == nameFirstLetters.length ) { + return false; + } + } + return true; + } + + private static char[] allLetters(String[] acronym) { + StringBuffer sb = new StringBuffer(); + for( int s = 0; s < acronym.length; ++s ) { + String acr = acronym[s]; + for(int c = 0; c < acr.length(); ++c ) { + char ch = acr.charAt(c); + if( Character.isLetter( ch ) ) { + sb.append(ch); + } + } + } + return sbToChars(sb); + } + + private static char[] firstLetters(String[] name) { + StringBuffer sb = new StringBuffer(name.length); + for( int s = 0; s < name.length; ++s) { + char c = name[s].charAt(0); + if( Character.isLetter(c) ) { + sb.append(c); + } + } + return sbToChars(sb); + } + + private static char[] sbToChars(StringBuffer sb) { + char[] letters = new char[sb.length()]; + sb.getChars(0, sb.length(), letters, 0); + return letters; + } + + public static void main(String[] args) throws Exception { + + AcronymModel am = new AcronymModel(); + String s1 = args[0]; + String s2 = args[1]; + System.out.println("Hearst: "+am.HearstSimilarity(s1, s2)); + System.out.println("naive: "+am.naiveSimilarity(s1, s2)); + System.out.println("Rich: "+am.RichSimilarity(s1, s2)); + System.out.println("default: "+am.similarity(s1, s2)); + + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/Alignment.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/Alignment.java new file mode 100644 index 0000000..7d26608 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/Alignment.java @@ -0,0 +1,113 @@ +package edu.stanford.nlp.ie.pascal; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; + +/** + * Container class for aligning acronyms. + * + * @author Jamie Nicolson + */ + +public class Alignment { + public char[] longForm; + public char[] shortForm; + public int[] pointers; + + public Alignment(char[] longForm, char[] shortForm, int[] pointers) { + this.longForm = longForm; + this.shortForm = shortForm; + this.pointers = pointers; + } + + public void serialize(PrintWriter writer) { + writer.println(new String(longForm)); + writer.println(new String(shortForm)); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < pointers.length; ++i) { + sb.append(pointers[i] + " "); + } + writer.println(sb.toString()); + } + + public Alignment(BufferedReader reader) throws IOException { + String line; + line = reader.readLine(); + if (line == null) { + throw new IOException(); + } + longForm = line.toCharArray(); + line = reader.readLine(); + if (line == null) { + throw new IOException(); + } + shortForm = line.toCharArray(); + line = reader.readLine(); + if (line == null) { + throw new IOException(); + } + String[] pstrings = line.split("\\s+"); + if (pstrings.length != shortForm.length) { + throw new IOException("Number of pointers != size of short form"); + } + pointers = new int[pstrings.length]; + for (int i = 0; i < pointers.length; ++i) { + pointers[i] = Integer.parseInt(pstrings[i]); + } + } + + public void print() { + System.out.println(toString()); + } + + @Override + public String toString() { + return toString(""); + } + + private static final char[] spaces = " ".toCharArray(); + + public String toString(String prefix) { + StringBuffer buf = new StringBuffer(); + buf.append(prefix); + buf.append(longForm); + buf.append("\n"); + buf.append(spaces, 0, prefix.length()); + int l = 0; + for (int s = 0; s < shortForm.length; ++s) { + if (pointers[s] == -1) { + continue; + } + for (; l < longForm.length && pointers[s] != l; ++l) { + buf.append(" "); + } + if (l < longForm.length) { + buf.append(shortForm[s]); + ++l; + } + } + return buf.toString(); + } + + @Override + public boolean equals(Object o) { + if (o == null || !(o instanceof Alignment)) { + return false; + } + Alignment cmp = (Alignment) o; + + return Arrays.equals(longForm, cmp.longForm) && Arrays.equals(shortForm, cmp.shortForm) && Arrays.equals(pointers, cmp.pointers); + } + + @Override + public int hashCode() { + int code = 0; + for (int i = 0; i < pointers.length; ++i) { + code += pointers[i]; + code *= 31; + } + return code; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/AlignmentFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/AlignmentFactory.java new file mode 100644 index 0000000..a198681 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/AlignmentFactory.java @@ -0,0 +1,218 @@ +package edu.stanford.nlp.ie.pascal; + +import java.util.*; +import edu.stanford.nlp.util.Generics; + +/** + * Generates {@link Alignment} objects for acronym alignment. + * + * @author Jamie Nicolson + */ +public class AlignmentFactory { + + public static final byte SHIFT_LONG = 1; + public static final byte SHIFT_SHORT = 2; + public static final byte SHIFT_BOTH = 4; + + private char[] longForm; + private char[] lcLongForm; + private char[] shortForm; + private char[] lcShortForm; + private int [][]alignMatrix; + private byte [][]backMatrix; + private Set alignments; + + public AlignmentFactory(String longForm, String shortForm) { + this(longForm.toCharArray(), shortForm.toCharArray()); + } + + public static char[] toLower(char []in) { + char[] out = new char[in.length]; + for(int i = 0; i < in.length; ++i) { + out[i] = Character.toLowerCase(in[i]); + } + return out; + } + + public AlignmentFactory(char[] longForm, char[] shortForm) { + this.longForm = longForm; + this.lcLongForm = toLower(longForm); + this.shortForm = shortForm; + this.lcShortForm = toLower(shortForm); + + alignMatrix = new int[lcLongForm.length][lcShortForm.length]; + backMatrix = new byte[lcLongForm.length][lcShortForm.length]; + for( int l = 0; l < lcLongForm.length; ++l) { + for( int s = 0; s < lcShortForm.length; ++s) { + int match = (lcLongForm[l] == lcShortForm[s]) ? 1 : 0; + int froml = (l == 0) ? 0 : alignMatrix[l-1][s]; + int froms = (s == 0) ? 0 : alignMatrix[l][s-1]; + int frommatch = + ((l==0 || s==0) ? 0 : alignMatrix[l-1][s-1]) + match; + int max = Math.max(froml, Math.max(froms, frommatch)); + byte backp = 0; + if( froml == max ) backp |= SHIFT_LONG; + if( froms == max ) backp |= SHIFT_SHORT; + if( match == 1 && frommatch == max ) backp |= SHIFT_BOTH; + backMatrix[l][s] = backp; + alignMatrix[l][s] = max; + } + } + + alignments = Generics.newHashSet(); + int[] pointers = new int[lcShortForm.length]; + Arrays.fill(pointers, -1); + + if( lcLongForm.length > 0 && lcShortForm.length > 0 ) { + addCount = 0; + //initListMatrix(); + findAlignments(pointers, lcLongForm.length-1, lcShortForm.length-1); + //listMatrix = null; + + } + } + + public Iterator getAlignments() { + return alignments.iterator(); + } + + public ArrayList getAlignmentsList() { + return new ArrayList(alignments); + } + + public static String dumpIntArray(int []a) { + StringBuilder buf = new StringBuilder(); + buf.append('['); + for (int anA : a) { + buf.append(anA).append(' '); + } + buf.append(']'); + return buf.toString(); + } + + int addCount; + +/* + LinkedList [] [] listMatrix; + + private void initListMatrix() { + listMatrix = new LinkedList[][lcLongForm.length]; + for( int i = 0; i < lcLongForm.length; i++) { + listMatrix[i] = new LinkedList[lcShortForm.length]; + } + } +*/ + +/* + private void findAlignments(int l, int s) { + if( listMatrix[l][s] != null ) + return; + + byte backp = backMatrix[l][s]; + + listMatrix[l][s] = new LinkedList(); + + if( alignMatrix[l][s] == 0 ) { + listMatrix[l][s].add( new int[shortForm.length] ); + return; + } + + if( (backp & SHIFT_BOTH) != 0 ) { + assert( lcLongForm[l] == lcShortForm[s] ); + findAlignments(l-1,s-1); + LinkedList from = listMatrix[l-1][s-1]; + Iterator iter = from.iterator(); + while(iter.hasNext()) { + int[] ref = (int[]) iter.next(); + int[] cpy = ref.clone(); + cpy[s] = l; + listMatrix[l][s].add(cpy); + } + } + + if( (backp & SHIFT_LONG) != 0 ) { + if( l != 0 ) { + findAlignments(l-1, s); + Iterator iter = listMatrix[l-1][s]; + while(iter.hasNext()) { + listMatrix[l][s].add( iter.next() ); + } + } else { + listMatrix[l][s].add( new int[shortForm.length] ); + } + } + + if( (backp & SHIFT_SHORT) != 0 ) { + backp &= ~SHIFT_SHORT; + int[] ptrcpy = (int[]) ((backp == 0) ? pointers : pointers.clone()); + if( s == 0 ) { + ++addCount; + alignments.add( new Alignment(longForm, shortForm, ptrcpy) ); + } else { + findAlignments(ptrcpy, l, s-1); + } + } + + if( lcLongForm[l] == lcShortForm[s] ) + assert( (backMatrix[l][s] & SHIFT_BOTH) != 0); +*/ + + private void findAlignments(int[]pointers, int lg, int s) + { + byte backp = backMatrix[lg][s]; + + if( alignMatrix[lg][s] == 0 ) { + ++addCount; + alignments.add( new Alignment(longForm, shortForm, pointers) ); + return; + } + + if( (backp & SHIFT_LONG)!= 0 ) { + backp &= ~SHIFT_LONG; + int[] ptrcpy = ((backp == 0) ? pointers : pointers.clone()); + if( lg == 0 ) { + ++addCount; + alignments.add( new Alignment(longForm, shortForm, ptrcpy) ); + } else { + findAlignments(ptrcpy, lg-1, s); + } + } + + if( (backp & SHIFT_SHORT) != 0 ) { + backp &= ~SHIFT_SHORT; + int[] ptrcpy = ((backp == 0) ? pointers : pointers.clone()); + if( s == 0 ) { + ++addCount; + alignments.add( new Alignment(longForm, shortForm, ptrcpy) ); + } else { + findAlignments(ptrcpy, lg, s-1); + } + } + + if( lcLongForm[lg] == lcShortForm[s] ) + assert( (backMatrix[lg][s] & SHIFT_BOTH) != 0); + + if( (backp & SHIFT_BOTH) != 0 ) { + assert( lcLongForm[lg] == lcShortForm[s] ); + pointers[s] = lg; + if( lg == 0 || s == 0 ) { + ++addCount; + alignments.add( new Alignment(longForm, shortForm, pointers) ); + } else { + findAlignments(pointers, lg-1, s-1); + } + } + } + + public static void main(String[] args) throws Exception { + AlignmentFactory fact = new AlignmentFactory(args[0].toCharArray(), + AcronymModel.stripAcronym(args[1])); + + Iterator iter = fact.getAlignments(); + while( iter.hasNext() ) { + Alignment a = iter.next(); + a.print(); + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/CliqueTemplates.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/CliqueTemplates.java new file mode 100644 index 0000000..100f093 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/CliqueTemplates.java @@ -0,0 +1,31 @@ +package edu.stanford.nlp.ie.pascal; + +import edu.stanford.nlp.stats.ClassicCounter; + +import java.net.URL; +import java.util.HashMap; +import java.util.ArrayList; + +/** + * Template information and counters corresponding to sampling on one document. + * + * As an alternative to reading a document labelling into a full {@link PascalTemplate} + * we can read it into partial templates which contain only strictly related information, + * (See {@link DateTemplate} and {@link InfoTemplate}). + * + * @author Chris Cox + */ + +public class CliqueTemplates { + + public HashMap stemmedAcronymIndex = new HashMap(); + public HashMap inverseAcronymMap = new HashMap(); + + public ArrayList urls = null; + + public ClassicCounter dateCliqueCounter = new ClassicCounter(); + public ClassicCounter locationCliqueCounter = new ClassicCounter(); + public ClassicCounter workshopInfoCliqueCounter = new ClassicCounter(); + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/DateTemplate.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/DateTemplate.java new file mode 100644 index 0000000..6e3a25e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/DateTemplate.java @@ -0,0 +1,49 @@ +package edu.stanford.nlp.ie.pascal; + +/** + * A partial {@link PascalTemplate}. Holds date fields only. + * + * @author Chris Cox + */ +public class DateTemplate{ + + public String subdate="1/1/1000"; + public String noadate="1/1/1000"; + public String crcdate="1/1/1000"; + public String workdate="1/1/1000"; + + public DateTemplate(String subdate,String noadate,String crcdate,String workdate) { + if(subdate!=null)this.subdate=subdate; + if(noadate!=null)this.noadate=noadate; + if(crcdate!=null)this.crcdate=crcdate; + if(workdate!=null)this.workdate=workdate; + } + + @Override + public int hashCode() { + int tally = 31; + int n = 3; + tally = tally+n*subdate.hashCode()+n*n*noadate.hashCode()+ + n*n*n*crcdate.hashCode()+n*workdate.hashCode(); + return tally; + } + + @Override + public boolean equals(Object obj) { + if(obj==null)return false; + if(! (obj instanceof DateTemplate)) return false; + + DateTemplate d = (DateTemplate)obj; + return (subdate.equals(d.subdate) && + noadate.equals(d.noadate) && + crcdate.equals(d.crcdate) && + workdate.equals(d.workdate)); + } + + @Override + public String toString() { + return (" Sub:" + subdate + " Noa:" + noadate + " Crc:" + crcdate + " Wrk:" + workdate); + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/DefaultTeXHyphenData.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/DefaultTeXHyphenData.java new file mode 100644 index 0000000..029912f --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/DefaultTeXHyphenData.java @@ -0,0 +1,4461 @@ +package edu.stanford.nlp.ie.pascal; + +/** + * Default TeX hyphenation data, as borrowed from the TeX distribution. + * + * THIS FILE IS AUTOMATICALLY GENERATED from DefaultTeXHyphenData.java.template + * and buildDefaultTeXHyphenData.pl. + * + * DO NOT EDIT DefaultTeXHyphenData.java; YOUR CHANGES WILL BE DISCARDED. + */ +public class DefaultTeXHyphenData { + public static final String hyphenData = + ".ach4\n" + + ".ad4der\n" + + ".af1t\n" + + ".al3t\n" + + ".am5at\n" + + ".an5c\n" + + ".ang4\n" + + ".ani5m\n" + + ".ant4\n" + + ".an3te\n" + + ".anti5s\n" + + ".ar5s\n" + + ".ar4tie\n" + + ".ar4ty\n" + + ".as3c\n" + + ".as1p\n" + + ".as1s\n" + + ".aster5\n" + + ".atom5\n" + + ".au1d\n" + + ".av4i\n" + + ".awn4\n" + + ".ba4g\n" + + ".ba5na\n" + + ".bas4e\n" + + ".ber4\n" + + ".be5ra\n" + + ".be3sm\n" + + ".be5sto\n" + + ".bri2\n" + + ".but4ti\n" + + ".cam4pe\n" + + ".can5c\n" + + ".capa5b\n" + + ".car5ol\n" + + ".ca4t\n" + + ".ce4la\n" + + ".ch4\n" + + ".chill5i\n" + + ".ci2\n" + + ".cit5r\n" + + ".co3e\n" + + ".co4r\n" + + ".cor5ner\n" + + ".de4moi\n" + + ".de3o\n" + + ".de3ra\n" + + ".de3ri\n" + + ".des4c\n" + + ".dictio5\n" + + ".do4t\n" + + ".du4c\n" + + ".dumb5\n" + + ".earth5\n" + + ".eas3i\n" + + ".eb4\n" + + ".eer4\n" + + ".eg2\n" + + ".el5d\n" + + ".el3em\n" + + ".enam3\n" + + ".en3g\n" + + ".en3s\n" + + ".eq5ui5t\n" + + ".er4ri\n" + + ".es3\n" + + ".eu3\n" + + ".eye5\n" + + ".fes3\n" + + ".for5mer\n" + + ".ga2\n" + + ".ge2\n" + + ".gen3t4\n" + + ".ge5og\n" + + ".gi5a\n" + + ".gi4b\n" + + ".go4r\n" + + ".hand5i\n" + + ".han5k\n" + + ".he2\n" + + ".hero5i\n" + + ".hes3\n" + + ".het3\n" + + ".hi3b\n" + + ".hi3er\n" + + ".hon5ey\n" + + ".hon3o\n" + + ".hov5\n" + + ".id4l\n" + + ".idol3\n" + + ".im3m\n" + + ".im5pin\n" + + ".in1\n" + + ".in3ci\n" + + ".ine2\n" + + ".in2k\n" + + ".in3s\n" + + ".ir5r\n" + + ".is4i\n" + + ".ju3r\n" + + ".la4cy\n" + + ".la4m\n" + + ".lat5er\n" + + ".lath5\n" + + ".le2\n" + + ".leg5e\n" + + ".len4\n" + + ".lep5\n" + + ".lev1\n" + + ".li4g\n" + + ".lig5a\n" + + ".li2n\n" + + ".li3o\n" + + ".li4t\n" + + ".mag5a5\n" + + ".mal5o\n" + + ".man5a\n" + + ".mar5ti\n" + + ".me2\n" + + ".mer3c\n" + + ".me5ter\n" + + ".mis1\n" + + ".mist5i\n" + + ".mon3e\n" + + ".mo3ro\n" + + ".mu5ta\n" + + ".muta5b\n" + + ".ni4c\n" + + ".od2\n" + + ".odd5\n" + + ".of5te\n" + + ".or5ato\n" + + ".or3c\n" + + ".or1d\n" + + ".or3t\n" + + ".os3\n" + + ".os4tl\n" + + ".oth3\n" + + ".out3\n" + + ".ped5al\n" + + ".pe5te\n" + + ".pe5tit\n" + + ".pi4e\n" + + ".pio5n\n" + + ".pi2t\n" + + ".pre3m\n" + + ".ra4c\n" + + ".ran4t\n" + + ".ratio5na\n" + + ".ree2\n" + + ".re5mit\n" + + ".res2\n" + + ".re5stat\n" + + ".ri4g\n" + + ".rit5u\n" + + ".ro4q\n" + + ".ros5t\n" + + ".row5d\n" + + ".ru4d\n" + + ".sci3e\n" + + ".self5\n" + + ".sell5\n" + + ".se2n\n" + + ".se5rie\n" + + ".sh2\n" + + ".si2\n" + + ".sing4\n" + + ".st4\n" + + ".sta5bl\n" + + ".sy2\n" + + ".ta4\n" + + ".te4\n" + + ".ten5an\n" + + ".th2\n" + + ".ti2\n" + + ".til4\n" + + ".tim5o5\n" + + ".ting4\n" + + ".tin5k\n" + + ".ton4a\n" + + ".to4p\n" + + ".top5i\n" + + ".tou5s\n" + + ".trib5ut\n" + + ".un1a\n" + + ".un3ce\n" + + ".under5\n" + + ".un1e\n" + + ".un5k\n" + + ".un5o\n" + + ".un3u\n" + + ".up3\n" + + ".ure3\n" + + ".us5a\n" + + ".ven4de\n" + + ".ve5ra\n" + + ".wil5i\n" + + ".ye4\n" + + "4ab.\n" + + "a5bal\n" + + "a5ban\n" + + "abe2\n" + + "ab5erd\n" + + "abi5a\n" + + "ab5it5ab\n" + + "ab5lat\n" + + "ab5o5liz\n" + + "4abr\n" + + "ab5rog\n" + + "ab3ul\n" + + "a4car\n" + + "ac5ard\n" + + "ac5aro\n" + + "a5ceou\n" + + "ac1er\n" + + "a5chet\n" + + "4a2ci\n" + + "a3cie\n" + + "ac1in\n" + + "a3cio\n" + + "ac5rob\n" + + "act5if\n" + + "ac3ul\n" + + "ac4um\n" + + "a2d\n" + + "ad4din\n" + + "ad5er.\n" + + "2adi\n" + + "a3dia\n" + + "ad3ica\n" + + "adi4er\n" + + "a3dio\n" + + "a3dit\n" + + "a5diu\n" + + "ad4le\n" + + "ad3ow\n" + + "ad5ran\n" + + "ad4su\n" + + "4adu\n" + + "a3duc\n" + + "ad5um\n" + + "ae4r\n" + + "aeri4e\n" + + "a2f\n" + + "aff4\n" + + "a4gab\n" + + "aga4n\n" + + "ag5ell\n" + + "age4o\n" + + "4ageu\n" + + "ag1i\n" + + "4ag4l\n" + + "ag1n\n" + + "a2go\n" + + "3agog\n" + + "ag3oni\n" + + "a5guer\n" + + "ag5ul\n" + + "a4gy\n" + + "a3ha\n" + + "a3he\n" + + "ah4l\n" + + "a3ho\n" + + "ai2\n" + + "a5ia\n" + + "a3ic.\n" + + "ai5ly\n" + + "a4i4n\n" + + "ain5in\n" + + "ain5o\n" + + "ait5en\n" + + "a1j\n" + + "ak1en\n" + + "al5ab\n" + + "al3ad\n" + + "a4lar\n" + + "4aldi\n" + + "2ale\n" + + "al3end\n" + + "a4lenti\n" + + "a5le5o\n" + + "al1i\n" + + "al4ia.\n" + + "ali4e\n" + + "al5lev\n" + + "4allic\n" + + "4alm\n" + + "a5log.\n" + + "a4ly.\n" + + "4alys\n" + + "5a5lyst\n" + + "5alyt\n" + + "3alyz\n" + + "4ama\n" + + "am5ab\n" + + "am3ag\n" + + "ama5ra\n" + + "am5asc\n" + + "a4matis\n" + + "a4m5ato\n" + + "am5era\n" + + "am3ic\n" + + "am5if\n" + + "am5ily\n" + + "am1in\n" + + "ami4no\n" + + "a2mo\n" + + "a5mon\n" + + "amor5i\n" + + "amp5en\n" + + "a2n\n" + + "an3age\n" + + "3analy\n" + + "a3nar\n" + + "an3arc\n" + + "anar4i\n" + + "a3nati\n" + + "4and\n" + + "ande4s\n" + + "an3dis\n" + + "an1dl\n" + + "an4dow\n" + + "a5nee\n" + + "a3nen\n" + + "an5est.\n" + + "a3neu\n" + + "2ang\n" + + "ang5ie\n" + + "an1gl\n" + + "a4n1ic\n" + + "a3nies\n" + + "an3i3f\n" + + "an4ime\n" + + "a5nimi\n" + + "a5nine\n" + + "an3io\n" + + "a3nip\n" + + "an3ish\n" + + "an3it\n" + + "a3niu\n" + + "an4kli\n" + + "5anniz\n" + + "ano4\n" + + "an5ot\n" + + "anoth5\n" + + "an2sa\n" + + "an4sco\n" + + "an4sn\n" + + "an2sp\n" + + "ans3po\n" + + "an4st\n" + + "an4sur\n" + + "antal4\n" + + "an4tie\n" + + "4anto\n" + + "an2tr\n" + + "an4tw\n" + + "an3ua\n" + + "an3ul\n" + + "a5nur\n" + + "4ao\n" + + "apar4\n" + + "ap5at\n" + + "ap5ero\n" + + "a3pher\n" + + "4aphi\n" + + "a4pilla\n" + + "ap5illar\n" + + "ap3in\n" + + "ap3ita\n" + + "a3pitu\n" + + "a2pl\n" + + "apoc5\n" + + "ap5ola\n" + + "apor5i\n" + + "apos3t\n" + + "aps5es\n" + + "a3pu\n" + + "aque5\n" + + "2a2r\n" + + "ar3act\n" + + "a5rade\n" + + "ar5adis\n" + + "ar3al\n" + + "a5ramete\n" + + "aran4g\n" + + "ara3p\n" + + "ar4at\n" + + "a5ratio\n" + + "ar5ativ\n" + + "a5rau\n" + + "ar5av4\n" + + "araw4\n" + + "arbal4\n" + + "ar4chan\n" + + "ar5dine\n" + + "ar4dr\n" + + "ar5eas\n" + + "a3ree\n" + + "ar3ent\n" + + "a5ress\n" + + "ar4fi\n" + + "ar4fl\n" + + "ar1i\n" + + "ar5ial\n" + + "ar3ian\n" + + "a3riet\n" + + "ar4im\n" + + "ar5inat\n" + + "ar3io\n" + + "ar2iz\n" + + "ar2mi\n" + + "ar5o5d\n" + + "a5roni\n" + + "a3roo\n" + + "ar2p\n" + + "ar3q\n" + + "arre4\n" + + "ar4sa\n" + + "ar2sh\n" + + "4as.\n" + + "as4ab\n" + + "as3ant\n" + + "ashi4\n" + + "a5sia.\n" + + "a3sib\n" + + "a3sic\n" + + "5a5si4t\n" + + "ask3i\n" + + "as4l\n" + + "a4soc\n" + + "as5ph\n" + + "as4sh\n" + + "as3ten\n" + + "as1tr\n" + + "asur5a\n" + + "a2ta\n" + + "at3abl\n" + + "at5ac\n" + + "at3alo\n" + + "at5ap\n" + + "ate5c\n" + + "at5ech\n" + + "at3ego\n" + + "at3en.\n" + + "at3era\n" + + "ater5n\n" + + "a5terna\n" + + "at3est\n" + + "at5ev\n" + + "4ath\n" + + "ath5em\n" + + "a5then\n" + + "at4ho\n" + + "ath5om\n" + + "4ati.\n" + + "a5tia\n" + + "at5i5b\n" + + "at1ic\n" + + "at3if\n" + + "ation5ar\n" + + "at3itu\n" + + "a4tog\n" + + "a2tom\n" + + "at5omiz\n" + + "a4top\n" + + "a4tos\n" + + "a1tr\n" + + "at5rop\n" + + "at4sk\n" + + "at4tag\n" + + "at5te\n" + + "at4th\n" + + "a2tu\n" + + "at5ua\n" + + "at5ue\n" + + "at3ul\n" + + "at3ura\n" + + "a2ty\n" + + "au4b\n" + + "augh3\n" + + "au3gu\n" + + "au4l2\n" + + "aun5d\n" + + "au3r\n" + + "au5sib\n" + + "aut5en\n" + + "au1th\n" + + "a2va\n" + + "av3ag\n" + + "a5van\n" + + "ave4no\n" + + "av3era\n" + + "av5ern\n" + + "av5ery\n" + + "av1i\n" + + "avi4er\n" + + "av3ig\n" + + "av5oc\n" + + "a1vor\n" + + "3away\n" + + "aw3i\n" + + "aw4ly\n" + + "aws4\n" + + "ax4ic\n" + + "ax4id\n" + + "ay5al\n" + + "aye4\n" + + "ays4\n" + + "azi4er\n" + + "azz5i\n" + + "5ba.\n" + + "bad5ger\n" + + "ba4ge\n" + + "bal1a\n" + + "ban5dag\n" + + "ban4e\n" + + "ban3i\n" + + "barbi5\n" + + "bari4a\n" + + "bas4si\n" + + "1bat\n" + + "ba4z\n" + + "2b1b\n" + + "b2be\n" + + "b3ber\n" + + "bbi4na\n" + + "4b1d\n" + + "4be.\n" + + "beak4\n" + + "beat3\n" + + "4be2d\n" + + "be3da\n" + + "be3de\n" + + "be3di\n" + + "be3gi\n" + + "be5gu\n" + + "1bel\n" + + "be1li\n" + + "be3lo\n" + + "4be5m\n" + + "be5nig\n" + + "be5nu\n" + + "4bes4\n" + + "be3sp\n" + + "be5str\n" + + "3bet\n" + + "bet5iz\n" + + "be5tr\n" + + "be3tw\n" + + "be3w\n" + + "be5yo\n" + + "2bf\n" + + "4b3h\n" + + "bi2b\n" + + "bi4d\n" + + "3bie\n" + + "bi5en\n" + + "bi4er\n" + + "2b3if\n" + + "1bil\n" + + "bi3liz\n" + + "bina5r4\n" + + "bin4d\n" + + "bi5net\n" + + "bi3ogr\n" + + "bi5ou\n" + + "bi2t\n" + + "3bi3tio\n" + + "bi3tr\n" + + "3bit5ua\n" + + "b5itz\n" + + "b1j\n" + + "bk4\n" + + "b2l2\n" + + "blath5\n" + + "b4le.\n" + + "blen4\n" + + "5blesp\n" + + "b3lis\n" + + "b4lo\n" + + "blun4t\n" + + "4b1m\n" + + "4b3n\n" + + "bne5g\n" + + "3bod\n" + + "bod3i\n" + + "bo4e\n" + + "bol3ic\n" + + "bom4bi\n" + + "bon4a\n" + + "bon5at\n" + + "3boo\n" + + "5bor.\n" + + "4b1ora\n" + + "bor5d\n" + + "5bore\n" + + "5bori\n" + + "5bos4\n" + + "b5ota\n" + + "both5\n" + + "bo4to\n" + + "bound3\n" + + "4bp\n" + + "4brit\n" + + "broth3\n" + + "2b5s2\n" + + "bsor4\n" + + "2bt\n" + + "bt4l\n" + + "b4to\n" + + "b3tr\n" + + "buf4fer\n" + + "bu4ga\n" + + "bu3li\n" + + "bumi4\n" + + "bu4n\n" + + "bunt4i\n" + + "bu3re\n" + + "bus5ie\n" + + "buss4e\n" + + "5bust\n" + + "4buta\n" + + "3butio\n" + + "b5uto\n" + + "b1v\n" + + "4b5w\n" + + "5by.\n" + + "bys4\n" + + "1ca\n" + + "cab3in\n" + + "ca1bl\n" + + "cach4\n" + + "ca5den\n" + + "4cag4\n" + + "2c5ah\n" + + "ca3lat\n" + + "cal4la\n" + + "call5in\n" + + "4calo\n" + + "can5d\n" + + "can4e\n" + + "can4ic\n" + + "can5is\n" + + "can3iz\n" + + "can4ty\n" + + "cany4\n" + + "ca5per\n" + + "car5om\n" + + "cast5er\n" + + "cas5tig\n" + + "4casy\n" + + "ca4th\n" + + "4cativ\n" + + "cav5al\n" + + "c3c\n" + + "ccha5\n" + + "cci4a\n" + + "ccompa5\n" + + "ccon4\n" + + "ccou3t\n" + + "2ce.\n" + + "4ced.\n" + + "4ceden\n" + + "3cei\n" + + "5cel.\n" + + "3cell\n" + + "1cen\n" + + "3cenc\n" + + "2cen4e\n" + + "4ceni\n" + + "3cent\n" + + "3cep\n" + + "ce5ram\n" + + "4cesa\n" + + "3cessi\n" + + "ces5si5b\n" + + "ces5t\n" + + "cet4\n" + + "c5e4ta\n" + + "cew4\n" + + "2ch\n" + + "4ch.\n" + + "4ch3ab\n" + + "5chanic\n" + + "ch5a5nis\n" + + "che2\n" + + "cheap3\n" + + "4ched\n" + + "che5lo\n" + + "3chemi\n" + + "ch5ene\n" + + "ch3er.\n" + + "ch3ers\n" + + "4ch1in\n" + + "5chine.\n" + + "ch5iness\n" + + "5chini\n" + + "5chio\n" + + "3chit\n" + + "chi2z\n" + + "3cho2\n" + + "ch4ti\n" + + "1ci\n" + + "3cia\n" + + "ci2a5b\n" + + "cia5r\n" + + "ci5c\n" + + "4cier\n" + + "5cific.\n" + + "4cii\n" + + "ci4la\n" + + "3cili\n" + + "2cim\n" + + "2cin\n" + + "c4ina\n" + + "3cinat\n" + + "cin3em\n" + + "c1ing\n" + + "c5ing.\n" + + "5cino\n" + + "cion4\n" + + "4cipe\n" + + "ci3ph\n" + + "4cipic\n" + + "4cista\n" + + "4cisti\n" + + "2c1it\n" + + "cit3iz\n" + + "5ciz\n" + + "ck1\n" + + "ck3i\n" + + "1c4l4\n" + + "4clar\n" + + "c5laratio\n" + + "5clare\n" + + "cle4m\n" + + "4clic\n" + + "clim4\n" + + "cly4\n" + + "c5n\n" + + "1co\n" + + "co5ag\n" + + "coe2\n" + + "2cog\n" + + "co4gr\n" + + "coi4\n" + + "co3inc\n" + + "col5i\n" + + "5colo\n" + + "col3or\n" + + "com5er\n" + + "con4a\n" + + "c4one\n" + + "con3g\n" + + "con5t\n" + + "co3pa\n" + + "cop3ic\n" + + "co4pl\n" + + "4corb\n" + + "coro3n\n" + + "cos4e\n" + + "cov1\n" + + "cove4\n" + + "cow5a\n" + + "coz5e\n" + + "co5zi\n" + + "c1q\n" + + "cras5t\n" + + "5crat.\n" + + "5cratic\n" + + "cre3at\n" + + "5cred\n" + + "4c3reta\n" + + "cre4v\n" + + "cri2\n" + + "cri5f\n" + + "c4rin\n" + + "cris4\n" + + "5criti\n" + + "cro4pl\n" + + "crop5o\n" + + "cros4e\n" + + "cru4d\n" + + "4c3s2\n" + + "2c1t\n" + + "cta4b\n" + + "ct5ang\n" + + "c5tant\n" + + "c2te\n" + + "c3ter\n" + + "c4ticu\n" + + "ctim3i\n" + + "ctu4r\n" + + "c4tw\n" + + "cud5\n" + + "c4uf\n" + + "c4ui\n" + + "cu5ity\n" + + "5culi\n" + + "cul4tis\n" + + "3cultu\n" + + "cu2ma\n" + + "c3ume\n" + + "cu4mi\n" + + "3cun\n" + + "cu3pi\n" + + "cu5py\n" + + "cur5a4b\n" + + "cu5ria\n" + + "1cus\n" + + "cuss4i\n" + + "3c4ut\n" + + "cu4tie\n" + + "4c5utiv\n" + + "4cutr\n" + + "1cy\n" + + "cze4\n" + + "1d2a\n" + + "5da.\n" + + "2d3a4b\n" + + "dach4\n" + + "4daf\n" + + "2dag\n" + + "da2m2\n" + + "dan3g\n" + + "dard5\n" + + "dark5\n" + + "4dary\n" + + "3dat\n" + + "4dativ\n" + + "4dato\n" + + "5dav4\n" + + "dav5e\n" + + "5day\n" + + "d1b\n" + + "d5c\n" + + "d1d4\n" + + "2de.\n" + + "deaf5\n" + + "deb5it\n" + + "de4bon\n" + + "decan4\n" + + "de4cil\n" + + "de5com\n" + + "2d1ed\n" + + "4dee.\n" + + "de5if\n" + + "deli4e\n" + + "del5i5q\n" + + "de5lo\n" + + "d4em\n" + + "5dem.\n" + + "3demic\n" + + "dem5ic.\n" + + "de5mil\n" + + "de4mons\n" + + "demor5\n" + + "1den\n" + + "de4nar\n" + + "de3no\n" + + "denti5f\n" + + "de3nu\n" + + "de1p\n" + + "de3pa\n" + + "depi4\n" + + "de2pu\n" + + "d3eq\n" + + "d4erh\n" + + "5derm\n" + + "dern5iz\n" + + "der5s\n" + + "des2\n" + + "d2es.\n" + + "de1sc\n" + + "de2s5o\n" + + "des3ti\n" + + "de3str\n" + + "de4su\n" + + "de1t\n" + + "de2to\n" + + "de1v\n" + + "dev3il\n" + + "4dey\n" + + "4d1f\n" + + "d4ga\n" + + "d3ge4t\n" + + "dg1i\n" + + "d2gy\n" + + "d1h2\n" + + "5di.\n" + + "1d4i3a\n" + + "dia5b\n" + + "di4cam\n" + + "d4ice\n" + + "3dict\n" + + "3did\n" + + "5di3en\n" + + "d1if\n" + + "di3ge\n" + + "di4lato\n" + + "d1in\n" + + "1dina\n" + + "3dine.\n" + + "5dini\n" + + "di5niz\n" + + "1dio\n" + + "dio5g\n" + + "di4pl\n" + + "dir2\n" + + "di1re\n" + + "dirt5i\n" + + "dis1\n" + + "5disi\n" + + "d4is3t\n" + + "d2iti\n" + + "1di1v\n" + + "d1j\n" + + "d5k2\n" + + "4d5la\n" + + "3dle.\n" + + "3dled\n" + + "3dles.\n" + + "4dless\n" + + "2d3lo\n" + + "4d5lu\n" + + "2dly\n" + + "d1m\n" + + "4d1n4\n" + + "1do\n" + + "3do.\n" + + "do5de\n" + + "5doe\n" + + "2d5of\n" + + "d4og\n" + + "do4la\n" + + "doli4\n" + + "do5lor\n" + + "dom5iz\n" + + "do3nat\n" + + "doni4\n" + + "doo3d\n" + + "dop4p\n" + + "d4or\n" + + "3dos\n" + + "4d5out\n" + + "do4v\n" + + "3dox\n" + + "d1p\n" + + "1dr\n" + + "drag5on\n" + + "4drai\n" + + "dre4\n" + + "drea5r\n" + + "5dren\n" + + "dri4b\n" + + "dril4\n" + + "dro4p\n" + + "4drow\n" + + "5drupli\n" + + "4dry\n" + + "2d1s2\n" + + "ds4p\n" + + "d4sw\n" + + "d4sy\n" + + "d2th\n" + + "1du\n" + + "d1u1a\n" + + "du2c\n" + + "d1uca\n" + + "duc5er\n" + + "4duct.\n" + + "4ducts\n" + + "du5el\n" + + "du4g\n" + + "d3ule\n" + + "dum4be\n" + + "du4n\n" + + "4dup\n" + + "du4pe\n" + + "d1v\n" + + "d1w\n" + + "d2y\n" + + "5dyn\n" + + "dy4se\n" + + "dys5p\n" + + "e1a4b\n" + + "e3act\n" + + "ead1\n" + + "ead5ie\n" + + "ea4ge\n" + + "ea5ger\n" + + "ea4l\n" + + "eal5er\n" + + "eal3ou\n" + + "eam3er\n" + + "e5and\n" + + "ear3a\n" + + "ear4c\n" + + "ear5es\n" + + "ear4ic\n" + + "ear4il\n" + + "ear5k\n" + + "ear2t\n" + + "eart3e\n" + + "ea5sp\n" + + "e3ass\n" + + "east3\n" + + "ea2t\n" + + "eat5en\n" + + "eath3i\n" + + "e5atif\n" + + "e4a3tu\n" + + "ea2v\n" + + "eav3en\n" + + "eav5i\n" + + "eav5o\n" + + "2e1b\n" + + "e4bel.\n" + + "e4bels\n" + + "e4ben\n" + + "e4bit\n" + + "e3br\n" + + "e4cad\n" + + "ecan5c\n" + + "ecca5\n" + + "e1ce\n" + + "ec5essa\n" + + "ec2i\n" + + "e4cib\n" + + "ec5ificat\n" + + "ec5ifie\n" + + "ec5ify\n" + + "ec3im\n" + + "eci4t\n" + + "e5cite\n" + + "e4clam\n" + + "e4clus\n" + + "e2col\n" + + "e4comm\n" + + "e4compe\n" + + "e4conc\n" + + "e2cor\n" + + "ec3ora\n" + + "eco5ro\n" + + "e1cr\n" + + "e4crem\n" + + "ec4tan\n" + + "ec4te\n" + + "e1cu\n" + + "e4cul\n" + + "ec3ula\n" + + "2e2da\n" + + "4ed3d\n" + + "e4d1er\n" + + "ede4s\n" + + "4edi\n" + + "e3dia\n" + + "ed3ib\n" + + "ed3ica\n" + + "ed3im\n" + + "ed1it\n" + + "edi5z\n" + + "4edo\n" + + "e4dol\n" + + "edon2\n" + + "e4dri\n" + + "e4dul\n" + + "ed5ulo\n" + + "ee2c\n" + + "eed3i\n" + + "ee2f\n" + + "eel3i\n" + + "ee4ly\n" + + "ee2m\n" + + "ee4na\n" + + "ee4p1\n" + + "ee2s4\n" + + "eest4\n" + + "ee4ty\n" + + "e5ex\n" + + "e1f\n" + + "e4f3ere\n" + + "1eff\n" + + "e4fic\n" + + "5efici\n" + + "efil4\n" + + "e3fine\n" + + "ef5i5nite\n" + + "3efit\n" + + "efor5es\n" + + "e4fuse.\n" + + "4egal\n" + + "eger4\n" + + "eg5ib\n" + + "eg4ic\n" + + "eg5ing\n" + + "e5git5\n" + + "eg5n\n" + + "e4go.\n" + + "e4gos\n" + + "eg1ul\n" + + "e5gur\n" + + "5egy\n" + + "e1h4\n" + + "eher4\n" + + "ei2\n" + + "e5ic\n" + + "ei5d\n" + + "eig2\n" + + "ei5gl\n" + + "e3imb\n" + + "e3inf\n" + + "e1ing\n" + + "e5inst\n" + + "eir4d\n" + + "eit3e\n" + + "ei3th\n" + + "e5ity\n" + + "e1j\n" + + "e4jud\n" + + "ej5udi\n" + + "eki4n\n" + + "ek4la\n" + + "e1la\n" + + "e4la.\n" + + "e4lac\n" + + "elan4d\n" + + "el5ativ\n" + + "e4law\n" + + "elaxa4\n" + + "e3lea\n" + + "el5ebra\n" + + "5elec\n" + + "e4led\n" + + "el3ega\n" + + "e5len\n" + + "e4l1er\n" + + "e1les\n" + + "el2f\n" + + "el2i\n" + + "e3libe\n" + + "e4l5ic.\n" + + "el3ica\n" + + "e3lier\n" + + "el5igib\n" + + "e5lim\n" + + "e4l3ing\n" + + "e3lio\n" + + "e2lis\n" + + "el5ish\n" + + "e3liv3\n" + + "4ella\n" + + "el4lab\n" + + "ello4\n" + + "e5loc\n" + + "el5og\n" + + "el3op.\n" + + "el2sh\n" + + "el4ta\n" + + "e5lud\n" + + "el5ug\n" + + "e4mac\n" + + "e4mag\n" + + "e5man\n" + + "em5ana\n" + + "em5b\n" + + "e1me\n" + + "e2mel\n" + + "e4met\n" + + "em3ica\n" + + "emi4e\n" + + "em5igra\n" + + "em1in2\n" + + "em5ine\n" + + "em3i3ni\n" + + "e4mis\n" + + "em5ish\n" + + "e5miss\n" + + "em3iz\n" + + "5emniz\n" + + "emo4g\n" + + "emoni5o\n" + + "em3pi\n" + + "e4mul\n" + + "em5ula\n" + + "emu3n\n" + + "e3my\n" + + "en5amo\n" + + "e4nant\n" + + "ench4er\n" + + "en3dic\n" + + "e5nea\n" + + "e5nee\n" + + "en3em\n" + + "en5ero\n" + + "en5esi\n" + + "en5est\n" + + "en3etr\n" + + "e3new\n" + + "en5ics\n" + + "e5nie\n" + + "e5nil\n" + + "e3nio\n" + + "en3ish\n" + + "en3it\n" + + "e5niu\n" + + "5eniz\n" + + "4enn\n" + + "4eno\n" + + "eno4g\n" + + "e4nos\n" + + "en3ov\n" + + "en4sw\n" + + "ent5age\n" + + "4enthes\n" + + "en3ua\n" + + "en5uf\n" + + "e3ny.\n" + + "4en3z\n" + + "e5of\n" + + "eo2g\n" + + "e4oi4\n" + + "e3ol\n" + + "eop3ar\n" + + "e1or\n" + + "eo3re\n" + + "eo5rol\n" + + "eos4\n" + + "e4ot\n" + + "eo4to\n" + + "e5out\n" + + "e5ow\n" + + "e2pa\n" + + "e3pai\n" + + "ep5anc\n" + + "e5pel\n" + + "e3pent\n" + + "ep5etitio\n" + + "ephe4\n" + + "e4pli\n" + + "e1po\n" + + "e4prec\n" + + "ep5reca\n" + + "e4pred\n" + + "ep3reh\n" + + "e3pro\n" + + "e4prob\n" + + "ep4sh\n" + + "ep5ti5b\n" + + "e4put\n" + + "ep5uta\n" + + "e1q\n" + + "equi3l\n" + + "e4q3ui3s\n" + + "er1a\n" + + "era4b\n" + + "4erand\n" + + "er3ar\n" + + "4erati.\n" + + "2erb\n" + + "er4bl\n" + + "er3ch\n" + + "er4che\n" + + "2ere.\n" + + "e3real\n" + + "ere5co\n" + + "ere3in\n" + + "er5el.\n" + + "er3emo\n" + + "er5ena\n" + + "er5ence\n" + + "4erene\n" + + "er3ent\n" + + "ere4q\n" + + "er5ess\n" + + "er3est\n" + + "eret4\n" + + "er1h\n" + + "er1i\n" + + "e1ria4\n" + + "5erick\n" + + "e3rien\n" + + "eri4er\n" + + "er3ine\n" + + "e1rio\n" + + "4erit\n" + + "er4iu\n" + + "eri4v\n" + + "e4riva\n" + + "er3m4\n" + + "er4nis\n" + + "4ernit\n" + + "5erniz\n" + + "er3no\n" + + "2ero\n" + + "er5ob\n" + + "e5roc\n" + + "ero4r\n" + + "er1ou\n" + + "er1s\n" + + "er3set\n" + + "ert3er\n" + + "4ertl\n" + + "er3tw\n" + + "4eru\n" + + "eru4t\n" + + "5erwau\n" + + "e1s4a\n" + + "e4sage.\n" + + "e4sages\n" + + "es2c\n" + + "e2sca\n" + + "es5can\n" + + "e3scr\n" + + "es5cu\n" + + "e1s2e\n" + + "e2sec\n" + + "es5ecr\n" + + "es5enc\n" + + "e4sert.\n" + + "e4serts\n" + + "e4serva\n" + + "4esh\n" + + "e3sha\n" + + "esh5en\n" + + "e1si\n" + + "e2sic\n" + + "e2sid\n" + + "es5iden\n" + + "es5igna\n" + + "e2s5im\n" + + "es4i4n\n" + + "esis4te\n" + + "esi4u\n" + + "e5skin\n" + + "es4mi\n" + + "e2sol\n" + + "es3olu\n" + + "e2son\n" + + "es5ona\n" + + "e1sp\n" + + "es3per\n" + + "es5pira\n" + + "es4pre\n" + + "2ess\n" + + "es4si4b\n" + + "estan4\n" + + "es3tig\n" + + "es5tim\n" + + "4es2to\n" + + "e3ston\n" + + "2estr\n" + + "e5stro\n" + + "estruc5\n" + + "e2sur\n" + + "es5urr\n" + + "es4w\n" + + "eta4b\n" + + "eten4d\n" + + "e3teo\n" + + "ethod3\n" + + "et1ic\n" + + "e5tide\n" + + "etin4\n" + + "eti4no\n" + + "e5tir\n" + + "e5titio\n" + + "et5itiv\n" + + "4etn\n" + + "et5ona\n" + + "e3tra\n" + + "e3tre\n" + + "et3ric\n" + + "et5rif\n" + + "et3rog\n" + + "et5ros\n" + + "et3ua\n" + + "et5ym\n" + + "et5z\n" + + "4eu\n" + + "e5un\n" + + "e3up\n" + + "eu3ro\n" + + "eus4\n" + + "eute4\n" + + "euti5l\n" + + "eu5tr\n" + + "eva2p5\n" + + "e2vas\n" + + "ev5ast\n" + + "e5vea\n" + + "ev3ell\n" + + "evel3o\n" + + "e5veng\n" + + "even4i\n" + + "ev1er\n" + + "e5verb\n" + + "e1vi\n" + + "ev3id\n" + + "evi4l\n" + + "e4vin\n" + + "evi4v\n" + + "e5voc\n" + + "e5vu\n" + + "e1wa\n" + + "e4wag\n" + + "e5wee\n" + + "e3wh\n" + + "ewil5\n" + + "ew3ing\n" + + "e3wit\n" + + "1exp\n" + + "5eyc\n" + + "5eye.\n" + + "eys4\n" + + "1fa\n" + + "fa3bl\n" + + "fab3r\n" + + "fa4ce\n" + + "4fag\n" + + "fain4\n" + + "fall5e\n" + + "4fa4ma\n" + + "fam5is\n" + + "5far\n" + + "far5th\n" + + "fa3ta\n" + + "fa3the\n" + + "4fato\n" + + "fault5\n" + + "4f5b\n" + + "4fd\n" + + "4fe.\n" + + "feas4\n" + + "feath3\n" + + "fe4b\n" + + "4feca\n" + + "5fect\n" + + "2fed\n" + + "fe3li\n" + + "fe4mo\n" + + "fen2d\n" + + "fend5e\n" + + "fer1\n" + + "5ferr\n" + + "fev4\n" + + "4f1f\n" + + "f4fes\n" + + "f4fie\n" + + "f5fin.\n" + + "f2f5is\n" + + "f4fly\n" + + "f2fy\n" + + "4fh\n" + + "1fi\n" + + "fi3a\n" + + "2f3ic.\n" + + "4f3ical\n" + + "f3ican\n" + + "4ficate\n" + + "f3icen\n" + + "fi3cer\n" + + "fic4i\n" + + "5ficia\n" + + "5ficie\n" + + "4fics\n" + + "fi3cu\n" + + "fi5del\n" + + "fight5\n" + + "fil5i\n" + + "fill5in\n" + + "4fily\n" + + "2fin\n" + + "5fina\n" + + "fin2d5\n" + + "fi2ne\n" + + "f1in3g\n" + + "fin4n\n" + + "fis4ti\n" + + "f4l2\n" + + "f5less\n" + + "flin4\n" + + "flo3re\n" + + "f2ly5\n" + + "4fm\n" + + "4fn\n" + + "1fo\n" + + "5fon\n" + + "fon4de\n" + + "fon4t\n" + + "fo2r\n" + + "fo5rat\n" + + "for5ay\n" + + "fore5t\n" + + "for4i\n" + + "fort5a\n" + + "fos5\n" + + "4f5p\n" + + "fra4t\n" + + "f5rea\n" + + "fres5c\n" + + "fri2\n" + + "fril4\n" + + "frol5\n" + + "2f3s\n" + + "2ft\n" + + "f4to\n" + + "f2ty\n" + + "3fu\n" + + "fu5el\n" + + "4fug\n" + + "fu4min\n" + + "fu5ne\n" + + "fu3ri\n" + + "fusi4\n" + + "fus4s\n" + + "4futa\n" + + "1fy\n" + + "1ga\n" + + "gaf4\n" + + "5gal.\n" + + "3gali\n" + + "ga3lo\n" + + "2gam\n" + + "ga5met\n" + + "g5amo\n" + + "gan5is\n" + + "ga3niz\n" + + "gani5za\n" + + "4gano\n" + + "gar5n4\n" + + "gass4\n" + + "gath3\n" + + "4gativ\n" + + "4gaz\n" + + "g3b\n" + + "gd4\n" + + "2ge.\n" + + "2ged\n" + + "geez4\n" + + "gel4in\n" + + "ge5lis\n" + + "ge5liz\n" + + "4gely\n" + + "1gen\n" + + "ge4nat\n" + + "ge5niz\n" + + "4geno\n" + + "4geny\n" + + "1geo\n" + + "ge3om\n" + + "g4ery\n" + + "5gesi\n" + + "geth5\n" + + "4geto\n" + + "ge4ty\n" + + "ge4v\n" + + "4g1g2\n" + + "g2ge\n" + + "g3ger\n" + + "gglu5\n" + + "ggo4\n" + + "gh3in\n" + + "gh5out\n" + + "gh4to\n" + + "5gi.\n" + + "1gi4a\n" + + "gia5r\n" + + "g1ic\n" + + "5gicia\n" + + "g4ico\n" + + "gien5\n" + + "5gies.\n" + + "gil4\n" + + "g3imen\n" + + "3g4in.\n" + + "gin5ge\n" + + "5g4ins\n" + + "5gio\n" + + "3gir\n" + + "gir4l\n" + + "g3isl\n" + + "gi4u\n" + + "5giv\n" + + "3giz\n" + + "gl2\n" + + "gla4\n" + + "glad5i\n" + + "5glas\n" + + "1gle\n" + + "gli4b\n" + + "g3lig\n" + + "3glo\n" + + "glo3r\n" + + "g1m\n" + + "g4my\n" + + "gn4a\n" + + "g4na.\n" + + "gnet4t\n" + + "g1ni\n" + + "g2nin\n" + + "g4nio\n" + + "g1no\n" + + "g4non\n" + + "1go\n" + + "3go.\n" + + "gob5\n" + + "5goe\n" + + "3g4o4g\n" + + "go3is\n" + + "gon2\n" + + "4g3o3na\n" + + "gondo5\n" + + "go3ni\n" + + "5goo\n" + + "go5riz\n" + + "gor5ou\n" + + "5gos.\n" + + "gov1\n" + + "g3p\n" + + "1gr\n" + + "4grada\n" + + "g4rai\n" + + "gran2\n" + + "5graph.\n" + + "g5rapher\n" + + "5graphic\n" + + "4graphy\n" + + "4gray\n" + + "gre4n\n" + + "4gress.\n" + + "4grit\n" + + "g4ro\n" + + "gruf4\n" + + "gs2\n" + + "g5ste\n" + + "gth3\n" + + "gu4a\n" + + "3guard\n" + + "2gue\n" + + "5gui5t\n" + + "3gun\n" + + "3gus\n" + + "4gu4t\n" + + "g3w\n" + + "1gy\n" + + "2g5y3n\n" + + "gy5ra\n" + + "h3ab4l\n" + + "hach4\n" + + "hae4m\n" + + "hae4t\n" + + "h5agu\n" + + "ha3la\n" + + "hala3m\n" + + "ha4m\n" + + "han4ci\n" + + "han4cy\n" + + "5hand.\n" + + "han4g\n" + + "hang5er\n" + + "hang5o\n" + + "h5a5niz\n" + + "han4k\n" + + "han4te\n" + + "hap3l\n" + + "hap5t\n" + + "ha3ran\n" + + "ha5ras\n" + + "har2d\n" + + "hard3e\n" + + "har4le\n" + + "harp5en\n" + + "har5ter\n" + + "has5s\n" + + "haun4\n" + + "5haz\n" + + "haz3a\n" + + "h1b\n" + + "1head\n" + + "3hear\n" + + "he4can\n" + + "h5ecat\n" + + "h4ed\n" + + "he5do5\n" + + "he3l4i\n" + + "hel4lis\n" + + "hel4ly\n" + + "h5elo\n" + + "hem4p\n" + + "he2n\n" + + "hena4\n" + + "hen5at\n" + + "heo5r\n" + + "hep5\n" + + "h4era\n" + + "hera3p\n" + + "her4ba\n" + + "here5a\n" + + "h3ern\n" + + "h5erou\n" + + "h3ery\n" + + "h1es\n" + + "he2s5p\n" + + "he4t\n" + + "het4ed\n" + + "heu4\n" + + "h1f\n" + + "h1h\n" + + "hi5an\n" + + "hi4co\n" + + "high5\n" + + "h4il2\n" + + "himer4\n" + + "h4ina\n" + + "hion4e\n" + + "hi4p\n" + + "hir4l\n" + + "hi3ro\n" + + "hir4p\n" + + "hir4r\n" + + "his3el\n" + + "his4s\n" + + "hith5er\n" + + "hi2v\n" + + "4hk\n" + + "4h1l4\n" + + "hlan4\n" + + "h2lo\n" + + "hlo3ri\n" + + "4h1m\n" + + "hmet4\n" + + "2h1n\n" + + "h5odiz\n" + + "h5ods\n" + + "ho4g\n" + + "hoge4\n" + + "hol5ar\n" + + "3hol4e\n" + + "ho4ma\n" + + "home3\n" + + "hon4a\n" + + "ho5ny\n" + + "3hood\n" + + "hoon4\n" + + "hor5at\n" + + "ho5ris\n" + + "hort3e\n" + + "ho5ru\n" + + "hos4e\n" + + "ho5sen\n" + + "hos1p\n" + + "1hous\n" + + "house3\n" + + "hov5el\n" + + "4h5p\n" + + "4hr4\n" + + "hree5\n" + + "hro5niz\n" + + "hro3po\n" + + "4h1s2\n" + + "h4sh\n" + + "h4tar\n" + + "ht1en\n" + + "ht5es\n" + + "h4ty\n" + + "hu4g\n" + + "hu4min\n" + + "hun5ke\n" + + "hun4t\n" + + "hus3t4\n" + + "hu4t\n" + + "h1w\n" + + "h4wart\n" + + "hy3pe\n" + + "hy3ph\n" + + "hy2s\n" + + "2i1a\n" + + "i2al\n" + + "iam4\n" + + "iam5ete\n" + + "i2an\n" + + "4ianc\n" + + "ian3i\n" + + "4ian4t\n" + + "ia5pe\n" + + "iass4\n" + + "i4ativ\n" + + "ia4tric\n" + + "i4atu\n" + + "ibe4\n" + + "ib3era\n" + + "ib5ert\n" + + "ib5ia\n" + + "ib3in\n" + + "ib5it.\n" + + "ib5ite\n" + + "i1bl\n" + + "ib3li\n" + + "i5bo\n" + + "i1br\n" + + "i2b5ri\n" + + "i5bun\n" + + "4icam\n" + + "5icap\n" + + "4icar\n" + + "i4car.\n" + + "i4cara\n" + + "icas5\n" + + "i4cay\n" + + "iccu4\n" + + "4iceo\n" + + "4ich\n" + + "2ici\n" + + "i5cid\n" + + "ic5ina\n" + + "i2cip\n" + + "ic3ipa\n" + + "i4cly\n" + + "i2c5oc\n" + + "4i1cr\n" + + "5icra\n" + + "i4cry\n" + + "ic4te\n" + + "ictu2\n" + + "ic4t3ua\n" + + "ic3ula\n" + + "ic4um\n" + + "ic5uo\n" + + "i3cur\n" + + "2id\n" + + "i4dai\n" + + "id5anc\n" + + "id5d\n" + + "ide3al\n" + + "ide4s\n" + + "i2di\n" + + "id5ian\n" + + "idi4ar\n" + + "i5die\n" + + "id3io\n" + + "idi5ou\n" + + "id1it\n" + + "id5iu\n" + + "i3dle\n" + + "i4dom\n" + + "id3ow\n" + + "i4dr\n" + + "i2du\n" + + "id5uo\n" + + "2ie4\n" + + "ied4e\n" + + "5ie5ga\n" + + "ield3\n" + + "ien5a4\n" + + "ien4e\n" + + "i5enn\n" + + "i3enti\n" + + "i1er.\n" + + "i3esc\n" + + "i1est\n" + + "i3et\n" + + "4if.\n" + + "if5ero\n" + + "iff5en\n" + + "if4fr\n" + + "4ific.\n" + + "i3fie\n" + + "i3fl\n" + + "4ift\n" + + "2ig\n" + + "iga5b\n" + + "ig3era\n" + + "ight3i\n" + + "4igi\n" + + "i3gib\n" + + "ig3il\n" + + "ig3in\n" + + "ig3it\n" + + "i4g4l\n" + + "i2go\n" + + "ig3or\n" + + "ig5ot\n" + + "i5gre\n" + + "igu5i\n" + + "ig1ur\n" + + "i3h\n" + + "4i5i4\n" + + "i3j\n" + + "4ik\n" + + "i1la\n" + + "il3a4b\n" + + "i4lade\n" + + "i2l5am\n" + + "ila5ra\n" + + "i3leg\n" + + "il1er\n" + + "ilev4\n" + + "il5f\n" + + "il1i\n" + + "il3ia\n" + + "il2ib\n" + + "il3io\n" + + "il4ist\n" + + "2ilit\n" + + "il2iz\n" + + "ill5ab\n" + + "4iln\n" + + "il3oq\n" + + "il4ty\n" + + "il5ur\n" + + "il3v\n" + + "i4mag\n" + + "im3age\n" + + "ima5ry\n" + + "imenta5r\n" + + "4imet\n" + + "im1i\n" + + "im5ida\n" + + "imi5le\n" + + "i5mini\n" + + "4imit\n" + + "im4ni\n" + + "i3mon\n" + + "i2mu\n" + + "im3ula\n" + + "2in.\n" + + "i4n3au\n" + + "4inav\n" + + "incel4\n" + + "in3cer\n" + + "4ind\n" + + "in5dling\n" + + "2ine\n" + + "i3nee\n" + + "iner4ar\n" + + "i5ness\n" + + "4inga\n" + + "4inge\n" + + "in5gen\n" + + "4ingi\n" + + "in5gling\n" + + "4ingo\n" + + "4ingu\n" + + "2ini\n" + + "i5ni.\n" + + "i4nia\n" + + "in3io\n" + + "in1is\n" + + "i5nite.\n" + + "5initio\n" + + "in3ity\n" + + "4ink\n" + + "4inl\n" + + "2inn\n" + + "2i1no\n" + + "i4no4c\n" + + "ino4s\n" + + "i4not\n" + + "2ins\n" + + "in3se\n" + + "insur5a\n" + + "2int.\n" + + "2in4th\n" + + "in1u\n" + + "i5nus\n" + + "4iny\n" + + "2io\n" + + "4io.\n" + + "ioge4\n" + + "io2gr\n" + + "i1ol\n" + + "io4m\n" + + "ion3at\n" + + "ion4ery\n" + + "ion3i\n" + + "io5ph\n" + + "ior3i\n" + + "i4os\n" + + "io5th\n" + + "i5oti\n" + + "io4to\n" + + "i4our\n" + + "2ip\n" + + "ipe4\n" + + "iphras4\n" + + "ip3i\n" + + "ip4ic\n" + + "ip4re4\n" + + "ip3ul\n" + + "i3qua\n" + + "iq5uef\n" + + "iq3uid\n" + + "iq3ui3t\n" + + "4ir\n" + + "i1ra\n" + + "ira4b\n" + + "i4rac\n" + + "ird5e\n" + + "ire4de\n" + + "i4ref\n" + + "i4rel4\n" + + "i4res\n" + + "ir5gi\n" + + "ir1i\n" + + "iri5de\n" + + "ir4is\n" + + "iri3tu\n" + + "5i5r2iz\n" + + "ir4min\n" + + "iro4g\n" + + "5iron.\n" + + "ir5ul\n" + + "2is.\n" + + "is5ag\n" + + "is3ar\n" + + "isas5\n" + + "2is1c\n" + + "is3ch\n" + + "4ise\n" + + "is3er\n" + + "3isf\n" + + "is5han\n" + + "is3hon\n" + + "ish5op\n" + + "is3ib\n" + + "isi4d\n" + + "i5sis\n" + + "is5itiv\n" + + "4is4k\n" + + "islan4\n" + + "4isms\n" + + "i2so\n" + + "iso5mer\n" + + "is1p\n" + + "is2pi\n" + + "is4py\n" + + "4is1s\n" + + "is4sal\n" + + "issen4\n" + + "is4ses\n" + + "is4ta.\n" + + "is1te\n" + + "is1ti\n" + + "ist4ly\n" + + "4istral\n" + + "i2su\n" + + "is5us\n" + + "4ita.\n" + + "ita4bi\n" + + "i4tag\n" + + "4ita5m\n" + + "i3tan\n" + + "i3tat\n" + + "2ite\n" + + "it3era\n" + + "i5teri\n" + + "it4es\n" + + "2ith\n" + + "i1ti\n" + + "4itia\n" + + "4i2tic\n" + + "it3ica\n" + + "5i5tick\n" + + "it3ig\n" + + "it5ill\n" + + "i2tim\n" + + "2itio\n" + + "4itis\n" + + "i4tism\n" + + "i2t5o5m\n" + + "4iton\n" + + "i4tram\n" + + "it5ry\n" + + "4itt\n" + + "it3uat\n" + + "i5tud\n" + + "it3ul\n" + + "4itz.\n" + + "i1u\n" + + "2iv\n" + + "iv3ell\n" + + "iv3en.\n" + + "i4v3er.\n" + + "i4vers.\n" + + "iv5il.\n" + + "iv5io\n" + + "iv1it\n" + + "i5vore\n" + + "iv3o3ro\n" + + "i4v3ot\n" + + "4i5w\n" + + "ix4o\n" + + "4iy\n" + + "4izar\n" + + "izi4\n" + + "5izont\n" + + "5ja\n" + + "jac4q\n" + + "ja4p\n" + + "1je\n" + + "jer5s\n" + + "4jestie\n" + + "4jesty\n" + + "jew3\n" + + "jo4p\n" + + "5judg\n" + + "3ka.\n" + + "k3ab\n" + + "k5ag\n" + + "kais4\n" + + "kal4\n" + + "k1b\n" + + "k2ed\n" + + "1kee\n" + + "ke4g\n" + + "ke5li\n" + + "k3en4d\n" + + "k1er\n" + + "kes4\n" + + "k3est.\n" + + "ke4ty\n" + + "k3f\n" + + "kh4\n" + + "k1i\n" + + "5ki.\n" + + "5k2ic\n" + + "k4ill\n" + + "kilo5\n" + + "k4im\n" + + "k4in.\n" + + "kin4de\n" + + "k5iness\n" + + "kin4g\n" + + "ki4p\n" + + "kis4\n" + + "k5ish\n" + + "kk4\n" + + "k1l\n" + + "4kley\n" + + "4kly\n" + + "k1m\n" + + "k5nes\n" + + "1k2no\n" + + "ko5r\n" + + "kosh4\n" + + "k3ou\n" + + "kro5n\n" + + "4k1s2\n" + + "k4sc\n" + + "ks4l\n" + + "k4sy\n" + + "k5t\n" + + "k1w\n" + + "lab3ic\n" + + "l4abo\n" + + "laci4\n" + + "l4ade\n" + + "la3dy\n" + + "lag4n\n" + + "lam3o\n" + + "3land\n" + + "lan4dl\n" + + "lan5et\n" + + "lan4te\n" + + "lar4g\n" + + "lar3i\n" + + "las4e\n" + + "la5tan\n" + + "4lateli\n" + + "4lativ\n" + + "4lav\n" + + "la4v4a\n" + + "2l1b\n" + + "lbin4\n" + + "4l1c2\n" + + "lce4\n" + + "l3ci\n" + + "2ld\n" + + "l2de\n" + + "ld4ere\n" + + "ld4eri\n" + + "ldi4\n" + + "ld5is\n" + + "l3dr\n" + + "l4dri\n" + + "le2a\n" + + "le4bi\n" + + "left5\n" + + "5leg.\n" + + "5legg\n" + + "le4mat\n" + + "lem5atic\n" + + "4len.\n" + + "3lenc\n" + + "5lene.\n" + + "1lent\n" + + "le3ph\n" + + "le4pr\n" + + "lera5b\n" + + "ler4e\n" + + "3lerg\n" + + "3l4eri\n" + + "l4ero\n" + + "les2\n" + + "le5sco\n" + + "5lesq\n" + + "3less\n" + + "5less.\n" + + "l3eva\n" + + "lev4er.\n" + + "lev4era\n" + + "lev4ers\n" + + "3ley\n" + + "4leye\n" + + "2lf\n" + + "l5fr\n" + + "4l1g4\n" + + "l5ga\n" + + "lgar3\n" + + "l4ges\n" + + "lgo3\n" + + "2l3h\n" + + "li4ag\n" + + "li2am\n" + + "liar5iz\n" + + "li4as\n" + + "li4ato\n" + + "li5bi\n" + + "5licio\n" + + "li4cor\n" + + "4lics\n" + + "4lict.\n" + + "l4icu\n" + + "l3icy\n" + + "l3ida\n" + + "lid5er\n" + + "3lidi\n" + + "lif3er\n" + + "l4iff\n" + + "li4fl\n" + + "5ligate\n" + + "3ligh\n" + + "li4gra\n" + + "3lik\n" + + "4l4i4l\n" + + "lim4bl\n" + + "lim3i\n" + + "li4mo\n" + + "l4im4p\n" + + "l4ina\n" + + "1l4ine\n" + + "lin3ea\n" + + "lin3i\n" + + "link5er\n" + + "li5og\n" + + "4l4iq\n" + + "lis4p\n" + + "l1it\n" + + "l2it.\n" + + "5litica\n" + + "l5i5tics\n" + + "liv3er\n" + + "l1iz\n" + + "4lj\n" + + "lka3\n" + + "l3kal\n" + + "lka4t\n" + + "l1l\n" + + "l4law\n" + + "l2le\n" + + "l5lea\n" + + "l3lec\n" + + "l3leg\n" + + "l3lel\n" + + "l3le4n\n" + + "l3le4t\n" + + "ll2i\n" + + "l2lin4\n" + + "l5lina\n" + + "ll4o\n" + + "lloqui5\n" + + "ll5out\n" + + "l5low\n" + + "2lm\n" + + "l5met\n" + + "lm3ing\n" + + "l4mod\n" + + "lmon4\n" + + "2l1n2\n" + + "3lo.\n" + + "lob5al\n" + + "lo4ci\n" + + "4lof\n" + + "3logic\n" + + "l5ogo\n" + + "3logu\n" + + "lom3er\n" + + "5long\n" + + "lon4i\n" + + "l3o3niz\n" + + "lood5\n" + + "5lope.\n" + + "lop3i\n" + + "l3opm\n" + + "lora4\n" + + "lo4rato\n" + + "lo5rie\n" + + "lor5ou\n" + + "5los.\n" + + "los5et\n" + + "5losophiz\n" + + "5losophy\n" + + "los4t\n" + + "lo4ta\n" + + "loun5d\n" + + "2lout\n" + + "4lov\n" + + "2lp\n" + + "lpa5b\n" + + "l3pha\n" + + "l5phi\n" + + "lp5ing\n" + + "l3pit\n" + + "l4pl\n" + + "l5pr\n" + + "4l1r\n" + + "2l1s2\n" + + "l4sc\n" + + "l2se\n" + + "l4sie\n" + + "4lt\n" + + "lt5ag\n" + + "ltane5\n" + + "l1te\n" + + "lten4\n" + + "ltera4\n" + + "lth3i\n" + + "l5ties.\n" + + "ltis4\n" + + "l1tr\n" + + "ltu2\n" + + "ltur3a\n" + + "lu5a\n" + + "lu3br\n" + + "luch4\n" + + "lu3ci\n" + + "lu3en\n" + + "luf4\n" + + "lu5id\n" + + "lu4ma\n" + + "5lumi\n" + + "l5umn.\n" + + "5lumnia\n" + + "lu3o\n" + + "luo3r\n" + + "4lup\n" + + "luss4\n" + + "lus3te\n" + + "1lut\n" + + "l5ven\n" + + "l5vet4\n" + + "2l1w\n" + + "1ly\n" + + "4lya\n" + + "4lyb\n" + + "ly5me\n" + + "ly3no\n" + + "2lys4\n" + + "l5yse\n" + + "1ma\n" + + "2mab\n" + + "ma2ca\n" + + "ma5chine\n" + + "ma4cl\n" + + "mag5in\n" + + "5magn\n" + + "2mah\n" + + "maid5\n" + + "4mald\n" + + "ma3lig\n" + + "ma5lin\n" + + "mal4li\n" + + "mal4ty\n" + + "5mania\n" + + "man5is\n" + + "man3iz\n" + + "4map\n" + + "ma5rine.\n" + + "ma5riz\n" + + "mar4ly\n" + + "mar3v\n" + + "ma5sce\n" + + "mas4e\n" + + "mas1t\n" + + "5mate\n" + + "math3\n" + + "ma3tis\n" + + "4matiza\n" + + "4m1b\n" + + "mba4t5\n" + + "m5bil\n" + + "m4b3ing\n" + + "mbi4v\n" + + "4m5c\n" + + "4me.\n" + + "2med\n" + + "4med.\n" + + "5media\n" + + "me3die\n" + + "m5e5dy\n" + + "me2g\n" + + "mel5on\n" + + "mel4t\n" + + "me2m\n" + + "mem1o3\n" + + "1men\n" + + "men4a\n" + + "men5ac\n" + + "men4de\n" + + "4mene\n" + + "men4i\n" + + "mens4\n" + + "mensu5\n" + + "3ment\n" + + "men4te\n" + + "me5on\n" + + "m5ersa\n" + + "2mes\n" + + "3mesti\n" + + "me4ta\n" + + "met3al\n" + + "me1te\n" + + "me5thi\n" + + "m4etr\n" + + "5metric\n" + + "me5trie\n" + + "me3try\n" + + "me4v\n" + + "4m1f\n" + + "2mh\n" + + "5mi.\n" + + "mi3a\n" + + "mid4a\n" + + "mid4g\n" + + "mig4\n" + + "3milia\n" + + "m5i5lie\n" + + "m4ill\n" + + "min4a\n" + + "3mind\n" + + "m5inee\n" + + "m4ingl\n" + + "min5gli\n" + + "m5ingly\n" + + "min4t\n" + + "m4inu\n" + + "miot4\n" + + "m2is\n" + + "mis4er.\n" + + "mis5l\n" + + "mis4ti\n" + + "m5istry\n" + + "4mith\n" + + "m2iz\n" + + "4mk\n" + + "4m1l\n" + + "m1m\n" + + "mma5ry\n" + + "4m1n\n" + + "mn4a\n" + + "m4nin\n" + + "mn4o\n" + + "1mo\n" + + "4mocr\n" + + "5mocratiz\n" + + "mo2d1\n" + + "mo4go\n" + + "mois2\n" + + "moi5se\n" + + "4mok\n" + + "mo5lest\n" + + "mo3me\n" + + "mon5et\n" + + "mon5ge\n" + + "moni3a\n" + + "mon4ism\n" + + "mon4ist\n" + + "mo3niz\n" + + "monol4\n" + + "mo3ny.\n" + + "mo2r\n" + + "4mora.\n" + + "mos2\n" + + "mo5sey\n" + + "mo3sp\n" + + "moth3\n" + + "m5ouf\n" + + "3mous\n" + + "mo2v\n" + + "4m1p\n" + + "mpara5\n" + + "mpa5rab\n" + + "mpar5i\n" + + "m3pet\n" + + "mphas4\n" + + "m2pi\n" + + "mpi4a\n" + + "mp5ies\n" + + "m4p1in\n" + + "m5pir\n" + + "mp5is\n" + + "mpo3ri\n" + + "mpos5ite\n" + + "m4pous\n" + + "mpov5\n" + + "mp4tr\n" + + "m2py\n" + + "4m3r\n" + + "4m1s2\n" + + "m4sh\n" + + "m5si\n" + + "4mt\n" + + "1mu\n" + + "mula5r4\n" + + "5mult\n" + + "multi3\n" + + "3mum\n" + + "mun2\n" + + "4mup\n" + + "mu4u\n" + + "4mw\n" + + "1na\n" + + "2n1a2b\n" + + "n4abu\n" + + "4nac.\n" + + "na4ca\n" + + "n5act\n" + + "nag5er.\n" + + "nak4\n" + + "na4li\n" + + "na5lia\n" + + "4nalt\n" + + "na5mit\n" + + "n2an\n" + + "nanci4\n" + + "nan4it\n" + + "nank4\n" + + "nar3c\n" + + "4nare\n" + + "nar3i\n" + + "nar4l\n" + + "n5arm\n" + + "n4as\n" + + "nas4c\n" + + "nas5ti\n" + + "n2at\n" + + "na3tal\n" + + "nato5miz\n" + + "n2au\n" + + "nau3se\n" + + "3naut\n" + + "nav4e\n" + + "4n1b4\n" + + "ncar5\n" + + "n4ces.\n" + + "n3cha\n" + + "n5cheo\n" + + "n5chil\n" + + "n3chis\n" + + "nc1in\n" + + "nc4it\n" + + "ncour5a\n" + + "n1cr\n" + + "n1cu\n" + + "n4dai\n" + + "n5dan\n" + + "n1de\n" + + "nd5est.\n" + + "ndi4b\n" + + "n5d2if\n" + + "n1dit\n" + + "n3diz\n" + + "n5duc\n" + + "ndu4r\n" + + "nd2we\n" + + "2ne.\n" + + "n3ear\n" + + "ne2b\n" + + "neb3u\n" + + "ne2c\n" + + "5neck\n" + + "2ned\n" + + "ne4gat\n" + + "neg5ativ\n" + + "5nege\n" + + "ne4la\n" + + "nel5iz\n" + + "ne5mi\n" + + "ne4mo\n" + + "1nen\n" + + "4nene\n" + + "3neo\n" + + "ne4po\n" + + "ne2q\n" + + "n1er\n" + + "nera5b\n" + + "n4erar\n" + + "n2ere\n" + + "n4er5i\n" + + "ner4r\n" + + "1nes\n" + + "2nes.\n" + + "4nesp\n" + + "2nest\n" + + "4nesw\n" + + "3netic\n" + + "ne4v\n" + + "n5eve\n" + + "ne4w\n" + + "n3f\n" + + "n4gab\n" + + "n3gel\n" + + "nge4n4e\n" + + "n5gere\n" + + "n3geri\n" + + "ng5ha\n" + + "n3gib\n" + + "ng1in\n" + + "n5git\n" + + "n4gla\n" + + "ngov4\n" + + "ng5sh\n" + + "n1gu\n" + + "n4gum\n" + + "n2gy\n" + + "4n1h4\n" + + "nha4\n" + + "nhab3\n" + + "nhe4\n" + + "3n4ia\n" + + "ni3an\n" + + "ni4ap\n" + + "ni3ba\n" + + "ni4bl\n" + + "ni4d\n" + + "ni5di\n" + + "ni4er\n" + + "ni2fi\n" + + "ni5ficat\n" + + "n5igr\n" + + "nik4\n" + + "n1im\n" + + "ni3miz\n" + + "n1in\n" + + "5nine.\n" + + "nin4g\n" + + "ni4o\n" + + "5nis.\n" + + "nis4ta\n" + + "n2it\n" + + "n4ith\n" + + "3nitio\n" + + "n3itor\n" + + "ni3tr\n" + + "n1j\n" + + "4nk2\n" + + "n5kero\n" + + "n3ket\n" + + "nk3in\n" + + "n1kl\n" + + "4n1l\n" + + "n5m\n" + + "nme4\n" + + "nmet4\n" + + "4n1n2\n" + + "nne4\n" + + "nni3al\n" + + "nni4v\n" + + "nob4l\n" + + "no3ble\n" + + "n5ocl\n" + + "4n3o2d\n" + + "3noe\n" + + "4nog\n" + + "noge4\n" + + "nois5i\n" + + "no5l4i\n" + + "5nologis\n" + + "3nomic\n" + + "n5o5miz\n" + + "no4mo\n" + + "no3my\n" + + "no4n\n" + + "non4ag\n" + + "non5i\n" + + "n5oniz\n" + + "4nop\n" + + "5nop5o5li\n" + + "nor5ab\n" + + "no4rary\n" + + "4nosc\n" + + "nos4e\n" + + "nos5t\n" + + "no5ta\n" + + "1nou\n" + + "3noun\n" + + "nov3el3\n" + + "nowl3\n" + + "n1p4\n" + + "npi4\n" + + "npre4c\n" + + "n1q\n" + + "n1r\n" + + "nru4\n" + + "2n1s2\n" + + "ns5ab\n" + + "nsati4\n" + + "ns4c\n" + + "n2se\n" + + "n4s3es\n" + + "nsid1\n" + + "nsig4\n" + + "n2sl\n" + + "ns3m\n" + + "n4soc\n" + + "ns4pe\n" + + "n5spi\n" + + "nsta5bl\n" + + "n1t\n" + + "nta4b\n" + + "nter3s\n" + + "nt2i\n" + + "n5tib\n" + + "nti4er\n" + + "nti2f\n" + + "n3tine\n" + + "n4t3ing\n" + + "nti4p\n" + + "ntrol5li\n" + + "nt4s\n" + + "ntu3me\n" + + "nu1a\n" + + "nu4d\n" + + "nu5en\n" + + "nuf4fe\n" + + "n3uin\n" + + "3nu3it\n" + + "n4um\n" + + "nu1me\n" + + "n5umi\n" + + "3nu4n\n" + + "n3uo\n" + + "nu3tr\n" + + "n1v2\n" + + "n1w4\n" + + "nym4\n" + + "nyp4\n" + + "4nz\n" + + "n3za\n" + + "4oa\n" + + "oad3\n" + + "o5a5les\n" + + "oard3\n" + + "oas4e\n" + + "oast5e\n" + + "oat5i\n" + + "ob3a3b\n" + + "o5bar\n" + + "obe4l\n" + + "o1bi\n" + + "o2bin\n" + + "ob5ing\n" + + "o3br\n" + + "ob3ul\n" + + "o1ce\n" + + "och4\n" + + "o3chet\n" + + "ocif3\n" + + "o4cil\n" + + "o4clam\n" + + "o4cod\n" + + "oc3rac\n" + + "oc5ratiz\n" + + "ocre3\n" + + "5ocrit\n" + + "octor5a\n" + + "oc3ula\n" + + "o5cure\n" + + "od5ded\n" + + "od3ic\n" + + "odi3o\n" + + "o2do4\n" + + "odor3\n" + + "od5uct.\n" + + "od5ucts\n" + + "o4el\n" + + "o5eng\n" + + "o3er\n" + + "oe4ta\n" + + "o3ev\n" + + "o2fi\n" + + "of5ite\n" + + "ofit4t\n" + + "o2g5a5r\n" + + "og5ativ\n" + + "o4gato\n" + + "o1ge\n" + + "o5gene\n" + + "o5geo\n" + + "o4ger\n" + + "o3gie\n" + + "1o1gis\n" + + "og3it\n" + + "o4gl\n" + + "o5g2ly\n" + + "3ogniz\n" + + "o4gro\n" + + "ogu5i\n" + + "1ogy\n" + + "2ogyn\n" + + "o1h2\n" + + "ohab5\n" + + "oi2\n" + + "oic3es\n" + + "oi3der\n" + + "oiff4\n" + + "oig4\n" + + "oi5let\n" + + "o3ing\n" + + "oint5er\n" + + "o5ism\n" + + "oi5son\n" + + "oist5en\n" + + "oi3ter\n" + + "o5j\n" + + "2ok\n" + + "o3ken\n" + + "ok5ie\n" + + "o1la\n" + + "o4lan\n" + + "olass4\n" + + "ol2d\n" + + "old1e\n" + + "ol3er\n" + + "o3lesc\n" + + "o3let\n" + + "ol4fi\n" + + "ol2i\n" + + "o3lia\n" + + "o3lice\n" + + "ol5id.\n" + + "o3li4f\n" + + "o5lil\n" + + "ol3ing\n" + + "o5lio\n" + + "o5lis.\n" + + "ol3ish\n" + + "o5lite\n" + + "o5litio\n" + + "o5liv\n" + + "olli4e\n" + + "ol5ogiz\n" + + "olo4r\n" + + "ol5pl\n" + + "ol2t\n" + + "ol3ub\n" + + "ol3ume\n" + + "ol3un\n" + + "o5lus\n" + + "ol2v\n" + + "o2ly\n" + + "om5ah\n" + + "oma5l\n" + + "om5atiz\n" + + "om2be\n" + + "om4bl\n" + + "o2me\n" + + "om3ena\n" + + "om5erse\n" + + "o4met\n" + + "om5etry\n" + + "o3mia\n" + + "om3ic.\n" + + "om3ica\n" + + "o5mid\n" + + "om1in\n" + + "o5mini\n" + + "5ommend\n" + + "omo4ge\n" + + "o4mon\n" + + "om3pi\n" + + "ompro5\n" + + "o2n\n" + + "on1a\n" + + "on4ac\n" + + "o3nan\n" + + "on1c\n" + + "3oncil\n" + + "2ond\n" + + "on5do\n" + + "o3nen\n" + + "on5est\n" + + "on4gu\n" + + "on1ic\n" + + "o3nio\n" + + "on1is\n" + + "o5niu\n" + + "on3key\n" + + "on4odi\n" + + "on3omy\n" + + "on3s\n" + + "onspi4\n" + + "onspir5a\n" + + "onsu4\n" + + "onten4\n" + + "on3t4i\n" + + "ontif5\n" + + "on5um\n" + + "onva5\n" + + "oo2\n" + + "ood5e\n" + + "ood5i\n" + + "oo4k\n" + + "oop3i\n" + + "o3ord\n" + + "oost5\n" + + "o2pa\n" + + "ope5d\n" + + "op1er\n" + + "3opera\n" + + "4operag\n" + + "2oph\n" + + "o5phan\n" + + "o5pher\n" + + "op3ing\n" + + "o3pit\n" + + "o5pon\n" + + "o4posi\n" + + "o1pr\n" + + "op1u\n" + + "opy5\n" + + "o1q\n" + + "o1ra\n" + + "o5ra.\n" + + "o4r3ag\n" + + "or5aliz\n" + + "or5ange\n" + + "ore5a\n" + + "o5real\n" + + "or3ei\n" + + "ore5sh\n" + + "or5est.\n" + + "orew4\n" + + "or4gu\n" + + "4o5ria\n" + + "or3ica\n" + + "o5ril\n" + + "or1in\n" + + "o1rio\n" + + "or3ity\n" + + "o3riu\n" + + "or2mi\n" + + "orn2e\n" + + "o5rof\n" + + "or3oug\n" + + "or5pe\n" + + "3orrh\n" + + "or4se\n" + + "ors5en\n" + + "orst4\n" + + "or3thi\n" + + "or3thy\n" + + "or4ty\n" + + "o5rum\n" + + "o1ry\n" + + "os3al\n" + + "os2c\n" + + "os4ce\n" + + "o3scop\n" + + "4oscopi\n" + + "o5scr\n" + + "os4i4e\n" + + "os5itiv\n" + + "os3ito\n" + + "os3ity\n" + + "osi4u\n" + + "os4l\n" + + "o2so\n" + + "os4pa\n" + + "os4po\n" + + "os2ta\n" + + "o5stati\n" + + "os5til\n" + + "os5tit\n" + + "o4tan\n" + + "otele4g\n" + + "ot3er.\n" + + "ot5ers\n" + + "o4tes\n" + + "4oth\n" + + "oth5esi\n" + + "oth3i4\n" + + "ot3ic.\n" + + "ot5ica\n" + + "o3tice\n" + + "o3tif\n" + + "o3tis\n" + + "oto5s\n" + + "ou2\n" + + "ou3bl\n" + + "ouch5i\n" + + "ou5et\n" + + "ou4l\n" + + "ounc5er\n" + + "oun2d\n" + + "ou5v\n" + + "ov4en\n" + + "over4ne\n" + + "over3s\n" + + "ov4ert\n" + + "o3vis\n" + + "oviti4\n" + + "o5v4ol\n" + + "ow3der\n" + + "ow3el\n" + + "ow5est\n" + + "ow1i\n" + + "own5i\n" + + "o4wo\n" + + "oy1a\n" + + "1pa\n" + + "pa4ca\n" + + "pa4ce\n" + + "pac4t\n" + + "p4ad\n" + + "5pagan\n" + + "p3agat\n" + + "p4ai\n" + + "pain4\n" + + "p4al\n" + + "pan4a\n" + + "pan3el\n" + + "pan4ty\n" + + "pa3ny\n" + + "pa1p\n" + + "pa4pu\n" + + "para5bl\n" + + "par5age\n" + + "par5di\n" + + "3pare\n" + + "par5el\n" + + "p4a4ri\n" + + "par4is\n" + + "pa2te\n" + + "pa5ter\n" + + "5pathic\n" + + "pa5thy\n" + + "pa4tric\n" + + "pav4\n" + + "3pay\n" + + "4p1b\n" + + "pd4\n" + + "4pe.\n" + + "3pe4a\n" + + "pear4l\n" + + "pe2c\n" + + "2p2ed\n" + + "3pede\n" + + "3pedi\n" + + "pedia4\n" + + "ped4ic\n" + + "p4ee\n" + + "pee4d\n" + + "pek4\n" + + "pe4la\n" + + "peli4e\n" + + "pe4nan\n" + + "p4enc\n" + + "pen4th\n" + + "pe5on\n" + + "p4era.\n" + + "pera5bl\n" + + "p4erag\n" + + "p4eri\n" + + "peri5st\n" + + "per4mal\n" + + "perme5\n" + + "p4ern\n" + + "per3o\n" + + "per3ti\n" + + "pe5ru\n" + + "per1v\n" + + "pe2t\n" + + "pe5ten\n" + + "pe5tiz\n" + + "4pf\n" + + "4pg\n" + + "4ph.\n" + + "phar5i\n" + + "phe3no\n" + + "ph4er\n" + + "ph4es.\n" + + "ph1ic\n" + + "5phie\n" + + "ph5ing\n" + + "5phisti\n" + + "3phiz\n" + + "ph2l\n" + + "3phob\n" + + "3phone\n" + + "5phoni\n" + + "pho4r\n" + + "4phs\n" + + "ph3t\n" + + "5phu\n" + + "1phy\n" + + "pi3a\n" + + "pian4\n" + + "pi4cie\n" + + "pi4cy\n" + + "p4id\n" + + "p5ida\n" + + "pi3de\n" + + "5pidi\n" + + "3piec\n" + + "pi3en\n" + + "pi4grap\n" + + "pi3lo\n" + + "pi2n\n" + + "p4in.\n" + + "pind4\n" + + "p4ino\n" + + "3pi1o\n" + + "pion4\n" + + "p3ith\n" + + "pi5tha\n" + + "pi2tu\n" + + "2p3k2\n" + + "1p2l2\n" + + "3plan\n" + + "plas5t\n" + + "pli3a\n" + + "pli5er\n" + + "4plig\n" + + "pli4n\n" + + "ploi4\n" + + "plu4m\n" + + "plum4b\n" + + "4p1m\n" + + "2p3n\n" + + "po4c\n" + + "5pod.\n" + + "po5em\n" + + "po3et5\n" + + "5po4g\n" + + "poin2\n" + + "5point\n" + + "poly5t\n" + + "po4ni\n" + + "po4p\n" + + "1p4or\n" + + "po4ry\n" + + "1pos\n" + + "pos1s\n" + + "p4ot\n" + + "po4ta\n" + + "5poun\n" + + "4p1p\n" + + "ppa5ra\n" + + "p2pe\n" + + "p4ped\n" + + "p5pel\n" + + "p3pen\n" + + "p3per\n" + + "p3pet\n" + + "ppo5site\n" + + "pr2\n" + + "pray4e\n" + + "5preci\n" + + "pre5co\n" + + "pre3em\n" + + "pref5ac\n" + + "pre4la\n" + + "pre3r\n" + + "p3rese\n" + + "3press\n" + + "pre5ten\n" + + "pre3v\n" + + "5pri4e\n" + + "prin4t3\n" + + "pri4s\n" + + "pris3o\n" + + "p3roca\n" + + "prof5it\n" + + "pro3l\n" + + "pros3e\n" + + "pro1t\n" + + "2p1s2\n" + + "p2se\n" + + "ps4h\n" + + "p4sib\n" + + "2p1t\n" + + "pt5a4b\n" + + "p2te\n" + + "p2th\n" + + "pti3m\n" + + "ptu4r\n" + + "p4tw\n" + + "pub3\n" + + "pue4\n" + + "puf4\n" + + "pul3c\n" + + "pu4m\n" + + "pu2n\n" + + "pur4r\n" + + "5pus\n" + + "pu2t\n" + + "5pute\n" + + "put3er\n" + + "pu3tr\n" + + "put4ted\n" + + "put4tin\n" + + "p3w\n" + + "qu2\n" + + "qua5v\n" + + "2que.\n" + + "3quer\n" + + "3quet\n" + + "2rab\n" + + "ra3bi\n" + + "rach4e\n" + + "r5acl\n" + + "raf5fi\n" + + "raf4t\n" + + "r2ai\n" + + "ra4lo\n" + + "ram3et\n" + + "r2ami\n" + + "rane5o\n" + + "ran4ge\n" + + "r4ani\n" + + "ra5no\n" + + "rap3er\n" + + "3raphy\n" + + "rar5c\n" + + "rare4\n" + + "rar5ef\n" + + "4raril\n" + + "r2as\n" + + "ration4\n" + + "rau4t\n" + + "ra5vai\n" + + "rav3el\n" + + "ra5zie\n" + + "r1b\n" + + "r4bab\n" + + "r4bag\n" + + "rbi2\n" + + "rbi4f\n" + + "r2bin\n" + + "r5bine\n" + + "rb5ing.\n" + + "rb4o\n" + + "r1c\n" + + "r2ce\n" + + "rcen4\n" + + "r3cha\n" + + "rch4er\n" + + "r4ci4b\n" + + "rc4it\n" + + "rcum3\n" + + "r4dal\n" + + "rd2i\n" + + "rdi4a\n" + + "rdi4er\n" + + "rdin4\n" + + "rd3ing\n" + + "2re.\n" + + "re1al\n" + + "re3an\n" + + "re5arr\n" + + "5reav\n" + + "re4aw\n" + + "r5ebrat\n" + + "rec5oll\n" + + "rec5ompe\n" + + "re4cre\n" + + "2r2ed\n" + + "re1de\n" + + "re3dis\n" + + "red5it\n" + + "re4fac\n" + + "re2fe\n" + + "re5fer.\n" + + "re3fi\n" + + "re4fy\n" + + "reg3is\n" + + "re5it\n" + + "re1li\n" + + "re5lu\n" + + "r4en4ta\n" + + "ren4te\n" + + "re1o\n" + + "re5pin\n" + + "re4posi\n" + + "re1pu\n" + + "r1er4\n" + + "r4eri\n" + + "rero4\n" + + "re5ru\n" + + "r4es.\n" + + "re4spi\n" + + "ress5ib\n" + + "res2t\n" + + "re5stal\n" + + "re3str\n" + + "re4ter\n" + + "re4ti4z\n" + + "re3tri\n" + + "reu2\n" + + "re5uti\n" + + "rev2\n" + + "re4val\n" + + "rev3el\n" + + "r5ev5er.\n" + + "re5vers\n" + + "re5vert\n" + + "re5vil\n" + + "rev5olu\n" + + "re4wh\n" + + "r1f\n" + + "rfu4\n" + + "r4fy\n" + + "rg2\n" + + "rg3er\n" + + "r3get\n" + + "r3gic\n" + + "rgi4n\n" + + "rg3ing\n" + + "r5gis\n" + + "r5git\n" + + "r1gl\n" + + "rgo4n\n" + + "r3gu\n" + + "rh4\n" + + "4rh.\n" + + "4rhal\n" + + "ri3a\n" + + "ria4b\n" + + "ri4ag\n" + + "r4ib\n" + + "rib3a\n" + + "ric5as\n" + + "r4ice\n" + + "4rici\n" + + "5ricid\n" + + "ri4cie\n" + + "r4ico\n" + + "rid5er\n" + + "ri3enc\n" + + "ri3ent\n" + + "ri1er\n" + + "ri5et\n" + + "rig5an\n" + + "5rigi\n" + + "ril3iz\n" + + "5riman\n" + + "rim5i\n" + + "3rimo\n" + + "rim4pe\n" + + "r2ina\n" + + "5rina.\n" + + "rin4d\n" + + "rin4e\n" + + "rin4g\n" + + "ri1o\n" + + "5riph\n" + + "riph5e\n" + + "ri2pl\n" + + "rip5lic\n" + + "r4iq\n" + + "r2is\n" + + "r4is.\n" + + "ris4c\n" + + "r3ish\n" + + "ris4p\n" + + "ri3ta3b\n" + + "r5ited.\n" + + "rit5er.\n" + + "rit5ers\n" + + "rit3ic\n" + + "ri2tu\n" + + "rit5ur\n" + + "riv5el\n" + + "riv3et\n" + + "riv3i\n" + + "r3j\n" + + "r3ket\n" + + "rk4le\n" + + "rk4lin\n" + + "r1l\n" + + "rle4\n" + + "r2led\n" + + "r4lig\n" + + "r4lis\n" + + "rl5ish\n" + + "r3lo4\n" + + "r1m\n" + + "rma5c\n" + + "r2me\n" + + "r3men\n" + + "rm5ers\n" + + "rm3ing\n" + + "r4ming.\n" + + "r4mio\n" + + "r3mit\n" + + "r4my\n" + + "r4nar\n" + + "r3nel\n" + + "r4ner\n" + + "r5net\n" + + "r3ney\n" + + "r5nic\n" + + "r1nis4\n" + + "r3nit\n" + + "r3niv\n" + + "rno4\n" + + "r4nou\n" + + "r3nu\n" + + "rob3l\n" + + "r2oc\n" + + "ro3cr\n" + + "ro4e\n" + + "ro1fe\n" + + "ro5fil\n" + + "rok2\n" + + "ro5ker\n" + + "5role.\n" + + "rom5ete\n" + + "rom4i\n" + + "rom4p\n" + + "ron4al\n" + + "ron4e\n" + + "ro5n4is\n" + + "ron4ta\n" + + "1room\n" + + "5root\n" + + "ro3pel\n" + + "rop3ic\n" + + "ror3i\n" + + "ro5ro\n" + + "ros5per\n" + + "ros4s\n" + + "ro4the\n" + + "ro4ty\n" + + "ro4va\n" + + "rov5el\n" + + "rox5\n" + + "r1p\n" + + "r4pea\n" + + "r5pent\n" + + "rp5er.\n" + + "r3pet\n" + + "rp4h4\n" + + "rp3ing\n" + + "r3po\n" + + "r1r4\n" + + "rre4c\n" + + "rre4f\n" + + "r4reo\n" + + "rre4st\n" + + "rri4o\n" + + "rri4v\n" + + "rron4\n" + + "rros4\n" + + "rrys4\n" + + "4rs2\n" + + "r1sa\n" + + "rsa5ti\n" + + "rs4c\n" + + "r2se\n" + + "r3sec\n" + + "rse4cr\n" + + "rs5er.\n" + + "rs3es\n" + + "rse5v2\n" + + "r1sh\n" + + "r5sha\n" + + "r1si\n" + + "r4si4b\n" + + "rson3\n" + + "r1sp\n" + + "r5sw\n" + + "rtach4\n" + + "r4tag\n" + + "r3teb\n" + + "rten4d\n" + + "rte5o\n" + + "r1ti\n" + + "rt5ib\n" + + "rti4d\n" + + "r4tier\n" + + "r3tig\n" + + "rtil3i\n" + + "rtil4l\n" + + "r4tily\n" + + "r4tist\n" + + "r4tiv\n" + + "r3tri\n" + + "rtroph4\n" + + "rt4sh\n" + + "ru3a\n" + + "ru3e4l\n" + + "ru3en\n" + + "ru4gl\n" + + "ru3in\n" + + "rum3pl\n" + + "ru2n\n" + + "runk5\n" + + "run4ty\n" + + "r5usc\n" + + "ruti5n\n" + + "rv4e\n" + + "rvel4i\n" + + "r3ven\n" + + "rv5er.\n" + + "r5vest\n" + + "r3vey\n" + + "r3vic\n" + + "rvi4v\n" + + "r3vo\n" + + "r1w\n" + + "ry4c\n" + + "5rynge\n" + + "ry3t\n" + + "sa2\n" + + "2s1ab\n" + + "5sack\n" + + "sac3ri\n" + + "s3act\n" + + "5sai\n" + + "salar4\n" + + "sal4m\n" + + "sa5lo\n" + + "sal4t\n" + + "3sanc\n" + + "san4de\n" + + "s1ap\n" + + "sa5ta\n" + + "5sa3tio\n" + + "sat3u\n" + + "sau4\n" + + "sa5vor\n" + + "5saw\n" + + "4s5b\n" + + "scan4t5\n" + + "sca4p\n" + + "scav5\n" + + "s4ced\n" + + "4scei\n" + + "s4ces\n" + + "sch2\n" + + "s4cho\n" + + "3s4cie\n" + + "5scin4d\n" + + "scle5\n" + + "s4cli\n" + + "scof4\n" + + "4scopy\n" + + "scour5a\n" + + "s1cu\n" + + "4s5d\n" + + "4se.\n" + + "se4a\n" + + "seas4\n" + + "sea5w\n" + + "se2c3o\n" + + "3sect\n" + + "4s4ed\n" + + "se4d4e\n" + + "s5edl\n" + + "se2g\n" + + "seg3r\n" + + "5sei\n" + + "se1le\n" + + "5self\n" + + "5selv\n" + + "4seme\n" + + "se4mol\n" + + "sen5at\n" + + "4senc\n" + + "sen4d\n" + + "s5ened\n" + + "sen5g\n" + + "s5enin\n" + + "4sentd\n" + + "4sentl\n" + + "sep3a3\n" + + "4s1er.\n" + + "s4erl\n" + + "ser4o\n" + + "4servo\n" + + "s1e4s\n" + + "se5sh\n" + + "ses5t\n" + + "5se5um\n" + + "5sev\n" + + "sev3en\n" + + "sew4i\n" + + "5sex\n" + + "4s3f\n" + + "2s3g\n" + + "s2h\n" + + "2sh.\n" + + "sh1er\n" + + "5shev\n" + + "sh1in\n" + + "sh3io\n" + + "3ship\n" + + "shiv5\n" + + "sho4\n" + + "sh5old\n" + + "shon3\n" + + "shor4\n" + + "short5\n" + + "4shw\n" + + "si1b\n" + + "s5icc\n" + + "3side.\n" + + "5sides\n" + + "5sidi\n" + + "si5diz\n" + + "4signa\n" + + "sil4e\n" + + "4sily\n" + + "2s1in\n" + + "s2ina\n" + + "5sine.\n" + + "s3ing\n" + + "1sio\n" + + "5sion\n" + + "sion5a\n" + + "si2r\n" + + "sir5a\n" + + "1sis\n" + + "3sitio\n" + + "5siu\n" + + "1siv\n" + + "5siz\n" + + "sk2\n" + + "4ske\n" + + "s3ket\n" + + "sk5ine\n" + + "sk5ing\n" + + "s1l2\n" + + "s3lat\n" + + "s2le\n" + + "slith5\n" + + "2s1m\n" + + "s3ma\n" + + "small3\n" + + "sman3\n" + + "smel4\n" + + "s5men\n" + + "5smith\n" + + "smol5d4\n" + + "s1n4\n" + + "1so\n" + + "so4ce\n" + + "soft3\n" + + "so4lab\n" + + "sol3d2\n" + + "so3lic\n" + + "5solv\n" + + "3som\n" + + "3s4on.\n" + + "sona4\n" + + "son4g\n" + + "s4op\n" + + "5sophic\n" + + "s5ophiz\n" + + "s5ophy\n" + + "sor5c\n" + + "sor5d\n" + + "4sov\n" + + "so5vi\n" + + "2spa\n" + + "5spai\n" + + "spa4n\n" + + "spen4d\n" + + "2s5peo\n" + + "2sper\n" + + "s2phe\n" + + "3spher\n" + + "spho5\n" + + "spil4\n" + + "sp5ing\n" + + "4spio\n" + + "s4ply\n" + + "s4pon\n" + + "spor4\n" + + "4spot\n" + + "squal4l\n" + + "s1r\n" + + "2ss\n" + + "s1sa\n" + + "ssas3\n" + + "s2s5c\n" + + "s3sel\n" + + "s5seng\n" + + "s4ses.\n" + + "s5set\n" + + "s1si\n" + + "s4sie\n" + + "ssi4er\n" + + "ss5ily\n" + + "s4sl\n" + + "ss4li\n" + + "s4sn\n" + + "sspend4\n" + + "ss2t\n" + + "ssur5a\n" + + "ss5w\n" + + "2st.\n" + + "s2tag\n" + + "s2tal\n" + + "stam4i\n" + + "5stand\n" + + "s4ta4p\n" + + "5stat.\n" + + "s4ted\n" + + "stern5i\n" + + "s5tero\n" + + "ste2w\n" + + "stew5a\n" + + "s3the\n" + + "st2i\n" + + "s4ti.\n" + + "s5tia\n" + + "s1tic\n" + + "5stick\n" + + "s4tie\n" + + "s3tif\n" + + "st3ing\n" + + "5stir\n" + + "s1tle\n" + + "5stock\n" + + "stom3a\n" + + "5stone\n" + + "s4top\n" + + "3store\n" + + "st4r\n" + + "s4trad\n" + + "5stratu\n" + + "s4tray\n" + + "s4trid\n" + + "4stry\n" + + "4st3w\n" + + "s2ty\n" + + "1su\n" + + "su1al\n" + + "su4b3\n" + + "su2g3\n" + + "su5is\n" + + "suit3\n" + + "s4ul\n" + + "su2m\n" + + "sum3i\n" + + "su2n\n" + + "su2r\n" + + "4sv\n" + + "sw2\n" + + "4swo\n" + + "s4y\n" + + "4syc\n" + + "3syl\n" + + "syn5o\n" + + "sy5rin\n" + + "1ta\n" + + "3ta.\n" + + "2tab\n" + + "ta5bles\n" + + "5taboliz\n" + + "4taci\n" + + "ta5do\n" + + "4taf4\n" + + "tai5lo\n" + + "ta2l\n" + + "ta5la\n" + + "tal5en\n" + + "tal3i\n" + + "4talk\n" + + "tal4lis\n" + + "ta5log\n" + + "ta5mo\n" + + "tan4de\n" + + "tanta3\n" + + "ta5per\n" + + "ta5pl\n" + + "tar4a\n" + + "4tarc\n" + + "4tare\n" + + "ta3riz\n" + + "tas4e\n" + + "ta5sy\n" + + "4tatic\n" + + "ta4tur\n" + + "taun4\n" + + "tav4\n" + + "2taw\n" + + "tax4is\n" + + "2t1b\n" + + "4tc\n" + + "t4ch\n" + + "tch5et\n" + + "4t1d\n" + + "4te.\n" + + "tead4i\n" + + "4teat\n" + + "tece4\n" + + "5tect\n" + + "2t1ed\n" + + "te5di\n" + + "1tee\n" + + "teg4\n" + + "te5ger\n" + + "te5gi\n" + + "3tel.\n" + + "teli4\n" + + "5tels\n" + + "te2ma2\n" + + "tem3at\n" + + "3tenan\n" + + "3tenc\n" + + "3tend\n" + + "4tenes\n" + + "1tent\n" + + "ten4tag\n" + + "1teo\n" + + "te4p\n" + + "te5pe\n" + + "ter3c\n" + + "5ter3d\n" + + "1teri\n" + + "ter5ies\n" + + "ter3is\n" + + "teri5za\n" + + "5ternit\n" + + "ter5v\n" + + "4tes.\n" + + "4tess\n" + + "t3ess.\n" + + "teth5e\n" + + "3teu\n" + + "3tex\n" + + "4tey\n" + + "2t1f\n" + + "4t1g\n" + + "2th.\n" + + "than4\n" + + "th2e\n" + + "4thea\n" + + "th3eas\n" + + "the5at\n" + + "the3is\n" + + "3thet\n" + + "th5ic.\n" + + "th5ica\n" + + "4thil\n" + + "5think\n" + + "4thl\n" + + "th5ode\n" + + "5thodic\n" + + "4thoo\n" + + "thor5it\n" + + "tho5riz\n" + + "2ths\n" + + "1tia\n" + + "ti4ab\n" + + "ti4ato\n" + + "2ti2b\n" + + "4tick\n" + + "t4ico\n" + + "t4ic1u\n" + + "5tidi\n" + + "3tien\n" + + "tif2\n" + + "ti5fy\n" + + "2tig\n" + + "5tigu\n" + + "till5in\n" + + "1tim\n" + + "4timp\n" + + "tim5ul\n" + + "2t1in\n" + + "t2ina\n" + + "3tine.\n" + + "3tini\n" + + "1tio\n" + + "ti5oc\n" + + "tion5ee\n" + + "5tiq\n" + + "ti3sa\n" + + "3tise\n" + + "tis4m\n" + + "ti5so\n" + + "tis4p\n" + + "5tistica\n" + + "ti3tl\n" + + "ti4u\n" + + "1tiv\n" + + "tiv4a\n" + + "1tiz\n" + + "ti3za\n" + + "ti3zen\n" + + "2tl\n" + + "t5la\n" + + "tlan4\n" + + "3tle.\n" + + "3tled\n" + + "3tles.\n" + + "t5let.\n" + + "t5lo\n" + + "4t1m\n" + + "tme4\n" + + "2t1n2\n" + + "1to\n" + + "to3b\n" + + "to5crat\n" + + "4todo\n" + + "2tof\n" + + "to2gr\n" + + "to5ic\n" + + "to2ma\n" + + "tom4b\n" + + "to3my\n" + + "ton4ali\n" + + "to3nat\n" + + "4tono\n" + + "4tony\n" + + "to2ra\n" + + "to3rie\n" + + "tor5iz\n" + + "tos2\n" + + "5tour\n" + + "4tout\n" + + "to3war\n" + + "4t1p\n" + + "1tra\n" + + "tra3b\n" + + "tra5ch\n" + + "traci4\n" + + "trac4it\n" + + "trac4te\n" + + "tras4\n" + + "tra5ven\n" + + "trav5es5\n" + + "tre5f\n" + + "tre4m\n" + + "trem5i\n" + + "5tria\n" + + "tri5ces\n" + + "5tricia\n" + + "4trics\n" + + "2trim\n" + + "tri4v\n" + + "tro5mi\n" + + "tron5i\n" + + "4trony\n" + + "tro5phe\n" + + "tro3sp\n" + + "tro3v\n" + + "tru5i\n" + + "trus4\n" + + "4t1s2\n" + + "t4sc\n" + + "tsh4\n" + + "t4sw\n" + + "4t3t2\n" + + "t4tes\n" + + "t5to\n" + + "ttu4\n" + + "1tu\n" + + "tu1a\n" + + "tu3ar\n" + + "tu4bi\n" + + "tud2\n" + + "4tue\n" + + "4tuf4\n" + + "5tu3i\n" + + "3tum\n" + + "tu4nis\n" + + "2t3up.\n" + + "3ture\n" + + "5turi\n" + + "tur3is\n" + + "tur5o\n" + + "tu5ry\n" + + "3tus\n" + + "4tv\n" + + "tw4\n" + + "4t1wa\n" + + "twis4\n" + + "4two\n" + + "1ty\n" + + "4tya\n" + + "2tyl\n" + + "type3\n" + + "ty5ph\n" + + "4tz\n" + + "tz4e\n" + + "4uab\n" + + "uac4\n" + + "ua5na\n" + + "uan4i\n" + + "uar5ant\n" + + "uar2d\n" + + "uar3i\n" + + "uar3t\n" + + "u1at\n" + + "uav4\n" + + "ub4e\n" + + "u4bel\n" + + "u3ber\n" + + "u4bero\n" + + "u1b4i\n" + + "u4b5ing\n" + + "u3ble.\n" + + "u3ca\n" + + "uci4b\n" + + "uc4it\n" + + "ucle3\n" + + "u3cr\n" + + "u3cu\n" + + "u4cy\n" + + "ud5d\n" + + "ud3er\n" + + "ud5est\n" + + "udev4\n" + + "u1dic\n" + + "ud3ied\n" + + "ud3ies\n" + + "ud5is\n" + + "u5dit\n" + + "u4don\n" + + "ud4si\n" + + "u4du\n" + + "u4ene\n" + + "uens4\n" + + "uen4te\n" + + "uer4il\n" + + "3ufa\n" + + "u3fl\n" + + "ugh3en\n" + + "ug5in\n" + + "2ui2\n" + + "uil5iz\n" + + "ui4n\n" + + "u1ing\n" + + "uir4m\n" + + "uita4\n" + + "uiv3\n" + + "uiv4er.\n" + + "u5j\n" + + "4uk\n" + + "u1la\n" + + "ula5b\n" + + "u5lati\n" + + "ulch4\n" + + "5ulche\n" + + "ul3der\n" + + "ul4e\n" + + "u1len\n" + + "ul4gi\n" + + "ul2i\n" + + "u5lia\n" + + "ul3ing\n" + + "ul5ish\n" + + "ul4lar\n" + + "ul4li4b\n" + + "ul4lis\n" + + "4ul3m\n" + + "u1l4o\n" + + "4uls\n" + + "uls5es\n" + + "ul1ti\n" + + "ultra3\n" + + "4ultu\n" + + "u3lu\n" + + "ul5ul\n" + + "ul5v\n" + + "um5ab\n" + + "um4bi\n" + + "um4bly\n" + + "u1mi\n" + + "u4m3ing\n" + + "umor5o\n" + + "um2p\n" + + "unat4\n" + + "u2ne\n" + + "un4er\n" + + "u1ni\n" + + "un4im\n" + + "u2nin\n" + + "un5ish\n" + + "uni3v\n" + + "un3s4\n" + + "un4sw\n" + + "unt3ab\n" + + "un4ter.\n" + + "un4tes\n" + + "unu4\n" + + "un5y\n" + + "un5z\n" + + "u4ors\n" + + "u5os\n" + + "u1ou\n" + + "u1pe\n" + + "uper5s\n" + + "u5pia\n" + + "up3ing\n" + + "u3pl\n" + + "up3p\n" + + "upport5\n" + + "upt5ib\n" + + "uptu4\n" + + "u1ra\n" + + "4ura.\n" + + "u4rag\n" + + "u4ras\n" + + "ur4be\n" + + "urc4\n" + + "ur1d\n" + + "ure5at\n" + + "ur4fer\n" + + "ur4fr\n" + + "u3rif\n" + + "uri4fic\n" + + "ur1in\n" + + "u3rio\n" + + "u1rit\n" + + "ur3iz\n" + + "ur2l\n" + + "url5ing.\n" + + "ur4no\n" + + "uros4\n" + + "ur4pe\n" + + "ur4pi\n" + + "urs5er\n" + + "ur5tes\n" + + "ur3the\n" + + "urti4\n" + + "ur4tie\n" + + "u3ru\n" + + "2us\n" + + "u5sad\n" + + "u5san\n" + + "us4ap\n" + + "usc2\n" + + "us3ci\n" + + "use5a\n" + + "u5sia\n" + + "u3sic\n" + + "us4lin\n" + + "us1p\n" + + "us5sl\n" + + "us5tere\n" + + "us1tr\n" + + "u2su\n" + + "usur4\n" + + "uta4b\n" + + "u3tat\n" + + "4ute.\n" + + "4utel\n" + + "4uten\n" + + "uten4i\n" + + "4u1t2i\n" + + "uti5liz\n" + + "u3tine\n" + + "ut3ing\n" + + "ution5a\n" + + "u4tis\n" + + "5u5tiz\n" + + "u4t1l\n" + + "ut5of\n" + + "uto5g\n" + + "uto5matic\n" + + "u5ton\n" + + "u4tou\n" + + "uts4\n" + + "u3u\n" + + "uu4m\n" + + "u1v2\n" + + "uxu3\n" + + "uz4e\n" + + "1va\n" + + "5va.\n" + + "2v1a4b\n" + + "vac5il\n" + + "vac3u\n" + + "vag4\n" + + "va4ge\n" + + "va5lie\n" + + "val5o\n" + + "val1u\n" + + "va5mo\n" + + "va5niz\n" + + "va5pi\n" + + "var5ied\n" + + "3vat\n" + + "4ve.\n" + + "4ved\n" + + "veg3\n" + + "v3el.\n" + + "vel3li\n" + + "ve4lo\n" + + "v4ely\n" + + "ven3om\n" + + "v5enue\n" + + "v4erd\n" + + "5vere.\n" + + "v4erel\n" + + "v3eren\n" + + "ver5enc\n" + + "v4eres\n" + + "ver3ie\n" + + "vermi4n\n" + + "3verse\n" + + "ver3th\n" + + "v4e2s\n" + + "4ves.\n" + + "ves4te\n" + + "ve4te\n" + + "vet3er\n" + + "ve4ty\n" + + "vi5ali\n" + + "5vian\n" + + "5vide.\n" + + "5vided\n" + + "4v3iden\n" + + "5vides\n" + + "5vidi\n" + + "v3if\n" + + "vi5gn\n" + + "vik4\n" + + "2vil\n" + + "5vilit\n" + + "v3i3liz\n" + + "v1in\n" + + "4vi4na\n" + + "v2inc\n" + + "vin5d\n" + + "4ving\n" + + "vio3l\n" + + "v3io4r\n" + + "vi1ou\n" + + "vi4p\n" + + "vi5ro\n" + + "vis3it\n" + + "vi3so\n" + + "vi3su\n" + + "4viti\n" + + "vit3r\n" + + "4vity\n" + + "3viv\n" + + "5vo.\n" + + "voi4\n" + + "3vok\n" + + "vo4la\n" + + "v5ole\n" + + "5volt\n" + + "3volv\n" + + "vom5i\n" + + "vor5ab\n" + + "vori4\n" + + "vo4ry\n" + + "vo4ta\n" + + "4votee\n" + + "4vv4\n" + + "v4y\n" + + "w5abl\n" + + "2wac\n" + + "wa5ger\n" + + "wag5o\n" + + "wait5\n" + + "w5al.\n" + + "wam4\n" + + "war4t\n" + + "was4t\n" + + "wa1te\n" + + "wa5ver\n" + + "w1b\n" + + "wea5rie\n" + + "weath3\n" + + "wed4n\n" + + "weet3\n" + + "wee5v\n" + + "wel4l\n" + + "w1er\n" + + "west3\n" + + "w3ev\n" + + "whi4\n" + + "wi2\n" + + "wil2\n" + + "will5in\n" + + "win4de\n" + + "win4g\n" + + "wir4\n" + + "3wise\n" + + "with3\n" + + "wiz5\n" + + "w4k\n" + + "wl4es\n" + + "wl3in\n" + + "w4no\n" + + "1wo2\n" + + "wom1\n" + + "wo5ven\n" + + "w5p\n" + + "wra4\n" + + "wri4\n" + + "writa4\n" + + "w3sh\n" + + "ws4l\n" + + "ws4pe\n" + + "w5s4t\n" + + "4wt\n" + + "wy4\n" + + "x1a\n" + + "xac5e\n" + + "x4ago\n" + + "xam3\n" + + "x4ap\n" + + "xas5\n" + + "x3c2\n" + + "x1e\n" + + "xe4cuto\n" + + "x2ed\n" + + "xer4i\n" + + "xe5ro\n" + + "x1h\n" + + "xhi2\n" + + "xhil5\n" + + "xhu4\n" + + "x3i\n" + + "xi5a\n" + + "xi5c\n" + + "xi5di\n" + + "x4ime\n" + + "xi5miz\n" + + "x3o\n" + + "x4ob\n" + + "x3p\n" + + "xpan4d\n" + + "xpecto5\n" + + "xpe3d\n" + + "x1t2\n" + + "x3ti\n" + + "x1u\n" + + "xu3a\n" + + "xx4\n" + + "y5ac\n" + + "3yar4\n" + + "y5at\n" + + "y1b\n" + + "y1c\n" + + "y2ce\n" + + "yc5er\n" + + "y3ch\n" + + "ych4e\n" + + "ycom4\n" + + "ycot4\n" + + "y1d\n" + + "y5ee\n" + + "y1er\n" + + "y4erf\n" + + "yes4\n" + + "ye4t\n" + + "y5gi\n" + + "4y3h\n" + + "y1i\n" + + "y3la\n" + + "ylla5bl\n" + + "y3lo\n" + + "y5lu\n" + + "ymbol5\n" + + "yme4\n" + + "ympa3\n" + + "yn3chr\n" + + "yn5d\n" + + "yn5g\n" + + "yn5ic\n" + + "5ynx\n" + + "y1o4\n" + + "yo5d\n" + + "y4o5g\n" + + "yom4\n" + + "yo5net\n" + + "y4ons\n" + + "y4os\n" + + "y4ped\n" + + "yper5\n" + + "yp3i\n" + + "y3po\n" + + "y4poc\n" + + "yp2ta\n" + + "y5pu\n" + + "yra5m\n" + + "yr5ia\n" + + "y3ro\n" + + "yr4r\n" + + "ys4c\n" + + "y3s2e\n" + + "ys3ica\n" + + "ys3io\n" + + "3ysis\n" + + "y4so\n" + + "yss4\n" + + "ys1t\n" + + "ys3ta\n" + + "ysur4\n" + + "y3thin\n" + + "yt3ic\n" + + "y1w\n" + + "za1\n" + + "z5a2b\n" + + "zar2\n" + + "4zb\n" + + "2ze\n" + + "ze4n\n" + + "ze4p\n" + + "z1er\n" + + "ze3ro\n" + + "zet4\n" + + "2z1i\n" + + "z4il\n" + + "z4is\n" + + "5zl\n" + + "4zm\n" + + "1zo\n" + + "zo4m\n" + + "zo5ol\n" + + "zte4\n" + + "4z1z2\n" + + "z4zy\n" + ; +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/ISODateInstance.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/ISODateInstance.java new file mode 100644 index 0000000..65e6479 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/ISODateInstance.java @@ -0,0 +1,1164 @@ +package edu.stanford.nlp.ie.pascal; + +import edu.stanford.nlp.ie.QuantifiableEntityNormalizer; +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; + +import java.io.BufferedReader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Map; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Represents dates and times according to ISO8601 standard while also allowing for + * wild cards - e.g., can represent "21 June" without a year + * (Standard ISO8601 only allows removing less precise annotations (e.g., + * 200706 rather than 20070621 but not a way to represent 0621 without a year) + *

    + * Format stores date and time separately since the majority of current use + * cases involve only one of these items. Standard ISO 8601 instead + * requires <date>T<time>. + *

    + * Ranges are specified within the strings via forward slash. For example + * 6 June - 8 June is represented ****0606/****0608. 6 June onward is + * ****0606/ and until 8 June is /****0608. + * + * @author Anna Rafferty + * TODO: add time support - currently just dates are supported + */ +public class ISODateInstance { + + private static final boolean DEBUG = false; + private ArrayList tokens = new ArrayList();//each token contains some piece of the date, from our input. + + public static final String OPEN_RANGE_AFTER = "A"; + public static final String OPEN_RANGE_BEFORE = "B"; + public static final String BOUNDED_RANGE = "C"; + public static final String NO_RANGE = ""; + public static final int DAY_OF_HALF_MONTH = 15; + public static final int LAST_DAY_OF_MONTH = 31;//close enough for our purposes + public static final String MONTH_OF_HALF_YEAR = "07"; + public static final String LAST_MONTH_OF_YEAR = "12"; + /** + * String of the format <year><month><day>. Representations + * by week are also allowed. If a more general field (such as year) + * is not specified when a less general one (such as month) is, the characters + * normally filled by the more general field are replaced by asterisks. For example, + * 21 June would be \"****0621\". Less general fields are simply truncated; + * for example, June 2007 would be \"200706\". + */ + private String isoDate = ""; + + //Variable for marking if we were unable to parse the string associated with this isoDate + private boolean unparseable = false; + + //private String isoTime = ""; + + + /** + * Creates an empty date instance; you probably + * don't want this in most cases. + */ + public ISODateInstance() { + + } + + /** + * Takes a string that represents a date, and attempts to + * normalize it into ISO 8601-compatible format. + * + */ + public ISODateInstance(String date) { + extractFields(date); + } + + public ISODateInstance(String date, String openRangeMarker) { + extractFields(date); + //now process the range marker; if a range was found independently, we ignore the marker + if ( ! ISODateInstance.NO_RANGE.equals(openRangeMarker) && ! isoDate.contains("/")) { + if (ISODateInstance.OPEN_RANGE_AFTER.equals(openRangeMarker)) { + isoDate = isoDate + '/'; + } else if (ISODateInstance.OPEN_RANGE_BEFORE.equals(openRangeMarker)) { + isoDate = '/' + isoDate; + } + } + } + + /** + * Constructor for a range of dates, beginning at date start and finishing at date end + * + */ + public ISODateInstance(ISODateInstance start, ISODateInstance end) { + String startString = start.getDateString(); + if (start.isRange()) { + startString = start.getStartDate(); + } + String endString = end.getDateString(); + if (end.isRange()) { + endString = end.getEndDate(); + } + + isoDate = startString + '/' + endString; + unparseable = (start.isUnparseable() || end.isUnparseable()); + } + + /** + * Construct a new ISODate based on its relation to a referenceDate. + * relativeDate should be something like "today" or "tomorrow" or "last year" + * and the resulting ISODate will be the same as the referenceDate, a day later, + * or a year earlier, respectively. + * + */ + public ISODateInstance(ISODateInstance referenceDate, String relativeDate) { + Pair relation = relativeDateMap.get(relativeDate.toLowerCase()); + if (relation != null) { + switch (relation.first()) { + case DAY: + incrementDay(referenceDate, relation); + break; + case MONTH: + incrementMonth(referenceDate, relation); + break; + case YEAR: + incrementYear(referenceDate, relation); + break; + } + } + } + + + private void incrementYear(ISODateInstance referenceDate, Pair relation) { + String origDateString = referenceDate.getStartDate(); + String yearString = origDateString.substring(0, 4); + if (yearString.contains("*")) { + isoDate = origDateString; + return; + } + isoDate = makeStringYearChange(origDateString, Integer.parseInt(yearString) + relation.second()); + } + + private void incrementMonth(ISODateInstance referenceDate, Pair relation) { + String origDateString = referenceDate.getStartDate(); + String monthString = origDateString.substring(4, 6); + if (monthString.contains("*")) { + isoDate = origDateString; + return; + } + //Month is not a variable + Integer monthNum = Integer.parseInt(monthString); + //Check if we're an edge case + if (((monthNum + relation.second()) > 12) || ((monthNum + relation.second) < 1)) { + boolean decreasing = ((monthNum + relation.second) < 1); + int newMonthNum = (monthNum + relation.second()) % 12; + if (newMonthNum < 0) { + newMonthNum *= -1; + } + //Set the month appropriately + isoDate = makeStringMonthChange(origDateString, newMonthNum); + //Increment the year if possible + String yearString = origDateString.substring(0, 4); + if (!yearString.contains("*")) { + //How much we increment depends on above mod + int numYearsToIncrement = (int) Math.ceil(relation.second() / 12.0); + if (decreasing) { + isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - numYearsToIncrement); + } else { + isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + numYearsToIncrement); + } + } + } else { + isoDate = makeStringMonthChange(origDateString, (monthNum + relation.second())); + } + } + + + private void incrementDay(ISODateInstance referenceDate, Pair relation) { + String origDateString = referenceDate.getStartDate(); + String dayString = origDateString.substring(origDateString.length() - 2, origDateString.length()); + if (dayString.contains("*")) { + isoDate = origDateString; + return; + } + //Date is not a variable + Integer dayNum = Integer.parseInt(dayString); + String monthString = origDateString.substring(origDateString.length() - 4, origDateString.length() - 2); + int numDaysInMonth = 30;//default - assume this if month is a variable + int monthNum = -1;//ie, we don't know the month yet - this remains -1 if the month is a variable + if (!monthString.contains("*")) { + //Set appropriate numDaysInMonth and monthNum + monthNum = Integer.parseInt(monthString); + numDaysInMonth = daysPerMonth.get(monthNum); + } + + //Now, find out if we're an edge case (potential to increment month) + if (dayNum + relation.second() <= numDaysInMonth && dayNum + relation.second() >= 1) { + //Not an edge case - just increment the day, create a new string, and return + dayNum += relation.second(); + isoDate = makeStringDayChange(origDateString, dayNum); + return; + } + + //Since we're an edge case, the month can't be a variable - if it is a variable, just set this to the reference string + if (monthNum == -1) { + isoDate = origDateString; + return; + } + //At this point, neither our day nor our month is a variable + isoDate = origDateString; + boolean decreasing = (dayNum + relation.second() < 1); + //Need to increment the month, set the date appropriately - we need the new month num to set the day appropriately, so do month first + int newMonthNum; + //Now, check if we're an edge case for month + if ((monthNum + 1 > 12 && !decreasing) || (monthNum - 1 < 1 && decreasing)) { + //First, change the month + if (decreasing) { + newMonthNum = 12; + } else { + newMonthNum = 1; + } + //If we can, increment the year + //TODO: fix this to work more nicely with variables and thus handle more cases + String yearString = origDateString.substring(0, 4); + if (!yearString.contains("*")) { + if (decreasing) { + isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - 1); + } else { + isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + 1); + } + } + } else { + //We're not an edge case for month - just increment + if (decreasing) { + newMonthNum = monthNum - 1; + } else { + newMonthNum = monthNum + 1; + } + } + //do the increment + isoDate = makeStringMonthChange(isoDate, newMonthNum); + int newDateNum; + if (decreasing) { + newDateNum = -relation.second() + daysPerMonth.get(newMonthNum) - dayNum; + } else { + newDateNum = relation.second() - dayNum + daysPerMonth.get(monthNum); + } + //Now, change the day in our original string to be appropriate + isoDate = makeStringDayChange(isoDate, newDateNum); + + + } + + /** + * Changes the day portion of the origDate String to be the String + * value of newDay in two character format. (e.g., 9 -> "09") + * + */ + private static String makeStringDayChange(String origDate, int newDay) { + String newDayString = (newDay < 10 ? ("0" + newDay) : String.valueOf(newDay)); + return origDate.substring(0, origDate.length() - 2) + newDayString; + } + + /** + * Changes the month portion of the origDate String to be the String + * value of newDay in two character format. (e.g., 9 -> "09") + * + */ + private static String makeStringMonthChange(String origDate, int newMonth) { + String newMonthString = (newMonth < 10 ? ("0" + newMonth) : String.valueOf(newMonth)); + return origDate.substring(0, 4) + newMonthString + origDate.substring(6, 8); + } + + /** + * Changes the year portion of the origDate String to be the String + * value of newDay in two character format. (e.g., 9 -> "09") + * + */ + private static String makeStringYearChange(String origDate, int newYear) { + String newYearString = String.valueOf(newYear); + while (newYearString.length() < 4) { + newYearString = '0' + newYearString;//we're compatible with year 1! + } + return newYearString + origDate.substring(4, origDate.length()); + } + + + /** + * Enum for the fields * + */ + public static enum DateField { + DAY, MONTH, YEAR + } + + + /** + * Map for mapping a relativeDate String to a pair with the field that should be modified and the amount to modify it * + */ + public static final Map> relativeDateMap = Generics.newHashMap(); + + static { + //Add entries to the relative datemap + relativeDateMap.put("today", new Pair(DateField.DAY, 0)); + relativeDateMap.put("tomorrow", new Pair(DateField.DAY, 1)); + relativeDateMap.put("yesterday", new Pair(DateField.DAY, -1)); + + + } + + public static final Map daysPerMonth = Generics.newHashMap(); + + static { + //Add month entries + daysPerMonth.put(1, 31); + daysPerMonth.put(2, 28); + daysPerMonth.put(3, 31); + daysPerMonth.put(4, 30); + daysPerMonth.put(5, 31); + daysPerMonth.put(6, 30); + daysPerMonth.put(7, 31); + daysPerMonth.put(8, 31); + daysPerMonth.put(9, 30); + daysPerMonth.put(10, 31); + daysPerMonth.put(11, 30); + daysPerMonth.put(12, 31); + } + + /** + * Takes a string already formatted in ISODateInstance format + * (such as one previously written out using toString) and creates + * a new date instance from it + * + */ + public static ISODateInstance fromDateString(String date) { + ISODateInstance d = new ISODateInstance(); + d.isoDate = date; + return d; + } + + public String toString() { + return isoDate; + } + + /** + * Provided for backwards compatibility with DateInstance; + * returns the same thing as toString() + * + */ + public String getDateString() { + return this.toString(); + } + + /** + * Uses regexp matching to match month, day, and year fields + * TODO: Find a way to mark what;s already been handled in the string + */ + public boolean extractFields(String inputDate) { + + if (tokens.size() < 2) { + tokenizeDate(inputDate); + } + if (DEBUG) { + System.err.println("Extracting date: " + inputDate); + } + //first we see if it's a hyphen and two parseable dates - if not, we treat it as one date + Pair dateEndpoints = getRangeDates(inputDate); + if (dateEndpoints != null) { + ISODateInstance date1 = new ISODateInstance(dateEndpoints.first()); + if (dateEndpoints.first().contains(" ") && !dateEndpoints.second().contains(" ")) { + //consider whether it's a leading modifier; e.g., "June 8-10" will be split into June 8, and 10 when really we'd like June 8 and June 10 + String date = dateEndpoints.first().substring(0, dateEndpoints.first().indexOf(' ')) + ' ' + dateEndpoints.second(); + ISODateInstance date2 = new ISODateInstance(date); + if (!date1.isUnparseable() && !date2.isUnparseable()) { + isoDate = (new ISODateInstance(date1, date2)).getDateString(); + return true; + } + } + + ISODateInstance date2 = new ISODateInstance(dateEndpoints.second()); + if (!date1.isUnparseable() && !date2.isUnparseable()) { + isoDate = (new ISODateInstance(date1, date2)).getDateString(); + return true; + } + } + + if (extractYYYYMMDD(inputDate)) { + return true; + } + if (extractMMDDYY(inputDate)) { + return true; + } + boolean passed = false; + passed = extractYear(inputDate) || passed; + passed = extractMonth(inputDate) || passed; + passed = extractDay(inputDate) || passed; + + //slightly hacky, but check for some common modifiers that get grouped into the date + passed = addExtraRanges(inputDate) || passed; + + if (!passed) {//couldn't parse + //try one more trick + unparseable = true; + boolean weekday = extractWeekday(inputDate); + if (!weekday) { + isoDate = inputDate; + } + } + return passed; + } + + private static String[] rangeIndicators = {"--", "-"}; + + /** + * Attempts to find the two sides of a range in the given string. + * Uses rangeIndicators to find possible matches. + * + */ + private static Pair getRangeDates(String inputDate) { + for (String curIndicator : rangeIndicators) { + String[] dates = inputDate.split(curIndicator); + if (dates.length == 2) { + return new Pair(dates[0], dates[1]); + } + } + return null; + } + + private boolean addExtraRanges(String inputDate) { + if (isRange()) { + return false; + } + inputDate = inputDate.toLowerCase(); + if (inputDate.contains("half")) { + if (inputDate.contains("first") && isoDate.length() <= 6) { + String firstDate = isoDate + "01"; + String secondDate; + if (isoDate.length() == 4) {//year + secondDate = isoDate + MONTH_OF_HALF_YEAR; + } else {//month + secondDate = isoDate + DAY_OF_HALF_MONTH; + } + isoDate = firstDate + '/' + secondDate; + return true; + } else if (inputDate.contains("second") && isoDate.length() <= 6) { + String firstDate; + String secondDate; + if (isoDate.length() == 4) {//year + firstDate = isoDate + MONTH_OF_HALF_YEAR; + secondDate = isoDate + LAST_MONTH_OF_YEAR; + isoDate = firstDate + '/' + secondDate; + } else {//month + firstDate = isoDate + DAY_OF_HALF_MONTH; + secondDate = isoDate + LAST_DAY_OF_MONTH; + } + isoDate = firstDate + '/' + secondDate; + return true; + } + } + + return false; + } + + /** + * Returns true iff this date represents a range + * The range must have at least a start or end + * date, but is not guaranteed to have both + * + * @return Whether this date represents a range + */ + public boolean isRange() { + if (unparseable) { + return false; + } + return isoDate.matches("/"); + } + + /** + * Returns true iff we were unable to parse the input + * String associated with this date; in that case, + * we just store the input string and shortcircuit + * all of the comparison methods + * + */ + public boolean isUnparseable() { + return unparseable; + } + + + /** + * Returns this date or if it is a range, + * the date the range starts. If the date + * is of the form /<date>, "" is returned + * + * @return Start date of range + */ + public String getStartDate() { + if (!isRange()) { + return isoDate; + } + if (isoDate.startsWith("/")) { + return ""; + } + return isoDate.split("/")[0]; + } + + /** + * Returns this date or if it is a range, + * the date the range ends. If the date + * is of the form <date>/, "" is returned + * + * @return End date of range + */ + public String getEndDate() { + if (!isRange()) { + return isoDate; + } + if (isoDate.endsWith("/")) { + return ""; + } + String[] split = isoDate.split("/"); + return split[split.length - 1]; + } + + /* -------------------------- Static Comparison Methods -------------------------- */ + /** + * Returns true if date1 is after date2 + *

    + * Several tricky cases exist, and implementation tries to + * go with the common sense interpretation: + * When a year and a month are given for one, but only a month + * for the other, it is assumed that both have the same year + * e.g: + * ****12 is after 200211 + *

    + * When a year and a month are given for one but only a year + * for the other, it is assumed that one of these is after the + * other only if the years differ, e.g.: + * 2003 is after 200211 + * 2002 is not after 200211 + * 200211 is not after 2002 + * + * @return Whether date2 is after date1 + */ + static boolean isAfter(String date1, String date2) { + if (!isDateFormat(date1) || !isDateFormat(date2)) { + return false; + } + boolean after = true; + //first check years + String year = date1.substring(0, 4); + String yearOther = date2.substring(0, 4); + if (year.contains("*") || yearOther.contains("*")) { + after = after && checkWildcardCompatibility(year, yearOther); + } else if (Integer.valueOf(year) > Integer.valueOf(yearOther)) { + return true; + } else if (Integer.valueOf(year) < Integer.valueOf(yearOther)) { + return false; + } + + if (date1.length() < 6 || date2.length() < 6) { + if (year.contains("*") || yearOther.contains("*")) { + return after; + } else { + return after && (!Integer.valueOf(year).equals(Integer.valueOf(yearOther))); + } + } + //then check months + String month = date1.substring(4, 6); + String monthOther = date2.substring(4, 6); + if (month.contains("*") || monthOther.contains("*")) { + after = after && checkWildcardCompatibility(month, monthOther); + } else if (Integer.valueOf(month) > Integer.valueOf(monthOther)) { + return true; + } else if (Integer.valueOf(month) < Integer.valueOf(monthOther)) { + return false; + } + + if (date1.length() < 8 || date2.length() < 8) { + if (month.contains("*") || monthOther.contains("*")) { + return after; + } else { + return after && (!Integer.valueOf(month).equals(Integer.valueOf(monthOther))); + } + } + + //then check days + String day = date1.substring(6, 8); + String dayOther = date2.substring(6, 8); + if (day.contains("*") || dayOther.contains("*")) { + after = after && checkWildcardCompatibility(day, dayOther); + } else if (Integer.valueOf(day) > Integer.valueOf(dayOther)) { + return true; + } else if (Integer.valueOf(day) <= Integer.valueOf(dayOther)) { + return false; + } + + return after; + } + + /** + * Right now, we say they're compatible iff one of them is all + * wildcards or they are equivalent + * + */ + @SuppressWarnings("unused") + private static boolean checkWildcardAfterCompatibility(String txt1, String txt2) { + if (txt1.length() != txt2.length()) { + return false; + } + + for (int i = 0; i < txt1.length(); i++) { + Character t1 = txt1.charAt(i); + Character t2 = txt2.charAt(i); + if (!(t1.equals('*') || t2.equals('*') || t1.equals(t2))) { + return false; + } + } + return true; + } + + /** + * Returns true if the given txt contains only digits and "*" characters; + * false otherwise + * + */ + private static boolean isDateFormat(String txt) { + String numberValue = txt.replace("*", "");//remove wildcards + try { + Integer.parseInt(numberValue); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Returns true iff date1 could represent the same value as date2 + * e.g. + * ****07 is compatible with 200207 (and 200207 is compatible with ****07) + * 200207 is compatible with 20020714 (?maybe need a better idea of use case here...) + * + */ + public static boolean isCompatible(String date1, String date2) { + boolean compatible = true; + //first check years + compatible = compatible && isYearCompatible(date1, date2); + + //then check months + compatible = compatible && isMonthCompatible(date1, date2); + + //then check days + compatible = compatible && isDayCompatible(date1, date2); + + return compatible; + + } + + /** + * Checks if the years represented by the two dates are compatible + * If either lacks a year, we return true. + * + */ + private static boolean isYearCompatible(String date1, String date2) { + boolean compatible = true; + if (date1.length() < 4 || date2.length() < 4) { + return compatible; + } + //first check years + String year = date1.substring(0, 4); + String yearOther = date2.substring(0, 4); + if (year.contains("*") || yearOther.contains("*")) { + compatible = compatible && checkWildcardCompatibility(year, yearOther); + } else if (!year.equals(yearOther)) { + return false; + } + return compatible; + } + + /** + * Checks if the months represented by the two dates are compatible + * If either lacks a month, we return true. + * + */ + private static boolean isMonthCompatible(String date1, String date2) { + boolean compatible = true; + if (date1.length() < 6 || date2.length() < 6) { + return compatible; + } + //then check months + String month = date1.substring(4, 6); + String monthOther = date2.substring(4, 6); + if (month.contains("*") || monthOther.contains("*")) { + compatible = (compatible && checkWildcardCompatibility(month, monthOther)); + } else if (!month.equals(monthOther)) { + return false; + } + return compatible; + } + + /** + * Checks if the days represented by the two dates are compatible + * If either lacks a day, we return true. + * + */ + private static boolean isDayCompatible(String date1, String date2) { + boolean compatible = true; + if (date1.length() < 8 || date2.length() < 8) { + return compatible; + } + //then check days + String day = date1.substring(6, 8); + String dayOther = date2.substring(6, 8); + if (day.contains("*") || dayOther.contains("*")) { + compatible = compatible && checkWildcardCompatibility(day, dayOther); + } else if (!day.equals(dayOther)) { + return false; + } + return compatible; + } + + + /** + */ + private static boolean checkWildcardCompatibility(String txt1, String txt2) { + if (txt1.length() != txt2.length()) { + return false; + } + for (int i = 0; i < txt1.length(); i++) { + Character t1 = txt1.charAt(i); + Character t2 = txt2.charAt(i); + if (!(t1.equals('*') || t2.equals('*') || t1.equals(t2))) { + return false; + } + } + return true; + } + + + /* -------------------------- Instance Comparison Methods -------------------------- */ + /** + * Returns true iff this date + * contains the date represented by other. + * A range contains a date if it + * is equal to or after the start date and equal to or + * before the end date. For open ranges, contains + * is also inclusive of the one end point. + * + */ + public boolean contains(ISODateInstance other) { + if (this.isUnparseable() || other.isUnparseable()) { + return this.isoDate.equals(other.isoDate); + } + String start = this.getStartDate(); + if (!start.equals("")) {//we have a start date, need to make sure other is after it + String startOther = other.getStartDate(); + if (startOther.equals("")) { + return false;//incompatible + } else { + if (!isAfter(startOther, start)) { + return false; + } + } + } + //now we've found out that the start date is appropriate, check the end date + String end = this.getEndDate(); + if (!end.equals("")) { + String endOther = other.getEndDate(); + if (endOther.equals("")) { + return false; + } else { + if (!isAfter(end, endOther)) { + return false; + } + } + } + return true;//passes both start and end + } + + + /** + * Returns true if this date instance is after + * the given dateString. If this date instance + * is a range, then returns true only if both + * start and end dates are after dateString. + *

    + * Several tricky cases exist, and implementation tries to + * go with the commonsense interpretation: + * When a year and a month are given for one, but only a month + * for the other, it is assumed that both have the same year + * e.g: + * ****12 is after 200211 + *

    + * When a year and a month are given for one but only a year + * for the other, it is assumed that one of these is after the + * other only if the years differ, e.g.: + * 2003 is after 200211 + * 2002 is not after 200211 + * 200211 is not after 2002 + * + */ + public boolean isAfter(String dateString) { + if (this.isUnparseable()) { + return false; + } + if (!isDateFormat(dateString)) { + return false; + } + return isAfter(this.getEndDate(), dateString); + } + + public boolean isCompatibleDate(ISODateInstance other) { + if (this.isUnparseable() || other.isUnparseable()) { + return this.isoDate.equals(other.isoDate); + } + + //first see if either is a range + if (this.isRange()) { + return this.contains(other); + } else if (other.isRange()) { + return false;//not compatible if other is range and this isn't + } else { + return isCompatible(isoDate, other.getDateString()); + } + } + + /** + * Looks if the years for the two dates are compatible. + * This method does not consider ranges and uses only the + * start date. + * + */ + public boolean isYearCompatible(ISODateInstance other) { + if (this.isUnparseable() || other.isUnparseable()) { + return this.isoDate.equals(other.isoDate); + } + + return isYearCompatible(isoDate, other.getDateString()); + } + + /** + * Looks if the months for the two dates are compatible. + * This method does not consider ranges and uses only the + * start date. + * + */ + public boolean isMonthCompatible(ISODateInstance other) { + if (this.isUnparseable() || other.isUnparseable()) { + return this.isoDate.equals(other.isoDate); + } + + return isMonthCompatible(isoDate, other.getDateString()); + } + + /** + * Looks if the days for the two dates are compatible. + * This method does not consider ranges and uses only the + * start date. + * + */ + public boolean isDayCompatible(ISODateInstance other) { + if (this.isUnparseable() || other.isUnparseable()) { + return this.isoDate.equals(other.isoDate); + } + + return isDayCompatible(isoDate, other.getDateString()); + } + + + /* -------------------------- Tokenization and Field Extraction -------------------------- */ + //These methods are taken directly from or modified slightly from {@link DateInstance} + + private void tokenizeDate(String inputDate) { + tokens = new ArrayList(); + Pattern pat = Pattern.compile("[-]"); + if (inputDate == null) { + System.out.println("Null input date"); + } + Matcher m = pat.matcher(inputDate); + String str = m.replaceAll(" - "); + str = str.replaceAll(",", " "); + PTBTokenizer tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); + while (tokenizer.hasNext()) { + Word nextToken = tokenizer.next(); + tokens.add(nextToken.toString()); + } + if(DEBUG) { + System.out.println("tokens:" + tokens); + } + } + + + /** + * This method does YYYY-MM-DD style ISO date formats + * + * @return whether it worked. + */ + private boolean extractYYYYMMDD(String inputDate) { + Pattern pat = Pattern.compile("([12][0-9]{3})[ /-]?([01]?[0-9])[ /-]([0-3]?[0-9])[ \t\r\n\f]*"); + Matcher m = pat.matcher(inputDate); + if (m.matches()) { + if (DEBUG) { + System.err.println("YYYYMMDD succeeded"); + } + String monthValue = m.group(2); + if (monthValue.length() < 2)//we always use two digit months + { + monthValue = '0' + monthValue; + } + String dayValue = m.group(3); + if (dayValue.length() < 2) { + dayValue = '0' + dayValue; + } + String yearString = m.group(1); + isoDate = yearString + monthValue + dayValue; + return true; + } + return false; + } + + /** + * This method copied from {@link DateInstance}; not sure how we tell that it + * is MMDD versus DDMM (sometimes it will be ambiguous). + * + */ + private boolean extractMMDDYY(String inputDate) { + Pattern pat = Pattern.compile("([0-1]??[0-9])[ \t\n\r\f]*[/-][ \t\n\r\f]*([0-3]??[0-9])[ \t\r\n\f]*[/-][ \t\r\n\f]*([0-2]??[0-9]??[0-9][0-9])[ \t\r\n\f]*"); + Matcher m = pat.matcher(inputDate); + if (m.matches()) { + if (DEBUG) { + System.err.println("MMDDYY succeeded"); + } + String monthValue = m.group(1); + if (monthValue.length() < 2)//we always use two digit months + { + monthValue = '0' + monthValue; + } + String dayValue = m.group(2); + if (dayValue.length() < 2) { + dayValue = '0' + dayValue; + } + String yearString; // always initialized below + if (m.group(3).length() == 2) { + int yearInt = Integer.parseInt(m.group(3)); + //Now we add "20" or "19" to the front of the two digit year depending on its value.... + if (yearInt < 50) { + yearString = "20" + m.group(3); + } else { + yearString = "19" + m.group(3); + } + + } else { + yearString = m.group(3); + } + //lastYearSet = new Integer(yearString).intValue(); + isoDate = yearString + monthValue + dayValue; + return true; + } + return false; + } + + private Pattern re1 = Pattern.compile("[1-2][0-9]{3}|'[0-9]{2}"); + private Pattern re2 = Pattern.compile("[0-9][^0-9].*([0-9]{2})\\s*$"); + + public boolean extractYear(String inputDate) { + if (DEBUG) { + System.err.println("Extracting year from: |" + inputDate + '|'); + } + String extract; + Matcher m1 = re1.matcher(inputDate); + Matcher m2 = re2.matcher(inputDate); + if (m1.find()) { + extract = m1.group(0); + } else if (m2.find()) { + extract = m2.group(1); + } else { + extract = foundMiscYearPattern(inputDate); + if (extract == null || extract.equals("")) { + isoDate = "****"; + return false; + } + } + + if ( ! "".equals(extract)) { + if (extract.charAt(0) == '\'') { + extract = extract.substring(1); + } + extract = extract.trim(); + if (extract.length() == 2) { + if (extract.charAt(0) < '5') { + extract = "20" + extract; + } else { + extract = "19" + extract; + } + } + if (inputDate.charAt(inputDate.length() - 1) == 's') {//decade or century marker + if (extract.charAt(2) == '0') {//e.g., 1900s -> 1900/1999 + String endDate = Integer.toString((Integer.valueOf(extract) + 99)); + extract = extract + '/' + endDate; + } else {//e.g., 1920s -> 1920/1929 + String endDate = Integer.toString((Integer.valueOf(extract) + 9)); + extract = extract + '/' + endDate; + } + } + isoDate = extract; + if (DEBUG) { + System.err.println("year extracted:" + extract); + } + return true; + } + isoDate = "****"; + return false; + } + + /** + * Tries to find a year pattern in the input string that may be somewhat + * odd/non-standard. + * + */ + private static String foundMiscYearPattern(String inputDate) { + String year = ""; + if (inputDate.toLowerCase().contains("century")) { + if (inputDate.endsWith("A.D. ")) { + inputDate = inputDate.substring(0, inputDate.length()-5); + if(DEBUG) { + System.out.println("inputDate: |" + inputDate + "|"); + } + } + if (inputDate.startsWith("late")) { + inputDate = inputDate.substring(5, inputDate.length()); + if(DEBUG) { + System.out.println("inputDate: |" + inputDate + "|"); + } + } + if (inputDate.startsWith("early")) { + inputDate = inputDate.substring(6, inputDate.length()); + if(DEBUG) { + System.out.println("inputDate: |" + inputDate + "|"); + } + } + if (Character.isDigit(inputDate.charAt(0))) { + // just parse number part, assuming last two letters are st/nd/rd + year = QuantifiableEntityNormalizer.normalizedNumberStringQuiet(inputDate.substring(0, inputDate.length() - 2), 1, "", null); + if (year.contains(".")) {//number format issue + year = year.substring(0, year.indexOf('.')); + } + while (year.length() < 4) { + year = year + '*'; + } + } else if (QuantifiableEntityNormalizer.ordinalsToValues.containsKey(inputDate)) { + year = Double.toString(QuantifiableEntityNormalizer.ordinalsToValues.getCount(inputDate)); + while (year.length() < 4) { + year = year + '*'; + } + } else { + if (DEBUG) { + System.out.println("ISODateInstance: Couldn't parse probable century: " + inputDate); + } + year = ""; + } + } + return year; + } + + private static final Pattern[] extractorArray = {Pattern.compile("[Jj]anuary|JANUARY|[Jj]an\\.?|JAN\\.?"), Pattern.compile("[Ff]ebruary|FEBRUARY|[Ff]eb\\.?|FEB\\.?"), Pattern.compile("[Mm]arch|MARCH|[Mm]ar\\.?|MAR\\.?"), Pattern.compile("[Aa]pril|APRIL|[Aa]pr\\.?|APR\\.?"), Pattern.compile("[Mm]ay|MAY"), Pattern.compile("[Jj]une|JUNE|[Jj]un\\.?|JUN\\.?"), Pattern.compile("[Jj]uly|JULY|[Jj]ul\\.?|JUL\\.?"), Pattern.compile("[Aa]ugust|AUGUST|[Aa]ug\\.?|AUG\\.?"), Pattern.compile("[Ss]eptember|SEPTEMBER|[Ss]ept?\\.?|SEPT?\\.?"), Pattern.compile("[Oo]ctober|OCTOBER|[Oo]ct\\.?|OCT\\.?"), Pattern.compile("[Nn]ovember|NOVEMBER|[Nn]ov\\.?|NOV\\.?"), Pattern.compile("[Dd]ecember|DECEMBER|[Dd]ec(?:\\.|[^aeiou]|$)|DEC(?:\\.|[^aeiou]|$)")}; // avoid matching "decades"! + + public boolean extractMonth(String inputDate) { + boolean foundMonth = false; + + for (int i = 0; i < 12; i++) { + String extract = ""; + Matcher m = extractorArray[i].matcher(inputDate); + if (m.find()) { + extract = m.group(0); + } + if ( ! "".equals(extract)) { + if (!foundMonth) { + if (DEBUG) { + System.err.println("month extracted: " + extract); + } + int monthNum = i + 1; + if (isoDate.length() != 4) { + isoDate = "****"; + } + String month = (monthNum < 10) ? "0" + monthNum : String.valueOf(monthNum); + isoDate += month; + foundMonth = true; + } + } + } + return foundMonth; + } + + public boolean extractDay(String inputDate) { + for (int a = 0; a < tokens.size(); a++) { + String extract = tokens.get(a); + if (QuantifiableEntityNormalizer.wordsToValues.containsKey(extract)) { + extract = Integer.toString(Double.valueOf(QuantifiableEntityNormalizer.wordsToValues.getCount(extract)).intValue()); + } else if (QuantifiableEntityNormalizer.ordinalsToValues.containsKey(extract)) { + extract = Integer.toString(Double.valueOf(QuantifiableEntityNormalizer.ordinalsToValues.getCount(extract)).intValue()); + } + extract = extract.replaceAll("[^0-9]", ""); + if (!extract.equals("")) { + try { + Integer i = Integer.valueOf(extract); + if (i.intValue() < 32 && i.intValue() > 0) { + if (isoDate.length() < 6) {//should already have year and month + if (isoDate.length() != 4)//throw new RuntimeException("Error extracting dates; should have had month and year but didn't"); + { + isoDate = isoDate + "******"; + } else { + isoDate = isoDate + "**"; + } + } + String day = (i < 10) ? "0" + i : String.valueOf(i); + isoDate = isoDate + day; + return true; + } + } catch (NumberFormatException e) { + System.err.println("Exception in extract Day."); + System.err.println("tokens size :" + tokens.size()); + e.printStackTrace(); + } + } + } + return false; + } + + private static Pattern[] weekdayArray = {Pattern.compile("[Ss]unday"), Pattern.compile("[Mm]onday"), Pattern.compile("[Tt]uesday"), Pattern.compile("[Ww]ednesday"), Pattern.compile("[Tt]hursday"), Pattern.compile("[Ff]riday"), Pattern.compile("[Ss]aturday")}; + + /** + * This is a backup method if everything else fails. It searches for named + * days of the week and if it finds one, it sets that as the date in lowercase form + * + */ + public boolean extractWeekday(String inputDate) { + for (Pattern p : weekdayArray) { + Matcher m = p.matcher(inputDate); + if (m.find()) { + String extract = m.group(0); + isoDate = extract.toLowerCase(); + return true; + } + } + return false; + } + + /** + * For testing only + * + */ + public static void main(String[] args) { + Properties props = StringUtils.argsToProperties(args); + String dateProperty = props.getProperty("date"); + if (dateProperty != null) { + ISODateInstance d = new ISODateInstance(dateProperty); + System.out.println(dateProperty + " processed as " + d.toString()); + } + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/InfoTemplate.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/InfoTemplate.java new file mode 100644 index 0000000..3dc0f01 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/InfoTemplate.java @@ -0,0 +1,58 @@ +package edu.stanford.nlp.ie.pascal; + +/** + * A partial {@link PascalTemplate}. + * Holds URL, acronym, and name template fields. + * + * @author Chris Cox + */ +public class InfoTemplate{ + String whomepage="null"; + String wacronym="null"; + String wname="null"; + String chomepage="null"; + String cacronym="null"; + String cname="null"; + + public InfoTemplate(String whomepage, String wacronym, String wname, + String chomepage, String cacronym, String cname, + CliqueTemplates ct) { + + if(whomepage!=null)this.whomepage=whomepage; + if(wacronym!=null)this.wacronym=PascalTemplate.stemAcronym(wacronym,ct); + if(wname!=null)this.wname=wname; + if(chomepage!=null)this.chomepage=chomepage; + if(cacronym!=null)this.cacronym=PascalTemplate.stemAcronym(cacronym,ct); + if(cname!=null)this.cname=cname; + } + + @Override + public int hashCode() { + int tally=31; + int n=7; + tally = whomepage.hashCode()+n*wacronym.hashCode()+n*n*wname.hashCode(); + tally += (chomepage.hashCode() + + n*cacronym.hashCode()+ n*n*cname.hashCode()); + return tally; + } + + @Override + public boolean equals(Object obj){ + if(obj==null)return false; + if(!( obj instanceof InfoTemplate)) return false; + InfoTemplate i = (InfoTemplate)obj; + + return(whomepage.equals(i.whomepage)&& + wacronym.equals(i.wacronym) && + wname.equals(i.wname) && + chomepage.equals(i.chomepage)&& + cacronym.equals(i.cacronym) && + cname.equals(i.cname)); + } + + @Override + public String toString(){ + return ("W_URL: "+whomepage+" W_ACRO: "+wacronym+" W_NAME: "+wname+ + "\nC_URL: "+chomepage+" C_ACRO: "+cacronym+" C_NAME: "+cname); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/PascalTemplate.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/PascalTemplate.java new file mode 100644 index 0000000..0f869e2 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/PascalTemplate.java @@ -0,0 +1,279 @@ +package edu.stanford.nlp.ie.pascal; + +import edu.stanford.nlp.stats.Counter; +import edu.stanford.nlp.util.Index; +import edu.stanford.nlp.util.HashIndex; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Maps non-background Pascal fields to strings. + * + * @author Chris Cox + */ + + +public class PascalTemplate { + + public static final String[] fields = { + //dates + "workshoppapersubmissiondate", + "workshopnotificationofacceptancedate", + "workshopcamerareadycopydate", + "workshopdate", + //location + "workshoplocation", + //workshop info + "workshopacronym", + "workshophomepage", + "workshopname", + //conference info + "conferenceacronym", + "conferencehomepage", + "conferencename", + //background symbol + "0" + }; + + public static final String BACKGROUND_SYMBOL = "0"; + + private static final Index fieldIndices; + + static { + fieldIndices = new HashIndex(); + for (String field : fields) { + fieldIndices.add(field); + } + } + + private final String[] values; + + + public PascalTemplate() { + values = new String[fields.length]; + for (int i = 0; i < values.length; i++) { + values[i] = null; + } + } + + //copy constructor + public PascalTemplate(PascalTemplate pt) { + this.values = new String[fields.length]; + for (int i = 0; i < values.length; i++) { + if (pt.values[i] == null) { + this.values[i] = null; + } else { + this.values[i] = pt.values[i]; + } + } + } + + /* + * Acronym stemming and matching fields + */ + private static Pattern acronymPattern = Pattern.compile("([ \r-/a-zA-Z]+?)(?:[ -'*\t\r\n\f0-9]*)", Pattern.DOTALL); + + /** + * + */ + public static boolean acronymMatch(String s1, String s2, HashMap stemmedAcronymIndex) { + System.err.println("Testing match:" + s1 + " : " + s2); + String stem1 = (String) stemmedAcronymIndex.get(s1); + String stem2 = (String) stemmedAcronymIndex.get(s2); + System.err.println("Got stems:" + s1 + " : " + s2); + return stem1.equals(stem2); + } + /** + * + */ + public static String stemAcronym(String s, CliqueTemplates ct) { + if (ct.stemmedAcronymIndex.containsKey(s)) { + return (String) ct.stemmedAcronymIndex.get(s); + } + Matcher matcher = acronymPattern.matcher(s); + if (!matcher.matches() || s.equalsIgnoreCase("www")) { + System.err.println("Not a valid acronym: " + s); + return "null"; + } + + String stemmed = matcher.group(1).toLowerCase(); + if (stemmed.endsWith("-")) { + stemmed = stemmed.substring(0, stemmed.length() - 1); + } + + ct.stemmedAcronymIndex.put(s, stemmed); + System.err.println("Stemmed: " + s + " to: " + stemmed); + if (ct.inverseAcronymMap.containsKey(stemmed)) { + HashSet set = (HashSet) ct.inverseAcronymMap.get(stemmed); + set.add(s); + } else { + HashSet set = new HashSet(); + set.add(s); + ct.inverseAcronymMap.put(stemmed, set); + } + return stemmed; + } + +/** + * Merges partial (clique) templates into a full one. + * + * @param dt date template + * @param location location + * @param wi workshop/conference info template + * @return the {@link PascalTemplate} resulting from this merge. + */ + + public static PascalTemplate mergeCliqueTemplates(DateTemplate dt, String location, InfoTemplate wi) { + PascalTemplate pt = new PascalTemplate(); + pt.setValue("workshopnotificationofacceptancedate", dt.noadate); + pt.setValue("workshopcamerareadycopydate", dt.crcdate); + pt.setValue("workshopdate", dt.workdate); + pt.setValue("workshoppapersubmissiondate", dt.subdate); + pt.setValue("workshoplocation", location); + pt.setValue("workshopacronym", wi.wacronym); + pt.setValue("workshophomepage", wi.whomepage); + pt.setValue("workshopname", wi.wname); + pt.setValue("conferenceacronym", wi.cacronym); + pt.setValue("conferencehomepage", wi.chomepage); + pt.setValue("conferencename", wi.cname); + return pt; + } + +/** + * Sets template values. + * @param fieldName (i.e. workshopname, workshopdate) + */ + public void setValue(String fieldName, String value) { + int index = getFieldIndex(fieldName); + assert(index != -1); + values[index] = value; + } + + public void setValue(int index, String value) { + if (index != values.length - 1) { + values[index] = value; + } + } + + public String getValue(String fieldName) { + int i = getFieldIndex(fieldName); + if (i == -1 || i == values.length - 1) { + return null; + } else { + return values[i]; + } + } + + @Override + public boolean equals(Object obj) { + + if (obj == null) { + return false; + } + if (!(obj instanceof PascalTemplate)) { + return false; + } + + PascalTemplate pt = (PascalTemplate) obj; + String[] values2 = pt.values; + + if (values.length != values2.length) { + return false; + } + + for (int i = 0; i < values.length - 1; i++) { + if (values[i] == null) { + if (values2[i] != null) { + return false; + } + } else { + if (values2[i] == null) { + return false; + } + if (!values2[i].equals(values[i])) { + return false; + } + } + } + return true; + } + + @Override + public int hashCode() { + int tally = 37; + for (int i = 0; i < values.length - 1; i++) { + int n; + if (values[i] == null) { + n = 11; + } else { + n = values[i].hashCode(); + } + tally = 17 * tally + n; + } + return tally; + } + + /** + * + * @param tag field name (i.e. workshopdate, workshoplocation) + * @return the reference of that field in the underlying {@link edu.stanford.nlp.util.Index} + */ + public static int getFieldIndex(String tag) { + return (fieldIndices.indexOf(tag)); + } + + /** + * Should be passed a Counter[], each entry of which + * keeps scores for possibilities in that template slot. The counter + * for each template value is incremented by the corresponding score of + * this PascalTemplate. + * + * @param fieldValueCounter an array of counters, each of which holds label possibilities for one field + * @param score increment counts by this much. + */ + + public void writeToFieldValueCounter(Counter[] fieldValueCounter, double score) { + for (int i = 0; i < fields.length; i++) { + if ((values[i] != null) && !values[i].equals("NULL")) { + fieldValueCounter[i].incrementCount(values[i], score); + } + } + } +/** + * Divides this template into partial templates, and updates the counts of these + * partial templates in the {@link CliqueTemplates} object. + * + * @param ct the partial templates counter object + * @param score increment counts by this much + */ + public void unpackToCliqueTemplates(CliqueTemplates ct, double score) { + + ct.dateCliqueCounter.incrementCount(new DateTemplate(values[0], values[1], values[2], values[3]), score); + if (values[4] != null) { + ct.locationCliqueCounter.incrementCount(values[4], score); + } + + ct.workshopInfoCliqueCounter.incrementCount(new InfoTemplate(values[6], values[5], values[7], values[9], values[8], values[10], ct), score); + } + + public void print() { + System.err.println("PascalTemplate: "); + System.err.println(this.toString()); + } + + @Override + public String toString() { + String str = "\n====================\n"; + for (int i = 0; i < values.length; i++) { + if (values[i] != null) { + if (!(values[i].equalsIgnoreCase("NULL"))) { + str = str.concat(fields[i] + " : " + values[i] + "\n"); + } + } + } + return str; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/Prior.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/Prior.java new file mode 100644 index 0000000..0a1b96b --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/Prior.java @@ -0,0 +1,77 @@ +package edu.stanford.nlp.ie.pascal; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * @author Jamie Nicolson + */ +public class Prior { + // Map maps field names to indexes in the matrix + private Map fieldIndices; + private String[] indexFields; + + // n-dimensional boolean matrix. There will be 2^n entries in the matrix. + private double[] matrix; + + public Prior(BufferedReader reader) throws IOException { + String line; + line = reader.readLine(); + if (line == null) { + throw new IOException(); + } + indexFields = line.split("\\s+"); + fieldIndices = new HashMap(); + for (int i = 0; i < indexFields.length; ++i) { + fieldIndices.put(indexFields[i], Integer.valueOf(i)); + } + if (indexFields.length < 1 || indexFields.length > 31) { + throw new IOException("Invalid number of fields, should be >=1 and <= 31"); + } + int matrixSize = 1 << indexFields.length; + matrix = new double[matrixSize]; + int matrixIdx = 0; + while (matrixIdx < matrix.length && (line = reader.readLine()) != null) { + String[] tokens = line.split("\\s+"); + for (int t = 0; matrixIdx < matrix.length && t < tokens.length; ++t) { + matrix[matrixIdx++] = Double.parseDouble(tokens[t]); + } + } + } + + /** + * Map + */ + public double get(Set presentFields) { + int index = 0; + for (int f = 0; f < indexFields.length; ++f) { + String field = indexFields[f]; + index *= 2; + if (presentFields.contains(field)) { + ++index; + } + } + return matrix[index]; + } + + public static void main(String args[]) throws Exception { + + BufferedReader br = new BufferedReader(new FileReader("/tmp/acstats")); + + Prior p = new Prior(br); + + HashSet hs = new HashSet(); + hs.add("workshopname"); + //hs.add("workshopacronym"); + + double d = p.get(hs); + System.out.println("d is " + d); + + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/RelationalModel.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/RelationalModel.java new file mode 100644 index 0000000..5bcfb77 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/RelationalModel.java @@ -0,0 +1,18 @@ +package edu.stanford.nlp.ie.pascal; + + +/** + * An interface for the relational models in phase 2 of the pascal system. + * + * @author Jamie Nicolson + */ +public interface RelationalModel { + /** + * + * @param temp template to be scored + * @return its score + */ + public double computeProb(PascalTemplate temp); + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/TeXHyphenator.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/TeXHyphenator.java new file mode 100644 index 0000000..1f8b9cb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/pascal/TeXHyphenator.java @@ -0,0 +1,192 @@ +package edu.stanford.nlp.ie.pascal; + +import java.io.*; +import java.util.*; +import edu.stanford.nlp.util.StringUtils; + +/** + * Hyphenates words according to the TeX algorithm. + * @author Jamie Nicolson (nicolson@cs.stanford.edu) + */ +public class TeXHyphenator { + + private static class Node { + HashMap children = new HashMap(); + + int [] pattern = null; + }; + + /** + * Loads the default hyphenation rules in DefaultTeXHyphenator. + */ + public void loadDefault() { + try { + load( new BufferedReader(new StringReader( + DefaultTeXHyphenData.hyphenData) ) ); + } catch(IOException e) { + // shouldn't happen + throw new RuntimeException(e); + } + } + + /** + * Loads custom hyphenation rules. You probably want to use + * loadDefault() instead. + * + */ + public void load(BufferedReader input) throws IOException { + String line; + while( (line=input.readLine()) != null ) { + if( StringUtils.matches(line, "\\s*(%.*)?") ) { + // comment or blank line + System.err.println("Skipping: " + line); + continue; + } + char [] linechars = line.toCharArray(); + int [] pattern = new int[linechars.length]; + char [] chars = new char[linechars.length]; + int c = 0; + for( int i = 0; i < linechars.length; ++i) { + if( Character.isDigit(linechars[i]) ) { + pattern[c] = Character.digit(linechars[i], 10); + } else { + chars[c++] = linechars[i]; + } + } + char[] shortchars = new char[c]; + int [] shortpattern = new int[c+1]; + System.arraycopy(chars, 0, shortchars, 0, c); + System.arraycopy(pattern, 0, shortpattern, 0, c+1); + insertHyphPattern(shortchars, shortpattern); + } + } + + private Node head = new Node(); + + public static String toString(int[]i) { + StringBuffer sb = new StringBuffer(); + for(int j = 0; j < i.length; ++j) { + sb.append(i[j]); + } + return sb.toString(); + } + + private void insertHyphPattern(char [] chars, int [] pattern) { + // find target node, building as we go + Node cur = head; + for( int c = 0; c < chars.length; ++c) { + Character curchar = new Character(chars[c]); + Node next = (Node) cur.children.get(curchar); + if( next == null ) { + next = new Node(); + cur.children.put( curchar, next ); + } + cur = next; + } + assert( cur.pattern == null ); + cur.pattern = pattern; + } + + private List getMatchingPatterns( char[] chars, int startingIdx ) { + Node cur = head; + LinkedList matchingPatterns = new LinkedList(); + if( cur.pattern != null ) { + matchingPatterns.add(cur.pattern); + } + for(int c = startingIdx; cur != null && c < chars.length; ++c ) { + Character curchar = new Character(chars[c]); + Node next = (Node) cur.children.get(curchar); + cur = next; + if( cur != null && cur.pattern != null ) { + matchingPatterns.add(cur.pattern); + } + } + return matchingPatterns; + } + + + private void labelWordBreakPoints( char [] phrase, int start, int end, + boolean[] breakPoints) + { + + char [] word = new char[end-start+2]; + System.arraycopy(phrase, start, word, 1, end-start); + word[0] = '.'; + word[word.length-1] = '.'; + + // breakScore[i] is the score for breaking before word[i] + int [] breakScore = new int [word.length + 1]; + + for( int c = 0; c < word.length; ++c ) { + List patterns = getMatchingPatterns(word, c); + Iterator iter = patterns.iterator(); + while(iter.hasNext()) { + int [] pattern = (int[]) iter.next(); + for( int i = 0; i < pattern.length; ++i ) { + if( breakScore[c+i] < pattern[i] ) { + breakScore[c+i] = pattern[i]; + } + } + } + } + + breakPoints[start] = true; + for( int i = start+1; i < end; i++) { + // remember that breakPoints is offset by one because we introduced + // the leading "." + breakPoints[i-1] |= (breakScore[i-start] % 2 == 1 ); + } + } + + /** + * @param lcphrase Some English text in lowercase. + * @return An array of booleans, one per character of the input, + * indicating whether it would be OK to insert a hyphen before that + * character. + */ + public boolean[] findBreakPoints(char [] lcphrase) { + + boolean [] breakPoints = new boolean[lcphrase.length]; + + boolean inWord = false; + int wordStart = 0; + int c = 0; + for(; c < lcphrase.length; ++c) { + if( !inWord && Character.isLetter(lcphrase[c]) ) { + wordStart = c; + inWord = true; + } else if( inWord && !Character.isLetter(lcphrase[c]) ) { + inWord = false; + labelWordBreakPoints(lcphrase, wordStart, c, breakPoints); + } + } + if( inWord ) { + labelWordBreakPoints(lcphrase, wordStart, c, breakPoints); + } + + return breakPoints; + } + + public static void main(String[] args) throws Exception { + + TeXHyphenator hyphenator = new TeXHyphenator(); + hyphenator.loadDefault(); + + for( int a = 0; a < args.length; ++a) { + char[] chars = args[a].toLowerCase().toCharArray(); + boolean [] breakPoints = hyphenator.findBreakPoints(chars); + System.out.println(args[a]); + StringBuffer sb = new StringBuffer(); + for(int i = 0; i < breakPoints.length; ++i) { + if( breakPoints[i] ) { + sb.append("^"); + } else { + sb.append("-"); + } + } + System.out.println(sb.toString()); + } + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java new file mode 100644 index 0000000..2314f7e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java @@ -0,0 +1,846 @@ +package edu.stanford.nlp.ie.regexp; + +import edu.stanford.nlp.ie.AbstractSequenceClassifier; +import edu.stanford.nlp.time.TimeAnnotations; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.sequences.DocumentReaderAndWriter; +import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter; +import edu.stanford.nlp.time.TimeExpressionExtractor; +import edu.stanford.nlp.time.TimeExpressionExtractorFactory; +import edu.stanford.nlp.time.Timex; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.PaddedList; +import edu.stanford.nlp.util.StringUtils; + +import java.io.ObjectInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Properties; +import java.util.regex.Pattern; + +/** + * A set of deterministic rules for marking certain entities, to add + * categories and to correct for failures of statistical NER taggers. + * This is an extremely simple and ungeneralized implementation of + * AbstractSequenceClassifier that was written for PASCAL RTE. + * It could profitably be extended and generalized. + * It marks a NUMBER category based on part-of-speech tags in a + * deterministic manner. + * It marks an ORDINAL category based on word form in a deterministic manner. + * It tags as MONEY currency signs and things tagged CD after a currency sign. + * It marks a number before a month name as a DATE. + * It marks as a DATE a word of the form xx/xx/xxxx + * (where x is a digit from a suitable range). + * It marks as a TIME a word of the form x(x):xx (where x is a digit). + * It marks everything else tagged "CD" as a NUMBER, and instances + * of "and" appearing between CD tags in contexts suggestive of a number. + * It requires text to be POS-tagged (have the getString(TagAnnotation.class) attribute). + * Effectively these rules assume that + * this classifier will be used as a secondary classifier by + * code such as ClassifierCombiner: it will mark most CD as NUMBER, and it + * is assumed that something else with higher priority is marking ones that + * are PERCENT, ADDRESS, etc. + * + * @author Christopher Manning + * @author Mihai (integrated with NumberNormalizer, SUTime) + */ +public class NumberSequenceClassifier extends AbstractSequenceClassifier { + + private static final boolean DEBUG = false; + + private final boolean useSUTime; + + public static final boolean USE_SUTIME_DEFAULT = TimeExpressionExtractorFactory.DEFAULT_EXTRACTOR_PRESENT; + public static final String USE_SUTIME_PROPERTY = "ner.useSUTime"; + + private final TimeExpressionExtractor timexExtractor; + + public NumberSequenceClassifier() { + this(new Properties(), USE_SUTIME_DEFAULT, new Properties()); + if (! CURRENCY_WORD_PATTERN.matcher("pounds").matches()) { + System.err.println("NumberSequence: Currency pattern broken"); + } + } + + public NumberSequenceClassifier(boolean useSUTime) { + this(new Properties(), useSUTime, new Properties()); + } + + public NumberSequenceClassifier(Properties props, + boolean useSUTime, Properties sutimeProps) { + super(props); + this.useSUTime = useSUTime; + if(this.useSUTime) { + this.timexExtractor = TimeExpressionExtractorFactory.createExtractor(); + this.timexExtractor.init("sutime", sutimeProps); + } else { + this.timexExtractor = null; + } + } + + /** + * Classify a {@link List} of {@link CoreLabel}s. + * + * @param document A {@link List} of {@link CoreLabel}s. + * @return the same {@link List}, but with the elements annotated + * with their answers. + */ + @Override + public List classify(List document) { + return classifyWithGlobalInformation(document, null, null); + } + + @Override + public List classifyWithGlobalInformation(List tokens, final CoreMap document, final CoreMap sentence) { + if(useSUTime) return classifyWithSUTime(tokens, document, sentence); + return classifyOld(tokens); + } + + /** + * Modular classification using NumberNormalizer for numbers, SUTime for date/time. + * Note: this is slower than classifyOld because it runs multiple passes + * over the tokens (one for numbers and dates, and others for money and ordinals). + * However, the slowdown is not substantial since the passes are fast. Plus, + * the code is much cleaner than before... + * @param tokenSequence + */ + private List classifyWithSUTime(List tokenSequence, final CoreMap document, final CoreMap sentence) { + // + // set everything to "O" by default + // + for(CoreLabel token: tokenSequence) { + if(token.get(CoreAnnotations.AnswerAnnotation.class) == null) + token.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); + } + + // + // run SUTime + // note: SUTime requires TextAnnotation to be set at document/sent level and + // that the Character*Offset annotations be aligned with the token words. + // This is guaranteed because here we work on a copy generated by copyTokens() + // + CoreMap timeSentence = (sentence != null ? + alignSentence(sentence) : + buildSentenceFromTokens(tokenSequence)); + List timeExpressions = runSUTime(timeSentence, document); + List numbers = timeSentence.get(CoreAnnotations.NumerizedTokensAnnotation.class); + + // + // store DATE and TIME + // + if(timeExpressions != null){ + for(CoreMap timeExpression: timeExpressions) { + int start = timeExpression.get(CoreAnnotations.TokenBeginAnnotation.class); + int end = timeExpression.get(CoreAnnotations.TokenEndAnnotation.class); + int offset = 0; + if(sentence != null && sentence.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) { + offset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class); + } + Timex timex = timeExpression.get(TimeAnnotations.TimexAnnotation.class); + if(timex != null){ + if(DEBUG){ + System.err.println("FOUND DATE/TIME \"" + timeExpression + + "\" with offsets " + start + " " + end + + " and value " + timex); + System.err.println("The above CoreMap has the following fields:"); + // for(Class key: timeExpression.keySet()) System.err.println("\t" + key + ": " + timeExpression.get(key)); + } + String label = timex.timexType(); + for(int i = start; i < end; i ++){ + CoreLabel token = tokenSequence.get(i - offset); + if(token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){ + token.set(CoreAnnotations.AnswerAnnotation.class, label); + token.set(TimeAnnotations.TimexAnnotation.class, timex); + } + } + } + } + } + + // + // store the numbers found by SUTime as NUMBER if they are not part of anything else + // + if(numbers != null){ + for(CoreMap number: numbers) { + if(number.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){ + int start = number.get(CoreAnnotations.TokenBeginAnnotation.class); + int end = number.get(CoreAnnotations.TokenEndAnnotation.class); + int offset = 0; + if(sentence != null && sentence.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) { + offset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class); + } + String type = number.get(CoreAnnotations.NumericCompositeTypeAnnotation.class); + Number value = number.get(CoreAnnotations.NumericCompositeValueAnnotation.class); + if(type != null){ + if(DEBUG) System.err.println("FOUND NUMBER \"" + number + "\" with offsets " + start + " " + end + " and value " + value + " and type " + type); + for(int i = start; i < end; i ++){ + CoreLabel token = tokenSequence.get(i - offset); + if(token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){ + token.set(CoreAnnotations.AnswerAnnotation.class, type); + if(value != null){ + token.set(CoreAnnotations.NumericCompositeValueAnnotation.class, value); + } + } + } + } + } + } + } + // everything tagged as CD is also a number + // NumberNormalizer probably catches these but let's be safe + for(CoreLabel token: tokenSequence) { + if(token.tag().equals("CD") && + token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){ + token.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER"); + } + } + + // extract money and percents + moneyAndPercentRecognizer(tokenSequence); + + // ordinals + // NumberNormalizer probably catches these but let's be safe + ordinalRecognizer(tokenSequence); + + return tokenSequence; + } + + /** + * Copies one sentence replicating only information necessary for SUTime + * @param sentence + */ + public static CoreMap alignSentence(CoreMap sentence) { + + String text = sentence.get(CoreAnnotations.TextAnnotation.class); + if(text != null){ + // original text is preserved; no need to align anything + return sentence; + } + + CoreMap newSentence = buildSentenceFromTokens( + sentence.get(CoreAnnotations.TokensAnnotation.class), + sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), + sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + + newSentence.set(CoreAnnotations.TokenBeginAnnotation.class, + sentence.get(CoreAnnotations.TokenBeginAnnotation.class)); + newSentence.set(CoreAnnotations.TokenEndAnnotation.class, + sentence.get(CoreAnnotations.TokenEndAnnotation.class)); + + return newSentence; + } + + private static CoreMap buildSentenceFromTokens(List tokens) { + return buildSentenceFromTokens(tokens, null, null); + } + + private static CoreMap buildSentenceFromTokens( + List tokens, + Integer characterOffsetStart, + Integer characterOffsetEnd) { + + // + // Recover the sentence text: + // a) try to get it from TextAnnotation + // b) if not present, build it from the OriginalTextAnnotation of each token + // c) if not present, build it from the TextAnnotation of each token + // + boolean adjustCharacterOffsets = false; + // try to recover the text from the original tokens + String text = buildText(tokens, CoreAnnotations.OriginalTextAnnotation.class); + if(text == null){ + text = buildText(tokens, CoreAnnotations.TextAnnotation.class); + // character offset will point to the original tokens + // so we need to align them to the text built from normalized tokens + adjustCharacterOffsets = true; + if(text == null){ + throw new RuntimeException("ERROR: to use SUTime, sentences must have TextAnnotation set, or the individual tokens must have OriginalTextAnnotation or TextAnnotation set!"); + } + } + + // make sure token character offsets are aligned with text + List tokenSequence = copyTokens(tokens, adjustCharacterOffsets, false); + + Annotation newSentence = new Annotation(text); + newSentence.set(CoreAnnotations.TokensAnnotation.class, tokenSequence); + if (! adjustCharacterOffsets && + characterOffsetStart != null && + characterOffsetEnd != null){ + newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, characterOffsetStart); + newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, characterOffsetEnd); + } else { + int tokenCharStart = tokenSequence.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); + int tokenCharEnd = tokenSequence.get(tokenSequence.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokenCharStart); + newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokenCharEnd); + } + + // some default token offsets + newSentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0); + newSentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenSequence.size()); + + return newSentence; + } + + @SuppressWarnings("unchecked") + private static String buildText(List tokens, Class textAnnotation) { + StringBuilder os = new StringBuilder(); + for (int i = 0, sz = tokens.size(); i < sz; i ++) { + CoreLabel crt = tokens.get(i); + // System.out.println("\t" + crt.word() + "\t" + crt.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + "\t" + crt.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + if (i > 0) { + CoreLabel prev = tokens.get(i - 1); + int spaces = 1; + if (crt.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { + spaces = crt.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - + prev.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + } + while (spaces > 0) { + os.append(' '); + spaces--; + } + } + String word = (String) crt.get(textAnnotation); + if (word == null) { + // this annotation does not exist; bail out + return null; + } + os.append(word); + } + return os.toString(); + } + + /** + * Runs SUTime and converts its output into NamedEntityTagAnnotations + * @param sentence + * @param document Contains document-level annotations such as DocDateAnnotation + */ + private List runSUTime(CoreMap sentence, final CoreMap document) { + // docDate can be null. In such situations we do not disambiguate relative dates + String docDate = (document != null ? document.get(CoreAnnotations.DocDateAnnotation.class) : null); + + /* + System.err.println("PARSING SENTENCE: " + sentence.get(CoreAnnotations.TextAnnotation.class)); + for(CoreLabel t: sentence.get(CoreAnnotations.TokensAnnotation.class)){ + System.err.println("TOKEN: \"" + t.word() + "\" \"" + t.get(CoreAnnotations.OriginalTextAnnotation.class) + "\" " + t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + " " + t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + } + */ + + List timeExpressions = timexExtractor.extractTimeExpressionCoreMaps(sentence, docDate); + if(timeExpressions != null){ + if(DEBUG) System.out.println("FOUND TEMPORALS: " + timeExpressions); + } + + return timeExpressions; + } + + /** + * Recognizes money and percents + * This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units (e.g., "dollar") only after + * @param tokenSequence + */ + private void moneyAndPercentRecognizer(List tokenSequence) { + for(int i = 0; i < tokenSequence.size(); i ++){ + CoreLabel crt = tokenSequence.get(i); + CoreLabel next = (i < tokenSequence.size() - 1 ? tokenSequence.get(i + 1) : null); + CoreLabel prev = (i > 0 ? tokenSequence.get(i - 1) : null); + + // $5 + if(CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null && + (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || next.tag().equals("CD"))){ + crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + i = changeLeftToRight(tokenSequence, i + 1, + next.get(CoreAnnotations.AnswerAnnotation.class), + next.tag(), "MONEY") - 1; + } + + // 5$, 5 dollars + else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() || + CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches()) && + prev != null && + (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || + prev.tag().equals("CD")) && + ! leftScanFindsWeightWord(tokenSequence, i)){ + crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + changeRightToLeft(tokenSequence, i - 1, + prev.get(CoreAnnotations.AnswerAnnotation.class), + prev.tag(), "MONEY"); + } + + // 5%, 5 percent + else if((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() || + PERCENT_SYMBOL_PATTERN.matcher(crt.word()).matches()) && + prev != null && + (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || + prev.tag().equals("CD"))){ + crt.set(CoreAnnotations.AnswerAnnotation.class, "PERCENT"); + changeRightToLeft(tokenSequence, i - 1, + prev.get(CoreAnnotations.AnswerAnnotation.class), + prev.tag(), "PERCENT"); + } + } + } + + /** + * Recognizes ordinal numbers + * @param tokenSequence + */ + private void ordinalRecognizer(List tokenSequence) { + for (CoreLabel crt : tokenSequence) { + if ((crt.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol) || + crt.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER")) && + ORDINAL_PATTERN.matcher(crt.word()).matches()) { + crt.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL"); + } + } + } + + private int changeLeftToRight(List tokens, + int start, + String oldTag, + String posTag, + String newTag) { + while(start < tokens.size()) { + CoreLabel crt = tokens.get(start); + // we are scanning for a NER tag and found something different + if(! oldTag.equals(flags.backgroundSymbol) && ! crt.get(CoreAnnotations.AnswerAnnotation.class).equals(oldTag)) { + break; + } + // the NER tag is not set, so we scan for similar POS tags + if(oldTag.equals(flags.backgroundSymbol) && ! crt.tag().equals(posTag)) { + break; + } + + crt.set(CoreAnnotations.AnswerAnnotation.class, newTag); + start ++; + } + return start; + } + + private int changeRightToLeft(List tokens, + int start, + String oldTag, + String posTag, + String newTag) { + while(start >= 0) { + CoreLabel crt = tokens.get(start); + // we are scanning for a NER tag and found something different + if(! oldTag.equals(flags.backgroundSymbol) && ! crt.get(CoreAnnotations.AnswerAnnotation.class).equals(oldTag)) { + break; + } + // the NER tag is not set, so we scan for similar POS tags + if(oldTag.equals(flags.backgroundSymbol) && ! crt.tag().equals(posTag)) { + break; + } + + crt.set(CoreAnnotations.AnswerAnnotation.class, newTag); + start --; + } + return start; + } + + /** + * Aligns the character offsets of these tokens with the actual text stored in each token + * Note that this copies the list ONLY when we need to adjust the character offsets. Otherwise, it keeps the original list. + * Note that this looks first at OriginalTextAnnotation and only when null at TextAnnotation. + * @param srcList + * @param adjustCharacterOffsets If true, it adjust the character offsets to match exactly with the token lengths + */ + private static List copyTokens(List srcList, + boolean adjustCharacterOffsets, + boolean forceCopy) { + // no need to adjust anything; use the original list + if(! adjustCharacterOffsets && ! forceCopy) return srcList; + + List dstList = new ArrayList(); + int adjustment = 0; + int offset = 0; // for when offsets are not available + for(CoreLabel src: srcList) { + if(adjustCharacterOffsets) { + int wordLength = (src.containsKey(CoreAnnotations.OriginalTextAnnotation.class))? + src.get(CoreAnnotations.OriginalTextAnnotation.class).length():src.word().length(); + + // We try to preserve the old character offsets but they just don't work well for normalized token text + // Also, in some cases, these offsets are not set + if(src.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class) && + src.containsKey(CoreAnnotations.CharacterOffsetEndAnnotation.class)){ + int start = src.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); + int end = src.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + int origLength = end - start; + start += adjustment; + end = start + wordLength; + dstList.add(copyCoreLabel(src, start, end)); + adjustment += wordLength - origLength; + } else { + int start = offset; + int end = start + wordLength; + offset = end + 1; // allow for one space character + dstList.add(copyCoreLabel(src, start, end)); + } + } else { + dstList.add(copyCoreLabel(src, null, null)); + } + } + + return dstList; + } + + /** + * Transfer from src to dst all annotations generated bu SUTime and NumberNormalizer + * @param src + * @param dst + */ + public static void transferAnnotations(CoreLabel src, CoreLabel dst) { + // + // annotations potentially set by NumberNormalizer + // + if(src.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){ + dst.set(CoreAnnotations.NumericCompositeValueAnnotation.class, + src.get(CoreAnnotations.NumericCompositeValueAnnotation.class)); + } + + if(src.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class)) + dst.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, + src.get(CoreAnnotations.NumericCompositeTypeAnnotation.class)); + + // + // annotations set by SUTime + // + if(src.containsKey(TimeAnnotations.TimexAnnotation.class)) + dst.set(TimeAnnotations.TimexAnnotation.class, + src.get(TimeAnnotations.TimexAnnotation.class)); + } + + /** + * Create a copy of srcTokens, detecting on the fly if character offsets need adjusting + * @param srcTokens + * @param srcSentence + */ + public static List copyTokens(List srcTokens, CoreMap srcSentence) { + boolean adjustCharacterOffsets = false; + if(srcSentence == null || + srcSentence.get(CoreAnnotations.TextAnnotation.class) == null || + srcTokens.size() == 0 || + srcTokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class) == null){ + adjustCharacterOffsets = true; + } + + return copyTokens(srcTokens, adjustCharacterOffsets, true); + } + + /** + * Copies only the fields required for numeric entity extraction in the new CoreLabel + * @param src + */ + private static CoreLabel copyCoreLabel(CoreLabel src, Integer startOffset, Integer endOffset) { + CoreLabel dst = new CoreLabel(); + dst.setWord(src.word()); + dst.setTag(src.tag()); + if (src.containsKey(CoreAnnotations.OriginalTextAnnotation.class)) { + dst.set(CoreAnnotations.OriginalTextAnnotation.class, src.get(CoreAnnotations.OriginalTextAnnotation.class)); + } + if(startOffset == null){ + dst.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, src.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); + } else { + dst.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, startOffset); + } + if(endOffset == null){ + dst.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, src.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); + } else { + dst.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, endOffset); + } + + transferAnnotations(src, dst); + + return dst; + } + + public static final Pattern MONTH_PATTERN = Pattern.compile("January|Jan\\.?|February|Feb\\.?|March|Mar\\.?|April|Apr\\.?|May|June|Jun\\.?|July|Jul\\.?|August|Aug\\.?|September|Sept?\\.?|October|Oct\\.?|November|Nov\\.?|December|Dec\\."); + + public static final Pattern YEAR_PATTERN = Pattern.compile("[1-3][0-9]{3}|'?[0-9]{2}"); + + public static final Pattern DAY_PATTERN = Pattern.compile("(?:[1-9]|[12][0-9]|3[01])(?:st|nd|rd)?"); + + public static final Pattern DATE_PATTERN = Pattern.compile("(?:[1-9]|[0-3][0-9])\\\\?/(?:[1-9]|[0-3][0-9])\\\\?/[1-3][0-9]{3}"); + + public static final Pattern DATE_PATTERN2 = Pattern.compile("[12][0-9]{3}[-/](?:0?[1-9]|1[0-2])[-/][0-3][0-9]"); + + public static final Pattern TIME_PATTERN = Pattern.compile("[0-2]?[0-9]:[0-5][0-9]"); + + public static final Pattern TIME_PATTERN2 = Pattern.compile("[0-2][0-9]:[0-5][0-9]:[0-5][0-9]"); + + public static final Pattern AM_PM = Pattern.compile("(a\\.?m\\.?)|(p\\.?m\\.?)", Pattern.CASE_INSENSITIVE); + + public static final Pattern CURRENCY_WORD_PATTERN = Pattern.compile("(?:dollar|cent|euro|pound)s?|penny|pence|yen|yuan|won", Pattern.CASE_INSENSITIVE); + public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|£|\u00A3|\u00A5|#|\u20AC|US\\$|HK\\$|A\\$", Pattern.CASE_INSENSITIVE); + + public static final Pattern ORDINAL_PATTERN = Pattern.compile("(?i)[2-9]?1st|[2-9]?2nd|[2-9]?3rd|1[0-9]th|[2-9]?[04-9]th|100+th|zeroth|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|thousandth|millionth"); + + public static final Pattern ARMY_TIME_MORNING = Pattern.compile("0([0-9])([0-9]){2}"); + + public static final Pattern GENERIC_TIME_WORDS = Pattern.compile("(morning|evening|night|noon|midnight|teatime|lunchtime|dinnertime|suppertime|afternoon|midday|dusk|dawn|sunup|sundown|daybreak|day)"); + + public static final Pattern PERCENT_WORD_PATTERN = Pattern.compile("percent", Pattern.CASE_INSENSITIVE); + public static final Pattern PERCENT_SYMBOL_PATTERN = Pattern.compile("%"); + + private List classifyOld(List document) { + // if (DEBUG) { System.err.println("NumberSequenceClassifier tagging"); } + PaddedList pl = new PaddedList(document, pad); + for (int i = 0, sz = pl.size(); i < sz; i++) { + CoreLabel me = pl.get(i); + CoreLabel prev = pl.get(i - 1); + CoreLabel next = pl.get(i + 1); + CoreLabel next2 = pl.get(i + 2); + //if (DEBUG) { System.err.println("Tagging:" + me.word()); } + me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); + if (CURRENCY_SYMBOL_PATTERN.matcher(me.word()).matches() && + (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") || + next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD"))) { + // dollar, pound, pound, yen, + // Penn Treebank ancient # as pound, euro, + if (DEBUG) { + System.err.println("Found currency sign:" + me.word()); + } + me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { + if (DEBUG) { + System.err.println("Tagging CD:" + me.word()); + } + + if (TIME_PATTERN.matcher(me.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "TIME"); + } else if (TIME_PATTERN2.matcher(me.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "TIME"); + } else if (DATE_PATTERN.matcher(me.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } else if (DATE_PATTERN2.matcher(me.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + + } else if (next.get(CoreAnnotations.TextAnnotation.class) != null && + me.get(CoreAnnotations.TextAnnotation.class) != null && + DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches() && + MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) { + // deterministically make DATE for British-style number before month + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } else if (prev.get(CoreAnnotations.TextAnnotation.class) != null && + MONTH_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() && + me.get(CoreAnnotations.TextAnnotation.class) != null && + DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches()) { + // deterministically make DATE for number after month + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } else if (rightScanFindsMoneyWord(pl, i) && ! leftScanFindsWeightWord(pl, i)) { + me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + } else if(ARMY_TIME_MORNING.matcher(me.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "TIME"); + } else + if (YEAR_PATTERN.matcher(me.word()).matches() && + prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") && + (MONTH_PATTERN.matcher(prev.word()).matches() || + pl.get(i - 2).get(CoreAnnotations.AnswerAnnotation.class).equals("DATE"))) + { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } else { + if (DEBUG) { + System.err.println("Found number:" + me.word()); + } + if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) { + me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + } else { + me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER"); + } + } + } else if(AM_PM.matcher(me.word()).matches() && + prev.get(CoreAnnotations.AnswerAnnotation.class).equals("TIME")){ + me.set(CoreAnnotations.AnswerAnnotation.class, "TIME"); + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && + me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") && + prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") && + next.word() != null && YEAR_PATTERN.matcher(next.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP") && + MONTH_PATTERN.matcher(me.word()).matches()) { + if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") || + next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && + me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC")) { + if (prev.tag() != null && prev.tag().equals("CD") && + next.tag() != null && next.tag().equals("CD") && + me.get(CoreAnnotations.TextAnnotation.class) != null && + me.get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("and")) { + if (DEBUG) { + System.err.println("Found number and:" + me.word()); + } + String wd = prev.word(); + if (wd.equalsIgnoreCase("hundred") || + wd.equalsIgnoreCase("thousand") || + wd.equalsIgnoreCase("million") || + wd.equalsIgnoreCase("billion") || + wd.equalsIgnoreCase("trillion")) + { + me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER"); + } + } + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && + (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NN") || + me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS"))) { + if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches()) { + if (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") && + prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) { + me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + } + } else if (me.word().equals("m") || me.word().equals("b")) { + // could be metres, but it's probably million or billion in our + // applications + if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) { + me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); + } else { + me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER"); + } + } else if (ORDINAL_PATTERN.matcher(me.word()).matches()) { + if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) || + (next.word() != null && next.word().equalsIgnoreCase("of") && + next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches())) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } + } else if(GENERIC_TIME_WORDS.matcher(me.word()).matches()){ + me.set(CoreAnnotations.AnswerAnnotation.class, "TIME"); + } + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ")) { + if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) || + next.word() != null && next.word().equalsIgnoreCase("of") && + next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } else if (ORDINAL_PATTERN.matcher(me.word()).matches()) { + // don't do other tags: don't want 'second' as noun, or 'first' as adverb + // introducing reasons + me.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL"); + } + } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("IN") && + me.word().equalsIgnoreCase("of")) { + if (prev.get(CoreAnnotations.TextAnnotation.class) != null && + ORDINAL_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() && + next.get(CoreAnnotations.TextAnnotation.class) != null && + MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) { + me.set(CoreAnnotations.AnswerAnnotation.class, "DATE"); + } + } + } + return document; + } + + + /** + * Look for a distance of up to 3 for something that indicates weight not + * money. + * + * @param pl The list of CoreLabel + * @param i The position to scan right from + * @return whether a weight word is found + */ + private static boolean leftScanFindsWeightWord(List pl, int i) { + if (DEBUG) { + System.err.println("leftScan from: " + pl.get(i).word()); + } + for (int j = i - 1; j >= 0 && j >= i - 3; j--) { + CoreLabel fl = pl.get(j); + if (fl.word().startsWith("weigh")) { + if (DEBUG) { + System.err.println("leftScan found weight: " + fl.word()); + } + return true; + } + } + return false; + } + + + /** + * Look along CD words and see if next thing is a money word + * like cents or pounds. + * + * @param pl The list of CoreLabel + * @param i The position to scan right from + * @return Whether a money word is found + */ + private static boolean rightScanFindsMoneyWord(List pl, int i) { + int j = i; + if (DEBUG) { + System.err.println("rightScan from: " + pl.get(j).word()); + } + int sz = pl.size(); + while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { + j++; + } + if (j >= sz) { + return false; + } + String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class); + String word = pl.get(j).word(); + if (DEBUG) { + System.err.println("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches())); + } + return (tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches(); + } + + // Implement other methods of AbstractSequenceClassifier interface + + @SuppressWarnings("unchecked") + @Override + public void train(Collection> docs, + DocumentReaderAndWriter readerAndWriter) { + } + + @Override + public void printProbsDocument(List document) { + } + + @Override + public void serializeClassifier(String serializePath) { + System.err.print("Serializing classifier to " + serializePath + "..."); + System.err.println("done."); + } + + @Override + public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException { + } + + @SuppressWarnings("unchecked") + public static void main(String[] args) throws Exception { + Properties props = StringUtils.argsToProperties(args); + NumberSequenceClassifier nsc = + new NumberSequenceClassifier(props, true, props); + String trainFile = nsc.flags.trainFile; + String testFile = nsc.flags.testFile; + String textFile = nsc.flags.textFile; + String loadPath = nsc.flags.loadClassifier; + String serializeTo = nsc.flags.serializeTo; + + if (loadPath != null) { + nsc.loadClassifierNoExceptions(loadPath); + nsc.flags.setProperties(props); + } else if (trainFile != null) { + nsc.train(trainFile); + } + + if (serializeTo != null) { + nsc.serializeClassifier(serializeTo); + } + + if (testFile != null) { + nsc.classifyAndWriteAnswers(testFile, nsc.makeReaderAndWriter()); + } + + if (textFile != null) { + DocumentReaderAndWriter readerAndWriter = + new PlainTextDocumentReaderAndWriter(); + nsc.classifyAndWriteAnswers(textFile, readerAndWriter); + } + } // end main + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/regexp/RegexNERSequenceClassifier.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/regexp/RegexNERSequenceClassifier.java new file mode 100644 index 0000000..cec839c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ie/regexp/RegexNERSequenceClassifier.java @@ -0,0 +1,267 @@ +package edu.stanford.nlp.ie.regexp; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.ObjectInputStream; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.Properties; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.ie.AbstractSequenceClassifier; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.sequences.DocumentReaderAndWriter; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; + +/** + * A sequence classifier that labels tokens with types based on a simple manual mapping from + * regular expressions to the types of the entities they are meant to describe. + * The user provides a file formatted as follows: + * regex1 TYPE overwritableType1,Type2... priority + * regex2 TYPE overwritableType1,Type2... priority + * ... + * where each argument is tab-separated, and the last two arguments are optional. Several regexes can be + * associated with a single type. In the case where multiple regexes match a phrase, the priority ranking + * is used to choose between the possible types. This classifier is designed to be used as part of a full NER + * system to label entities that don't fall into the usual NER categories. It only records the label + * if the token has not already been NER-annotated, or it has been annotated but the NER-type has been + * designated overwritable (the third argument). + * + * NOTE: Following Java regex conventions, some characters in the file need to be escaped. Only a single + * backslash should be used though, as they are not String literals. Spaces should only be used to + * separate regular expression tokens; within tokens \\s should be used instead. Genitives and commas + * at the end of words should be tokenized in the input file. + * + * @author jtibs + * @author Mihai + * + */ +public class RegexNERSequenceClassifier extends AbstractSequenceClassifier { + private List entries; + + /** + * If true, it overwrites NE labels generated through this regex NER + * This is necessary because sometimes the RegexNERSequenceClassifier is run successively over the same text (e.g., to overwrite some older annotations) + */ + private boolean overwriteMyLabels; + + private Set myLabels; + + private boolean ignoreCase; + + public RegexNERSequenceClassifier(String mapping, boolean ignoreCase, boolean overwriteMyLabels) { + this(mapping, ignoreCase, overwriteMyLabels, DEFAULT_VALID_POS); + } + + /** + * Make a new instance of this classifier. The ignoreCase option allows case-insensitive + * regular expression matching, provided with the idea that the provided file might just + * be a manual list of the possible entities for each type. + * @param mapping + * @param ignoreCase + */ + public RegexNERSequenceClassifier(String mapping, boolean ignoreCase, boolean overwriteMyLabels, String validPosRegex) { + super(new Properties()); + if (validPosRegex != null && !validPosRegex.equals("")) { + validPosPattern = Pattern.compile(validPosRegex); + } else { + validPosPattern = null; + } + entries = readEntries(mapping, ignoreCase); + this.ignoreCase = ignoreCase; + this.overwriteMyLabels = overwriteMyLabels; + myLabels = Generics.newHashSet(); + if(this.overwriteMyLabels) { + for(Entry entry: entries) myLabels.add(entry.type); + } + //System.err.println("RegexNERSequenceClassifier using labels: " + + // myLabels); + } + + private static class Entry implements Comparable { + public List regex; // the regex, tokenized by splitting on white space + public String type; // the associated type + public Set overwritableTypes; + public double priority; + + public Entry(List regex, String type, Set overwritableTypes, double priority) { + this.regex = regex; + this.type = type.intern(); + this.overwritableTypes = overwritableTypes; + this.priority = priority; + } + + // if the given priorities are equal, an entry whose regex has more tokens is assigned + // a higher priority + public int compareTo(Entry other) { + if (this.priority > other.priority) + return -1; + if (this.priority < other.priority) + return 1; + return other.regex.size() - this.regex.size(); + } + } + + // TODO: make this a property? + // ms: but really this should be rewritten from scratch + // we should have a language to specify regexes over *tokens*, where each token could be a regular Java regex (over words, POSs, etc.) + private final Pattern validPosPattern; + public static final String DEFAULT_VALID_POS = "^(NN|JJ)"; + + private boolean containsValidPos(List tokens, int start, int end) { + if (validPosPattern == null) { + return true; + } + // System.err.println("CHECKING " + start + " " + end); + for(int i = start; i < end; i ++){ + // System.err.println("TAG = " + tokens.get(i).tag()); + if (tokens.get(i).tag() == null) { + throw new IllegalArgumentException("The regex ner was asked to check for valid tags on an untagged sequence. Either tag the sequence, perhaps with the pos annotator, or create the regex ner with an empty pos tag, perhaps with the flag regexner.validpospattern="); + } + Matcher m = validPosPattern.matcher(tokens.get(i).tag()); + if(m.find()) return true; + } + return false; + } + + @Override + public List classify(List document) { + for (Entry entry : entries) { + int start = 0; // the index of the token from which we begin our search each iteration + + while (true) { + // only search the part of the document that we haven't yet considered + // System.err.println("REGEX FIND MATCH FOR " + entry.regex.toString()); + start = findStartIndex(entry, document, start, myLabels); + if (start == -1) break; // no match found + + // make sure we annotate only valid POS tags + if (containsValidPos(document, start, start + entry.regex.size())) { + // annotate each matching token + for (int i = start; i < start + entry.regex.size(); i++) { + CoreLabel token = document.get(i); + token.set(CoreAnnotations.AnswerAnnotation.class, entry.type); + } + } + start++; + } + } + return document; + } + + public void train(Collection> docs, + DocumentReaderAndWriter readerAndWriter) {} + + public void printProbsDocument(List document) {} + + public void serializeClassifier(String serializePath) {} + + public void loadClassifier(ObjectInputStream in, Properties props) + throws IOException, ClassCastException, ClassNotFoundException {} + + /** + * Creates a combined list of Entries using the provided mapping file, and sorts them by + * first by priority, then the number of tokens in the regex. + * + * @param mapping The path to a file of mappings + * @return a sorted list of Entries + */ + private List readEntries(String mapping, boolean ignoreCase) { + List entries = new ArrayList(); + + try { + BufferedReader rd = IOUtils.readerFromString(mapping); + + int lineCount = 0; + for (String line; (line = rd.readLine()) != null; ) { + lineCount ++; + String[] split = line.split("\t"); + if (split.length < 2 || split.length > 4) + throw new RuntimeException("Provided mapping file is in wrong format"); + + String[] regexes = split[0].trim().split("\\s+"); + String type = split[1].trim(); + Set overwritableTypes = Generics.newHashSet(); + overwritableTypes.add(flags.backgroundSymbol); + overwritableTypes.add(null); + double priority = 0; + List tokens = new ArrayList(); + + try { + if (split.length >= 3) + overwritableTypes.addAll(Arrays.asList(split[2].trim().split(","))); + if (split.length == 4) + priority = Double.parseDouble(split[3].trim()); + + for (String str : regexes) { + if(ignoreCase) tokens.add(Pattern.compile(str, Pattern.CASE_INSENSITIVE)); + else tokens.add(Pattern.compile(str)); + } + } catch(NumberFormatException e) { + System.err.println("ERROR: Invalid line " + lineCount + " in regexner file " + mapping + ": \"" + line + "\"!"); + throw e; + } + + entries.add(new Entry(tokens, type, overwritableTypes, priority)); + } + rd.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + Collections.sort(entries); + return entries; + } + + /** + * Checks if the entry's regex sequence is contained in the tokenized document, starting the search + * from index searchStart. Also requires that each token's current NER-type be overwritable, + * and that each token has not yet been Answer-annotated. + * @param entry + * @param document + * @return on success, the index of the first token in the matching sequence, otherwise -1 + */ + private static int findStartIndex(Entry entry, List document, int searchStart, Set myLabels) { + List regex = entry.regex; + for (int start = searchStart; start <= document.size() - regex.size(); start++) { + boolean failed = false; + for (int i = 0; i < regex.size(); i++) { + Pattern pattern = regex.get(i); + CoreLabel token = document.get(start + i); + String NERType = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); + String currentType = token.get(CoreAnnotations.AnswerAnnotation.class); + + if (! pattern.matcher(token.word()).matches() || + currentType != null || + ! (entry.overwritableTypes.contains(NERType) || + myLabels.contains(NERType) || + NERType.equals("O"))) { + failed = true; + break; + } + } + if(! failed) { + //System.err.print("MATCHED REGEX:"); + //for(int i = start; i < start + regex.size(); i ++) System.err.print(" " + document.get(i).word()); + //System.err.println(); + return start; + } + } + return -1; + } + + @Override + public List classifyWithGlobalInformation(List tokenSeq, final CoreMap doc, final CoreMap sent) { + return classify(tokenSeq); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/Languages.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/Languages.java new file mode 100644 index 0000000..840f2cb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/Languages.java @@ -0,0 +1,69 @@ +package edu.stanford.nlp.international; + +import edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams; +import edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams; +import edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams; +import edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams; +import edu.stanford.nlp.parser.lexparser.HebrewTreebankParserParams; +import edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams; +import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams; + +/** + * Constants and parameters for multilingual parsing. + * + * @author Spence Green + * + */ +public class Languages { + + private Languages() {} + + public static enum Language {Arabic,Chinese,English,German,French,Hebrew} + + private static String langList; + static { + StringBuilder sb = new StringBuilder(); + for(Language lang : Language.values()) { + sb.append(lang.toString()); + sb.append(" "); + } + langList = sb.toString().trim(); + } + + public static String listOfLanguages() { + return langList; + } + + public static TreebankLangParserParams getLanguageParams(String lang) { + return getLanguageParams(Language.valueOf(lang)); + } + + public static TreebankLangParserParams getLanguageParams(Language lang) { + TreebankLangParserParams tlpp; // initialized below + switch(lang) { + case Arabic: + tlpp = new ArabicTreebankParserParams(); + break; + + case Chinese: + tlpp = new ChineseTreebankParserParams(); + break; + + case German: + tlpp = new NegraPennTreebankParserParams(); + break; + + case French: + tlpp = new FrenchTreebankParserParams(); + break; + + case Hebrew: + tlpp = new HebrewTreebankParserParams(); + break; + + default: + tlpp = new EnglishTreebankParserParams(); + } + return tlpp; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/ArabicMorphoFeatureSpecification.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/ArabicMorphoFeatureSpecification.java new file mode 100644 index 0000000..68ea48c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/ArabicMorphoFeatureSpecification.java @@ -0,0 +1,299 @@ +package edu.stanford.nlp.international.arabic; + +import java.io.*; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.international.morph.MorphoFeatureSpecification; +import edu.stanford.nlp.international.morph.MorphoFeatures; + +/** + * Extracts morphosyntactic features from BAMA/SAMA analyses. Compatible with both the + * long tags in the ATB and the output of MADA. + * + * @author Spence Green + * + */ +public class ArabicMorphoFeatureSpecification extends MorphoFeatureSpecification { + + private static final long serialVersionUID = 4448045447200922076L; + + private static final String[] defVals = {"I", "D"}; + private static final String[] caseVals = {"NOM","ACC","GEN"}; + private static final String[] genVals = {"M","F"}; + private static final String[] numVals = {"SG","DU","PL"}; + private static final String[] perVals = {"1","2","3"}; + private static final String[] possVals = {"POSS"}; + private static final String[] voiceVals = {"ACT","PASS"}; + private static final String[] moodVals = {"I","S","J"}; + private static final String[] tenseVals = {"PAST","PRES","IMP"}; + + // Standard feature tuple (e.g., "3MS", "1P", etc.) + private static final Pattern pFeatureTuple = Pattern.compile("(\\d\\p{Upper}\\p{Upper}?)"); + + // Demonstrative pronouns do not have number + private static final Pattern pDemPronounFeatures = Pattern.compile("DEM_PRON(.+)"); + + //Verbal patterns + private static final Pattern pVerbMood = Pattern.compile("MOOD|SUBJ"); + private static final Pattern pMood = Pattern.compile("_MOOD:([ISJ])"); + private static final Pattern pVerbTenseMarker = Pattern.compile("IV|PV|CV"); + private static final Pattern pNounNoMorph = Pattern.compile("PROP|QUANT"); + + @Override + public List getValues(MorphoFeatureType feat) { + if(feat == MorphoFeatureType.DEF) + return Arrays.asList(defVals); + else if(feat == MorphoFeatureType.CASE) { + throw new RuntimeException(this.getClass().getName() + ": Case is presently unsupported!"); +// return Arrays.asList(caseVals); + } else if(feat == MorphoFeatureType.GEN) + return Arrays.asList(genVals); + else if(feat == MorphoFeatureType.NUM) + return Arrays.asList(numVals); + else if(feat == MorphoFeatureType.PER) + return Arrays.asList(perVals); + else if(feat == MorphoFeatureType.POSS) + return Arrays.asList(possVals); + else if(feat == MorphoFeatureType.VOICE) + return Arrays.asList(voiceVals); + else if(feat == MorphoFeatureType.MOOD) + return Arrays.asList(moodVals); + else if(feat == MorphoFeatureType.TENSE) + return Arrays.asList(tenseVals); + else + throw new IllegalArgumentException("Arabic does not support feature type: " + feat.toString()); + } + + /** + * Hand-written rules to convert SAMA analyses to feature structures. + */ + @Override + public MorphoFeatures strToFeatures(String spec) { + MorphoFeatures features = new ArabicMorphoFeatures(); + + // Check for the boundary symbol + if(spec == null || spec.equals("")) { + return features; + } + //Possessiveness + if(isActive(MorphoFeatureType.POSS) && spec.contains("POSS")) { + features.addFeature(MorphoFeatureType.POSS,possVals[0]); + } + + //Nominals and pronominals. Mona ignores Pronominals in ERTS, but they seem to help... + // NSUFF -- declinable nominals + // VSUFF -- enclitic pronominals + // PRON -- ordinary pronominals + if(spec.contains("NSUFF") || spec.contains("NOUN") || spec.contains("ADJ")) { + // Nominal phi feature indicators are different than the indicators + // that we process with processInflectionalFeatures() + if(isActive(MorphoFeatureType.NGEN)) { + if(spec.contains("FEM")) { + features.addFeature(MorphoFeatureType.NGEN, genVals[1]); + } else if(spec.contains("MASC") || !pNounNoMorph.matcher(spec).find()) { + features.addFeature(MorphoFeatureType.NGEN, genVals[0]); + } + } + + // WSGDEBUG -- Number for nominals only + if(isActive(MorphoFeatureType.NNUM)) { + if(spec.contains("DU")) { + features.addFeature(MorphoFeatureType.NNUM, numVals[1]); + } else if(spec.contains("PL")) { + features.addFeature(MorphoFeatureType.NNUM, numVals[2]); + } else if (!pNounNoMorph.matcher(spec).find()){ // (spec.contains("SG")) + features.addFeature(MorphoFeatureType.NNUM, numVals[0]); + } + } + + //Definiteness + if(isActive(MorphoFeatureType.DEF)) { + if (spec.contains("DET")) { + features.addFeature(MorphoFeatureType.DEF, defVals[1]); + } else if (!pNounNoMorph.matcher(spec).find()){ + features.addFeature(MorphoFeatureType.DEF, defVals[0]); + } + } + + // Proper nouns (probably a stupid feature) + if (isActive(MorphoFeatureType.PROP)) { + if (spec.contains("PROP")) { + features.addFeature(MorphoFeatureType.PROP,""); + } + } + + } else if(spec.contains("PRON") || (spec.contains("VSUFF_DO") && !pVerbMood.matcher(spec).find())) { + if(spec.contains("DEM_PRON")) { + features.addFeature(MorphoFeatureType.DEF, defVals[0]); + Matcher m = pDemPronounFeatures.matcher(spec); + if (m.find()) { + spec = m.group(1); + processInflectionalFeaturesHelper(features, spec); + } + + } else { + processInflectionalFeatures(features, spec); + } + + // Verbs (marked for tense) + } else if(pVerbTenseMarker.matcher(spec).find()) { + + // Tense feature + if(isActive(MorphoFeatureType.TENSE)) { + if(spec.contains("PV")) + features.addFeature(MorphoFeatureType.TENSE, tenseVals[0]); + else if(spec.contains("IV")) + features.addFeature(MorphoFeatureType.TENSE, tenseVals[1]); + else if(spec.contains("CV")) + features.addFeature(MorphoFeatureType.TENSE, tenseVals[2]); + } + + // Inflectional features + processInflectionalFeatures(features, spec); + + if(isActive(MorphoFeatureType.MOOD)) { + Matcher moodMatcher = pMood.matcher(spec); + if(moodMatcher.find()) { + String moodStr = moodMatcher.group(1); + if(moodStr.equals("I")) + features.addFeature(MorphoFeatureType.MOOD, moodVals[0]); + else if(moodStr.equals("S")) + features.addFeature(MorphoFeatureType.MOOD, moodVals[1]); + else if(moodStr.equals("J")) + features.addFeature(MorphoFeatureType.MOOD, moodVals[2]); + } + } + + if(isActive(MorphoFeatureType.VOICE)) { + if(spec.contains("PASS")) { + features.addFeature(MorphoFeatureType.VOICE, voiceVals[1]); + } else { + features.addFeature(MorphoFeatureType.VOICE, voiceVals[0]); + } + } + } + return features; + } + + /** + * Extract features from a standard phi feature specification. + * + * @param feats + * @param spec + */ + private void processInflectionalFeatures(MorphoFeatures feats, String spec) { + // Extract the feature tuple + Matcher m = pFeatureTuple.matcher(spec); + if (m.find()) { + spec = m.group(1); + processInflectionalFeaturesHelper(feats, spec); + } + } + + private void processInflectionalFeaturesHelper(MorphoFeatures feats, String spec) { + if(isActive(MorphoFeatureType.GEN)) { + if(spec.contains("M")) + feats.addFeature(MorphoFeatureType.GEN, genVals[0]); + else if(spec.contains("F")) + feats.addFeature(MorphoFeatureType.GEN, genVals[1]); + } + + if(isActive(MorphoFeatureType.NUM)) { + if(spec.endsWith("S")) + feats.addFeature(MorphoFeatureType.NUM, numVals[0]); + else if(spec.endsWith("D")) + feats.addFeature(MorphoFeatureType.NUM, numVals[1]); + else if(spec.endsWith("P")) + feats.addFeature(MorphoFeatureType.NUM, numVals[2]); + } + + if(isActive(MorphoFeatureType.PER)) { + if(spec.contains("1")) + feats.addFeature(MorphoFeatureType.PER, perVals[0]); + else if(spec.contains("2")) + feats.addFeature(MorphoFeatureType.PER, perVals[1]); + else if(spec.contains("3")) + feats.addFeature(MorphoFeatureType.PER, perVals[2]); + } + } + + /** + * Converts features specifications to labels for tagging + * + * @author Spence Green + * + */ + public static class ArabicMorphoFeatures extends MorphoFeatures { + + private static final long serialVersionUID = -4611776415583633186L; + + @Override + public MorphoFeatures fromTagString(String str) { + String[] feats = str.split("\\-"); + MorphoFeatures mFeats = new ArabicMorphoFeatures(); + // First element is the base POS +// String baseTag = feats[0]; + for(int i = 1; i < feats.length; i++) { + String[] keyValue = feats[i].split(KEY_VAL_DELIM); + if(keyValue.length != 2) continue; + MorphoFeatureType fName = MorphoFeatureType.valueOf(keyValue[0].trim()); + mFeats.addFeature(fName, keyValue[1].trim()); + } + return mFeats; + } + + @Override + public String getTag(String basePartOfSpeech) { + StringBuilder sb = new StringBuilder(basePartOfSpeech); + // Iterate over feature list so that features are added in the same order + // for every feature spec. + for (MorphoFeatureType feat : MorphoFeatureType.values()) { + if (hasFeature(feat)) { + sb.append(String.format("-%s:%s",feat,fSpec.get(feat))); + } + } + return sb.toString(); + } + } + + /** + * For debugging. Converts a set of long tags (BAMA analyses as in the ATB) to their morpho + * feature specification. The input file should have one long tag per line. + * + * @param args + */ + public static void main(String[] args) { + if(args.length != 2) { + System.err.printf("Usage: java %s filename feats%n", ArabicMorphoFeatureSpecification.class.getName()); + System.exit(-1); + } + + MorphoFeatureSpecification fSpec = new ArabicMorphoFeatureSpecification(); + String[] feats = args[1].split(","); + for(String feat : feats) { + MorphoFeatureType fType = MorphoFeatureType.valueOf(feat); + fSpec.activate(fType); + } + + File fName = new File(args[0]); + try { + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fName))); + + int nLine = 0; + for(String line;(line = br.readLine()) != null; nLine++) { + MorphoFeatures mFeats = fSpec.strToFeatures(line.trim()); + System.out.printf("%s\t%s%n", line.trim(), mFeats.toString()); + } + br.close(); + System.out.printf("%nRead %d lines%n",nLine); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/Buckwalter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/Buckwalter.java new file mode 100644 index 0000000..47d39d6 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/Buckwalter.java @@ -0,0 +1,291 @@ +package edu.stanford.nlp.international.arabic; + +import java.util.Map; +import java.util.StringTokenizer; +import java.util.regex.Pattern; +import java.io.*; + +import edu.stanford.nlp.io.EncodingPrintWriter; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.StringUtils; +import edu.stanford.nlp.process.SerializableFunction; + +/** This class can convert between Unicode and Buckwalter encodings of + * Arabic. + *

    + * Sources + *

    + * "MORPHOLOGICAL ANALYSIS & POS ANNOTATION," v3.8. LDC. 08 June 2009. + * + * http://www.ldc.upenn.edu/myl/morph/buckwalter.html + * http://www.qamus.org/transliteration.htm (Tim Buckwalter's site) + * http://www.livingflowers.com/Arabic_transliteration (many but hard to use) + * http://www.cis.upenn.edu/~cis639/arabic/info/romanization.html + * http://www.nongnu.org/aramorph/english/index.html (Java AraMorph) + * BBN's MBuckWalter2Unicode.tab + * see also my GALE-NOTES.txt file for other mappings ROSETTA people do. + * Normalization of decomposed characters to composed: + * ARABIC LETTER ALEF (\u0627), ARABIC MADDAH ABOVE (\u0653) -> + * ARABIC LETTER ALEF WITH MADDA ABOVE + * ARABIC LETTER ALEF (\u0627), ARABIC HAMZA ABOVE (\u0654) -> + * ARABIC LETTER ALEF WITH HAMZA ABOVE (\u0623) + * ARABIC LETTER WAW, ARABIC HAMZA ABOVE -> + * ARABIC LETTER WAW WITH HAMZA ABOVE + * ARABIC LETTER ALEF, ARABIC HAMZA BELOW (\u0655) -> + * ARABIC LETTER ALEF WITH HAMZA BELOW + * ARABIC LETTER YEH, ARABIC HAMZA ABOVE -> + * ARABIC LETTER YEH WITH HAMZA ABOVE + * + * @author Christopher Manning + * @author Spence Green + */ +public class Buckwalter implements SerializableFunction { + + private static final long serialVersionUID = 4351710914246859336L; + + /** + * If true (include flag "-o"), outputs space separated + * unicode values (e.g., "\u0621" rather than the character version of those values. + * Only applicable for Buckwalter to Arabic conversion. + */ + boolean outputUnicodeValues = false; + + private final char[] arabicChars = { + '\u0621', '\u0622', '\u0623', '\u0624', '\u0625', '\u0626', '\u0627', + '\u0628', '\u0629', '\u062A', '\u062B', + '\u062C', '\u062D', '\u062E', '\u062F', + '\u0630', '\u0631', '\u0632', '\u0633', + '\u0634', '\u0635', '\u0636', '\u0637', '\u0638', '\u0639', '\u063A', + '\u0640', '\u0641', '\u0642', '\u0643', + '\u0644', '\u0645', '\u0646', '\u0647', + '\u0648', '\u0649', '\u064A', '\u064B', + '\u064C', '\u064D', '\u064E', '\u064F', + '\u0650', '\u0651', '\u0652', + '\u0670', '\u0671', + '\u067E', '\u0686', '\u0698', '\u06A4', '\u06AF', + '\u0625', '\u0623', '\u0624', // add Tim's "XML-friendly" just in case + '\u060C', '\u061B', '\u061F', // from BBN script; status unknown + '\u066A', '\u066B', // from IBM script + '\u06F0','\u06F1','\u06F2','\u06F3','\u06F4', //Farsi/Urdu cardinals + '\u06F5','\u06F6','\u06F7','\u06F8','\u06F9', + '\u0660', '\u0661', '\u0662', '\u0663', '\u0664', + '\u0665', '\u0666', '\u0667', '\u0668', '\u0669', + '\u00AB', '\u00BB' // French quotes used in e.g. Gulf newswire + }; + + private final char[] buckChars = { + '\'', '|', '>', '&', '<', '}', 'A', + 'b', 'p', 't', 'v', + 'j', 'H', 'x', 'd', // end 062x + '*', 'r', 'z', 's', + '$', 'S', 'D', 'T', 'Z', 'E', 'g', // end 063x + '_', 'f', 'q', 'k', + 'l', 'm', 'n', 'h', + 'w', 'Y', 'y', 'F', + 'N', 'K', 'a', 'u', // end 0064x + 'i', '~', 'o', + '`', '{', + 'P', 'J', 'R', 'V', 'G', // U+0698 is Farsi Jeh: R to ATB POS guidelines + 'I', 'O', 'W', // add Tim's "XML-friendly" versions just in case + ',', ';', '?', // from BBN script; status unknown + '%', '.', // from IBM script + '0', '1', '2', '3', '4', + '5', '6', '7', '8', '9', + '0', '1', '2', '3', '4', + '5', '6', '7', '8', '9', + '"', '"' // French quotes used in e.g. Gulf newswire + }; + + /* BBN also maps to @: 0x007B 0x066C 0x066D 0x0660 0x0661 0x0662 0x0663 + 0x0664 0x0665 0x0666 0x0667 0x0668 0x0669 0x066A + 0x0686 0x06AF 0x066D 0x06AF 0x06AA 0x06AB 0x06B1 + 0x06F0 0x06EC 0x06DF 0x06DF 0x06F4 0x002A 0x274A + 0x00E9 0x00C9 0x00AB 0x00BB 0x00A0 0x00A4 + */ + /* BBNWalter dispreferring punct chars: + '\u0624', '\u0625', '\u0626', -> 'L', 'M', 'Q', + '\u0630', -> 'C', '\u0640', -> '@', '\u0651', -> 'B', + */ + /* IBM also deletes: 654 655 670 */ + + private boolean unicode2Buckwalter = false; + private final Map u2bMap; + private final Map b2uMap; + private ClassicCounter unmappable; + + private static boolean DEBUG = false; + private static final boolean PASS_ASCII_IN_UNICODE = true; + private static boolean SUPPRESS_DIGIT_MAPPING_IN_B2A = true; + private static boolean SUPPRESS_PUNC_MAPPING_IN_B2A = true; + + //wsg: I have included _ in this list, which actually maps to tatweel. + //In practice we strip tatweel as part of orthographic normalization, + //so any instances of _ in the Buckwalter should actually be treated as + //punctuation. + private static final Pattern latinPunc = Pattern.compile("[\"\\?%,-;\\._]+"); + + public Buckwalter() { + if (arabicChars.length != buckChars.length) + throw new RuntimeException(this.getClass().getName() + ": Inconsistent u2b/b2u arrays."); + + u2bMap = Generics.newHashMap(arabicChars.length); + b2uMap = Generics.newHashMap(buckChars.length); + for (int i = 0; i < arabicChars.length; i++) { + Character charU = Character.valueOf(arabicChars[i]); + Character charB = Character.valueOf(buckChars[i]); + u2bMap.put(charU, charB); + b2uMap.put(charB, charU); + } + + if (DEBUG) unmappable = new ClassicCounter(); + } + + public Buckwalter(boolean unicodeToBuckwalter) { + this(); + unicode2Buckwalter = unicodeToBuckwalter; + } + + public void suppressBuckDigitConversion(boolean b) { SUPPRESS_DIGIT_MAPPING_IN_B2A = b; } + + public void suppressBuckPunctConversion(boolean b) { SUPPRESS_PUNC_MAPPING_IN_B2A = b; } + + public String apply(String in) { return convert(in, unicode2Buckwalter); } + + public String buckwalterToUnicode(String in) { return convert(in, false); } + + public String unicodeToBuckwalter(String in) { return convert(in, true); } + + private String convert(String in, boolean unicodeToBuckwalter) { + final StringTokenizer st = new StringTokenizer(in); + final StringBuilder result = new StringBuilder(in.length()); + + while(st.hasMoreTokens()) { + final String token = st.nextToken(); + for (int i = 0; i < token.length(); i++) { + if(ATBTreeUtils.reservedWords.contains(token)) { + result.append(token); + break; + } + + final Character inCh = Character.valueOf(token.charAt(i)); + Character outCh = null; + + if (unicodeToBuckwalter) { + outCh = (PASS_ASCII_IN_UNICODE && inCh.charValue() < 127) ? inCh : u2bMap.get(inCh); + + } else if((SUPPRESS_DIGIT_MAPPING_IN_B2A && Character.isDigit(inCh)) || + (SUPPRESS_PUNC_MAPPING_IN_B2A && latinPunc.matcher(inCh.toString()).matches())) { + outCh = inCh; + + } else { + outCh = b2uMap.get(inCh); + } + + if (outCh == null) { + if (DEBUG) { + String key = inCh + "[U+" + + StringUtils.padLeft(Integer.toString(inCh, 16).toUpperCase(), 4, '0') + ']'; + unmappable.incrementCount(key); + } + result.append(inCh); // pass through char + + } else if(outputUnicodeValues) { + result.append("\\u").append(StringUtils.padLeft(Integer.toString(inCh, 16).toUpperCase(), 4, '0')); + + } else { + result.append(outCh); + } + } + result.append(" "); + } + + return result.toString().trim(); + } + + + private static final StringBuilder usage = new StringBuilder(); + static { + usage.append("Usage: java Buckwalter [OPTS] file (or < file)\n"); + usage.append("Options:\n"); + usage.append(" -u2b : Unicode -> Buckwalter (default is Buckwalter -> Unicode).\n"); + usage.append(" -d : Debug mode.\n"); + usage.append(" -o : Output unicode values.\n"); + } + + /** + * + * @param args + */ + public static void main(String[] args) { + + boolean unicodeToBuck = false; + boolean outputUnicodeValues = false; + File inputFile = null; + for(int i = 0; i < args.length; i++) { + if(args[i].startsWith("-")) { + if(args[i].equals("-u2b")) + unicodeToBuck = true; + else if(args[i].equals("-o")) + outputUnicodeValues = false; + else if(args[i].equals("-d")) + DEBUG = true; + else { + System.out.println(usage.toString()); + return; + } + + } else if(i != args.length) { + inputFile = new File(args[i]); + break; + } + } + + final Buckwalter b = new Buckwalter(unicodeToBuck); + b.outputUnicodeValues = outputUnicodeValues; + + int j = (b.outputUnicodeValues ? 2 : Integer.MAX_VALUE); + if (j < args.length) { + for (; j < args.length; j++) + EncodingPrintWriter.out.println(args[j] + " -> " + b.apply(args[j]), "utf-8"); + + } else { + int numLines = 0; + try { + final BufferedReader br = (inputFile == null) ? new BufferedReader(new InputStreamReader(System.in, "utf-8")) : + new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), "utf-8")); + + System.err.printf("Reading input..."); + String line; + while ((line = br.readLine()) != null) { + EncodingPrintWriter.out.println(b.apply(line), "utf-8"); + numLines++; + } + br.close(); + + System.err.printf("done.\nConverted %d lines from %s.\n",numLines, + (unicodeToBuck ? "UTF-8 to Buckwalter" : "Buckwalter to UTF-8")); + + } catch (UnsupportedEncodingException e) { + System.err.println("ERROR: File system does not support UTF-8 encoding."); + + } catch (FileNotFoundException e) { + System.err.println("ERROR: File does not exist: " + inputFile.getPath()); + + } catch (IOException e) { + System.err.printf("ERROR: IO exception while reading file (line %d).\n",numLines); + } + } + + if (DEBUG) { + if ( ! b.unmappable.keySet().isEmpty()) { + EncodingPrintWriter.err.println("Characters that could not be converted [passed through!]:", "utf-8"); + EncodingPrintWriter.err.println(b.unmappable.toString(), "utf-8"); + } else { + EncodingPrintWriter.err.println("All characters successfully converted!", "utf-8"); + } + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/pipeline/DefaultLexicalMapper.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/pipeline/DefaultLexicalMapper.java new file mode 100644 index 0000000..502ead5 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/pipeline/DefaultLexicalMapper.java @@ -0,0 +1,225 @@ +package edu.stanford.nlp.international.arabic.pipeline; + +import java.io.File; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.international.arabic.Buckwalter; +import edu.stanford.nlp.trees.treebank.Mapper; +import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils; +import edu.stanford.nlp.util.Generics; + +/** + * Applies a default set of lexical transformations that have been empirically validated + * in various Arabic tasks. This class automatically detects the input encoding and applies + * the appropriate set of transformations. + * + * @author Spence Green + * + */ +public class DefaultLexicalMapper implements Mapper, Serializable { + + private static final long serialVersionUID = -3798804368296999785L; + + private final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]"); + + //Buckwalter patterns + private final String bwAlefChar = "A"; //U+0627 + private final Pattern bwDiacritics = Pattern.compile("F|N|K|a|u|i|\\~|o"); + private final Pattern bwTatweel = Pattern.compile("_"); + private final Pattern bwAlef = Pattern.compile("\\{|\\||>|<"); + private final Pattern bwQuran = Pattern.compile("`"); + private final Pattern bwNullAnaphoraMarker = Pattern.compile("\\[nll\\]"); + + public final Pattern latinPunc = Pattern.compile("([\u0021-\u002F\u003A-\u0040\\u005B\u005C\\u005D\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E\u20A0-\u20B5])+"); + public final Pattern arabicPunc = Pattern.compile("([\u00AB\u00BB\u0609-\u060D\u061B-\u061F\u066A\u066C-\u066D\u06D4])+"); + + public final Pattern arabicDigit = Pattern.compile("([\u06F0-\u06F9\u0660-\u0669])+"); + + //TODO Extend coverage to entire Arabic code chart + //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about + //UTF-8 input from "the wild" + private final Pattern utf8Diacritics = Pattern.compile("َ|ً|ُ|ٌ|ِ|ٍ|ّ|ْ|\u0670"); + private final Pattern utf8Tatweel = Pattern.compile("ـ"); + private final Pattern utf8Alef = Pattern.compile("ا|إ|أ|آ|\u0671"); + private final Pattern utf8Quran = Pattern.compile("[\u0615-\u061A\u06D6-\u06E5]"); + private final Pattern utf8ProDrop = Pattern.compile("\\[نلل\\]"); + + //Patterns to fix segmentation issues observed in the ATB + public final Pattern segmentationMarker = Pattern.compile("^-+|-+$"); + private final Pattern morphemeBoundary = Pattern.compile("\\+"); + + private final Pattern hasDigit = Pattern.compile("\\d+"); + + // Process the vocalized section for parsing + private boolean useATBVocalizedSectionMapping = false; + + // Strip morpheme boundary markers in the vocalized section + private boolean stripMorphemeMarkersInUTF8 = false; + + // Strip all morpheme and segmentation markers in UTF-8 Arabic + private boolean stripSegmentationMarkersInUTF8 = false; + + //wsg: "LATIN" does not appear in the Bies tagset, so be sure to pass + //in the extended POS tags during normalization + private final String parentTagString = "PUNC LATIN -NONE-"; + private final Set parentTagsToEscape; + + private final String utf8CliticString = "ل ف و ما ه ها هم هن نا كم تن تم ى ي هما ك ب م"; +// private final Set utf8Clitics; + private final Set bwClitics; + + public DefaultLexicalMapper() { + parentTagsToEscape = + Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(parentTagString.split("\\s+")))); + +// utf8Clitics = +// Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(utf8CliticString.split("\\s+")))); + + Buckwalter bw = new Buckwalter(true); + String bwString = bw.apply(utf8CliticString); + bwClitics = + Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(bwString.split("\\s+")))); + } + + private String mapUtf8(String element) { + Matcher latinPuncOnly = latinPunc.matcher(element); + Matcher arbPuncOnly = arabicPunc.matcher(element); + if(latinPuncOnly.matches() || arbPuncOnly.matches()) return element; + + //Remove diacritics + Matcher rmDiacritics = utf8Diacritics.matcher(element); + element = rmDiacritics.replaceAll(""); + + if(element.length() > 1) { + Matcher rmTatweel = utf8Tatweel.matcher(element); + element = rmTatweel.replaceAll(""); + } + + //Normalize alef + Matcher normAlef = utf8Alef.matcher(element); + element = normAlef.replaceAll("ا"); + + //Remove characters that only appear in the Qur'an + Matcher rmQuran = utf8Quran.matcher(element); + element = rmQuran.replaceAll(""); + + Matcher rmProDrop = utf8ProDrop.matcher(element); + element = rmProDrop.replaceAll(""); + + if (stripMorphemeMarkersInUTF8) { + Matcher rmMorphemeBoundary = morphemeBoundary.matcher(element); + String strippedElem = rmMorphemeBoundary.replaceAll(""); + if(strippedElem.length() > 0) + element = strippedElem; + } + if (stripSegmentationMarkersInUTF8) { + String strippedElem = segmentationMarker.matcher(element).replaceAll(""); + if(strippedElem.length() > 0) + element = strippedElem; + } + + return element; + } + + private String mapBuckwalter(String element) { + Matcher puncOnly = latinPunc.matcher(element); + if(puncOnly.matches()) return element; + + //Remove diacritics + Matcher rmDiacritics = bwDiacritics.matcher(element); + element = rmDiacritics.replaceAll(""); + + //Remove tatweel + if(element.length() > 1) { + Matcher rmTatweel = bwTatweel.matcher(element); + element = rmTatweel.replaceAll(""); + } + + //Normalize alef + Matcher normAlef = bwAlef.matcher(element); + element = normAlef.replaceAll(bwAlefChar); + + //Remove characters that only appear in the Qur'an + Matcher rmQuran = bwQuran.matcher(element); + element = rmQuran.replaceAll(""); + + Matcher rmProDrop = bwNullAnaphoraMarker.matcher(element); + element = rmProDrop.replaceAll(""); + + // This conditional is used for normalizing raw ATB trees + // Morpheme boundaries are removed, and segmentation markers are retained on + // segmented morphemes (not the tokens to which the morphemes were attached) + if (useATBVocalizedSectionMapping && element.length() > 1) { + Matcher rmMorphemeBoundary = morphemeBoundary.matcher(element); + element = rmMorphemeBoundary.replaceAll(""); + + //wsg: This is hairy due to tokens like this in the vocalized section: + // layos-+-a + Matcher cliticMarker = segmentationMarker.matcher(element); + if(cliticMarker.find() && !hasDigit.matcher(element).find()) { + String strippedElem = cliticMarker.replaceAll(""); + if(strippedElem.length() > 0) + element = bwClitics.contains(strippedElem) ? element : strippedElem; + } + + } else if (element.length() > 1 && !ATBTreeUtils.reservedWords.contains(element)) { + Matcher rmCliticMarker = segmentationMarker.matcher(element); + element = rmCliticMarker.replaceAll(""); + } + + return element; + } + + public String map(String parent, String element) { + String elem = element.trim(); + + if(parent != null && parentTagsToEscape.contains(parent)) + return elem; + + Matcher utf8Encoding = utf8ArabicChart.matcher(elem); + return (utf8Encoding.find()) ? mapUtf8(elem) : mapBuckwalter(elem); + } + + public void setup(File path, String... options) { + if(options == null) return; + + for(int i = 0; i < options.length; i++) { + final String opt = options[i]; + if (opt.equals("ATBVocalizedSection")) { + useATBVocalizedSectionMapping = true; + } else if (opt.equals("StripSegMarkersInUTF8")) { + stripSegmentationMarkersInUTF8 = true; + } else if (opt.equals("StripMorphMarkersInUTF8")) { + stripMorphemeMarkersInUTF8 = true; + } + } + } + + //Whether or not the encoding of this word can be converted to another encoding + //from its current encoding (Buckwalter or UTF-8) + public boolean canChangeEncoding(String parent, String element) { + parent = parent.trim(); + element = element.trim(); + + //Hack for LDC2008E22 idiosyncrasy + //This is NUMERIC_COMMA in the raw trees. We allow conversion of this + //token to UTF-8 since it would appear in this encoding in arbitrary + //UTF-8 text input + if(parent.contains("NUMERIC_COMMA") || (parent.contains("PUNC") && element.equals("r"))) //Numeric comma + return true; + + Matcher numMatcher = hasDigit.matcher(element); + return !(numMatcher.find() || parentTagsToEscape.contains(parent)); + } + + public static void main(String[] args) { + Mapper m = new DefaultLexicalMapper(); + + System.out.printf("< :-> %s\n",m.map(null, "FNKqq")); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/process/ArabicLexer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/process/ArabicLexer.java new file mode 100644 index 0000000..e38bac8 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/process/ArabicLexer.java @@ -0,0 +1,1187 @@ +/* The following code was generated by JFlex 1.4.3 on 3/20/13 11:23 AM */ + +package edu.stanford.nlp.international.arabic.process; + +import java.io.Reader; +import java.util.Map; +import java.util.Properties; +import java.util.regex.Pattern; + +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.process.LexedTokenFactory; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.PropertiesUtils; + +/** + * Tokenizer for UTF-8 Arabic. Supports raw text and both sections + * (vocalized and unvocalized) of the ATB. + * + * @author Spence Green + */ + + +class ArabicLexer { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\11\0\1\34\1\2\1\3\1\4\1\1\22\0\1\34\1\36\1\35"+ + "\1\7\1\42\2\7\1\42\2\35\1\7\1\22\1\37\1\14\1\6"+ + "\1\33\12\43\1\32\1\42\1\35\1\7\1\35\1\36\1\56\2\41"+ + "\1\44\1\53\1\50\1\41\1\52\1\26\4\41\1\46\1\47\1\45"+ + "\1\30\1\41\1\51\1\31\1\27\1\54\1\41\1\40\3\41\1\15"+ + "\1\11\1\21\1\7\1\13\1\42\2\41\1\44\1\53\1\50\1\41"+ + "\1\52\1\26\4\41\1\46\1\47\1\45\1\30\1\41\1\51\1\31"+ + "\1\27\1\54\1\41\1\40\3\41\1\36\1\35\1\36\1\7\6\0"+ + "\1\3\32\0\1\55\11\7\1\24\2\7\1\24\7\7\1\24\4\7"+ + "\1\24\5\7\27\23\1\0\37\23\1\0\u013f\23\31\23\162\23\4\23"+ + "\14\23\16\23\5\23\11\23\1\23\213\23\1\23\3\23\6\0\2\23"+ + "\1\23\1\0\3\23\1\0\1\23\1\0\24\23\1\0\54\23\1\23"+ + "\46\23\1\23\5\23\4\23\202\23\1\0\5\23\2\0\105\23\1\23"+ + "\46\23\2\23\2\23\6\23\20\23\26\23\13\0\46\23\2\0\1\23"+ + "\6\23\1\0\47\23\11\0\55\23\1\0\1\23\1\0\2\23\1\0"+ + "\2\23\1\0\1\23\10\0\33\23\5\0\3\23\15\0\14\7\2\11"+ + "\15\12\5\7\1\0\32\25\5\12\4\25\1\20\1\25\1\16\4\25"+ + "\4\12\1\17\17\12\1\0\12\10\1\7\2\11\1\7\2\25\1\12"+ + "\143\25\1\7\1\25\17\12\2\25\7\12\2\25\12\10\3\25\2\12"+ + "\1\25\17\0\1\23\1\23\1\23\36\23\35\23\3\23\60\23\46\23"+ + "\13\23\1\23\30\0\54\23\4\0\1\23\u0105\0\4\23\66\23\2\0"+ + "\1\23\1\23\21\23\1\0\1\23\5\23\2\0\12\23\2\23\2\0"+ + "\12\10\21\0\3\23\1\0\10\23\2\0\2\23\2\0\26\23\1\0"+ + "\7\23\1\0\1\23\3\0\4\23\2\0\1\23\1\23\7\23\2\0"+ + "\2\23\2\0\3\23\11\0\1\23\4\0\2\23\1\0\3\23\2\23"+ + "\2\0\12\10\2\23\17\0\3\23\1\0\6\23\4\0\2\23\2\0"+ + "\26\23\1\0\7\23\1\0\2\23\1\0\2\23\1\0\2\23\2\0"+ + "\1\23\1\0\22\23\11\0\4\23\1\0\1\23\7\0\12\10\2\0"+ + "\3\23\14\0\3\23\1\0\11\23\1\0\3\23\1\0\26\23\1\0"+ + "\7\23\1\0\2\23\1\0\5\23\2\0\1\23\1\23\22\23\1\23"+ + "\17\0\2\23\4\0\12\10\25\0\10\23\2\0\2\23\2\0\26\23"+ + "\1\0\7\23\1\0\2\23\1\0\5\23\3\0\1\23\36\0\2\23"+ + "\1\0\3\23\4\0\12\10\1\0\1\23\20\0\1\23\1\23\1\0"+ + "\6\23\3\0\3\23\1\0\4\23\3\0\2\23\1\0\1\23\1\0"+ + "\2\23\3\0\2\23\3\0\3\23\3\0\10\23\1\0\3\23\4\0"+ + "\5\23\3\0\3\23\1\0\4\23\31\0\11\10\21\0\3\23\1\0"+ + "\10\23\1\0\3\23\1\0\27\23\1\0\12\23\1\0\5\23\4\0"+ + "\31\23\11\0\2\23\4\0\12\10\25\0\10\23\1\0\3\23\1\0"+ + "\27\23\1\0\12\23\1\0\5\23\3\0\1\23\40\0\1\23\1\0"+ + "\2\23\4\0\12\10\25\0\10\23\1\0\3\23\1\0\27\23\1\0"+ + "\20\23\4\0\7\23\1\0\3\23\27\0\2\23\4\0\12\10\25\0"+ + "\22\23\3\0\30\23\1\0\11\23\1\0\1\23\2\0\7\23\72\0"+ + "\57\23\1\23\1\23\2\23\7\23\5\0\7\23\10\23\1\0\12\10"+ + "\47\0\2\23\1\0\1\23\2\0\2\23\1\0\1\23\2\0\1\23"+ + "\6\0\4\23\1\0\7\23\1\0\3\23\1\0\1\23\1\0\1\23"+ + "\2\0\2\23\1\0\4\23\1\23\2\23\11\23\1\23\2\0\5\23"+ + "\1\0\1\23\1\0\6\23\2\0\12\10\2\0\2\23\42\0\1\23"+ + "\37\0\12\10\26\0\10\23\1\0\42\23\35\0\4\23\164\0\42\23"+ + "\1\0\5\23\1\0\2\23\25\0\12\10\6\0\6\23\112\0\46\23"+ + "\12\0\51\23\7\0\132\23\5\0\104\23\5\0\122\23\6\0\7\23"+ + "\1\0\77\23\1\0\1\23\1\0\4\23\2\0\7\23\1\0\1\23"+ + "\1\0\4\23\2\0\47\23\1\0\1\23\1\0\4\23\2\0\37\23"+ + "\1\0\1\23\1\0\4\23\2\0\7\23\1\0\1\23\1\0\4\23"+ + "\2\0\7\23\1\0\7\23\1\0\27\23\1\0\37\23\1\0\1\23"+ + "\1\0\4\23\2\0\7\23\1\0\47\23\1\0\23\23\16\0\11\10"+ + "\56\0\125\23\14\0\u026c\23\2\0\10\23\12\0\32\23\5\0\113\23"+ + "\25\0\15\23\1\0\4\23\16\0\22\23\16\0\22\23\16\0\15\23"+ + "\1\0\3\23\17\0\64\23\43\0\1\23\4\0\1\23\3\0\12\10"+ + "\46\0\12\10\6\0\130\23\10\0\51\23\127\0\35\23\51\0\12\10"+ + "\36\23\2\0\5\23\u038b\0\154\23\224\0\234\23\4\0\132\23\6\0"+ + "\26\23\2\0\6\23\2\0\46\23\2\0\6\23\2\0\10\23\1\0"+ + "\1\23\1\0\1\23\1\0\1\23\1\0\37\23\2\0\65\23\1\0"+ + "\7\23\1\0\1\23\3\0\3\23\1\0\7\23\3\0\4\23\2\0"+ + "\6\23\4\0\15\23\5\0\3\23\1\0\7\23\3\0\13\5\5\0"+ + "\30\7\1\3\1\3\5\0\1\5\57\7\22\0\1\23\15\0\1\23"+ + "\40\0\26\7\114\0\1\23\4\0\1\23\2\0\12\23\1\0\1\23"+ + "\3\0\5\23\6\0\1\23\1\0\1\23\1\0\1\23\1\0\4\23"+ + "\1\0\3\23\1\0\7\23\3\0\3\23\5\0\5\23\u0ce4\0\1\7"+ + "\u01d1\0\1\5\4\0\2\23\52\0\5\23\5\0\2\23\4\0\126\23"+ + "\6\0\3\23\1\0\132\23\1\0\4\23\5\0\50\23\4\0\136\23"+ + "\21\0\30\23\70\0\20\23\u0200\0\u19b6\23\112\0\u51a6\23\132\0\u048d\23"+ + "\u0773\0\u2ba4\23\u215c\0\u012e\23\2\0\73\23\225\0\7\23\14\0\5\23"+ + "\5\0\1\23\1\0\12\23\1\0\15\23\1\0\5\23\1\0\1\23"+ + "\1\0\2\23\1\0\2\23\1\0\154\23\41\0\u016b\23\22\0\100\23"+ + "\2\0\66\23\50\0\14\23\164\0\5\23\1\0\207\23\23\0\12\10"+ + "\7\0\32\23\6\0\32\23\13\0\131\23\3\0\6\23\2\0\6\23"+ + "\2\0\6\23\2\0\3\23\43\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\1\1\3\2\1\3\3\4\1\5\3\4\2\5"+ + "\1\4\2\5\1\4\1\3\1\5\1\4\2\0\5\4"+ + "\1\5\2\4\1\5\2\4\4\0\1\5\1\0\1\5"+ + "\5\0\1\6\6\0\1\5\2\0\1\5\12\0\1\5"+ + "\3\0\1\4\2\0\1\4\1\5\20\0\1\7\1\0"+ + "\6\4\10\0\1\7\3\4\1\0\1\4\2\0\5\4"+ + "\1\0\6\4"; + + private static int [] zzUnpackAction() { + int [] result = new int[129]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\57\0\136\0\215\0\57\0\274\0\353\0\u011a"+ + "\0\u0149\0\u0178\0\u01a7\0\u01d6\0\u0205\0\u0234\0\u0263\0\u0292"+ + "\0\u02c1\0\u02f0\0\u031f\0\u034e\0\u037d\0\u03ac\0\57\0\u03db"+ + "\0\u040a\0\u0439\0\u0468\0\u0497\0\u04c6\0\u04f5\0\u0524\0\u0553"+ + "\0\u0582\0\u05b1\0\u05e0\0\u060f\0\u063e\0\u066d\0\u069c\0\u06cb"+ + "\0\u06fa\0\u0729\0\u0758\0\u0787\0\u07b6\0\u07e5\0\u0814\0\u040a"+ + "\0\u0843\0\u0872\0\u08a1\0\u08d0\0\u08ff\0\u092e\0\u095d\0\u098c"+ + "\0\u09bb\0\u09ea\0\u0a19\0\u0a48\0\u0a77\0\u0aa6\0\u0ad5\0\u0b04"+ + "\0\u0b33\0\u0b62\0\u0b91\0\u0bc0\0\u0bef\0\u0c1e\0\u0c4d\0\u0c7c"+ + "\0\u0cab\0\u0cda\0\u0d09\0\u0d38\0\u0d67\0\u0d96\0\u0dc5\0\u0df4"+ + "\0\u0e23\0\u0e52\0\u0e81\0\u0eb0\0\u0edf\0\u0f0e\0\u0f3d\0\u0f6c"+ + "\0\u0f9b\0\u0fca\0\u0ff9\0\u1028\0\u1057\0\u1086\0\u10b5\0\u10e4"+ + "\0\u1113\0\u1142\0\u1171\0\u11a0\0\u11cf\0\u11fe\0\u122d\0\u125c"+ + "\0\u128b\0\u12ba\0\u12e9\0\u1318\0\u1347\0\u1376\0\u13a5\0\u13d4"+ + "\0\u1403\0\u1432\0\u1461\0\u1490\0\u14bf\0\u1347\0\u098c\0\u09bb"+ + "\0\u14ee\0\u151d\0\u154c\0\u157b\0\u15aa\0\u15d9\0\u1608\0\u1637"+ + "\0\u1666"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[129]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\4\1\5\1\4\1\6\1\7\1\10"+ + "\1\11\1\10\1\12\1\13\1\14\1\15\1\16\1\12"+ + "\1\16\2\10\1\17\1\20\1\16\1\21\3\22\2\23"+ + "\1\24\3\23\1\25\1\22\1\23\1\26\11\22\1\6"+ + "\1\10\1\27\2\0\1\27\1\0\1\27\1\30\5\27"+ + "\1\0\15\27\6\0\2\27\2\0\13\27\2\0\1\4"+ + "\133\0\1\27\2\0\1\27\1\0\1\6\1\30\5\27"+ + "\1\0\15\27\2\0\1\24\3\0\2\27\2\0\11\27"+ + "\1\6\1\27\6\0\1\31\1\23\1\0\1\23\1\0"+ + "\3\23\3\0\2\23\1\0\1\23\5\0\2\23\1\0"+ + "\3\23\2\0\1\23\13\0\1\23\1\27\2\0\1\27"+ + "\1\0\1\27\1\32\1\10\1\27\1\10\1\27\1\10"+ + "\1\23\1\10\3\27\2\10\1\27\1\10\5\27\2\23"+ + "\1\0\3\23\2\27\1\23\1\0\12\27\1\10\1\27"+ + "\2\0\1\27\1\0\1\27\1\33\1\27\2\11\1\27"+ + "\1\11\1\34\5\27\1\11\7\27\1\0\1\34\3\0"+ + "\1\34\2\27\1\0\1\34\14\27\2\0\1\27\1\0"+ + "\1\27\1\30\1\27\1\35\1\27\2\12\1\36\1\27"+ + "\3\12\1\27\1\12\2\27\1\12\4\27\6\0\2\27"+ + "\1\0\1\37\14\27\2\0\1\27\1\0\1\27\1\32"+ + "\1\10\1\35\1\10\1\12\1\13\1\40\1\10\3\12"+ + "\1\10\1\13\1\27\1\10\1\12\4\27\2\23\1\0"+ + "\3\23\2\27\1\23\1\37\12\27\1\10\6\0\2\23"+ + "\1\0\1\23\1\41\1\42\1\14\1\43\3\41\2\23"+ + "\1\0\1\23\1\41\4\0\2\23\1\0\3\23\2\0"+ + "\1\23\13\0\1\23\1\27\2\0\1\27\1\0\1\27"+ + "\1\32\1\10\1\27\1\10\1\27\1\10\1\23\1\10"+ + "\1\44\2\27\2\10\1\27\1\10\5\27\2\23\1\0"+ + "\3\23\2\27\1\23\1\0\12\27\1\10\1\27\2\0"+ + "\1\27\1\0\1\27\1\30\1\27\1\35\1\27\2\12"+ + "\1\36\1\27\1\16\1\12\1\16\1\27\1\12\2\17"+ + "\1\16\4\17\6\0\2\17\1\0\1\37\11\17\3\27"+ + "\2\0\1\27\1\0\1\27\1\30\5\27\1\0\1\27"+ + "\1\17\1\27\1\17\2\27\7\17\6\0\2\17\2\0"+ + "\11\17\3\27\2\0\1\27\1\0\1\27\1\32\1\10"+ + "\1\27\1\10\1\27\1\10\1\23\1\10\1\17\1\27"+ + "\1\17\2\10\1\17\1\20\5\17\2\23\1\0\3\23"+ + "\2\17\1\23\1\0\11\17\1\27\1\10\1\45\2\0"+ + "\1\45\1\0\1\45\1\46\5\45\1\47\1\45\1\22"+ + "\1\45\1\22\2\45\4\22\1\50\2\22\2\47\2\0"+ + "\2\47\2\22\2\47\11\22\1\27\1\51\1\45\2\0"+ + "\1\45\1\0\1\45\1\46\5\45\1\47\1\45\1\22"+ + "\1\45\1\22\2\45\7\22\2\47\2\0\2\47\2\22"+ + "\2\47\11\22\1\27\1\51\6\0\2\23\1\0\1\23"+ + "\1\0\3\23\3\0\2\23\1\0\1\23\5\0\2\23"+ + "\1\0\3\23\2\0\1\23\13\0\1\23\5\0\1\24"+ + "\26\0\1\24\20\0\1\24\1\0\1\45\2\0\1\45"+ + "\1\0\1\45\1\46\5\45\1\47\1\45\1\22\1\45"+ + "\1\22\2\45\7\22\2\47\2\0\2\47\1\52\1\22"+ + "\2\47\11\22\1\27\1\51\1\47\2\0\1\47\1\0"+ + "\1\47\1\26\1\47\2\26\1\47\2\26\5\47\1\26"+ + "\10\47\1\26\2\0\1\47\1\26\3\47\1\26\11\47"+ + "\1\0\1\53\1\27\2\0\1\27\1\0\1\27\1\0"+ + "\5\27\1\0\15\27\6\0\2\27\2\0\1\54\1\55"+ + "\1\27\1\56\1\57\6\27\6\0\1\60\1\23\1\0"+ + "\1\23\1\0\3\23\3\0\2\23\1\0\1\23\5\0"+ + "\2\23\1\0\3\23\2\0\1\23\13\0\1\23\1\27"+ + "\2\0\1\27\1\0\1\27\1\23\1\10\1\27\1\10"+ + "\1\27\1\10\1\23\1\10\3\27\2\10\1\27\1\10"+ + "\5\27\2\23\1\0\3\23\2\27\1\23\1\0\1\54"+ + "\1\55\1\27\1\56\1\57\5\27\1\10\1\27\2\0"+ + "\1\27\1\0\1\27\1\34\1\27\2\11\1\27\1\11"+ + "\1\34\5\27\1\11\7\27\1\0\1\34\3\0\1\34"+ + "\2\27\1\0\1\34\1\54\1\55\1\27\1\56\1\57"+ + "\6\27\6\0\1\34\1\0\2\34\1\0\2\34\5\0"+ + "\1\34\10\0\1\34\3\0\1\34\3\0\1\34\13\0"+ + "\1\27\2\0\1\27\1\0\1\27\1\30\1\27\1\35"+ + "\3\27\1\0\15\27\6\0\2\27\1\0\1\37\13\27"+ + "\10\0\1\37\3\0\1\36\26\0\1\37\23\0\1\37"+ + "\32\0\1\37\21\0\2\23\1\37\1\23\1\0\1\23"+ + "\1\40\1\23\3\0\2\23\1\0\1\23\5\0\2\23"+ + "\1\0\3\23\2\0\1\23\1\37\12\0\1\23\10\0"+ + "\1\37\1\0\2\41\1\36\1\0\3\41\1\0\1\41"+ + "\2\0\1\41\15\0\1\37\21\0\2\23\1\37\1\23"+ + "\1\41\1\42\1\40\1\23\3\41\1\23\1\42\1\0"+ + "\1\23\1\41\4\0\2\23\1\0\3\23\2\0\1\23"+ + "\1\37\12\0\1\23\6\0\2\23\1\0\1\23\1\0"+ + "\3\23\1\61\2\0\2\23\1\0\1\23\5\0\2\23"+ + "\1\0\3\23\2\0\1\23\13\0\1\23\1\27\2\0"+ + "\1\27\1\0\1\27\1\30\5\27\1\0\2\27\1\62"+ + "\12\27\6\0\2\27\2\0\13\27\1\45\2\0\1\45"+ + "\1\0\1\45\1\46\5\45\1\47\15\45\2\47\2\0"+ + "\2\47\2\45\2\47\11\45\1\27\1\51\1\45\2\0"+ + "\1\45\1\0\1\45\1\47\5\45\1\47\15\45\2\47"+ + "\2\0\2\47\2\45\2\47\1\63\1\64\1\45\1\65"+ + "\1\66\4\45\1\27\1\51\1\47\2\0\1\47\1\0"+ + "\27\47\2\0\17\47\1\0\1\53\1\45\2\0\1\45"+ + "\1\0\1\45\1\46\5\45\1\47\1\45\1\22\1\45"+ + "\1\22\2\45\4\22\1\67\2\22\2\47\2\0\2\47"+ + "\2\22\2\47\11\22\1\27\1\51\1\70\2\0\1\70"+ + "\1\0\1\70\1\46\5\70\1\71\15\70\2\71\2\0"+ + "\2\71\2\70\2\71\11\70\1\27\1\70\1\45\2\0"+ + "\1\45\1\0\1\45\1\46\5\45\1\47\1\45\1\22"+ + "\1\45\1\22\2\45\7\22\2\47\2\0\2\47\1\72"+ + "\1\22\2\47\11\22\1\27\1\51\1\71\2\0\1\71"+ + "\1\0\1\71\1\47\25\71\2\0\17\71\1\0\1\71"+ + "\1\27\2\0\1\27\1\0\1\27\1\30\5\27\1\0"+ + "\15\27\6\0\2\27\2\0\1\27\1\73\12\27\2\0"+ + "\1\27\1\0\1\27\1\30\5\27\1\0\15\27\6\0"+ + "\2\27\2\0\5\27\1\74\6\27\2\0\1\27\1\0"+ + "\1\27\1\30\5\27\1\0\15\27\6\0\2\27\2\0"+ + "\4\27\1\75\7\27\2\0\1\27\1\0\1\27\1\30"+ + "\5\27\1\0\15\27\6\0\2\27\2\0\7\27\1\76"+ + "\3\27\17\0\1\77\37\0\1\27\2\0\1\27\1\0"+ + "\1\27\1\30\5\27\1\0\3\27\1\100\11\27\6\0"+ + "\2\27\2\0\13\27\1\45\2\0\1\45\1\0\1\45"+ + "\1\46\5\45\1\47\15\45\2\47\2\0\2\47\2\45"+ + "\2\47\1\45\1\101\7\45\1\27\1\51\1\45\2\0"+ + "\1\45\1\0\1\45\1\46\5\45\1\47\15\45\2\47"+ + "\2\0\2\47\2\45\2\47\5\45\1\102\3\45\1\27"+ + "\1\51\1\45\2\0\1\45\1\0\1\45\1\46\5\45"+ + "\1\47\15\45\2\47\2\0\2\47\2\45\2\47\4\45"+ + "\1\103\4\45\1\27\1\51\1\45\2\0\1\45\1\0"+ + "\1\45\1\46\5\45\1\47\15\45\2\47\2\0\2\47"+ + "\2\45\2\47\7\45\1\104\1\45\1\27\1\51\1\45"+ + "\2\0\1\45\1\0\1\45\1\46\5\45\1\47\1\45"+ + "\1\22\1\45\1\22\2\45\5\22\1\105\1\22\2\47"+ + "\2\0\2\47\2\22\2\47\11\22\1\27\1\51\1\70"+ + "\2\0\1\70\1\0\1\70\1\106\5\70\1\71\15\70"+ + "\2\71\2\0\2\71\2\70\2\71\11\70\1\27\1\70"+ + "\1\71\2\0\1\71\1\0\1\71\1\107\25\71\2\0"+ + "\17\71\1\0\1\71\1\45\2\0\1\45\1\0\1\45"+ + "\1\110\5\45\1\47\1\45\1\22\1\45\1\22\2\45"+ + "\7\22\2\47\2\0\2\47\2\22\2\47\11\22\1\27"+ + "\1\51\1\27\2\0\1\27\1\0\1\27\1\30\5\27"+ + "\1\0\15\27\6\0\2\27\2\0\2\27\1\111\11\27"+ + "\2\0\1\27\1\0\1\27\1\30\5\27\1\0\15\27"+ + "\6\0\2\27\2\0\6\27\1\111\5\27\2\0\1\27"+ + "\1\0\1\27\1\30\5\27\1\0\12\27\1\111\2\27"+ + "\6\0\2\27\2\0\14\27\2\0\1\27\1\0\1\27"+ + "\1\30\5\27\1\0\15\27\6\0\2\27\2\0\10\27"+ + "\1\111\2\27\20\0\1\112\36\0\1\27\2\0\1\27"+ + "\1\0\1\27\1\30\5\27\1\0\3\27\1\113\11\27"+ + "\6\0\2\27\2\0\13\27\1\45\2\0\1\45\1\0"+ + "\1\45\1\46\5\45\1\47\15\45\2\47\2\0\2\47"+ + "\2\45\2\47\2\45\1\114\6\45\1\27\1\51\1\45"+ + "\2\0\1\45\1\0\1\45\1\46\5\45\1\47\15\45"+ + "\2\47\2\0\2\47\2\45\2\47\6\45\1\114\2\45"+ + "\1\27\1\51\1\45\2\0\1\45\1\0\1\45\1\46"+ + "\5\45\1\47\12\45\1\114\2\45\2\47\2\0\2\47"+ + "\2\45\2\47\11\45\1\27\1\51\1\45\2\0\1\45"+ + "\1\0\1\45\1\46\5\45\1\47\15\45\2\47\2\0"+ + "\2\47\2\45\2\47\10\45\1\114\1\27\1\51\1\45"+ + "\2\0\1\45\1\0\1\45\1\46\5\45\1\47\1\45"+ + "\1\22\1\45\1\22\2\45\6\22\1\115\1\116\1\47"+ + "\2\0\2\47\2\22\2\47\11\22\1\27\1\51\1\70"+ + "\2\0\1\70\1\0\1\70\1\47\5\70\1\71\11\70"+ + "\4\117\2\71\2\0\2\71\2\117\2\71\1\120\1\121"+ + "\1\117\1\122\1\123\4\117\1\27\1\70\1\71\2\0"+ + "\1\71\1\0\1\71\1\47\17\71\4\124\2\71\2\0"+ + "\2\71\2\124\2\71\11\124\1\0\1\71\1\125\2\0"+ + "\1\125\1\0\1\125\1\47\25\125\2\0\2\47\4\125"+ + "\1\126\1\127\1\125\1\130\1\131\4\125\1\132\1\133"+ + "\1\27\2\0\1\27\1\0\1\27\1\30\5\27\1\0"+ + "\15\27\1\0\1\134\4\0\2\27\2\0\13\27\20\0"+ + "\1\135\36\0\1\27\2\0\1\27\1\0\1\27\1\30"+ + "\5\27\1\0\4\27\1\136\10\27\6\0\2\27\2\0"+ + "\13\27\1\45\2\0\1\45\1\0\1\45\1\46\5\45"+ + "\1\47\15\45\1\47\1\134\2\0\2\47\2\45\2\47"+ + "\11\45\1\27\1\51\1\45\2\0\1\45\1\0\1\45"+ + "\1\46\5\45\1\47\1\45\1\22\1\45\1\22\2\45"+ + "\7\22\1\116\1\47\2\0\2\47\2\22\2\47\11\22"+ + "\1\27\1\51\1\47\2\0\1\47\1\0\26\47\1\137"+ + "\2\0\17\47\1\0\1\53\1\70\2\0\1\70\1\0"+ + "\1\70\1\106\5\70\1\71\11\70\4\140\2\71\2\0"+ + "\2\71\2\140\2\71\11\140\1\27\2\70\2\0\1\70"+ + "\1\0\1\70\1\106\5\70\1\71\11\70\4\140\2\71"+ + "\2\0\2\71\2\140\2\71\1\140\1\141\7\140\1\27"+ + "\2\70\2\0\1\70\1\0\1\70\1\106\5\70\1\71"+ + "\11\70\4\140\2\71\2\0\2\71\2\140\2\71\5\140"+ + "\1\142\3\140\1\27\2\70\2\0\1\70\1\0\1\70"+ + "\1\106\5\70\1\71\11\70\4\140\2\71\2\0\2\71"+ + "\2\140\2\71\4\140\1\143\4\140\1\27\2\70\2\0"+ + "\1\70\1\0\1\70\1\106\5\70\1\71\11\70\4\140"+ + "\2\71\2\0\2\71\2\140\2\71\7\140\1\144\1\140"+ + "\1\27\1\70\1\71\2\0\1\71\1\0\1\71\1\107"+ + "\17\71\4\145\2\71\2\0\2\71\2\145\2\71\11\145"+ + "\1\0\1\71\1\125\2\0\1\125\1\0\1\125\1\146"+ + "\25\125\2\0\2\47\15\125\1\132\1\133\1\125\2\0"+ + "\1\125\1\0\1\125\1\146\25\125\2\0\2\47\5\125"+ + "\1\147\7\125\1\132\1\133\1\125\2\0\1\125\1\0"+ + "\1\125\1\146\25\125\2\0\2\47\11\125\1\150\3\125"+ + "\1\132\1\133\1\125\2\0\1\125\1\0\1\125\1\146"+ + "\25\125\2\0\2\47\10\125\1\151\4\125\1\132\1\133"+ + "\1\125\2\0\1\125\1\0\1\125\1\146\25\125\2\0"+ + "\2\47\13\125\1\152\1\125\1\132\1\133\1\132\2\0"+ + "\1\132\1\0\1\132\1\153\25\132\4\0\17\132\1\154"+ + "\2\0\1\154\1\0\1\154\1\146\25\154\2\0\2\71"+ + "\15\154\1\132\1\154\1\155\2\0\1\155\1\0\27\155"+ + "\2\0\21\155\21\0\1\156\35\0\1\27\2\0\1\27"+ + "\1\0\1\27\1\30\5\27\1\156\15\27\6\0\2\27"+ + "\2\0\13\27\1\47\2\0\1\47\1\0\26\47\1\134"+ + "\2\0\17\47\1\0\1\53\1\70\2\0\1\70\1\0"+ + "\1\70\1\106\5\70\1\71\11\70\4\157\2\71\2\0"+ + "\2\71\2\157\2\71\11\157\1\27\2\70\2\0\1\70"+ + "\1\0\1\70\1\106\5\70\1\71\11\70\4\157\2\71"+ + "\2\0\2\71\2\157\2\71\2\157\1\160\6\157\1\27"+ + "\2\70\2\0\1\70\1\0\1\70\1\106\5\70\1\71"+ + "\11\70\4\157\2\71\2\0\2\71\2\157\2\71\6\157"+ + "\1\160\2\157\1\27\2\70\2\0\1\70\1\0\1\70"+ + "\1\106\5\70\1\71\11\70\1\157\1\160\2\157\2\71"+ + "\2\0\2\71\2\157\2\71\11\157\1\27\2\70\2\0"+ + "\1\70\1\0\1\70\1\106\5\70\1\71\11\70\4\157"+ + "\2\71\2\0\2\71\2\157\2\71\10\157\1\160\1\27"+ + "\1\70\1\71\2\0\1\71\1\0\1\71\1\107\17\71"+ + "\4\161\2\71\2\0\2\71\2\161\2\71\11\161\1\0"+ + "\1\71\1\125\2\0\1\125\1\0\1\125\1\47\17\125"+ + "\4\162\2\125\2\0\2\47\2\162\2\125\11\162\1\132"+ + "\1\133\1\125\2\0\1\125\1\0\1\125\1\146\25\125"+ + "\2\0\2\47\6\125\1\163\6\125\1\132\1\133\1\125"+ + "\2\0\1\125\1\0\1\125\1\146\25\125\2\0\2\47"+ + "\12\125\1\163\2\125\1\132\1\133\1\125\2\0\1\125"+ + "\1\0\1\125\1\146\20\125\1\163\4\125\2\0\2\47"+ + "\15\125\1\132\1\133\1\125\2\0\1\125\1\0\1\125"+ + "\1\146\25\125\2\0\2\47\14\125\1\163\1\132\1\133"+ + "\1\132\2\0\1\132\1\0\1\132\1\0\17\132\4\164"+ + "\2\132\4\0\2\164\2\132\11\164\2\132\1\154\2\0"+ + "\1\154\1\0\1\154\1\165\25\154\2\0\2\71\15\154"+ + "\1\132\1\154\1\166\2\0\1\166\1\0\1\166\1\155"+ + "\5\166\1\155\17\166\2\0\2\155\17\166\14\0\1\156"+ + "\42\0\1\70\2\0\1\70\1\0\1\70\1\106\5\70"+ + "\1\71\11\70\4\167\2\71\2\0\2\71\2\167\2\71"+ + "\11\167\1\27\2\70\2\0\1\70\1\0\1\70\1\106"+ + "\5\70\1\71\11\70\4\167\1\71\1\134\2\0\2\71"+ + "\2\167\2\71\11\167\1\27\1\70\1\71\2\0\1\71"+ + "\1\0\1\71\1\107\17\71\4\170\2\71\2\0\2\71"+ + "\2\170\2\71\11\170\1\0\1\71\1\125\2\0\1\125"+ + "\1\0\1\125\1\146\17\125\4\171\2\125\2\0\2\47"+ + "\2\171\2\125\11\171\1\132\1\133\1\125\2\0\1\125"+ + "\1\0\1\125\1\146\24\125\1\134\2\0\2\47\15\125"+ + "\1\132\1\133\1\132\2\0\1\132\1\0\1\132\1\153"+ + "\17\132\4\172\2\132\4\0\2\172\2\132\11\172\2\132"+ + "\1\154\2\0\1\154\1\0\1\154\1\47\17\154\4\173"+ + "\2\154\2\0\2\71\2\173\2\154\11\173\1\132\1\154"+ + "\1\125\2\0\1\125\1\0\1\125\1\146\17\125\4\174"+ + "\1\125\1\134\2\0\2\47\2\174\2\125\11\174\1\132"+ + "\1\133\1\132\2\0\1\132\1\0\1\132\1\153\17\132"+ + "\4\175\1\132\1\134\4\0\2\175\2\132\11\175\2\132"+ + "\1\154\2\0\1\154\1\0\1\154\1\165\17\154\4\176"+ + "\2\154\2\0\2\71\2\176\2\154\11\176\1\132\1\154"+ + "\1\125\2\0\1\125\1\0\1\125\1\146\17\125\4\163"+ + "\1\125\1\134\2\0\2\47\2\163\2\125\11\163\1\132"+ + "\1\133\1\132\2\0\1\132\1\0\1\132\1\153\17\132"+ + "\4\177\1\132\1\134\4\0\2\177\2\132\11\177\2\132"+ + "\1\154\2\0\1\154\1\0\1\154\1\165\17\154\4\200"+ + "\1\154\1\134\2\0\2\71\2\200\2\154\11\200\1\132"+ + "\1\154\1\132\2\0\1\132\1\0\1\132\1\153\24\132"+ + "\1\134\4\0\17\132\1\154\2\0\1\154\1\0\1\154"+ + "\1\165\17\154\4\201\1\154\1\134\2\0\2\71\2\201"+ + "\2\154\11\201\1\132\2\154\2\0\1\154\1\0\1\154"+ + "\1\165\24\154\1\134\2\0\2\71\15\154\1\132\1\154"; + + private static int [] zzUnpackTrans() { + int [] result = new int[5781]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\2\1\1\11\22\1\2\0\13\1\4\0\1\1"+ + "\1\0\1\1\5\0\1\1\6\0\1\1\2\0\1\1"+ + "\12\0\1\1\3\0\1\1\2\0\2\1\20\0\1\1"+ + "\1\0\6\1\10\0\4\1\1\0\1\1\2\0\5\1"+ + "\1\0\6\1"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[129]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private LexedTokenFactory tokenFactory; + private boolean invertible; + + // Convert Arabic digits to ASCII digits + private boolean normArDigits; + + // Convert Arabic punctuation to ASCII equivalents + private boolean normArPunc; + + // Substitute newlines with newlineChar. + // Otherwise, treat them like whitespace + private boolean tokenizeNL; + private String newlineChar; + + // Use \u2026 for ellipses + private boolean useUTF8Ellipsis; + + // Arabic-specific orthographic normalization rules + private boolean normAlif; + private boolean normYa; + private boolean removeDiacritics; + private boolean removeTatweel; + private boolean removeQuranChars; + + // Penn ATB vocalized section normalizations + private boolean removeProMarker; + private boolean removeSegMarker; + private boolean removeMorphMarker; + + private final Pattern segmentationMarker = Pattern.compile("^-+|-+$"); + + // Escape parens for ATB parsing + private boolean atbEscaping; + + // Normalize newlines to this token + public static final String NEWLINE_TOKEN = "*NL*"; + + private Map normMap; + + public ArabicLexer(Reader r, LexedTokenFactory tf, Properties props) { + this(r); + this.tokenFactory = tf; + + tokenizeNL = PropertiesUtils.getBool(props, "tokenizeNLs", false); + useUTF8Ellipsis = PropertiesUtils.getBool(props, "useUTF8Ellipsis", false); + invertible = PropertiesUtils.getBool(props, "invertible", false); + normArDigits = PropertiesUtils.getBool(props, "normArDigits", false); + normArPunc = PropertiesUtils.getBool(props, "normArPunc", false); + normAlif = PropertiesUtils.getBool(props, "normAlif", false); + normYa = PropertiesUtils.getBool(props, "normYa", false); + removeDiacritics = PropertiesUtils.getBool(props, "removeDiacritics", false); + removeTatweel = PropertiesUtils.getBool(props, "removeTatweel", false); + removeQuranChars = PropertiesUtils.getBool(props, "removeQuranChars", false); + removeProMarker = PropertiesUtils.getBool(props, "removeProMarker", false); + removeSegMarker = PropertiesUtils.getBool(props, "removeSegMarker", false); + removeMorphMarker = PropertiesUtils.getBool(props, "removeMorphMarker", false); + atbEscaping = PropertiesUtils.getBool(props, "atbEscaping", false); + + setupNormalizationMap(); + } + + private void setupNormalizationMap() { + normMap = Generics.newHashMap(200); + + // Junk characters that we always remove + normMap.put("\u0600","#"); + normMap.put("\u0601",""); + normMap.put("\u0602",""); + normMap.put("\u0603",""); + normMap.put("\u0606","\u221B"); + normMap.put("\u0607","\u221C"); + normMap.put("\u0608",""); + normMap.put("\u0609","%"); + normMap.put("\u060A","%"); + normMap.put("\u060B",""); + normMap.put("\u060E",""); + normMap.put("\u060F",""); + normMap.put("\u066E","\u0628"); + normMap.put("\u066F","\u0642"); + normMap.put("\u06CC","\u0649"); + normMap.put("\u06D6",""); + normMap.put("\u06D7",""); + normMap.put("\u06D8",""); + normMap.put("\u06D9",""); + normMap.put("\u06DA",""); + normMap.put("\u06DB",""); + normMap.put("\u06DC",""); + normMap.put("\u06DD",""); + normMap.put("\u06DE",""); + normMap.put("\u06DF",""); + normMap.put("\u06E0",""); + normMap.put("\u06E1",""); + normMap.put("\u06E2",""); + normMap.put("\u06E3",""); + normMap.put("\u06E4",""); + normMap.put("\u06E5",""); + normMap.put("\u06E6",""); + normMap.put("\u06E7",""); + normMap.put("\u06E8",""); + normMap.put("\u06E9",""); + normMap.put("\u06EA",""); + normMap.put("\u06EB",""); + normMap.put("\u06EC",""); + normMap.put("\u06ED",""); + + if (normArDigits) { + normMap.put("\u0660","0"); + normMap.put("\u0661","1"); + normMap.put("\u0662","2"); + normMap.put("\u0663","3"); + normMap.put("\u0664","4"); + normMap.put("\u0665","5"); + normMap.put("\u0666","6"); + normMap.put("\u0667","7"); + normMap.put("\u0668","8"); + normMap.put("\u0669","9"); + normMap.put("\u06F0","0"); + normMap.put("\u06F1","1"); + normMap.put("\u06F2","2"); + normMap.put("\u06F3","3"); + normMap.put("\u06F4","4"); + normMap.put("\u06F5","5"); + normMap.put("\u06F6","6"); + normMap.put("\u06F7","7"); + normMap.put("\u06F8","8"); + normMap.put("\u06F9","9"); + } + if (normArPunc) { + normMap.put("\u00BB","\""); + normMap.put("\u00AB","\""); + normMap.put("\u060C",","); + normMap.put("\u060D",","); + normMap.put("\u061B",";"); + normMap.put("\u061E","."); + normMap.put("\u061F","?"); + normMap.put("\u066A","%"); + normMap.put("\u066B",","); + normMap.put("\u066C","\u0027"); + normMap.put("\u066F","*"); + normMap.put("\u06DF","."); + } + if (normAlif) { + normMap.put("\u0622","\u0627"); + normMap.put("\u0623","\u0627"); + normMap.put("\u0625","\u0627"); + normMap.put("\u0671","\u0627"); + normMap.put("\u0672","\u0627"); + normMap.put("\u0673","\u0627"); + } + if (normYa) { + normMap.put("\u064A","\u0649"); + } + if (removeDiacritics) { + normMap.put("\u064B",""); + normMap.put("\u064C",""); + normMap.put("\u064D",""); + normMap.put("\u064E",""); + normMap.put("\u064F",""); + normMap.put("\u0650",""); + normMap.put("\u0651",""); + normMap.put("\u0652",""); + normMap.put("\u0653",""); + normMap.put("\u0654",""); + normMap.put("\u0655",""); + normMap.put("\u0656",""); + normMap.put("\u0657",""); + normMap.put("\u0658",""); + normMap.put("\u0659",""); + normMap.put("\u065A",""); + normMap.put("\u065B",""); + normMap.put("\u065C",""); + normMap.put("\u065D",""); + normMap.put("\u065E",""); + normMap.put("\u0670",""); + } + if (removeTatweel) { + normMap.put("\u0640",""); + normMap.put("_",""); + } + if (removeQuranChars) { + // Arabic honorifics + normMap.put("\u0610",""); + normMap.put("\u0611",""); + normMap.put("\u0612",""); + normMap.put("\u0613",""); + normMap.put("\u0614",""); + normMap.put("\u0615",""); + normMap.put("\u0616",""); + normMap.put("\u0617",""); + normMap.put("\u0618",""); + normMap.put("\u0619",""); + normMap.put("\u061A",""); + } + if (atbEscaping) { + normMap.put("(","-LRB-"); + normMap.put(")","-RRB-"); + } + } + + private String normalizeToken(String text, boolean isWord) { + // Remove segmentation markers from the ATB + if (isWord && removeSegMarker) { + text = segmentationMarker.matcher(text).replaceAll(""); + } + int len = text.length(); + StringBuilder sb = new StringBuilder(len); + for (int i = 0; i < len; ++i) { + String thisChar = String.valueOf(text.charAt(i)); + // Remove morpheme markers from the ATB vocalized section + if (isWord && removeMorphMarker && thisChar.equals("+")) { + continue; + } + if (normMap.containsKey(thisChar)) { + thisChar = normMap.get(thisChar); + } + if (thisChar.length() > 0) { + sb.append(thisChar); + } + } + return sb.toString(); + } + + + /** Make the next token. + * + * @param txt What the token should be + * @param originalText The original String that got transformed into txt + */ + private Object getNext(String txt, String originalText) { + if (tokenFactory == null) { + throw new RuntimeException(this.getClass().getName() + ": Token factory is null."); + } + if (invertible) { + //String str = prevWordAfter.toString(); + //prevWordAfter.setLength(0); + CoreLabel word = (CoreLabel) tokenFactory.makeToken(txt, yychar, yylength()); + word.set(CoreAnnotations.OriginalTextAnnotation.class, originalText); + //word.set(CoreAnnotations.BeforeAnnotation.class, str); + //prevWord.set(CoreAnnotations.AfterAnnotation.class, str); + //prevWord = word; + return word; + } else { + return tokenFactory.makeToken(txt, yychar, yylength()); + } + } + + private Object getNext(boolean isWord) { + String text = yytext(); + String normText = normalizeToken(text, isWord); + return getNext(normText, text); + } + + private Object getNewline() { + String nlString = tokenizeNL ? NEWLINE_TOKEN : System.getProperty("line.separator"); + return getNext(nlString, yytext()); + } + + private Object getEllipsis() { + String ellipsisString = useUTF8Ellipsis ? "\u2026" : "..."; + return getNext(ellipsisString, yytext()); + } + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + ArabicLexer(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + ArabicLexer(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 1530) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public Object next() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 6: + { return getEllipsis(); + } + case 8: break; + case 4: + { return getNext(false); + } + case 9: break; + case 7: + { if ( ! removeProMarker) { + return getNext(false); + } + } + case 10: break; + case 2: + { return getNewline(); + } + case 11: break; + case 5: + { return getNext(true); + } + case 12: break; + case 3: + { + } + case 13: break; + case 1: + { System.err.printf("Untokenizable: %s%n", yytext()); + return getNext(true); + } + case 14: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + { + return null; + } + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/process/ArabicTokenizer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/process/ArabicTokenizer.java new file mode 100644 index 0000000..99dbcda --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/arabic/process/ArabicTokenizer.java @@ -0,0 +1,208 @@ +package edu.stanford.nlp.international.arabic.process; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; +import java.util.Properties; + +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.process.TokenizerFactory; +import edu.stanford.nlp.process.*; +import edu.stanford.nlp.util.StringUtils; + +/** + * Tokenizer for UTF-8 Arabic. Buckwalter encoding is *not* supported. + * + *

    + * A single instance of an Arabic Tokenizer is not thread safe, as it + * uses a non-threadsafe jflex object to do the processing. Multiple + * instances can be created safely, though. A single instance of a + * ArabicTokenizerFactory is also not thread safe, as it keeps its + * options in a local variable. + *

    + * + *

    + * TODO(spenceg): Merge in rules from ibm tokenizer (v5). + * TODO(spenceg): Add XML escaping + * TODO(spenceg): When running from the command line, the tokenizer does not + * produce the correct number of newline-delimited lines for the ATB data + * sets. + *

    + * @author Spence Green + */ +public class ArabicTokenizer extends AbstractTokenizer { + + // The underlying JFlex lexer + private final ArabicLexer lexer; + + // Produces the normalization for parsing used in Green and Manning (2010) + private static final Properties atbOptions = new Properties(); + static { + String optionsStr = "normArDigits,normArPunc,normAlif,removeDiacritics,removeTatweel,removeQuranChars"; + String[] optionToks = optionsStr.split(","); + for (String option : optionToks) { + atbOptions.put(option, "true"); + } + } + + public static ArabicTokenizer newArabicTokenizer(Reader r, Properties lexerProperties) { + return new ArabicTokenizer(r, new CoreLabelTokenFactory(), lexerProperties); + } + + public ArabicTokenizer(Reader r, LexedTokenFactory tf, Properties lexerProperties) { + lexer = new ArabicLexer(r, tf, lexerProperties); + } + + @Override + @SuppressWarnings("unchecked") + protected T getNext() { + try { + T nextToken = null; + // Depending on the orthographic normalization options, + // some tokens can be obliterated. In this case, keep iterating + // until we see a non-zero length token. + do { + nextToken = (T) lexer.next(); + } while (nextToken != null && nextToken.word().length() == 0); + + return nextToken; + + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public static class ArabicTokenizerFactory implements TokenizerFactory, Serializable { + + private static final long serialVersionUID = 946818805507187330L; + + protected final LexedTokenFactory factory; + + protected Properties lexerProperties = new Properties(); + + public static TokenizerFactory newTokenizerFactory() { + return new ArabicTokenizerFactory(new CoreLabelTokenFactory()); + } + + private ArabicTokenizerFactory(LexedTokenFactory factory) { + this.factory = factory; + } + + public Iterator getIterator(Reader r) { + return getTokenizer(r); + } + + public Tokenizer getTokenizer(Reader r) { + return new ArabicTokenizer(r, factory, lexerProperties); + } + + /** + * options: A comma-separated list of options + */ + public void setOptions(String options) { + String[] optionList = options.split(","); + for (String option : optionList) { + lexerProperties.put(option, "true"); + } + } + + public Tokenizer getTokenizer(Reader r, String extraOptions) { + setOptions(extraOptions); + return getTokenizer(r); + } + } + + public static TokenizerFactory factory() { + return ArabicTokenizerFactory.newTokenizerFactory(); + } + + public static TokenizerFactory atbFactory() { + TokenizerFactory tf = ArabicTokenizerFactory.newTokenizerFactory(); + for (String option : atbOptions.stringPropertyNames()) { + tf.setOptions(option); + } + return tf; + } + + /** + * A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding). + * Performs punctuation splitting and light tokenization by default. + * Orthographic normalization options are available, and can be enabled with + * command line options. + *

    + * Currently, this tokenizer does not do line splitting. It normalizes non-printing + * line separators across platforms and prints the system default line splitter + * to the output. + *

    + *

    + * The following normalization options are provided: + *

      + *
    • useUTF8Ellipsis : Replaces sequences of three or more full stops with \u2026
    • + *
    • normArDigits : Convert Arabic digits to ASCII equivalents
    • + *
    • normArPunc : Convert Arabic punctuation to ASCII equivalents
    • + *
    • normAlif : Change all alif forms to bare alif
    • + *
    • normYa : Map ya to alif maqsura
    • + *
    • removeDiacritics : Strip all diacritics
    • + *
    • removeTatweel : Strip tatweel elongation character
    • + *
    • removeQuranChars : Remove diacritics that appear in the Quran
    • + *
    • removeProMarker : Remove the ATB null pronoun marker
    • + *
    • removeSegMarker : Remove the ATB clitic segmentation marker
    • + *
    • removeMorphMarker : Remove the ATB morpheme boundary markers
    • + *
    • atbEscaping : Replace left/right parentheses with ATB escape characters
    • + *
    + *

    + * + * @param args + */ + public static void main(String[] args) { + if (args.length > 0 && args[0].contains("help")) { + System.err.printf("Usage: java %s [OPTIONS] < file%n", ArabicTokenizer.class.getName()); + System.err.printf("%nOptions:%n"); + System.err.println(" -help : Print this message. See javadocs for all normalization options."); + System.err.println(" -atb : Tokenization for the parsing experiments in Green and Manning (2010)"); + System.exit(-1); + } + + // Process normalization options + final Properties tokenizerOptions = StringUtils.argsToProperties(args); + final TokenizerFactory tf = tokenizerOptions.containsKey("atb") ? + ArabicTokenizer.atbFactory() : ArabicTokenizer.factory(); + for (String option : tokenizerOptions.stringPropertyNames()) { + tf.setOptions(option); + } + + // Replace line separators with a token so that we can + // count lines + tf.setOptions("tokenizeNLs"); + + // Read the file + int nLines = 0; + int nTokens = 0; + final String encoding = "UTF-8"; + try { + Tokenizer tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding)); + boolean printSpace = false; + while (tokenizer.hasNext()) { + ++nTokens; + String word = tokenizer.next().word(); + if (word.equals(ArabicLexer.NEWLINE_TOKEN)) { + ++nLines; + printSpace = false; + System.out.println(); + } else { + if (printSpace) System.out.print(" "); + System.out.print(word); + printSpace = true; + } + } + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + System.err.printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/french/FrenchMorphoFeatureSpecification.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/french/FrenchMorphoFeatureSpecification.java new file mode 100644 index 0000000..9f028b1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/french/FrenchMorphoFeatureSpecification.java @@ -0,0 +1,270 @@ +package edu.stanford.nlp.international.french; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import edu.stanford.nlp.international.morph.MorphoFeatureSpecification; +import edu.stanford.nlp.international.morph.MorphoFeatures; + +/** + * If MorphoFeatureType.OTHER is active, then the "CC tagset" is produced (see Tbl.2 + * of (Crabbe and Candito, 2008). Additional support exists for GEN, NUM, and PER, which + * are (mostly) marked in the FTB annotation. + *

    + * The actual CC tag is placed in the altTag field of the MorphoFeatures object. + * + * @author Spence Green + * + */ +public class FrenchMorphoFeatureSpecification extends MorphoFeatureSpecification { + + private static final long serialVersionUID = -58379347760106784L; + + public static final String[] genVals = {"M","F"}; + public static final String[] numVals = {"SG","PL"}; + public static final String[] perVals = {"1","2","3"}; + + + @Override + public List getValues(MorphoFeatureType feat) { + if(feat == MorphoFeatureType.GEN) + return Arrays.asList(genVals); + else if(feat == MorphoFeatureType.NUM) + return Arrays.asList(numVals); + else if(feat == MorphoFeatureType.PER) + return Arrays.asList(perVals); + else + throw new IllegalArgumentException("French does not support feature type: " + feat.toString()); + } + + @Override + public MorphoFeatures strToFeatures(String spec) { + MorphoFeatures feats = new MorphoFeatures(); + + //Usually this is the boundary symbol + if(spec == null || spec.equals("")) + return feats; + + boolean isOtherActive = isActive(MorphoFeatureType.OTHER); + + if(spec.startsWith("ADV")) { + feats.setAltTag("ADV"); + if(spec.contains("int")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "advint"); + } + feats.setAltTag("ADVWH"); + } + + } else if(spec.startsWith("A")) { + feats.setAltTag("ADJ"); + if(spec.contains("int")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "adjint"); + } + feats.setAltTag("ADJWH"); + } + + addPhiFeatures(feats,spec); + + } else if(spec.equals("CC") || spec.equals("C-C")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Cc"); + } + feats.setAltTag("CC"); + + } else if(spec.equals("CS") || spec.equals("C-S")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Cs"); + } + feats.setAltTag("CS"); + + } else if(spec.startsWith("CL")) { + feats.setAltTag("CL"); + if(spec.contains("suj") || spec.equals("CL-S-3fp")) {//"CL-S-3fp" is equivalent to suj + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER,"Sbj"); + } + feats.setAltTag("CLS"); + + } else if(spec.contains("obj")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Obj"); + } + feats.setAltTag("CLO"); + + } else if(spec.contains("refl")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Rfl"); + } + feats.setAltTag("CLR"); + } + + addPhiFeatures(feats,spec); + + } else if(spec.startsWith("D")) { + feats.setAltTag("DET"); + if(spec.contains("int")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "dint"); + } + feats.setAltTag("DETWH"); + } + + addPhiFeatures(feats,spec); + + } else if(spec.startsWith("N")) { + feats.setAltTag("N");//TODO These are usually N-card...make these CD? + if(spec.contains("P")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Np"); + } + feats.setAltTag("NPP"); + + } else if(spec.contains("C")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Nc"); + } + feats.setAltTag("NC"); + } + + addPhiFeatures(feats,spec); + + } else if(spec.startsWith("PRO")) { + feats.setAltTag("PRO"); + if(spec.contains("int")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER,"Ni"); + } + feats.setAltTag("PROWH"); + + } else if(spec.contains("rel")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Nr"); + } + feats.setAltTag("PROREL"); + } + + addPhiFeatures(feats,spec); + + } else if(spec.startsWith("V")) { + feats.setAltTag("V"); + if(spec.contains("Y")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER,"Vp"); + } + feats.setAltTag("VIMP"); + + } else if(spec.contains("W")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Vf"); + } + feats.setAltTag("VINF"); + + } else if(spec.contains("S") || spec.contains("T")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Vs"); + } + feats.setAltTag("VS"); + + } else if(spec.contains("K")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Vp"); + } + feats.setAltTag("VPP"); + + } else if(spec.contains("G")) { + if (isOtherActive) { + feats.addFeature(MorphoFeatureType.OTHER, "Vr"); + } + feats.setAltTag("VPR"); + } + + addPhiFeatures(feats,spec); + + } else if(spec.equals("P") || spec.equals("I")) { + feats.setAltTag(spec); + + } +// else { +// System.err.println("Could not map spec: " + spec); +// } + + return feats; + } + + private void addPhiFeatures(MorphoFeatures feats, String spec) { + String[] toks = spec.split("\\-+"); + + String morphStr; + if(toks.length == 3 && toks[0].equals("PRO") && toks[2].equals("neg")) + morphStr = toks[1]; + else + morphStr = toks[toks.length-1]; + + //wsg2011: The analyses have mixed casing.... + morphStr = morphStr.toLowerCase(); + + if(isActive(MorphoFeatureType.GEN)) { + if(morphStr.contains("m")) + feats.addFeature(MorphoFeatureType.GEN, genVals[0]); + else if(morphStr.contains("f")) + feats.addFeature(MorphoFeatureType.GEN, genVals[1]); + } + + if(isActive(MorphoFeatureType.PER)) { + if(morphStr.contains("1")) + feats.addFeature(MorphoFeatureType.PER, perVals[0]); + else if(morphStr.contains("2")) + feats.addFeature(MorphoFeatureType.PER, perVals[1]); + else if(morphStr.contains("3")) + feats.addFeature(MorphoFeatureType.PER, perVals[2]); + } + + if(isActive(MorphoFeatureType.NUM)) { + if(morphStr.contains("s")) + feats.addFeature(MorphoFeatureType.NUM, numVals[0]); + else if(morphStr.contains("p")) + feats.addFeature(MorphoFeatureType.NUM, numVals[1]); + } + } + + + /** + * For debugging + * + * @param args + */ + public static void main(String[] args) { + if(args.length != 1) { + System.err.printf("Usage: java %s file%n", FrenchMorphoFeatureSpecification.class.getName()); + System.exit(-1); + } + + try { + BufferedReader br = new BufferedReader(new FileReader(args[0])); + MorphoFeatureSpecification mfs = new FrenchMorphoFeatureSpecification(); + + //Activate all features for debugging + mfs.activate(MorphoFeatureType.GEN); + mfs.activate(MorphoFeatureType.NUM); + mfs.activate(MorphoFeatureType.PER); + + for(String line; (line = br.readLine()) != null;) { + MorphoFeatures feats = mfs.strToFeatures(line); + System.out.printf("%s\t%s%n", line.trim(),feats.toString()); + } + + br.close(); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/french/FrenchUnknownWordSignatures.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/french/FrenchUnknownWordSignatures.java new file mode 100644 index 0000000..14b2520 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/french/FrenchUnknownWordSignatures.java @@ -0,0 +1,88 @@ +package edu.stanford.nlp.international.french; + +import java.util.regex.Pattern; + +/** + * Contains patterns for matching certain word types in French, such + * as common suffices for nouns, verbs, adjectives and adverbs. + */ +public class FrenchUnknownWordSignatures { + private static final Pattern pNounSuffix = Pattern.compile("(ier|ière|ité|ion|ison|isme|ysme|iste|esse|eur|euse|ence|eau|erie|ng|ette|age|ade|ance|ude|ogue|aphe|ate|duc|anthe|archie|coque|érèse|ergie|ogie|lithe|mètre|métrie|odie|pathie|phie|phone|phore|onyme|thèque|scope|some|pole|ôme|chromie|pie)s?$"); + private static final Pattern pAdjSuffix = Pattern.compile("(iste|ième|uple|issime|aire|esque|atoire|ale|al|able|ible|atif|ique|if|ive|eux|aise|ent|ois|oise|ante|el|elle|ente|oire|ain|aine)s?$"); + private static final Pattern pHasDigit = Pattern.compile("\\d+"); + private static final Pattern pIsDigit = Pattern.compile("^\\d+$"); + private static final Pattern pPosPlural = Pattern.compile("(s|ux)$"); + private static final Pattern pVerbSuffix = Pattern.compile("(ir|er|re|ez|ont|ent|ant|ais|ait|ra|era|eras|é|és|ées|isse|it)$"); + private static final Pattern pAdvSuffix = Pattern.compile("(iment|ement|emment|amment)$"); + private static final Pattern pHasPunc = Pattern.compile("([\u0021-\u002F\u003A-\u0040\\u005B\u005C\\u005D\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E\u20A0-\u20B5])+"); + private static final Pattern pIsPunc = Pattern.compile("([\u0021-\u002F\u003A-\u0040\\u005B\u005C\\u005D\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E\u20A0-\u20B5])+$"); + private static final Pattern pAllCaps = Pattern.compile("^[A-Z\\u00C0-\\u00DD]+$"); + + public static boolean hasNounSuffix(String s) { + return pNounSuffix.matcher(s).find(); + } + + public static String nounSuffix(String s) { + return hasNounSuffix(s) ? "-noun" : ""; + } + + public static boolean hasAdjSuffix(String s) { + return pAdjSuffix.matcher(s).find(); + } + + public static String adjSuffix(String s) { + return hasAdjSuffix(s) ? "-adj" : ""; + } + + public static String hasDigit(String s) { + return pHasDigit.matcher(s).find() ? "-num" : ""; + } + + public static String isDigit(String s) { + return pIsDigit.matcher(s).find() ? "-isNum" : ""; + } + + public static boolean hasVerbSuffix(String s) { + return pVerbSuffix.matcher(s).find(); + } + + public static String verbSuffix(String s) { + return hasVerbSuffix(s) ? "-verb" : ""; + } + + public static boolean hasPossiblePlural(String s) { + return pPosPlural.matcher(s).find(); + } + + public static String possiblePlural(String s) { + return hasPossiblePlural(s) ? "-plural" : ""; + } + + public static boolean hasAdvSuffix(String s) { + return pAdvSuffix.matcher(s).find(); + } + + public static String advSuffix(String s) { + return hasAdvSuffix(s) ? "-adv" : ""; + } + + public static String hasPunc(String s) { + return pHasPunc.matcher(s).find() ? "-hpunc" : ""; + } + + public static String isPunc(String s) { + return pIsPunc.matcher(s).matches() ? "-ipunc" : ""; + } + + public static String isAllCaps(String s) { + return pAllCaps.matcher(s).matches() ? "-allcap" : ""; + } + + public static String isCapitalized(String s) { + if(s.length() > 0) { + Character ch = s.charAt(0); + return Character.isUpperCase(ch) ? "-upper" : ""; + } + return ""; + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/morph/MorphoFeatureSpecification.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/morph/MorphoFeatureSpecification.java new file mode 100644 index 0000000..0b1e88a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/morph/MorphoFeatureSpecification.java @@ -0,0 +1,70 @@ +package edu.stanford.nlp.international.morph; + +import java.io.Serializable; + +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.Pair; + +/** + * Morphological feature specification for surface forms in a given language. + * Currently supported feature names are the values of MorphFeatureType. + * + * @author Spence Green + * + */ +public abstract class MorphoFeatureSpecification implements Serializable { + + private static final long serialVersionUID = -5720683653931585664L; + + //Delimiter for associating a surface form with a morphological analysis, e.g., + // + // his~#PRP_3ms + // + public static final String MORPHO_MARK = "~#"; + + public static final String LEMMA_MARK = "|||"; + + public static final String NO_ANALYSIS = "XXX"; + + // WSGDEBUG -- + // Added NNUM and NGEN for nominals in Arabic + public static enum MorphoFeatureType {TENSE,DEF,ASP,MOOD,NNUM,NUM, NGEN, GEN,CASE,PER,POSS,VOICE,OTHER,PROP}; + + protected final Set activeFeatures; + + public MorphoFeatureSpecification() { + activeFeatures = Generics.newHashSet(); + } + + public void activate(MorphoFeatureType feat) { + activeFeatures.add(feat); + } + + public boolean isActive(MorphoFeatureType feat) { return activeFeatures.contains(feat); } + + public abstract List getValues(MorphoFeatureType feat); + + public abstract MorphoFeatures strToFeatures(String spec); + + /** + * Returns the lemma as pair.first() and the morph analysis as pair.second(). + */ + public static Pair splitMorphString(String word, String morphStr) { + if (morphStr == null || morphStr.trim().equals("")) { + return new Pair(word, NO_ANALYSIS); + } + String[] toks = morphStr.split(Pattern.quote(LEMMA_MARK)); + if (toks.length != 2) { + throw new RuntimeException("Invalid morphology string: " + morphStr); + } + return new Pair(toks[0], toks[1]); + } + + + @Override + public String toString() { return activeFeatures.toString(); } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/morph/MorphoFeatures.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/morph/MorphoFeatures.java new file mode 100644 index 0000000..9cf3ef2 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/international/morph/MorphoFeatures.java @@ -0,0 +1,124 @@ +package edu.stanford.nlp.international.morph; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType; +import edu.stanford.nlp.util.Generics; + +/** + * Holds a set of morphosyntactic features for a given surface form. + * + * @author Spence Green + * + */ +public class MorphoFeatures implements Serializable { + + private static final long serialVersionUID = -3893316324305154940L; + + public static final String KEY_VAL_DELIM = ":"; + + protected final Map fSpec; + protected String altTag; + + public MorphoFeatures() { + fSpec = Generics.newHashMap(); + } + + public MorphoFeatures(MorphoFeatures other) { + this(); + for(Map.Entry entry : other.fSpec.entrySet()) + this.fSpec.put(entry.getKey(), entry.getValue()); + this.altTag = other.altTag; + } + + public void addFeature(MorphoFeatureType feat, String val) { + fSpec.put(feat, val); + } + + public boolean hasFeature(MorphoFeatureType feat) { + return fSpec.containsKey(feat); + } + + public String getValue(MorphoFeatureType feat) { + return hasFeature(feat) ? fSpec.get(feat) : ""; + } + + public int numFeatureMatches(MorphoFeatures other) { + int nMatches = 0; + for(Map.Entry fPair : fSpec.entrySet()) { + if(other.hasFeature(fPair.getKey()) && other.getValue(fPair.getKey()).equals(fPair.getValue())) + nMatches++; + } + + return nMatches; + } + + public int numActiveFeatures() { return fSpec.keySet().size(); } + + /** + * Build a POS tag consisting of a base category plus inflectional features. + * + * @param baseTag + * @return the tag + */ + public String getTag(String baseTag) { + return baseTag + toString(); + } + + public void setAltTag(String tag) { altTag = tag; } + + + /** + * An alternate tag form than the one produced by getTag(). Subclasses + * may want to use this form to implement someone else's tagset (e.g., CC, ERTS, etc.) + * + * @return the tag + */ + public String getAltTag() { + return altTag; + } + + /** + * Assumes that the tag string has been formed using a call to getTag(). As such, + * it removes the basic category from the feature string. + *

    + * Note that this method returns a new MorphoFeatures object. As a result, it + * behaves like a static method, but is non-static so that subclasses can override + * this method. + * + * @param str + */ + public MorphoFeatures fromTagString(String str) { + List feats = Arrays.asList(str.split("\\-")); + MorphoFeatures mFeats = new MorphoFeatures(); + for(String fPair : feats) { + String[] keyValue = fPair.split(KEY_VAL_DELIM); + if(keyValue.length != 2)//Manual state split annotations + continue; + MorphoFeatureType fName = MorphoFeatureType.valueOf(keyValue[0].trim()); + mFeats.addFeature(fName, keyValue[1].trim()); + } + + return mFeats; + } + + /** + * values() returns the values in the order in which they are declared. Thus we will not have + * the case where two feature types can yield two strings: + * -feat1:A-feat2:B + * -feat2:B-feat1:A + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for(MorphoFeatureType feat : MorphoFeatureType.values()) { + if(fSpec.containsKey(feat)) { + sb.append(String.format("-%s%s%s",feat.toString(),KEY_VAL_DELIM,fSpec.get(feat))); + } + } + return sb.toString(); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/BZip2PipedOutputStream.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/BZip2PipedOutputStream.java new file mode 100644 index 0000000..bae738e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/BZip2PipedOutputStream.java @@ -0,0 +1,64 @@ +package edu.stanford.nlp.io; + +import edu.stanford.nlp.util.ByteStreamGobbler; +import edu.stanford.nlp.util.StreamGobbler; + +import java.io.*; + +/** +* Opens a outputstream for writing into a bzip2 file by piping into the bzip2 command. +* Output from bzip2 command is written into the specified file. +* +* @author Angel Chang +*/ +public class BZip2PipedOutputStream extends OutputStream +{ + private String filename; + private Process process; + private ByteStreamGobbler outGobbler; + private StreamGobbler errGobbler; + private PrintWriter errWriter; + + public BZip2PipedOutputStream(String filename) throws IOException { + this(filename, System.err); + } + + public BZip2PipedOutputStream(String filename, OutputStream err) throws IOException { + String bzip2 = System.getProperty("bzip2", "bzip2"); + String cmd = bzip2; // + " > " + filename; + //System.err.println("getBZip2PipedOutputStream: Running command: "+cmd); + ProcessBuilder pb = new ProcessBuilder(); + pb.command(cmd); + this.process = pb.start(); + this.filename = filename; + OutputStream outStream = new FileOutputStream(filename); + errWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(err))); + outGobbler = new ByteStreamGobbler("Output stream gobbler: " + cmd + " " + filename, + process.getInputStream(), outStream); + errGobbler = new StreamGobbler(process.getErrorStream(), errWriter); + outGobbler.start(); + errGobbler.start(); + } + + public void flush() throws IOException + { + process.getOutputStream().flush(); + } + + public void write(int b) throws IOException + { + process.getOutputStream().write(b); + } + + public void close() throws IOException + { + process.getOutputStream().close(); + try { + outGobbler.join(); + errGobbler.join(); + outGobbler.getOutputStream().close(); + process.waitFor(); + } catch (InterruptedException ex) {} + //System.err.println("getBZip2PipedOutputStream: Closed. "); + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/EncodingFileReader.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/EncodingFileReader.java new file mode 100644 index 0000000..eb8d1d0 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/EncodingFileReader.java @@ -0,0 +1,92 @@ +package edu.stanford.nlp.io; + +import java.io.*; + +/** + * This is a convenience class which works almost exactly like + * FileReader + * but allows for the specification of input encoding. + * + * @author Alex Kleeman + */ + +public class EncodingFileReader extends InputStreamReader { + + private static final String DEFAULT_ENCODING = "UTF-8"; + /** + * Creates a new EncodingFileReader, given the name of the + * file to read from. + * + * @param fileName the name of the file to read from + * @exception java.io.FileNotFoundException if the named file does not + * exist, is a directory rather than a regular file, + * or for some other reason cannot be opened for + * reading. + * @exception java.io.UnsupportedEncodingException if the encoding does not exist. + * + */ + public EncodingFileReader(String fileName) throws UnsupportedEncodingException, FileNotFoundException { + super(new FileInputStream(fileName), DEFAULT_ENCODING); + } + + /** + * Creates a new EncodingFileReader, given the name of the + * file to read from and an encoding + * + * @param fileName the name of the file to read from + * @param encoding String specifying the encoding to be used + * @exception java.io.UnsupportedEncodingException if the encoding does not exist. + * @exception java.io.FileNotFoundException if the named file does not exist, + * is a directory rather than a regular file, + * or for some other reason cannot be opened for + * reading. + * + */ + public EncodingFileReader(String fileName, String encoding) throws UnsupportedEncodingException, FileNotFoundException { + super(new FileInputStream(fileName), + encoding == null ? DEFAULT_ENCODING: encoding); + } + + /** + * Creates a new EncodingFileReader, given the File + * to read from, and using default of utf-8. + * + * @param file the File to read from + * @exception FileNotFoundException if the file does not exist, + * is a directory rather than a regular file, + * or for some other reason cannot be opened for + * reading. + * @exception java.io.UnsupportedEncodingException if the encoding does not exist. + */ + public EncodingFileReader(File file) throws UnsupportedEncodingException, FileNotFoundException { + super(new FileInputStream(file), DEFAULT_ENCODING); + } + + /** + * Creates a new FileReader, given the File + * to read from and encoding. + * + * @param file the File to read from + * @param encoding String specifying the encoding to be used + * @exception FileNotFoundException if the file does not exist, + * is a directory rather than a regular file, + * or for some other reason cannot be opened for + * reading. + * @exception java.io.UnsupportedEncodingException if the encoding does not exist. + */ + public EncodingFileReader(File file,String encoding) throws UnsupportedEncodingException, FileNotFoundException { + super(new FileInputStream(file), + encoding == null ? DEFAULT_ENCODING: encoding); + } + + /** + * Creates a new FileReader, given the + * FileDescriptor to read from. + * + * @param fd the FileDescriptor to read from + */ + public EncodingFileReader(FileDescriptor fd) { + super(new FileInputStream(fd)); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/EncodingPrintWriter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/EncodingPrintWriter.java new file mode 100644 index 0000000..a088789 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/EncodingPrintWriter.java @@ -0,0 +1,125 @@ +package edu.stanford.nlp.io; + +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; + +/** + * A convenience IO class with print and println statements to + * standard output and standard error allowing encoding in an + * arbitrary character set. It also provides methods which use UTF-8 + * always, overriding the system default charset. + * + * @author Roger Levy + * @author Christopher Manning + */ + +public class EncodingPrintWriter { + + private static final String DEFAULT_ENCODING = "UTF-8"; + + private static PrintWriter cachedErrWriter; + private static String cachedErrEncoding = ""; + + private static PrintWriter cachedOutWriter; + private static String cachedOutEncoding = ""; + + // uninstantiable + private EncodingPrintWriter() {} + + /** + * Print methods wrapped around System.err + */ + public static class err { + + private err() {} // uninstantiable + + private static void setupErrWriter(String encoding) { + if (encoding == null) { + encoding = DEFAULT_ENCODING; + } + if (cachedErrWriter == null || ! cachedErrEncoding.equals(encoding)) { + try { + cachedErrWriter = new PrintWriter(new OutputStreamWriter(System.err, encoding), true); + cachedErrEncoding = encoding; + } catch (UnsupportedEncodingException e) { + System.err.println("Error " + e + "Printing as default encoding."); + cachedErrWriter = new PrintWriter(new OutputStreamWriter(System.err), true); + cachedErrEncoding = ""; + } + } + } + + public static void println(String o, String encoding) { + setupErrWriter(encoding); + cachedErrWriter.println(o); + } + + public static void print(String o, String encoding) { + setupErrWriter(encoding); + cachedErrWriter.print(o); + cachedErrWriter.flush(); + } + + public static void println(String o) { + println(o, null); + } + + public static void print(String o) { + print(o, null); + } + + } // end static class err + + + /** + * Print methods wrapped around System.out + */ + public static class out { + + private out() {} // uninstantiable + + private static void setupOutWriter(String encoding) { + if (encoding == null) { + encoding = DEFAULT_ENCODING; + } + if (cachedOutWriter == null || ! cachedOutEncoding.equals(encoding)) { + try { + cachedOutWriter = new PrintWriter(new OutputStreamWriter(System.out, encoding), true); + cachedOutEncoding = encoding; + } catch (UnsupportedEncodingException e) { + System.err.println("Error " + e + "Printing as default encoding."); + cachedOutWriter = new PrintWriter(new OutputStreamWriter(System.out), true); + cachedOutEncoding = ""; + } + } + } + + public static void println(String o, String encoding) { + setupOutWriter(encoding); + cachedOutWriter.println(o); + + } + + public static void print(String o, String encoding) { + setupOutWriter(encoding); + cachedOutWriter.print(o); + cachedOutWriter.flush(); + } + + /** Print the argument plus a NEWLINE in UTF-8, regardless of + * the platform default. + * + * @param o String to print + */ + public static void println(String o) { + println(o, null); + } + + public static void print(String o) { + print(o, null); + } + + } // end static class out + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/ExtensionFileFilter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/ExtensionFileFilter.java new file mode 100644 index 0000000..4196b51 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/ExtensionFileFilter.java @@ -0,0 +1,75 @@ +package edu.stanford.nlp.io; + +import java.io.File; + +/** + * Implements a file filter that uses file extensions to filter files. + * + * @author cmanning 2000/01/24 + */ +public class ExtensionFileFilter extends javax.swing.filechooser.FileFilter implements java.io.FileFilter { + + private String extension; // = null + private boolean recursively; + + /** + * Sets up Extension file filter by specifying an extension + * to accept (currently only 1) and whether to also display + * folders for recursive search. + * The passed extension may be null, in which case the filter + * will pass all files (passing an empty String does not have the same + * effect -- this would look for file names ending in a period). + * + * @param ext File extension (not including period) or null for any + * @param recurse go into folders + */ + public ExtensionFileFilter(String ext, boolean recurse) { + if (ext != null) { + if (ext.startsWith(".")) { + extension = ext; + } else { + extension = '.' + ext; + } + } + recursively = recurse; + } + + /** + * Sets up an extension file filter that will recurse into sub directories. + * @param ext The extension to accept (with or without a leading period). + */ + public ExtensionFileFilter(String ext) { + this(ext, true); + } + + /** + * Checks whether a file satisfies the selection filter. + * + * @param file The file + * @return true if the file is acceptable + */ + @Override + public boolean accept(File file) { + if (file.isDirectory()) { + return recursively; + } else if (extension == null) { + return true; + } else { + return file.getName().endsWith(extension); + } + } + + /** + * Returns a description of what extension is being used (for file choosers). + * For example, if the suffix is "xml", the description will be + * "XML Files (*.xml)". + * + * @return description of this file filter + */ + @Override + public String getDescription() { + String ucExt = extension.substring(1).toUpperCase(); + return (ucExt + " Files (*" + extension + ')'); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/FileSequentialCollection.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/FileSequentialCollection.java new file mode 100644 index 0000000..e4799cb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/FileSequentialCollection.java @@ -0,0 +1,432 @@ +package edu.stanford.nlp.io; + +import java.io.File; +import java.io.FileFilter; +import java.util.*; + +/** + * A FileSequentialCollection maintains a read-only + * collection of Files. (It's a list, but we don't + * make it a List or else one needs an iterator that can go backwards.) + * It is built from a Collection of paths, or just from a single path. + * Optionally one can also provide a FileFilter which is + * applied over the files in a recursive traversal, or else + * an extension and whether to do recursive traversal, which are used to + * construct a filter. + * Note that the Collection argument constructor will behave 'normally' + * iff none of the Collection elements are directories. If they are + * directories they will be recursed and files in them added. To get the + * behavior of putting just directories in the collection one needs to + * use the constructor + * FileSequentialCollection(c, failFilt, true), + * where failFilt + * is a user-supplied FileFilter that accepts no files. + * The FileSequentialCollection builds from these + * constructor arguments a collection of Files, which can be + * iterated over, etc. This class does runtime expansion of paths. + * That is, it is optimized for iteration and not for random access. + * It is also an unmodifiable Collection. + *

    + * The class provides some additional constructors beyond the two recommended + * by the Collections package, to allow specifying a FileFilter + * and similar options. Nevertheless, so as to avoid overburdening the + * the API, not every possibly useful constructor has been provided where + * these can be easily synthesized using standard Collections package + * facilities. Useful idioms to know are: + *

      + *
    • To make a FileSequentialCollection from an array of + * Files or Strings arr:
      + * FileSequentialCollection fcollect = new FileSequentialCollection(Arrays.asList(arr)); + *
    • + *
    • To make a FileSequentialCollection from a single + * File or String fi:
      + * FileSequentialCollection fcollect = + * new FileSequentialCollection(Collections.singletonList(fi));
    • + *
    + * This class will throw an IllegalArgumentException if there + * are things that are not existing Files or String paths to existing files + * in the input collection (from the Iterator). + * + * @author Christopher Manning + * @version 1.0, August 2002 + * @see FileArrayList + */ +public class FileSequentialCollection extends AbstractCollection { + + /** + * Stores the input collection over which we work. This is + * commonly a brief summary of a full set of files. + */ + private Collection coll; + + /** + * A filter for files to match. + */ + private FileFilter filt; + + private boolean includeDirs; + + + /** + * Creates an empty FileSequentialCollection, with no Files + * in it. Since a FileSequentialCollection is not + * modifiable, this is + * largely useless (except if you want an empty one). + */ + public FileSequentialCollection() { + this(null); + } + + + /** + * Creates a FileSequentialCollection from the passed in + * Collection. The constructor iterates through the + * collection. For each element, if it is a File or + * String, then this file path is traversed for addition + * to the collection. If the argument is of some other type, an + * IllegalArgumentException is thrown. + * For each File or String, if they + * do not correspond to directories, then they are added to the + * collection; if they do, they are recursively explored and all + * non-directories within them are added to the collection. + * + * @param c The collection to build the + * FileSequentialCollection from + */ + public FileSequentialCollection(Collection c) { + this(c, null); + } + + + /** + * Creates a FileSequentialCollection from the passed in + * File path. If the File + * does not correspond to a directory, then it is added to the + * collection; if it does, it is explored. Files + * that match the extension, and files in subfolders that match, if + * appropriate, are added to the collection. + * This is an additional convenience constructor. + * + * @param path file or directory to load from + * @param suffix suffix (normally "File extension") of files to load + * @param recursively true means descend into subdirectories as well + */ + public FileSequentialCollection(File path, String suffix, boolean recursively) { + this(Collections.singletonList(path), suffix, recursively); + } + + + /** + * Creates a FileSequentialCollection from the passed in + * Collection. The constructor iterates through the + * collection. For each element, if it is a File, then the + * File is added to the collection, if it is a + * String, then a File corresponding to this + * String as a file path is added to the collection, and + * if the argument is of some other type, an + * IllegalArgumentException is thrown. For the files + * thus specified, they are included in the collection only if they + * match an extension filter as specified by the other arguments. + * + * @param c Collection of files or directories as Files or Strings + * @param suffix suffix (normally "File extension") of files to load + * @param recursively true means descend into subdirectories as well + */ + public FileSequentialCollection(Collection c, String suffix, boolean recursively) { + this(c, new ExtensionFileFilter(suffix, recursively), false); + } + + + /** + * Creates a FileSequentialCollection from the passed in + * Collection. The constructor iterates through the + * collection. For each element, if it is a File or + * String then these file paths are processed as + * explained below. + * If the argument is of some other type, an + * IllegalArgumentException is thrown. For the files + * specified, if they are not directories, they are included in the + * collection. If they are directories, files inside them are + * included iff they match the FileFilter. This will + * include recursive directory descent iff the FileFilter + * accepts directories. + * If the path is a directory then only + * files within the directory (perhaps recursively) that satisfy the + * filter are processed. If the pathis a file, then + * that file is processed regardless of whether it satisfies the + * filter. (This semantics was adopted, since otherwise there was no + * easy way to go through all the files in a directory without + * descending recursively via the specification of a + * FileFilter.) + * + * @param c The collection of file or directory to load from + * @param filt A FileFilter of files to load. This may be + * null, in which case all files are accepted. + */ + public FileSequentialCollection(Collection c, FileFilter filt) { + this(c, filt, false); + } + + public FileSequentialCollection(String filename, FileFilter filt) { + this(Collections.singletonList(filename), filt); + } + + + /** + * Creates a FileSequentialCollection from the passed in + * Collection. The constructor iterates through the + * collection. For each element, if it is a File or + * String then these file paths are processed as + * explained below. + * If the argument is of some other type, an + * IllegalArgumentException is thrown. For the files + * specified, if they are not directories, they are included in the + * collection. If they are directories, files inside them are + * included iff they match the FileFilter. This will + * include recursive directory descent iff the FileFilter + * accepts directories. + * If the path is a directory then only + * files within the directory (perhaps recursively) that satisfy the + * filter are processed. If the pathis a file, then + * that file is processed regardless of whether it satisfies the + * filter. (This semantics was adopted, since otherwise there was no + * easy way to go through all the files in a directory without + * descending recursively via the specification of a + * FileFilter.) + * + * @param c The collection of file or directory to load from. An + * argument of null is interpreted like an + * empty collection. + * @param filt A FileFilter of files to load. This may be + * null, in which case all files are accepted + * @param includeDirs Whether to include directory names in the file list + */ + public FileSequentialCollection(Collection c, FileFilter filt, boolean includeDirs) { + super(); + // store the arguments. They are expanded by the iterator + if (c == null) { + coll = new ArrayList(); + } else { + coll = c; + } + this.filt = filt; + this.includeDirs = includeDirs; + } + + + /** + * Returns the size of the FileSequentialCollection. + * + * @return size How many files are in the collection + */ + @SuppressWarnings({"UnusedDeclaration","unused"}) + @Override + public int size() { + int counter = 0; + for (File f : this) { + counter++; + } + return counter; + } + + + /** + * Return an Iterator over files in the collection. + * This version lazily works its way down directories. + */ + @Override + public Iterator iterator() { + return new FileSequentialCollectionIterator(); + } + + + /** + * This is the iterator that gets returned + */ + private final class FileSequentialCollectionIterator implements Iterator { + + // current state is a rootsIterator, a position in a recursion + // under a directory listing, and a pointer in the current + // directory. + + private Object[] roots; // these may be of type File or String + private int rootsIndex; + // these next two simulate a list of pairs, but I was too lazy to + // make an extra class + private Stack fileArrayStack; + private Stack fileArrayStackIndices; + private File next; + + public FileSequentialCollectionIterator() { + // System.err.println("Coll is " + coll); + roots = coll.toArray(); + rootsIndex = 0; + fileArrayStack = new Stack(); + fileArrayStackIndices = new Stack(); + if (roots.length > 0) { + fileArrayStack.add(roots[rootsIndex]); + fileArrayStackIndices.push(Integer.valueOf(0)); + } + next = primeNextFile(); + } + + public boolean hasNext() { + return next != null; + } + + /** + * Returns the next element in the iteration. + */ + public File next() { + if (next == null) { + throw new NoSuchElementException("FileSequentialCollection exhausted"); + } + File ret = next; + next = primeNextFile(); + return ret; + } + + /** + * Not supported + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Returns the next file to be accessed, or null if + * there are none left. This is all quite hairy to write as an + * iterator.... + * + * @return The next file + */ + private File primeNextFile() { + while (rootsIndex < roots.length) { + while (!fileArrayStack.empty()) { + // System.err.println("fileArrayStack: " + fileArrayStack); + Object obj = fileArrayStack.peek(); + if (obj instanceof File[]) { + // System.err.println("Got a File[]"); + File[] files = (File[]) obj; + Integer index = fileArrayStackIndices.pop(); + int ind = index.intValue(); + if (ind < files.length) { + index = Integer.valueOf(ind + 1); + fileArrayStackIndices.push(index); + fileArrayStack.push(files[ind]); + // loop around to process this new file + } else { + // this directory is finished and we pop up + fileArrayStack.pop(); + } + } else { + // take it off the stack: tail recursion optimization + fileArrayStack.pop(); + if (obj instanceof String) { + obj = new File((String) obj); + } + if (!(obj instanceof File)) { + throw new IllegalArgumentException("Collection elements must be Files or Strings"); + } + File path = (File) obj; + if (path.isDirectory()) { + // System.err.println("Got directory " + path); + // if path is a directory, look into it + File[] directoryListing = path.listFiles(filt); + if (directoryListing == null) { + throw new IllegalArgumentException("Directory access problem for: " + path); + } + // System.err.println(" with " + + // directoryListing.length + " files in it."); + if (includeDirs) { + // System.err.println("Include dir as answer"); + if (directoryListing.length > 0) { + fileArrayStack.push(directoryListing); + fileArrayStackIndices.push(Integer.valueOf(0)); + } + return path; + } else { + // we don't include the dir, so we'll push + // the directory and loop around again ... + if (directoryListing.length > 0) { + fileArrayStack.push(directoryListing); + fileArrayStackIndices.push(Integer.valueOf(0)); + } + // otherwise there was nothing in the + // directory; we will pop back up + } + } else { + // it's just a fixed file + // System.err.println("Got a plain file " + path); + if (!path.exists()) { + throw new IllegalArgumentException("File doesn't exist: " + path); + } + return path; + } + } + // go through loop again. we've pushed or popped as needed + } + // finished this root entry; go on to the next + rootsIndex++; + if (rootsIndex < roots.length) { + fileArrayStack.add(roots[rootsIndex]); + fileArrayStackIndices.push(Integer.valueOf(0)); + } + } + // finished everything + return null; + } + + } + + + /** + * This is simply a debugging aid that tests the functionality of + * the class. The supplied arguments are put in a + * Collection, and passed to the + * FileSequentialCollection constructor. + * An iterator is then used to print the names of all the files + * (but not directories) in the collection. + * + * @param args A list of file paths + */ + public static void main(String[] args) { + FileSequentialCollection fcollect = new FileSequentialCollection(Arrays.asList(args)); + for (File fi: fcollect) { + System.out.println(fi); + } + + // test the other constructors + System.out.println("Above was Collection constructor"); + System.out.println("Empty constructor"); + FileSequentialCollection fcollect2 = new FileSequentialCollection(); + for (File fi : fcollect2) { + System.out.println(fi); + } + + System.out.println("File String(mrg) boolean(true) constructor"); + FileSequentialCollection fcollect3 = new FileSequentialCollection(new File(args[0]), "mrg", true); + for (File fi : fcollect3) { + System.out.println(fi); + } + + System.out.println("Collection String(mrg) boolean constructor"); + FileSequentialCollection fcollect4 = new FileSequentialCollection(Arrays.asList(args), "mrg", true); + for (File fi: fcollect4) { + System.out.println(fi); + } + + System.out.println("Testing number range file filter"); + FileSequentialCollection fcollect5 = new FileSequentialCollection(Arrays.asList(args), new NumberRangeFileFilter(320, 410, true)); + for (File fi: fcollect5) { + System.out.println(fi); + } + + System.out.println("Testing null filter but include dirs"); + FileSequentialCollection fcollect6 = new FileSequentialCollection(Arrays.asList(args), (FileFilter) null, true); + for (File fi : fcollect6) { + System.out.println(fi); + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/IOUtils.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/IOUtils.java new file mode 100644 index 0000000..93099f2 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/IOUtils.java @@ -0,0 +1,1434 @@ +package edu.stanford.nlp.io; + +import edu.stanford.nlp.util.*; + +import java.io.*; +import java.lang.reflect.InvocationTargetException; +import java.net.InetAddress; +import java.net.SocketTimeoutException; +import java.net.URL; +import java.net.URLConnection; +import java.util.*; +import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** + * Helper Class for various I/O related things. + * + * @author Kayur Patel, Teg Grenager + */ + +public class IOUtils { + + private static final int SLURPBUFFSIZE = 16000; + + public static final String eolChar = System.getProperty("line.separator"); + public static final String defaultEncoding = "utf-8"; + + // A class of static methods + private IOUtils() { } + + /** + * Write object to a file with the specified name. + * + * @param o Object to be written to file + * @param filename Name of the temp file + * @throws IOException If can't write file. + * @return File containing the object + */ + public static File writeObjectToFile(Object o, String filename) + throws IOException { + return writeObjectToFile(o, new File(filename)); + } + + /** + * Write an object to a specified File. + * + * @param o Object to be written to file + * @param file The temp File + * @throws IOException If File cannot be written + * @return File containing the object + */ + public static File writeObjectToFile(Object o, File file) throws IOException { + return writeObjectToFile(o, file, false); + } + + /** + * Write an object to a specified File. + * + * @param o Object to be written to file + * @param file The temp File + * @param append If true, append to this file instead of overwriting it + * @throws IOException If File cannot be written + * @return File containing the object + */ + public static File writeObjectToFile(Object o, File file, boolean append) throws IOException { + // file.createNewFile(); // cdm may 2005: does nothing needed + ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream( + new GZIPOutputStream(new FileOutputStream(file, append)))); + oos.writeObject(o); + oos.close(); + return file; + } + + /** + * Write object to a file with the specified name. + * + * @param o Object to be written to file + * @param filename Name of the temp file + * @return File containing the object, or null if an exception was caught + */ + public static File writeObjectToFileNoExceptions(Object o, String filename) { + File file = null; + ObjectOutputStream oos = null; + try { + file = new File(filename); + // file.createNewFile(); // cdm may 2005: does nothing needed + oos = new ObjectOutputStream(new BufferedOutputStream( + new GZIPOutputStream(new FileOutputStream(file)))); + oos.writeObject(o); + oos.close(); + } catch (Exception e) { + e.printStackTrace(); + } finally { + closeIgnoringExceptions(oos); + } + return file; + } + + /** + * Write object to temp file which is destroyed when the program exits. + * + * @param o Object to be written to file + * @param filename Name of the temp file + * @throws IOException If file cannot be written + * @return File containing the object + */ + public static File writeObjectToTempFile(Object o, String filename) + throws IOException { + File file = File.createTempFile(filename, ".tmp"); + file.deleteOnExit(); + writeObjectToFile(o, file); + return file; + } + + /** + * Write object to a temp file and ignore exceptions. + * + * @param o Object to be written to file + * @param filename Name of the temp file + * @return File containing the object + */ + public static File writeObjectToTempFileNoExceptions(Object o, String filename) { + try { + return writeObjectToTempFile(o, filename); + } catch (Exception e) { + System.err.println("Error writing object to file " + filename); + e.printStackTrace(); + return null; + } + } + + //++ todo [cdm, Aug 2012]: None of the methods below in this block are used. Delete them all? + //++ They're also kind of weird in unnecessarily bypassing using a Writer. + + /** + * Writes a string to a file. + * + * @param contents The string to write + * @param path The file path + * @param encoding The encoding to encode in + * @throws IOException In case of failure + */ + public static void writeStringToFile(String contents, String path, String encoding) throws IOException { + OutputStream writer; + if (path.endsWith(".gz")) { + writer = new GZIPOutputStream(new FileOutputStream(path)); + } else { + writer = new BufferedOutputStream(new FileOutputStream(path)); + } + writer.write(contents.getBytes(encoding)); + writer.close(); + } + + /** + * Writes a string to a file, as UTF-8. + * + * @param contents The string to write + * @param path The file path + * @throws IOException In case of failure + */ + + /** + * Writes a string to a file, squashing exceptions + * + * @param contents The string to write + * @param path The file path + * @param encoding The encoding to encode in + * */ + public static void writeStringToFileNoExceptions(String contents, String path, String encoding) { + OutputStream writer = null; + try{ + if (path.endsWith(".gz")) { + writer = new GZIPOutputStream(new FileOutputStream(path)); + } else { + writer = new BufferedOutputStream(new FileOutputStream(path)); + } + writer.write(contents.getBytes(encoding)); + } catch (Exception e) { + e.printStackTrace(); + } finally { + if(writer != null){ closeIgnoringExceptions(writer); } + } + } + + /** + * Writes a string to a temporary file + * + * @param contents The string to write + * @param path The file path + * @param encoding The encoding to encode in + * @throws IOException In case of failure + * @return The File written to + */ + public static File writeStringToTempFile(String contents, String path, String encoding) throws IOException { + OutputStream writer; + File tmp = File.createTempFile(path,".tmp"); + if (path.endsWith(".gz")) { + writer = new GZIPOutputStream(new FileOutputStream(tmp)); + } else { + writer = new BufferedOutputStream(new FileOutputStream(tmp)); + } + writer.write(contents.getBytes(encoding)); + return tmp; + } + + /** + * Writes a string to a temporary file, as UTF-8 + * + * @param contents The string to write + * @param path The file path + * @throws IOException In case of failure + */ + public static void writeStringToTempFile(String contents, String path) throws IOException { + writeStringToTempFile(contents, path, "UTF-8"); + } + + /** + * Writes a string to a temporary file, squashing exceptions + * + * @param contents The string to write + * @param path The file path + * @param encoding The encoding to encode in + * @return The File that was written to + */ + public static File writeStringToTempFileNoExceptions(String contents, String path, String encoding) { + OutputStream writer = null; + File tmp = null; + try { + tmp = File.createTempFile(path,".tmp"); + if (path.endsWith(".gz")) { + writer = new GZIPOutputStream(new FileOutputStream(tmp)); + } else { + writer = new BufferedOutputStream(new FileOutputStream(tmp)); + } + writer.write(contents.getBytes(encoding)); + } catch (Exception e) { + e.printStackTrace(); + } finally { + closeIgnoringExceptions(writer); + } + return tmp; + } + + /** + * Writes a string to a temporary file with UTF-8 encoding, squashing exceptions + * + * @param contents The string to write + * @param path The file path + */ + public static void writeStringToTempFileNoExceptions(String contents, String path) { + writeStringToTempFileNoExceptions(contents, path, "UTF-8"); + } + + //-- todo [cdm, Aug 2012]: None of the methods above in the block are used. Delete them all? + + + /** + * Read an object from a stored file. + * + * @param file The file pointing to the object to be retrieved + * @throws IOException If file cannot be read + * @throws ClassNotFoundException If reading serialized object fails + * @return The object read from the file. + */ + public static T readObjectFromFile(File file) throws IOException, + ClassNotFoundException { + ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( + new GZIPInputStream(new FileInputStream(file)))); + Object o = ois.readObject(); + ois.close(); + return ErasureUtils.uncheckedCast(o); + } + + /** + * Read an object from a stored file. The file can be anything obtained + * via a URL, the filesystem, or the classpath (eg in a jar file). + * + * @param filename The file pointing to the object to be retrieved + * @throws IOException If file cannot be read + * @throws ClassNotFoundException If reading serialized object fails + * @return The object read from the file. + */ + public static T readObjectFromURLOrClasspathOrFileSystem(String filename) throws IOException, ClassNotFoundException { + ObjectInputStream ois = new ObjectInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filename)); + Object o = ois.readObject(); + ois.close(); + return ErasureUtils.uncheckedCast(o); + } + + public static T readObjectFromObjectStream(ObjectInputStream ois) throws IOException, + ClassNotFoundException { + Object o = ois.readObject(); + return ErasureUtils.uncheckedCast(o); + } + + /** + * Read an object from a stored file. + * + * @param filename The filename of the object to be retrieved + * @throws IOException If file cannot be read + * @throws ClassNotFoundException If reading serialized object fails + * @return The object read from the file. + */ + public static T readObjectFromFile(String filename) throws IOException, + ClassNotFoundException { + return ErasureUtils.uncheckedCast(readObjectFromFile(new File(filename))); + } + + /** + * Read an object from a stored file without throwing exceptions. + * + * @param file The file pointing to the object to be retrieved + * @return The object read from the file, or null if an exception occurred. + */ + public static T readObjectFromFileNoExceptions(File file) { + Object o = null; + try { + ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( + new GZIPInputStream(new FileInputStream(file)))); + o = ois.readObject(); + ois.close(); + } catch (IOException e) { + e.printStackTrace(); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + return ErasureUtils.uncheckedCast(o); + } + + public static int lineCount(File textFile) throws IOException { + BufferedReader r = new BufferedReader(new FileReader(textFile)); + int numLines = 0; + while (r.readLine() != null) { + numLines++; + } + return numLines; + } + + public static ObjectOutputStream writeStreamFromString(String serializePath) + throws IOException { + ObjectOutputStream oos; + if (serializePath.endsWith(".gz")) { + oos = new ObjectOutputStream(new BufferedOutputStream( + new GZIPOutputStream(new FileOutputStream(serializePath)))); + } else { + oos = new ObjectOutputStream(new BufferedOutputStream( + new FileOutputStream(serializePath))); + } + + return oos; + } + + public static ObjectInputStream readStreamFromString(String filenameOrUrl) + throws IOException { + InputStream is = getInputStreamFromURLOrClasspathOrFileSystem(filenameOrUrl); + return new ObjectInputStream(is); + } + + /** + * Locates this file either in the CLASSPATH or in the file system. The CLASSPATH takes priority. + * @param name The file or resource name + * @throws FileNotFoundException If the file does not exist + * @return The InputStream of name, or null if not found + */ + private static InputStream findStreamInClasspathOrFileSystem(String name) throws FileNotFoundException { + // ms 10-04-2010: + // - even though this may look like a regular file, it may be a path inside a jar in the CLASSPATH + // - check for this first. This takes precedence over the file system. + InputStream is = IOUtils.class.getClassLoader().getResourceAsStream(name); + // windows File.separator is \, but getting resources only works with / + if (is == null) { + is = IOUtils.class.getClassLoader().getResourceAsStream(name.replaceAll("\\\\", "/")); + } + // if not found in the CLASSPATH, load from the file system + if (is == null) is = new FileInputStream(name); + return is; + } + + /** + * Locates this file either using the given URL, or in the CLASSPATH, or in the file system + * The CLASSPATH takes priority over the file system! + * This stream is buffered and gunzipped (if necessary). + * + * @param textFileOrUrl + * @return An InputStream for loading a resource + * @throws IOException + */ + public static InputStream getInputStreamFromURLOrClasspathOrFileSystem(String textFileOrUrl) + throws IOException + { + InputStream in; + if (textFileOrUrl.matches("https?://.*")) { + URL u = new URL(textFileOrUrl); + URLConnection uc = u.openConnection(); + in = uc.getInputStream(); + } else { + try { + in = findStreamInClasspathOrFileSystem(textFileOrUrl); + } catch (FileNotFoundException e) { + try { + // Maybe this happens to be some other format of URL? + URL u = new URL(textFileOrUrl); + URLConnection uc = u.openConnection(); + in = uc.getInputStream(); + } catch (IOException e2) { + // Don't make the original exception a cause, since it is almost certainly bogus + throw new IOException("Unable to resolve \"" + + textFileOrUrl + "\" as either " + + "class path, filename or URL"); // , e2); + } + } + } + + // buffer this stream + in = new BufferedInputStream(in); + + // gzip it if necessary + if (textFileOrUrl.endsWith(".gz")) + in = new GZIPInputStream(in); + + return in; + } + + /** + * Open a BufferedReader to a file or URL specified by a String name. If the + * String starts with https?://, then it is first tried as a URL, otherwise it + * is next tried as a resource on the CLASSPATH, and then finally it is tried + * as a local file or other network-available file . If the String ends in .gz, it + * is interpreted as a gzipped file (and uncompressed). The file is then + * interpreted as a utf-8 text file. + * + * @param textFileOrUrl What to read from + * @return The BufferedReader + * @throws IOException If there is an I/O problem + */ + public static BufferedReader readerFromString(String textFileOrUrl) + throws IOException { + return new BufferedReader(new InputStreamReader( + getInputStreamFromURLOrClasspathOrFileSystem(textFileOrUrl), "UTF-8")); + } + + /** + * Open a BufferedReader to a file or URL specified by a String name. If the + * String starts with https?://, then it is first tried as a URL, otherwise it + * is next tried as a resource on the CLASSPATH, and then finally it is tried + * as a local file or other network-available file . If the String ends in .gz, it + * is interpreted as a gzipped file (and uncompressed), else it is interpreted as + * a regular text file in the given encoding. + * + * @param textFileOrUrl What to read from + * @param encoding CharSet encoding. Maybe be null, in which case the + * platform default encoding is used + * @return The BufferedReader + * @throws IOException If there is an I/O problem + */ + public static BufferedReader readerFromString(String textFileOrUrl, + String encoding) throws IOException { + InputStream is = getInputStreamFromURLOrClasspathOrFileSystem(textFileOrUrl); + if (encoding == null) { + return new BufferedReader(new InputStreamReader(is)); + } + return new BufferedReader(new InputStreamReader(is, encoding)); + } + + /** + * Returns an Iterable of the lines in the file. + * + * The file reader will be closed when the iterator is exhausted. IO errors + * will throw an (unchecked) RuntimeIOException + * + * @param path The file whose lines are to be read. + * @return An Iterable containing the lines from the file. + */ + public static Iterable readLines(String path) { + return readLines(new File(path)); + } + + /** + * Returns an Iterable of the lines in the file. + * + * The file reader will be closed when the iterator is exhausted. IO errors + * will throw an (unchecked) RuntimeIOException + * + * @param path The file whose lines are to be read. + * @param encoding The encoding to use when reading lines. + * @return An Iterable containing the lines from the file. + */ + public static Iterable readLines(String path, String encoding) { + return readLines(new File(path), null, encoding); + } + + /** + * Returns an Iterable of the lines in the file. + * + * The file reader will be closed when the iterator is exhausted. + * + * @param file The file whose lines are to be read. + * @return An Iterable containing the lines from the file. + */ + public static Iterable readLines(final File file) { + return readLines(file, null, null); + } + + /** + * Returns an Iterable of the lines in the file. + * + * The file reader will be closed when the iterator is exhausted. + * + * @param file The file whose lines are to be read. + * @param fileInputStreamWrapper + * The class to wrap the InputStream with, e.g. GZIPInputStream. Note + * that the class must have a constructor that accepts an + * InputStream. + * @return An Iterable containing the lines from the file. + */ + public static Iterable readLines(final File file, + final Class fileInputStreamWrapper) { + return readLines(file, fileInputStreamWrapper, null); + } + + /** + * Returns an Iterable of the lines in the file, wrapping the generated + * FileInputStream with an instance of the supplied class. IO errors will + * throw an (unchecked) RuntimeIOException + * + * @param file The file whose lines are to be read. + * @param fileInputStreamWrapper + * The class to wrap the InputStream with, e.g. GZIPInputStream. Note + * that the class must have a constructor that accepts an + * InputStream. + * @param encoding The encoding to use when reading lines. + * @return An Iterable containing the lines from the file. + */ + public static Iterable readLines(final File file, + final Class fileInputStreamWrapper, + final String encoding) { + + return new Iterable() { + public Iterator iterator() { + return new Iterator() { + + protected BufferedReader reader = this.getReader(); + protected String line = this.getLine(); + + public boolean hasNext() { + return this.line != null; + } + + public String next() { + String nextLine = this.line; + if (nextLine == null) { + throw new NoSuchElementException(); + } + line = getLine(); + return nextLine; + } + + protected String getLine() { + try { + String result = this.reader.readLine(); + if (result == null) { + this.reader.close(); + } + return result; + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + protected BufferedReader getReader() { + try { + InputStream stream = new FileInputStream(file); + if (fileInputStreamWrapper != null) { + stream = fileInputStreamWrapper.getConstructor( + InputStream.class).newInstance(stream); + } + if (encoding == null) { + return new BufferedReader(new InputStreamReader(stream)); + } else { + return new BufferedReader(new InputStreamReader(stream, encoding)); + } + } catch (Exception e) { + throw new RuntimeIOException(e); + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } + + /** + * Quietly opens a File. If the file ends with a ".gz" extension, + * automatically opens a GZIPInputStream to wrap the constructed + * FileInputStream. + */ + public static InputStream openFile(File file) throws RuntimeIOException { + try { + InputStream is = new BufferedInputStream(new FileInputStream(file)); + if (file.getName().endsWith(".gz")) { + is = new GZIPInputStream(is); + } + return is; + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Provides an implementation of closing a file for use in a finally block so + * you can correctly close a file without even more exception handling stuff. + * From a suggestion in a talk by Josh Bloch. + * + * @param c The IO resource to close (e.g., a Stream/Reader) + */ + public static void closeIgnoringExceptions(Closeable c) { + if (c != null) { + try { + c.close(); + } catch (IOException ioe) { + // ignore + } + } + } + + /** + * Iterate over all the files in the directory, recursively. + * + * @param dir + * The root directory. + * @return All files within the directory. + */ + public static Iterable iterFilesRecursive(final File dir) { + return iterFilesRecursive(dir, (Pattern) null); + } + + /** + * Iterate over all the files in the directory, recursively. + * + * @param dir + * The root directory. + * @param ext + * A string that must be at the end of all files (e.g. ".txt") + * @return All files within the directory ending in the given extension. + */ + public static Iterable iterFilesRecursive(final File dir, + final String ext) { + return iterFilesRecursive(dir, Pattern.compile(Pattern.quote(ext) + "$")); + } + + /** + * Iterate over all the files in the directory, recursively. + * + * @param dir + * The root directory. + * @param pattern + * A regular expression that the file path must match. This uses + * Matcher.find(), so use ^ and $ to specify endpoints. + * @return All files within the directory. + */ + public static Iterable iterFilesRecursive(final File dir, + final Pattern pattern) { + return new Iterable() { + public Iterator iterator() { + return new AbstractIterator() { + private final Queue files = new LinkedList(Collections + .singleton(dir)); + private File file = this.findNext(); + + @Override + public boolean hasNext() { + return this.file != null; + } + + @Override + public File next() { + File result = this.file; + if (result == null) { + throw new NoSuchElementException(); + } + this.file = this.findNext(); + return result; + } + + private File findNext() { + File next = null; + while (!this.files.isEmpty() && next == null) { + next = this.files.remove(); + if (next.isDirectory()) { + files.addAll(Arrays.asList(next.listFiles())); + next = null; + } else if (pattern != null) { + if (!pattern.matcher(next.getPath()).find()) { + next = null; + } + } + } + return next; + } + }; + } + }; + } + + /** + * Returns all the text in the given File. + */ + public static String slurpFile(File file) throws IOException { + Reader r = new FileReader(file); + return IOUtils.slurpReader(r); + } + + /** + * Returns all the text in the given File. + * + * @param file The file to read from + * @param encoding The character encoding to assume. This may be null, and + * the platform default character encoding is used. + */ + public static String slurpFile(File file, String encoding) throws IOException { + return IOUtils.slurpReader(IOUtils.encodedInputStreamReader( + new FileInputStream(file), encoding)); + } + + /** + * Returns all the text in the given File. + */ + public static String slurpGZippedFile(String filename) throws IOException { + Reader r = new InputStreamReader(new GZIPInputStream(new FileInputStream( + filename))); + return IOUtils.slurpReader(r); + } + + /** + * Returns all the text in the given File. + */ + public static String slurpGZippedFile(File file) throws IOException { + Reader r = new InputStreamReader(new GZIPInputStream(new FileInputStream( + file))); + return IOUtils.slurpReader(r); + } + + public static String slurpGBFileNoExceptions(String filename) { + return IOUtils.slurpFileNoExceptions(filename, "GB18030"); + } + + /** + * Returns all the text in the given file with the given encoding. + */ + public static String slurpFile(String filename, String encoding) + throws IOException { + Reader r = new InputStreamReader(new FileInputStream(filename), encoding); + return IOUtils.slurpReader(r); + } + + /** + * Returns all the text in the given file with the given encoding. If the file + * cannot be read (non-existent, etc.), then and only then the method returns + * null. + */ + public static String slurpFileNoExceptions(String filename, String encoding) { + try { + return slurpFile(filename, encoding); + } catch (IOException e) { + throw new RuntimeIOException("slurpFile IO problem", e); + } + } + + public static String slurpGBFile(String filename) throws IOException { + return slurpFile(filename, "GB18030"); + } + + /** + * Returns all the text in the given file + * + * @return The text in the file. + */ + public static String slurpFile(String filename) throws IOException { + return IOUtils.slurpReader(new FileReader(filename)); + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpGBURL(URL u) throws IOException { + return IOUtils.slurpURL(u, "GB18030"); + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpGBURLNoExceptions(URL u) { + try { + return slurpGBURL(u); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpURLNoExceptions(URL u, String encoding) { + try { + return IOUtils.slurpURL(u, encoding); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpURL(URL u, String encoding) throws IOException { + String lineSeparator = System.getProperty("line.separator"); + URLConnection uc = u.openConnection(); + uc.setReadTimeout(30000); + InputStream is; + try { + is = uc.getInputStream(); + } catch (SocketTimeoutException e) { + // e.printStackTrace(); + System.err.println("Time out. Return empty string"); + return ""; + } + BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding)); + String temp; + StringBuilder buff = new StringBuilder(16000); // make biggish + while ((temp = br.readLine()) != null) { + buff.append(temp); + buff.append(lineSeparator); + } + br.close(); + return buff.toString(); + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpURL(URL u) throws IOException { + String lineSeparator = System.getProperty("line.separator"); + URLConnection uc = u.openConnection(); + InputStream is = uc.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String temp; + StringBuilder buff = new StringBuilder(16000); // make biggish + while ((temp = br.readLine()) != null) { + buff.append(temp); + buff.append(lineSeparator); + } + br.close(); + return buff.toString(); + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpURLNoExceptions(URL u) { + try { + return slurpURL(u); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + /** + * Returns all the text at the given URL. + */ + public static String slurpURL(String path) throws Exception { + return slurpURL(new URL(path)); + } + + /** + * Returns all the text at the given URL. If the file cannot be read + * (non-existent, etc.), then and only then the method returns + * null. + */ + public static String slurpURLNoExceptions(String path) { + try { + return slurpURL(path); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + /** + * Returns all the text in the given File. + * + * @return The text in the file. May be an empty string if the file is empty. + * If the file cannot be read (non-existent, etc.), then and only then + * the method returns null. + */ + public static String slurpFileNoExceptions(File file) { + try { + return IOUtils.slurpReader(new FileReader(file)); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + /** + * Returns all the text in the given File. + * + * @return The text in the file. May be an empty string if the file is empty. + * If the file cannot be read (non-existent, etc.), then and only then + * the method returns null. + */ + public static String slurpFileNoExceptions(String filename) { + try { + return slurpFile(filename); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + /** + * Returns all the text from the given Reader. + * Closes the Reader when done. + * + * @return The text in the file. + */ + public static String slurpReader(Reader reader) { + BufferedReader r = new BufferedReader(reader); + StringBuilder buff = new StringBuilder(); + try { + char[] chars = new char[SLURPBUFFSIZE]; + while (true) { + int amountRead = r.read(chars, 0, SLURPBUFFSIZE); + if (amountRead < 0) { + break; + } + buff.append(chars, 0, amountRead); + } + r.close(); + } catch (Exception e) { + throw new RuntimeIOException("slurpReader IO problem", e); + } + return buff.toString(); + } + + /** + * Send all bytes from the input stream to the output stream. + * + * @param input + * The input bytes. + * @param output + * Where the bytes should be written. + */ + public static void writeStreamToStream(InputStream input, OutputStream output) + throws IOException { + byte[] buffer = new byte[4096]; + while (true) { + int len = input.read(buffer); + if (len == -1) { + break; + } + output.write(buffer, 0, len); + } + } + + /** + * Read in a CSV formatted file with a header row + * @param path - path to CSV file + * @param quoteChar - character for enclosing strings, defaults to " + * @param escapeChar - character for escaping quotes appearing in quoted strings; defaults to " (i.e. "" is used for " inside quotes, consistent with Excel) + * @return a list of maps representing the rows of the csv. The maps' keys are the header strings and their values are the row contents + * @throws IOException + */ + public static List> readCSVWithHeader(String path, char quoteChar, char escapeChar) throws IOException { + String[] labels = null; + List> rows = Generics.newArrayList(); + for (String line : IOUtils.readLines(path)) { + System.out.println("Splitting "+line); + if (labels == null) { + labels = StringUtils.splitOnCharWithQuoting(line,',','"',escapeChar); + } else { + String[] cells = StringUtils.splitOnCharWithQuoting(line,',',quoteChar,escapeChar); + assert(cells.length == labels.length); + Map cellMap = Generics.newHashMap(); + for (int i=0; i> readCSVWithHeader(String path) throws IOException { + return readCSVWithHeader(path, '"', '"'); + } + + /** + * Read a CSV file character by character. Allows for multi-line CSV files (in quotes), but + * is less flexible and likely slower than readCSVWithHeader() + * @param csvContents The char[] array corresponding to the contents of the file + * @param numColumns The number of columns in the file (for verification, primarily) + * @return A list of lines in the file + */ + public static LinkedList readCSVStrictly(char[] csvContents, int numColumns){ + //--Variables + StringBuilder[] buffer = new StringBuilder[numColumns]; + buffer[0] = new StringBuilder(); + LinkedList lines = new LinkedList(); + //--State + boolean inQuotes = false; + boolean nextIsEscaped = false; + int columnI = 0; + //--Read + for(int offset=0; offset= numColumns){ + throw new IllegalArgumentException("Too many columns: "+columnI+"/"+numColumns+" (offset: " + offset + ")"); + } + buffer[columnI] = new StringBuilder(); + } + break; + case '\n': + //(case: newline) + if(inQuotes){ + buffer[columnI].append('\n'); + } else { + //((error checks)) + if(columnI != numColumns-1){ + throw new IllegalArgumentException("Too few columns: "+columnI+"/"+numColumns+" (offset: " + offset + ")"); + } + //((create line)) + String[] rtn = new String[buffer.length]; + for(int i=0; i readCSVStrictly(String filename, int numColumns) throws IOException { + return readCSVStrictly(slurpFile(filename).toCharArray(), numColumns); + } + + /** + * Get a input file stream (automatically gunzip/bunzip2 depending on file extension) + * @param filename Name of file to open + * @return Input stream that can be used to read from the file + * @throws IOException if there are exceptions opening the file + */ + public static InputStream getFileInputStream(String filename) throws IOException { + InputStream in = new FileInputStream(filename); + if (filename.endsWith(".gz")) { + in = new GZIPInputStream(in); + } else if (filename.endsWith(".bz2")) { + //in = new CBZip2InputStream(in); + in = getBZip2PipedInputStream(filename); + } + return in; + } + + /** + * Get a output file stream (automatically gzip/bzip2 depending on file extension) + * @param filename Name of file to open + * @return Output stream that can be used to write to the file + * @throws IOException if there are exceptions opening the file + */ + public static OutputStream getFileOutputStream(String filename) throws IOException { + OutputStream out = new FileOutputStream(filename); + if (filename.endsWith(".gz")) { + out = new GZIPOutputStream(out); + } else if (filename.endsWith(".bz2")) { + //out = new CBZip2OutputStream(out); + out = getBZip2PipedOutputStream(filename); + } + return out; + } + + public static BufferedReader getBufferedFileReader(String filename) throws IOException { + return getBufferedFileReader(filename, defaultEncoding); + } + + public static BufferedReader getBufferedFileReader(String filename, String encoding) throws IOException { + InputStream in = getFileInputStream(filename); + return new BufferedReader(new InputStreamReader(in, encoding)); + } + + public static BufferedReader getBufferedReaderFromClasspathOrFileSystem(String filename) throws IOException { + return getBufferedReaderFromClasspathOrFileSystem(filename, defaultEncoding); + } + + public static BufferedReader getBufferedReaderFromClasspathOrFileSystem(String filename, String encoding) throws IOException { + InputStream in = findStreamInClasspathOrFileSystem(filename); + return new BufferedReader(new InputStreamReader(in, encoding)); + } + + public static PrintWriter getPrintWriter(File textFile) throws IOException { + File f = textFile.getAbsoluteFile(); + return new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f))), true); + } + + public static PrintWriter getPrintWriter(String filename) throws IOException { + return getPrintWriter(filename, defaultEncoding); + } + + public static PrintWriter getPrintWriterIgnoringExceptions(String filename) { + try { + return getPrintWriter(filename, defaultEncoding); + } catch (IOException ioe) { + return null; + } + } + + public static PrintWriter getPrintWriterOrDie(String filename) { + try { + return getPrintWriter(filename, defaultEncoding); + } catch (IOException ioe) { + throw new RuntimeIOException(ioe); + } + } + + public static PrintWriter getPrintWriter(String filename, String encoding) throws IOException { + OutputStream out = getFileOutputStream(filename); + if (encoding == null) { + encoding = defaultEncoding; + } + return new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, encoding)), true); + } + + public static InputStream getBZip2PipedInputStream(String filename) throws IOException + { + String bzcat = System.getProperty("bzcat", "bzcat"); + Runtime rt = Runtime.getRuntime(); + String cmd = bzcat + " " + filename; + //System.err.println("getBZip2PipedInputStream: Running command: "+cmd); + Process p = rt.exec(cmd); + Writer errWriter = new BufferedWriter(new OutputStreamWriter(System.err)); + StreamGobbler errGobler = new StreamGobbler(p.getErrorStream(), errWriter); + errGobler.start(); + return p.getInputStream(); + } + + public static OutputStream getBZip2PipedOutputStream(String filename) throws IOException + { + return new BZip2PipedOutputStream(filename); + } + + private static final Pattern tab = Pattern.compile("\t"); + /** + * Read column as set + * @param infile - filename + * @param field index of field to read + * @return a set of the entries in column field + * @throws IOException + */ + public static Set readColumnSet(String infile, int field) throws IOException + { + BufferedReader br = IOUtils.getBufferedFileReader(infile); + String line; + Set set = Generics.newHashSet(); + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() > 0) { + if (field < 0) { + set.add(line); + } else { + String[] fields = tab.split(line); + if (field < fields.length) { + set.add(fields[field]); + } + } + } + } + br.close(); + return set; + } + + public static List readObjectFromColumns(Class objClass, String filename, String[] fieldNames, String delimiter) + throws IOException, InstantiationException, IllegalAccessException, + NoSuchFieldException, NoSuchMethodException, InvocationTargetException + { + Pattern delimiterPattern = Pattern.compile(delimiter); + List list = new ArrayList(); + BufferedReader br = IOUtils.getBufferedFileReader(filename); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() > 0) { + C item = StringUtils.columnStringToObject(objClass, line, delimiterPattern, fieldNames); + list.add(item); + } + } + br.close(); + return list; + } + + public static Map readMap(String filename) throws IOException + { + Map map = Generics.newHashMap(); + try { + BufferedReader br = IOUtils.getBufferedFileReader(filename); + String line; + while ((line = br.readLine()) != null) { + String[] fields = tab.split(line,2); + map.put(fields[0], fields[1]); + } + br.close(); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + return map; + } + + + /** + * Returns the contents of a file as a single string. The string may be + * empty, if the file is empty. If there is an IOException, it is caught + * and null is returned. + */ + public static String stringFromFile(String filename) { + return stringFromFile(filename, defaultEncoding); + } + + /** + * Returns the contents of a file as a single string. The string may be + * empty, if the file is empty. If there is an IOException, it is caught + * and null is returned. Encoding can also be specified. + */ + public static String stringFromFile(String filename, String encoding) { + try { + StringBuilder sb = new StringBuilder(); + BufferedReader in = new BufferedReader(new EncodingFileReader(filename,encoding)); + String line; + while ((line = in.readLine()) != null) { + sb.append(line); + sb.append(eolChar); + } + in.close(); + return sb.toString(); + } + catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + + /** + * Returns the contents of a file as a list of strings. The list may be + * empty, if the file is empty. If there is an IOException, it is caught + * and null is returned. + */ + public static List linesFromFile(String filename) { + return linesFromFile(filename, defaultEncoding); + } + + /** + * Returns the contents of a file as a list of strings. The list may be + * empty, if the file is empty. If there is an IOException, it is caught + * and null is returned. Encoding can also be specified + */ + public static List linesFromFile(String filename,String encoding) { + try { + List lines = new ArrayList(); + BufferedReader in = new BufferedReader(new EncodingFileReader(filename,encoding)); + String line; + while ((line = in.readLine()) != null) { + lines.add(line); + } + in.close(); + return lines; + } + catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + public static String backupName(String filename) { + return backupFile(new File(filename)).toString(); + } + + public static File backupFile(File file) { + int max = 1000; + String filename = file.toString(); + File backup = new File(filename + "~"); + if (!backup.exists()) { return backup; } + for (int i = 1; i <= max; i++) { + backup = new File(filename + ".~" + i + ".~"); + if (!backup.exists()) { return backup; } + } + return null; + } + + public static boolean renameToBackupName(File file) { + return file.renameTo(backupFile(file)); + } + + + /** + * A JavaNLP specific convenience routine for obtaining the current + * scratch directory for the machine you're currently running on. + */ + public static File getJNLPLocalScratch() { + try { + String machineName = InetAddress.getLocalHost().getHostName().split("\\.")[0]; + String username = System.getProperty("user.name"); + return new File("/"+machineName+"/scr1/"+username); + } catch (Exception e) { + return new File("./scr/"); // default scratch + } + } + + /** + * Given a filepath, makes sure a directory exists there. If not, creates and returns it. + * Same as ENSURE-DIRECTORY in CL. + * + * @param tgtDir The directory that you wish to ensure exists + * @throws IOException If directory can't be created, is an existing file, or for other reasons + */ + public static File ensureDir(File tgtDir) throws IOException { + if (tgtDir.exists()) { + if (tgtDir.isDirectory()) { + return tgtDir; + } else { + throw new IOException("Could not create directory "+tgtDir.getAbsolutePath()+", as a file already exists at that path."); + } + } else { + if ( ! tgtDir.mkdirs()) { + throw new IOException("Could not create directory "+tgtDir.getAbsolutePath()); + } + return tgtDir; + } + } + + public static void main(String[] args) { + System.out.println(backupName(args[0])); + } + + public static String getExtension(String fileName) { + if(!fileName.contains(".")) + return null; + int idx = fileName.lastIndexOf('.'); + return fileName.substring(idx+1); + } + + + /** Create a Reader with an explicit encoding around an InputStream. + * This static method will treat null as meaning to use the platform default, + * unlike the Java library methods that disallow a null encoding. + * + * @param stream An InputStream + * @param encoding A charset encoding + * @return A Reader + * @throws IOException If any IO problem + */ + public static Reader encodedInputStreamReader(InputStream stream, String encoding) throws IOException { + // InputStreamReader doesn't allow encoding to be null; + if (encoding == null) { + return new InputStreamReader(stream); + } else { + return new InputStreamReader(stream, encoding); + } + } + + + /** Create a Reader with an explicit encoding around an InputStream. + * This static method will treat null as meaning to use the platform default, + * unlike the Java library methods that disallow a null encoding. + * + * @param stream An InputStream + * @param encoding A charset encoding + * @return A Reader + * @throws IOException If any IO problem + */ + public static Writer encodedOutputStreamWriter(OutputStream stream, String encoding) throws IOException { + // OutputStreamWriter doesn't allow encoding to be null; + if (encoding == null) { + return new OutputStreamWriter(stream); + } else { + return new OutputStreamWriter(stream, encoding); + } + } + + + /** Create a Reader with an explicit encoding around an InputStream. + * This static method will treat null as meaning to use the platform default, + * unlike the Java library methods that disallow a null encoding. + * + * @param stream An InputStream + * @param encoding A charset encoding + * @param autoFlush Whether to make an autoflushing Writer + * @return A Reader + * @throws IOException If any IO problem + */ + public static PrintWriter encodedOutputStreamPrintWriter(OutputStream stream, + String encoding, boolean autoFlush) throws IOException { + // PrintWriter doesn't allow encoding to be null; or to have charset and flush + if (encoding == null) { + return new PrintWriter(stream, autoFlush); + } else { + return new PrintWriter(new OutputStreamWriter(stream, encoding), autoFlush); + } + } + + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/InDataStreamFile.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/InDataStreamFile.java new file mode 100644 index 0000000..cd5ced2 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/InDataStreamFile.java @@ -0,0 +1,30 @@ +/** + * Title: StanfordMaxEnt

    + * Description: A Maximum Entropy Toolkit

    + * Copyright: Copyright (c) Bruce Eckel

    + * Company: Stanford University

    + * @author Bruce Eckel + * @version 1.0 + */ + + +package edu.stanford.nlp.io; + +import java.io.*; +import java.net.URL; + +public class InDataStreamFile extends DataInputStream { + + public InDataStreamFile(String filename) throws FileNotFoundException { + this(new File(filename)); + } + + public InDataStreamFile(File file) throws FileNotFoundException { + super(new BufferedInputStream(new FileInputStream(file))); + } + + public InDataStreamFile(URL url) throws IOException { + super(new BufferedInputStream(url.openStream())); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/Lexer.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/Lexer.java new file mode 100644 index 0000000..30ae650 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/Lexer.java @@ -0,0 +1,70 @@ +package edu.stanford.nlp.io; + +import java.io.IOException; +import java.io.Reader; + + +/** + * A Lexer interface to be used with {@link edu.stanford.nlp.process.LexerTokenizer}. You can put a {@link Reader} inside + * a Lexer with the {@link #yyreset} method. An easy way to build classes implementing this + * interface is with JFlex (http://jflex.de). Just make sure to include the following in the + * JFlex source file + *

    + *

    In the Options and Macros section of the source file, include + *

    + * %class JFlexDummyLexer
    + * %standalone
    + * %unicode
    + * %int
    + *


    + * %implements edu.stanford.nlp.io.Lexer
    + *
    + * %{
    + * public void pushBack(int n) {
    + * yypushback(n);
    + * }
    + *
    + * public int getYYEOF() {
    + * return YYEOF;
    + * }
    + * %}
    + *

    + * Alternatively, you can customize your own lexer and get lots of + * flexibility out. + * + * @author Roger Levy + */ + +public interface Lexer { + + public int ACCEPT = 1; + public int IGNORE = 0; + + /** + * Gets the next token from input and returns an integer value + * signalling what to do with the token. + */ + public int yylex() throws IOException; + + /** + * returns the matched input text region + */ + public String yytext(); + + /** + * Pushes back length character positions in the + * lexer. Conventionally used to push back exactly one token. + */ + public void pushBack(int length); + + /** + * returns value for YYEOF + */ + public int getYYEOF(); + + /** + * put a {@link Reader} inside the Lexer. + */ + public void yyreset(Reader r) throws IOException; + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/NumberRangeFileFilter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/NumberRangeFileFilter.java new file mode 100644 index 0000000..95091b1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/NumberRangeFileFilter.java @@ -0,0 +1,77 @@ +package edu.stanford.nlp.io; + +import java.io.File; +import java.io.FileFilter; + +/** + * Implements a file filter that examines a number in a filename to + * determine acceptance. This is useful for wanting to process ranges + * of numbered files in collections where each file has some name, part + * of which is alphabetic and constant, and part of which is numeric. + * The test is evaluated based on the rightmost natural number found in + * the filename string. (It only looks in the final filename, not in other + * components of the path.) Number ranges are inclusive. + * + * @author Christopher Manning + * @version 2000/12/29 + */ +public class NumberRangeFileFilter implements FileFilter { + + private int minimum; + private int maximum; + private boolean recursively; + + /** + * Sets up a NumberRangeFileFilter by specifying the range of numbers + * to accept, and whether to also traverse + * folders for recursive search. + * + * @param min The minimum number file to accept (checks >= this one) + * @param max The maximum number file to accept (checks <= this one) + * @param recurse go into folders + */ + public NumberRangeFileFilter(int min, int max, boolean recurse) { + minimum = min; + maximum = max; + recursively = recurse; + } + + /** + * Checks whether a file satisfies the number range selection filter. + * + * @param file The file + * @return true if the file is within the range filtered for + */ + public boolean accept(File file) { + if (file.isDirectory()) { + return recursively; + } else { + String filename = file.getName(); + int k = filename.length() - 1; + char c = filename.charAt(k); + while (k >= 0 && (c < '0' || c > '9')) { + k--; + if (k >= 0) { + c = filename.charAt(k); + } + } + if (k < 0) { + return false; + } + int j = k; + c = filename.charAt(j); + while (j >= 0 && (c >= '0' && c <= '9')) { + j--; + if (j >= 0) { + c = filename.charAt(j); + } + } + j++; + k++; + String theNumber = filename.substring(j, k); + int number = Integer.parseInt(theNumber); + return (number >= minimum) && (number <= maximum); + } + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/NumberRangesFileFilter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/NumberRangesFileFilter.java new file mode 100644 index 0000000..e84414c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/NumberRangesFileFilter.java @@ -0,0 +1,169 @@ +package edu.stanford.nlp.io; + +import edu.stanford.nlp.util.Pair; + +import java.io.File; +import java.io.FileFilter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Implements a file filter that examines a number in a filename to + * determine acceptance. This is useful for wanting to process ranges + * of numbered files in collections where each file has some name, part + * of which is alphabetic and constant, and part of which is numeric. + * The test is evaluated based on the rightmost natural number found in + * the filename string. (It only looks in the final filename, not in other + * components of the path.) Number ranges are inclusive. + *

    + * This filter can select multiple discontinuous ranges based on a format + * similar to page selection ranges in various formatting software, such as + * "34,52-65,67,93-95". The constructor takes a String of this sort and + * deconstructs it into a list of ranges. The accepted syntax is:

    + * ranges = range
    + * ranges = range "," ranges
    + * range = integer
    + * range = integer "-" integer

    + * Whitespace will be ignored. If the filter constructor is passed anything + * that is not a list of numeric ranges of this sort, including being passed + * an empty String, then an IllegalArgumentException will be + * thrown. + * + * @author Christopher Manning + * @version 2003/03/31 + */ +public class NumberRangesFileFilter implements FileFilter { + + private List> ranges = new ArrayList>(); + private boolean recursively; + + + /** + * Sets up a NumberRangesFileFilter by specifying the ranges of numbers + * to accept, and whether to also traverse + * folders for recursive search. + * + * @param ranges The ranges of numbers to accept (see class documentation) + * @param recurse Whether to go into subfolders + * @throws IllegalArgumentException If the String ranges does not + * contain a suitable ranges format + */ + public NumberRangesFileFilter(String ranges, boolean recurse) { + recursively = recurse; + try { + String[] ra = ranges.split(","); + for (String range : ra) { + String[] one = range.split("-"); + if (one.length > 2) { + throw new IllegalArgumentException("Constructor argument not valid list of number ranges (too many hyphens): "); + } else { + int low = Integer.parseInt(one[0].trim()); + int high; + if (one.length == 2) { + high = Integer.parseInt(one[1].trim()); + } else { + high = low; + } + Pair p = new Pair(Integer.valueOf(low), Integer.valueOf(high)); + this.ranges.add(p); + } + } + } catch (Exception e) { + IllegalArgumentException iae = new IllegalArgumentException("Constructor argument not valid list of number ranges: " + ranges); + iae.initCause(e); + throw iae; + } + } + + + /** + * Checks whether a file satisfies the number range selection filter. + * The test is evaluated based on the rightmost natural number found in + * the filename string (proper, not including directories in a path). + * + * @param file The file + * @return true If the file is within the ranges filtered for + */ + public boolean accept(File file) { + if (file.isDirectory()) { + return recursively; + } else { + String filename = file.getName(); + return accept(filename); + } + } + + + /** + * Checks whether a String satisfies the number range selection filter. + * The test is evaluated based on the rightmost natural number found in + * the String. Note that this is just evaluated on the String as given. + * It is not trying to interpret it as a filename and to decide whether + * the file exists, is a directory or anything like that. + * + * @param str The String to check for a number in + * @return true If the String is within the ranges filtered for + */ + public boolean accept(String str) { + int k = str.length() - 1; + char c = str.charAt(k); + while (k >= 0 && !Character.isDigit(c)) { + k--; + if (k >= 0) { + c = str.charAt(k); + } + } + if (k < 0) { + return false; + } + int j = k; + c = str.charAt(j); + while (j >= 0 && Character.isDigit(c)) { + j--; + if (j >= 0) { + c = str.charAt(j); + } + } + j++; + k++; + String theNumber = str.substring(j, k); + int number = Integer.parseInt(theNumber); + for (Pair p : ranges) { + int low = p.first().intValue(); + int high = p.second().intValue(); + if (number >= low && number <= high) { + return true; + } + } + return false; + } + + + @Override + public String toString() { + StringBuilder sb; + if (recursively) { + sb = new StringBuilder("recursively "); + } else { + sb = new StringBuilder(); + } + for (Iterator> it = ranges.iterator(); it.hasNext(); ) { + Pair p = it.next(); + int low = p.first().intValue(); + int high = p.second().intValue(); + if (low == high) { + sb.append(low); + } else { + sb.append(low); + sb.append('-'); + sb.append(high); + } + if (it.hasNext()) { + sb.append(','); + } + } + return sb.toString(); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/OutDataStreamFile.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/OutDataStreamFile.java new file mode 100644 index 0000000..ad39f8a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/OutDataStreamFile.java @@ -0,0 +1,31 @@ +/** + * Title: StanfordMaxEnt

    + * Description: A Maximum Entropy Toolkit

    + * Copyright: Copyright (c) Kristina Toutanova

    + * Company: Stanford University

    + * @author Kristina Toutanova + * @version 1.0 + */ + +package edu.stanford.nlp.io; + +import java.io.BufferedOutputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +//: com:bruceeckel:tools:OutFile.java +// Shorthand class for opening an output file +// for data storage. + + +public class OutDataStreamFile extends DataOutputStream { + public OutDataStreamFile(String filename) throws IOException { + this(new File(filename)); + } + + public OutDataStreamFile(File file) throws IOException { + super(new BufferedOutputStream(new FileOutputStream(file))); + } +} ///:~ diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/PrintFile.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/PrintFile.java new file mode 100644 index 0000000..2a3ab23 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/PrintFile.java @@ -0,0 +1,27 @@ +/** + * Title: StanfordMaxEnt

    + * Description: A Maximum Entropy Toolkit

    + * Copyright: Copyright (c) Bruce Eckel

    + * Company: Stanford University

    + * @author Bruce Eckel + * @version 1.0 + */ +package edu.stanford.nlp.io; + +import java.io.*; + +/** + * Shorthand class for opening an output file for human-readable output. + * com:bruceeckel:tools:PrintFile.java + */ +public class PrintFile extends PrintStream { + + public PrintFile(String filename) throws IOException { + super(new BufferedOutputStream(new FileOutputStream(filename))); + } + + public PrintFile(File file) throws IOException { + this(file.getPath()); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/ReaderInputStream.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/ReaderInputStream.java new file mode 100644 index 0000000..163c301 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/ReaderInputStream.java @@ -0,0 +1,205 @@ +package edu.stanford.nlp.io; + +/* + * Copyright 2004-2005 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +/** + * Adapts a Reader as an InputStream. + * Adapted from StringInputStream. + * + */ +public class ReaderInputStream extends InputStream { + + /** Source Reader */ + private Reader in; + + private String encoding = System.getProperty("file.encoding"); + + private byte[] slack; + + private int begin; + + /** + * Construct a ReaderInputStream + * for the specified Reader. + * + * @param reader Reader. Must not be null. + */ + public ReaderInputStream(Reader reader) { + in = reader; + } + + /** + * Construct a ReaderInputStream + * for the specified Reader, + * with the specified encoding. + * + * @param reader non-null Reader. + * @param encoding non-null String encoding. + */ + public ReaderInputStream(Reader reader, String encoding) { + this(reader); + if (encoding == null) { + throw new IllegalArgumentException("encoding must not be null"); + } else { + this.encoding = encoding; + } + } + + /** + * Reads from the Reader, returning the same value. + * + * @return the value of the next character in the Reader. + * + * @exception IOException if the original Reader fails to be read + */ + public synchronized int read() throws IOException { + if (in == null) { + throw new IOException("Stream Closed"); + } + + byte result; + if (slack != null && begin < slack.length) { + result = slack[begin]; + if (++begin == slack.length) { + slack = null; + } + } else { + byte[] buf = new byte[1]; + if (read(buf, 0, 1) <= 0) { + result = -1; + } + result = buf[0]; + } + + if (result < -1) { + result += 256; + } + + return result; + } + + /** + * Reads from the Reader into a byte array + * + * @param b the byte array to read into + * @param off the offset in the byte array + * @param len the length in the byte array to fill + * @return the actual number read into the byte array, -1 at + * the end of the stream + * @exception IOException if an error occurs + */ + public synchronized int read(byte[] b, int off, int len) + throws IOException { + if (in == null) { + throw new IOException("Stream Closed"); + } + + while (slack == null) { + char[] buf = new char[len]; // might read too much + int n = in.read(buf); + if (n == -1) { + return -1; + } + if (n > 0) { + slack = new String(buf, 0, n).getBytes(encoding); + begin = 0; + } + } + + if (len > slack.length - begin) { + len = slack.length - begin; + } + + System.arraycopy(slack, begin, b, off, len); + + if ((begin += len) >= slack.length) { + slack = null; + } + + return len; + } + + /** + * Marks the read limit of the StringReader. + * + * @param limit the maximum limit of bytes that can be read before the + * mark position becomes invalid + */ + public synchronized void mark(final int limit) { + try { + in.mark(limit); + } catch (IOException ioe) { + throw new RuntimeException(ioe.getMessage()); + } + } + + + /** + * @return the current number of bytes ready for reading + * @exception IOException if an error occurs + */ + public synchronized int available() throws IOException { + if (in == null) { + throw new IOException("Stream Closed"); + } + if (slack != null) { + return slack.length - begin; + } + if (in.ready()) { + return 1; + } else { + return 0; + } + } + + /** + * @return false - mark is not supported + */ + public boolean markSupported () { + return false; // would be imprecise + } + + /** + * Resets the StringReader. + * + * @exception IOException if the StringReader fails to be reset + */ + public synchronized void reset() throws IOException { + if (in == null) { + throw new IOException("Stream Closed"); + } + slack = null; + in.reset(); + } + + /** + * Closes the Stringreader. + * + * @exception IOException if the original StringReader fails to be closed + */ + public synchronized void close() throws IOException { + if (in != null) { + in.close(); + slack = null; + in = null; + } + } +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/RegExFileFilter.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/RegExFileFilter.java new file mode 100644 index 0000000..bc6b4cf --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/RegExFileFilter.java @@ -0,0 +1,36 @@ +package edu.stanford.nlp.io; + +import java.io.*; +import java.util.regex.*; +/** + * Implements a file filter that filters based on a passed in {@link java.util.regex.Pattern}. + * Preciesly, it will accept exactly those {@link java.io.File}s for which + * the matches() method of the Pattern returns true on the output of the getName() + * method of the File. + * + * @author Jenny Finkel + */ +public class RegExFileFilter implements FileFilter { + + private Pattern pattern = null; + + /** + * Sets up a RegExFileFilter which checks if the file name (not the + * entire path) matches the passed in {@link java.util.regex.Pattern}. + */ + public RegExFileFilter(Pattern pattern) { + this.pattern = pattern; + } + + /** + * Checks whether a file satisfies the selection filter. + * + * @param file The file + * @return true if the file is acceptable + */ + public boolean accept(File file) { + Matcher m = pattern.matcher(file.getName()); + return m.matches(); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/RuntimeIOException.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/RuntimeIOException.java new file mode 100644 index 0000000..505779c --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/RuntimeIOException.java @@ -0,0 +1,51 @@ +package edu.stanford.nlp.io; + + +/** + * An unchecked version of {@link java.io.IOException}. Thrown by + * {@link edu.stanford.nlp.process.Tokenizer} implementing classes, + * among other things. + * + * @author Roger Levy + * @author Christopher Manning + */ +public class RuntimeIOException extends RuntimeException { + + private static final long serialVersionUID = -8572218999165094626L; + + /** + * Creates a new exception. + */ + public RuntimeIOException() { + } + + + /** + * Creates a new exception with a message. + * + * @param message the message for the exception + */ + public RuntimeIOException(String message) { + super(message); + } + + /** + * Creates a new exception with an embedded cause. + * + * @param cause The cause for the exception + */ + public RuntimeIOException(Throwable cause) { + super(cause); + } + + /** + * Creates a new exception with a message and an embedded cause. + * + * @param message the message for the exception + * @param cause The cause for the exception + */ + public RuntimeIOException(String message, Throwable cause) { + super(message, cause); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/StringOutputStream.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/StringOutputStream.java new file mode 100644 index 0000000..82a48c7 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/StringOutputStream.java @@ -0,0 +1,30 @@ +package edu.stanford.nlp.io; + +import java.io.*; + +/** + * An OutputStream that can be turned into a String. + * + * @author Bill MacCartney + */ +public class StringOutputStream extends OutputStream { + + StringBuilder sb = new StringBuilder(); + + public StringOutputStream() {} + + synchronized public void clear() { + sb.setLength(0); + } + + @Override + synchronized public void write(int i) { + sb.append((char) i); + } + + @Override + synchronized public String toString() { + return sb.toString(); + } + +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/TeeStream.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/TeeStream.java new file mode 100644 index 0000000..dba0fd1 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/io/TeeStream.java @@ -0,0 +1,59 @@ +package edu.stanford.nlp.io; + +import java.io.Closeable; +import java.io.Flushable; +import java.io.IOException; +import java.io.OutputStream; + +/** + * This class splits the calls to an OutputStream into two different + * streams. + * + * @author John Bauer + */ +public class TeeStream extends OutputStream + implements Closeable, Flushable +{ + public TeeStream(OutputStream s1, OutputStream s2) { + this.s1 = s1; + this.s2 = s2; + } + + OutputStream s1, s2; + + public void close() + throws IOException + { + s1.close(); + s2.close(); + } + + public void flush() + throws IOException + { + s1.flush(); + s2.flush(); + } + + public void write(byte[] b) + throws IOException + { + s1.write(b); + s2.write(b); + } + + public void write(byte[] b, int off, int len) + throws IOException + { + s1.write(b, off, len); + s2.write(b, off, len); + } + + public void write(int b) + throws IOException + { + s1.write(b); + s2.write(b); + } +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/AnnotationLookup.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/AnnotationLookup.java new file mode 100644 index 0000000..28ec8eb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/AnnotationLookup.java @@ -0,0 +1,184 @@ +package edu.stanford.nlp.ling; + +import java.util.Map; + +import edu.stanford.nlp.ling.CoreLabel.GenericAnnotation; +import edu.stanford.nlp.util.ErasureUtils; +import edu.stanford.nlp.util.Generics; + +/** @author Anna Rafferty */ +public class AnnotationLookup { + + private AnnotationLookup() {} + + public enum KeyLookup { + VALUE_KEY(CoreAnnotations.ValueAnnotation.class, OldFeatureLabelKeys.VALUE_KEY), + TAG_KEY(CoreAnnotations.PartOfSpeechAnnotation.class, OldFeatureLabelKeys.TAG_KEY), + WORD_KEY(CoreAnnotations.TextAnnotation.class, OldFeatureLabelKeys.WORD_KEY), + LEMMA_KEY(CoreAnnotations.LemmaAnnotation.class, OldFeatureLabelKeys.LEMMA_KEY), + CATEGORY_KEY(CoreAnnotations.CategoryAnnotation.class, OldFeatureLabelKeys.CATEGORY_KEY), + PROJ_CAT_KEY(CoreAnnotations.ProjectedCategoryAnnotation.class, OldFeatureLabelKeys.PROJ_CAT_KEY), + HEAD_WORD_KEY("edu.stanford.nlp.ling.TreeCoreAnnotations.HeadWordAnnotation", OldFeatureLabelKeys.HEAD_WORD_KEY), + HEAD_TAG_KEY("edu.stanford.nlp.ling.TreeCoreAnnotations.HeadTagAnnotation", OldFeatureLabelKeys.HEAD_TAG_KEY), + INDEX_KEY(CoreAnnotations.IndexAnnotation.class, OldFeatureLabelKeys.INDEX_KEY), + ARG_KEY(CoreAnnotations.ArgumentAnnotation.class, OldFeatureLabelKeys.ARG_KEY), + MARKING_KEY(CoreAnnotations.MarkingAnnotation.class, OldFeatureLabelKeys.MARKING_KEY), + SEMANTIC_HEAD_WORD_KEY(CoreAnnotations.SemanticHeadWordAnnotation.class, OldFeatureLabelKeys.SEMANTIC_HEAD_WORD_KEY), + SEMANTIC_HEAD_POS_KEY(CoreAnnotations.SemanticHeadTagAnnotation.class, OldFeatureLabelKeys.SEMANTIC_HEAD_POS_KEY), + VERB_SENSE_KEY(CoreAnnotations.VerbSenseAnnotation.class, OldFeatureLabelKeys.VERB_SENSE_KEY), + CATEGORY_FUNCTIONAL_TAG_KEY(CoreAnnotations.CategoryFunctionalTagAnnotation.class, OldFeatureLabelKeys.CATEGORY_FUNCTIONAL_TAG_KEY), + NER_KEY(CoreAnnotations.NamedEntityTagAnnotation.class, OldFeatureLabelKeys.NER_KEY), + SHAPE_KEY(CoreAnnotations.ShapeAnnotation.class, OldFeatureLabelKeys.SHAPE_KEY), + LEFT_TERM_KEY(CoreAnnotations.LeftTermAnnotation.class, OldFeatureLabelKeys.LEFT_TERM_KEY), + PARENT_KEY(CoreAnnotations.ParentAnnotation.class, OldFeatureLabelKeys.PARENT_KEY), + SPAN_KEY(CoreAnnotations.SpanAnnotation.class, OldFeatureLabelKeys.SPAN_KEY), + BEFORE_KEY(CoreAnnotations.BeforeAnnotation.class, OldFeatureLabelKeys.BEFORE_KEY), + AFTER_KEY(CoreAnnotations.AfterAnnotation.class, OldFeatureLabelKeys.AFTER_KEY), + CURRENT_KEY(CoreAnnotations.OriginalTextAnnotation.class, OldFeatureLabelKeys.CURRENT_KEY), + ANSWER_KEY(CoreAnnotations.AnswerAnnotation.class, OldFeatureLabelKeys.ANSWER_KEY), + GOLDANSWER_Key(CoreAnnotations.GoldAnswerAnnotation.class, OldFeatureLabelKeys.GOLDANSWER_KEY), + FEATURES_KEY(CoreAnnotations.FeaturesAnnotation.class, OldFeatureLabelKeys.FEATURES_KEY), + INTERPRETATION_KEY(CoreAnnotations.InterpretationAnnotation.class, OldFeatureLabelKeys.INTERPRETATION_KEY), + ROLE_KEY(CoreAnnotations.RoleAnnotation.class, OldFeatureLabelKeys.ROLE_KEY), + GAZETTEER_KEY(CoreAnnotations.GazetteerAnnotation.class, OldFeatureLabelKeys.GAZETTEER_KEY), + STEM_KEY(CoreAnnotations.StemAnnotation.class, OldFeatureLabelKeys.STEM_KEY), + POLARITY_KEY(CoreAnnotations.PolarityAnnotation.class, OldFeatureLabelKeys.POLARITY_KEY), + CH_CHAR_KEY(CoreAnnotations.ChineseCharAnnotation.class, OldFeatureLabelKeys.CH_CHAR_KEY), + CH_ORIG_SEG_KEY(CoreAnnotations.ChineseOrigSegAnnotation.class, OldFeatureLabelKeys.CH_ORIG_SEG_KEY), + CH_SEG_KEY(CoreAnnotations.ChineseSegAnnotation.class, OldFeatureLabelKeys.CH_SEG_KEY), + BEGIN_POSITION_KEY(CoreAnnotations.CharacterOffsetBeginAnnotation.class, OldFeatureLabelKeys.BEGIN_POSITION_KEY), + END_POSITION_KEY(CoreAnnotations.CharacterOffsetEndAnnotation.class, OldFeatureLabelKeys.END_POSITION_KEY), + DOCID_KEY(CoreAnnotations.DocIDAnnotation.class, OldFeatureLabelKeys.DOCID_KEY), + SENTINDEX_KEY(CoreAnnotations.SentenceIndexAnnotation.class, OldFeatureLabelKeys.SENTINDEX_KEY), + IDF_KEY(CoreAnnotations.IDFAnnotation.class, "idf"), + END_POSITION_KEY2(CoreAnnotations.CharacterOffsetEndAnnotation.class, "endPosition"), + CHUNK_KEY(CoreAnnotations.ChunkAnnotation.class, "chunk"), + NORMALIZED_NER_KEY(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, "normalized"), + MORPHO_NUM_KEY(CoreAnnotations.MorphoNumAnnotation.class,"num"), + MORPHO_PERS_KEY(CoreAnnotations.MorphoPersAnnotation.class,"pers"), + MORPHO_GEN_KEY(CoreAnnotations.MorphoGenAnnotation.class,"gen"), + MORPHO_CASE_KEY(CoreAnnotations.MorphoCaseAnnotation.class,"case"), + WORDNET_SYN_KEY(CoreAnnotations.WordnetSynAnnotation.class,"wordnetsyn"), + PROTO_SYN_KEY(CoreAnnotations.ProtoAnnotation.class,"proto"); + + public final Class coreKey; + public final String oldKey; + + private KeyLookup(Class> coreKey, String oldKey) { + this.coreKey = coreKey; + this.oldKey = oldKey; + } + + /** + * This constructor allows us to use reflection for loading old class keys. + * This is useful because we can then create distributions that do not have + * all of the classes required for all the old keys (such as trees package classes). + */ + private KeyLookup(String className, String oldKey) { + Class keyClass; + try { + keyClass = Class.forName(className); + } catch(ClassNotFoundException e) { + GenericAnnotation newKey = new GenericAnnotation() { + public Class getType() { return Object.class;} }; + keyClass = newKey.getClass(); + } + this.coreKey = ErasureUtils.uncheckedCast(keyClass); + this.oldKey = oldKey; + } + + + } + + /** + * Returns a CoreAnnotation class key for the given old-style FeatureLabel + * key if one exists; null otherwise. + */ + public static KeyLookup getCoreKey(String oldKey) { + for (KeyLookup lookup : KeyLookup.values()) { + if (lookup.oldKey.equals(oldKey)) { + return lookup; + } + } + return null; + } + + private static Map>,Class> valueCache = Generics.newHashMap(); + + /** + * Returns the runtime value type associated with the given key. Caches + * results. + */ + @SuppressWarnings("unchecked") + public static Class getValueType(Class key) { + Class type = valueCache.get(key); + if (type == null) { + try { + type = key.newInstance().getType(); + } catch (Exception e) { + throw new RuntimeException("Unexpected failure to instantiate - is your key class fancy?", e); + } + valueCache.put((Class)key, type); + } + return type; + } + + /** + * Lookup table for mapping between old-style *Label keys and classes + * the provide comparable backings in the core. + */ +//OLD keys kept around b/c we're kill IndexedFeatureLabel and these keys used to live there + private static class OldFeatureLabelKeys { + + public static final String DOCID_KEY = "docID"; + public static final String SENTINDEX_KEY = "sentIndex"; + public static final Object WORD_FORMAT = "WORD_FORMAT"; + public static final Object WORD_TAG_FORMAT = "WORD_TAG_FORMAT"; + public static final Object WORD_TAG_INDEX_FORMAT = "WORD_TAG_INDEX_FORMAT"; + public static final Object VALUE_FORMAT = "VALUE_FORMAT"; + public static final Object COMPLETE_FORMAT = "COMPLETE_FORMAT"; + public static final String VALUE_KEY = "value"; + public static final String TAG_KEY = "tag"; + public static final String WORD_KEY = "word"; + public static final String LEMMA_KEY = "lemma"; + public static final String CATEGORY_KEY = "cat"; + public static final String PROJ_CAT_KEY = "pcat"; + public static final String HEAD_WORD_KEY = "hw"; + public static final String HEAD_TAG_KEY = "ht"; + public static final String INDEX_KEY = "idx"; + public static final String ARG_KEY = "arg"; + public static final String MARKING_KEY = "mark"; + public static final String SEMANTIC_HEAD_WORD_KEY = "shw"; + public static final String SEMANTIC_HEAD_POS_KEY = "shp"; + public static final String VERB_SENSE_KEY = "vs"; + public static final String CATEGORY_FUNCTIONAL_TAG_KEY = "cft"; + public static final String NER_KEY = "ner"; + public static final String SHAPE_KEY = "shape"; + public static final String LEFT_TERM_KEY = "LEFT_TERM"; + public static final String PARENT_KEY = "PARENT"; + public static final String SPAN_KEY = "SPAN"; + public static final String BEFORE_KEY = "before"; + public static final String AFTER_KEY = "after"; + public static final String CURRENT_KEY = "current"; + public static final String ANSWER_KEY = "answer"; + public static final String GOLDANSWER_KEY = "goldAnswer"; + public static final String FEATURES_KEY = "features"; + public static final String INTERPRETATION_KEY = "interpretation"; + public static final String ROLE_KEY = "srl"; + public static final String GAZETTEER_KEY = "gazetteer"; + public static final String STEM_KEY = "stem"; + public static final String POLARITY_KEY = "polarity"; + public static final String CH_CHAR_KEY = "char"; + public static final String CH_ORIG_SEG_KEY = "orig_seg"; // the segmentation info existing in the original text + public static final String CH_SEG_KEY = "seg"; // the segmentation information from the segmenter + public static final String BEGIN_POSITION_KEY = "BEGIN_POS"; + public static final String END_POSITION_KEY = "END_POS"; + + + private OldFeatureLabelKeys() { + } + + } // end static class OldFeatureLabelKeys + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/BasicDatum.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/BasicDatum.java new file mode 100644 index 0000000..5da9a3a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/BasicDatum.java @@ -0,0 +1,146 @@ +package edu.stanford.nlp.ling; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Basic implementation of Datum interface that can be constructed with a + * Collection of features and one more more labels. The features must be + * specified + * at construction, but the labels can be set and/or changed later. + * + * @author Joseph Smarr (jsmarr@stanford.edu) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param The type of the labels in the Dataset + * @param The type of the features in the Dataset + */ +public class BasicDatum implements Datum { + + /** + * features for this Datum + */ + @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) + private final Collection features; + + /** + * labels for this Datum. Invariant: always non-null + */ + @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) + private final List labels = new ArrayList(); + + /** + * Constructs a new BasicDatum with the given features and labels. + */ + public BasicDatum(Collection features, Collection labels) { + this(features); + setLabels(labels); + } + + /** + * Constructs a new BasicDatum with the given features and label. + */ + public BasicDatum(Collection features, LabelType label) { + this(features); + setLabel(label); + } + + /** + * Constructs a new BasicDatum with the given features and no labels. + */ + public BasicDatum(Collection features) { + this.features = features; + } + + /** + * Constructs a new BasicDatum with no features or labels. + */ + public BasicDatum() { + this(null); + } + + /** + * Returns the collection that this BasicDatum was constructed with. + */ + public Collection asFeatures() { + return (features); + } + + /** + * Returns the first label for this Datum, or null if none have been set. + */ + public LabelType label() { + return ((labels.size() > 0) ? labels.get(0) : null); + } + + /** + * Returns the complete List of labels for this Datum, which may be empty. + */ + public Collection labels() { + return labels; + } + + /** + * Removes all currently assigned Labels for this Datum then adds the + * given Label. + * Calling setLabel(null) effectively clears all labels. + */ + public void setLabel(LabelType label) { + labels.clear(); + addLabel(label); + } + + /** + * Removes all currently assigned labels for this Datum then adds all + * of the given Labels. + */ + public void setLabels(Collection labels) { + this.labels.clear(); + if (labels != null) { + this.labels.addAll(labels); + } + } + + /** + * Adds the given Label to the List of labels for this Datum if it is not + * null. + */ + public void addLabel(LabelType label) { + if (label != null) { + labels.add(label); + } + } + + /** + * Returns a String representation of this BasicDatum (lists features and labels). + */ + @Override + public String toString() { + return ("BasicDatum[features=" + asFeatures() + ",labels=" + labels() + "]"); + } + + + /** + * Returns whether the given Datum contains the same features as this Datum. + * Doesn't check the labels, should we change this? + */ + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + if (!(o instanceof Datum)) { + return (false); + } + + Datum d = (Datum) o; + return features.equals(d.asFeatures()); + } + + public int hashCode() { + return features.hashCode(); + } + + private static final long serialVersionUID = -4857004070061779966L; + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/BasicDocument.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/BasicDocument.java new file mode 100644 index 0000000..028fc61 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/BasicDocument.java @@ -0,0 +1,471 @@ +package edu.stanford.nlp.ling; + +import edu.stanford.nlp.process.TokenizerFactory; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.process.Tokenizer; +import edu.stanford.nlp.util.ErasureUtils; + +import java.io.*; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + + +/** + * Basic implementation of Document that should be suitable for most needs. + * BasicDocument is an ArrayList for storing words and performs tokenization + * during construction. Override {@link #parse(String)} to provide support + * for custom + * document formats or to do a custom job of tokenization. BasicDocument should + * only be used for documents that are small enough to store in memory. + *

    + * The easiest way to use BasicDocuments is to construct them and call an init + * method in the same line (we use init methods instead of constructors because + * they're inherited and allow subclasses to have other more specific constructors). + * For example, to read in a file file and tokenize it, you can call + *

    Document doc=new BasicDocument().init(file);
    . + * + * @author Joseph Smarr (jsmarr@stanford.edu) + * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) + * + * @param The type of the labels + */ +public class BasicDocument extends ArrayList implements Document { + + /** + * title of this document (never null). + */ + protected String title = ""; + + /** + * original text of this document (may be null). + */ + protected String originalText; + + /** + * Label(s) for this document. + */ + protected final List labels = new ArrayList(); + + /** + * TokenizerFactory used to convert the text into words inside + * {@link #parse(String)}. + */ + protected TokenizerFactory tokenizerFactory; + + /** + * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. + * Call one of the init * methods to populate the document + * from a desired source. + */ + public BasicDocument() { + this(PTBTokenizer.factory()); + } + + /** + * Constructs a new (empty) BasicDocument using the given tokenizer. + * Call one of the init * methods to populate the document + * from a desired source. + */ + public BasicDocument(TokenizerFactory tokenizerFactory) { + setTokenizerFactory(tokenizerFactory); + } + + public BasicDocument(Document d) { + this((Collection) d); + } + + public BasicDocument(Collection d) { + this(); + addAll(d); + } + + /** + * Inits a new BasicDocument with the given text contents and title. + * The text is tokenized using {@link #parse(String)} to populate the list of words + * ("" is used if text is null). If specified, a reference to the + * original text is also maintained so that the text() method returns the + * text given to this constructor. Returns a reference to this + * BasicDocument + * for convenience (so it's more like a constructor, but inherited). + */ + public static BasicDocument init(String text, String title, boolean keepOriginalText) { + BasicDocument basicDocument = new BasicDocument(); + // initializes the List of labels and sets the title + basicDocument.setTitle(title); + + // stores the original text as specified + if (keepOriginalText) { + basicDocument.originalText = text; + } else { + basicDocument.originalText = null; + } + + // populates the words by parsing the text + basicDocument.parse(text == null ? "" : text); + + return basicDocument; + } + + /** + * Calls init(text,title,true) + */ + public static BasicDocument init(String text, String title) { + return init(text, title, true); + } + + /** + * Calls init(text,null,keepOriginalText) + */ + public static BasicDocument init(String text, boolean keepOriginalText) { + return init(text, null, keepOriginalText); + } + + /** + * Calls init(text,null,true) + */ + public static BasicDocument init(String text) { + return init(text, null, true); + } + + /** + * Calls init((String)null,null,true) + */ + public static BasicDocument init() { + return init((String) null, null, true); + } + + /** + * Inits a new BasicDocument by reading in the text from the given Reader. + * + * @see #init(String,String,boolean) + */ + public static BasicDocument init(Reader textReader, String title, boolean keepOriginalText) throws IOException { + return init(DocumentReader.readText(textReader), title, keepOriginalText); + } + + /** + * Calls init(textReader,title,true) + */ + public BasicDocument init(Reader textReader, String title) throws IOException { + return init(textReader, title, true); + } + + /** + * Calls init(textReader,null,keepOriginalText) + */ + public BasicDocument init(Reader textReader, boolean keepOriginalText) throws IOException { + return init(textReader, null, keepOriginalText); + } + + /** + * Calls init(textReader,null,true) + */ + public BasicDocument init(Reader textReader) throws IOException { + return init(textReader, null, true); + } + + /** + * Inits a new BasicDocument by reading in the text from the given File. + * + * @see #init(String,String,boolean) + */ + public BasicDocument init(File textFile, String title, boolean keepOriginalText) throws FileNotFoundException, IOException { + Reader in = DocumentReader.getReader(textFile); + BasicDocument bd = init(in, title, keepOriginalText); + in.close(); + return bd; + } + + /** + * Calls init(textFile,title,true) + */ + public BasicDocument init(File textFile, String title) throws FileNotFoundException, IOException { + return init(textFile, title, true); + } + + /** + * Calls init(textFile,textFile.getCanonicalPath(),keepOriginalText) + */ + public BasicDocument init(File textFile, boolean keepOriginalText) throws FileNotFoundException, IOException { + return init(textFile, textFile.getCanonicalPath(), keepOriginalText); + } + + /** + * Calls init(textFile,textFile.getCanonicalPath(),true) + */ + public BasicDocument init(File textFile) throws FileNotFoundException, IOException { + return init(textFile, textFile.getCanonicalPath(), true); + } + + /** + * Constructs a new BasicDocument by reading in the text from the given URL. + * + * @see #init(String,String,boolean) + */ + public BasicDocument init(URL textURL, String title, boolean keepOriginalText) throws IOException { + return init(DocumentReader.getReader(textURL), title, keepOriginalText); + } + + /** + * Calls init(textURL,title,true) + */ + public BasicDocument init(URL textURL, String title) throws FileNotFoundException, IOException { + return init(textURL, title, true); + } + + /** + * Calls init(textURL,textFile.toExternalForm(),keepOriginalText) + */ + public BasicDocument init(URL textURL, boolean keepOriginalText) throws FileNotFoundException, IOException { + return init(textURL, textURL.toExternalForm(), keepOriginalText); + } + + /** + * Calls init(textURL,textURL.toExternalForm(),true) + */ + public BasicDocument init(URL textURL) throws FileNotFoundException, IOException { + return init(textURL, textURL.toExternalForm(), true); + } + + /** + * Inits a new BasicDocument with the given list of words and title. + */ + public BasicDocument init(List words, String title) { + // initializes the List of labels and sets the title + setTitle(title); + // no original text + originalText = null; + // adds all of the given words to the list maintained by this document + addAll(words); + return (this); + } + + /** + * Calls init(words,null) + */ + public BasicDocument init(List words) { + return init(words, null); + } + + /** + * Tokenizes the given text to populate the list of words this Document + * represents. The default implementation uses the current tokenizer and tokenizes + * the entirety of the text into words. Subclasses should override this method + * to parse documents in non-standard formats, and/or to pull the title of the + * document from the text. The given text may be empty ("") but will never + * be null. Subclasses may want to do additional processing and then just + * call super.parse. + * + * @see #setTokenizerFactory + */ + protected void parse(String text) { + Tokenizer toke = tokenizerFactory.getTokenizer(new StringReader(text)); + addAll(toke.tokenize()); + } + + /** + * Returns this (the features are the list of words). + */ + public Collection asFeatures() { + return this; + } + + /** + * Returns the first label for this Document, or null if none have been + * set. + */ + public L label() { + return (labels.size() > 0) ? labels.get(0) : null; + } + + /** + * Returns the complete List of labels for this Document. + * This is an empty collection if none have been set. + */ + public Collection labels() { + return labels; + } + + /** + * Removes all currently assigned labels for this Document then adds + * the given label. + * Calling setLabel(null) effectively clears all labels. + */ + public void setLabel(L label) { + labels.clear(); + addLabel(label); + } + + /** + * Removes all currently assigned labels for this Document then adds all + * of the given labels. + */ + public void setLabels(Collection labels) { + this.labels.clear(); + if (labels != null) { + this.labels.addAll(labels); + } + } + + /** + * Adds the given label to the List of labels for this Document if it is not null. + */ + public void addLabel(L label) { + if (label != null) { + labels.add(label); + } + } + + /** + * Returns the title of this document. The title may be empty ("") but will + * never be null. + */ + public String title() { + return (title); + } + + /** + * Sets the title of this Document to the given title. If the given title + * is null, sets the title to "". + */ + public void setTitle(String title) { + if (title == null) { + this.title = ""; + } else { + this.title = title; + } + } + + /** + * Returns the current TokenizerFactory used by {@link #parse(String)}. + */ + public TokenizerFactory tokenizerFactory() { + return (tokenizerFactory); + } + + + /** + * Sets the tokenizerFactory to be used by {@link #parse(String)}. + * Set this tokenizer before calling one of the init methods + * because + * it will probably call parse. Note that the tokenizer can equivalently be + * passed in to the constructor. + * + * @see #BasicDocument(TokenizerFactory) + */ + public void setTokenizerFactory(TokenizerFactory tokenizerFactory) { + this.tokenizerFactory = tokenizerFactory; + } + + /** + * Returns a new empty BasicDocument with the same title, labels, and + * tokenizer as this Document. This is useful when you want to make a + * new Document that's like the old document but + * can be filled with new text (e.g. if you're transforming + * the contents non-destructively). + *

    + *

    Subclasses that want to preserve extra state should + * override this method and add the extra state to the new document before + * returning it. The new BasicDocument is created by calling + * getClass().newInstance() so it should be of the correct subclass, + * and thus you should be able to cast it down and add extra meta data directly. + * Note however that in the event an Exception is thrown on instantiation + * (e.g. if your subclass doesn't have a public empty constructor--it should btw!) + * then a new BasicDocument is used instead. Thus if you want to be paranoid + * (or some would say "correct") you should check that your instance is of + * the correct sub-type as follows (this example assumes the subclass is called + * NumberedDocument and it has the additional numberproperty): + *

    Document blankDocument=super.blankDocument();
    +   * if(blankDocument instanceof NumberedDocument) {
    +   *     ((NumberedDocument)blankDocument).setNumber(getNumber());
    + */ + public Document blankDocument() { + BasicDocument bd; + + // tries to instantiate by reflection, settles for direct instantiation + try { + bd = ErasureUtils.>uncheckedCast(getClass().newInstance()); + } catch (Exception e) { + bd = new BasicDocument(); + } + + // copies over basic meta-data + bd.setTitle(title()); + bd.setLabels(labels()); + bd.setTokenizerFactory(tokenizerFactory); + + // cast to the new output type + return ErasureUtils.>uncheckedCast(bd); + } + + /** + * Returns the text originally used to construct this document, or null if + * there was no original text. + */ + public String originalText() { + return (originalText); + } + + /** + * Returns a "pretty" version of the words in this Document suitable for + * display. The default implementation returns each of the words in + * this Document separated + * by spaces. Specifically, each element that implements {@link HasWord} + * has its + * {@link HasWord#word} printed, and other elements are skipped. + *

    + *

    Subclasses that maintain additional information may which to + * override this method.

    + */ + public String presentableText() { + StringBuilder sb = new StringBuilder(); + for (Word cur : this) { + if (sb.length() > 0) { + sb.append(' '); + } + sb.append(cur.word()); + } + return (sb.toString()); + } + + /** + * For internal debugging purposes only. Creates and tests various instances + * of BasicDocument. + */ + public static void main(String[] args) { + try { + printState(BasicDocument.init("this is the text", "this is the title [String]", true)); + printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); + + File f = File.createTempFile("BasicDocumentTestFile", null); + f.deleteOnExit(); + PrintWriter out = new PrintWriter(new FileWriter(f)); + out.print("this is the text"); + out.flush(); + out.close(); + printState(new BasicDocument().init(f, "this is the title [File]", true)); + printState(new BasicDocument().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * For internal debugging purposes only. + * Prints the state of the given BasicDocument to stderr. + */ + public static void printState(BasicDocument bd) throws Exception { + System.err.println("BasicDocument:"); + System.err.println("\tTitle: " + bd.title()); + System.err.println("\tLabels: " + bd.labels()); + System.err.println("\tOriginalText: " + bd.originalText()); + System.err.println("\tWords: " + bd); + System.err.println(); + } + + private static final long serialVersionUID = -24171720584352262L; + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CategoryWordTag.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CategoryWordTag.java new file mode 100644 index 0000000..18f6eaa --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CategoryWordTag.java @@ -0,0 +1,182 @@ +package edu.stanford.nlp.ling; + + +/** + * A CategoryWordTag object acts as a complex Label + * which contains a category, a head word, and a tag. + * The category label is the primary value + * + * @author Christopher Manning + */ +public class CategoryWordTag extends StringLabel implements HasCategory, HasWord, HasTag { + + private static final long serialVersionUID = -745085381666943254L; + + protected String word; + protected String tag; + + /** + * If this is false, the tag and word are never printed in toString() + * calls. + */ + public static boolean printWordTag = true; + + /** + * If set to true, when a terminal or preterminal has as its category + * something that is also the word or tag value, the latter are + * suppressed. + */ + public static boolean suppressTerminalDetails; // = false; + + + public CategoryWordTag() { + super(); + } + + /** + * This one argument constructor sets just the value. + * + * @param label the string that will become the category/value + */ + public CategoryWordTag(String label) { + super(label); + } + + public CategoryWordTag(String category, String word, String tag) { + super(category); + this.word = word; + this.tag = tag; + } + + /** + * Creates a new CategoryWordTag label from an existing label. + * The oldLabel value() -- i.e., category -- is used for the new label. + * The tag and word + * are initialized iff the current label implements HasTag and HasWord + * respectively. + * + * @param oldLabel The label to use as a basis of this Label + */ + public CategoryWordTag(Label oldLabel) { + super(oldLabel); + if (oldLabel instanceof HasTag) { + this.tag = ((HasTag) oldLabel).tag(); + } + if (oldLabel instanceof HasWord) { + this.word = ((HasWord) oldLabel).word(); + } + } + + public String category() { + return value(); + } + + public void setCategory(String category) { + setValue(category); + } + + public String word() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public String tag() { + return tag; + } + + public void setTag(String tag) { + this.tag = tag; + } + + public void setCategoryWordTag(String category, String word, String tag) { + setCategory(category); + setWord(word); + setTag(tag); + } + + + /** + * Returns a String representation of the label. + * This attempts to be somewhat clever in choosing to print or + * suppress null components and the details of words or categories + * depending on the setting of printWordTag and + * suppressTerminalDetails. + * + * @return The label as a string + */ + @Override + public String toString() { + if (category() != null) { + if ((word() == null || tag() == null) || !printWordTag || (suppressTerminalDetails && (word().equals(category()) || tag().equals(category())))) { + return category(); + } else { + return category() + "[" + word() + "/" + tag() + "]"; + } + } else { + if (tag() == null) { + return word(); + } else { + return word() + "/" + tag(); + } + } + } + + + /** + * Returns a String representation of the label. + * If the argument String is "full" then all components of the label + * are returned, and otherwise the normal toString() is returned. + * + * @return The label as a string + */ + public String toString(String mode) { + if ("full".equals(mode)) { + return category() + "[" + word() + "/" + tag() + "]"; + } + return toString(); + } + + + /** + * Set everything by reversing a toString operation. + * This should be added at some point. + */ + @Override + public void setFromString(String labelStr) { + throw new UnsupportedOperationException(); + } + + + // extra class guarantees correct lazy loading (Bloch p.194) + private static class LabelFactoryHolder { + private LabelFactoryHolder() {} + private static final LabelFactory lf = new CategoryWordTagFactory(); + } + + /** + * Return a factory for this kind of label + * (i.e., CategoryWordTag). + * The factory returned is always the same one (a singleton). + * + * @return The label factory + */ + @Override + public LabelFactory labelFactory() { + return LabelFactoryHolder.lf; + } + + + /** + * Return a factory for this kind of label + * + * @return The label factory + */ + public static LabelFactory factory() { + return LabelFactoryHolder.lf; + } + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CategoryWordTagFactory.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CategoryWordTagFactory.java new file mode 100644 index 0000000..c11da31 --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CategoryWordTagFactory.java @@ -0,0 +1,75 @@ +package edu.stanford.nlp.ling; + + +/** + * A CategoryWordTagFactory is a factory that makes + * a Label which is a CategoryWordTag triplet. + * + * @author Christopher Manning + */ +public class CategoryWordTagFactory implements LabelFactory { + + /** + * Make a new label with this String as the "name". + * + * @param labelStr The string to use as a label + * @return The newly created Label + */ + public Label newLabel(String labelStr) { + return new CategoryWordTag(labelStr); + } + + /** + * Make a new label with this String as the value. + * This implementation ignores the options + * + * @param labelStr The String that will be used for balue + * @param options This argument is ignored + * @return The newly created Label + */ + public Label newLabel(String labelStr, int options) { + return new CategoryWordTag(labelStr); + } + + /** + * Make a new label with this String as the "name". + * + * @param labelStr The string to use as a label + * @return The newly created Label + */ + public Label newLabelFromString(String labelStr) { + CategoryWordTag cwt = new CategoryWordTag(); + cwt.setFromString(labelStr); + return cwt; + } + + /** + * Create a new CategoryWordTag label, where the label is formed from + * the various String objects passed in. + * + * @param word The word part of the label + * @param tag The tag part of the label + * @param category The category part of the label + * @return The newly created Label + */ + public Label newLabel(String word, String tag, String category) { + // System.out.println("Making new CWT label: " + category + " | " + + // word + " | " + tag); + return new CategoryWordTag(category, word, tag); + } + + /** + * Create a new CategoryWordTag Label, where the label is + * formed from + * the Label object passed in. Depending on what fields + * each label has, other things will be null. + * + * @param oldLabel The Label that the new label is being created from + * @return a new label of a particular type + */ + public Label newLabel(Label oldLabel) { + return new CategoryWordTag(oldLabel); + } + +} + diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/ChineseCoreAnnotations.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/ChineseCoreAnnotations.java new file mode 100644 index 0000000..85f7ddb --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/ChineseCoreAnnotations.java @@ -0,0 +1,17 @@ +package edu.stanford.nlp.ling; + +import java.util.List; +import edu.stanford.nlp.util.ErasureUtils; + +public class ChineseCoreAnnotations { + + private ChineseCoreAnnotations() { } // only static members + + public static class CharactersAnnotation + implements CoreAnnotation> + { + public Class> getType() { + return ErasureUtils.>> uncheckedCast(List.class); + } + } +} \ No newline at end of file diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CoreAnnotation.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CoreAnnotation.java new file mode 100644 index 0000000..9e5632a --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CoreAnnotation.java @@ -0,0 +1,25 @@ +package edu.stanford.nlp.ling; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.TypesafeMap; + +/** + * The base class for any annotation that can be marked on a {@link CoreMap}, + * parameterized by the type of the value associated with the annotation. + * Subclasses of this class are the keys in the {@link CoreMap}, so they are + * instantiated only by utility methods in {@link CoreAnnotations}. + * + * @author dramage + * @author rafferty + */ +public interface CoreAnnotation + extends TypesafeMap.Key { + + /** + * Returns the type associated with this annotation. This method must + * return the same class type as its value type parameter. It feels like + * one should be able to get away without this method, but because Java + * erases the generic type signature, that info disappears at runtime. + */ + public Class getType(); +} diff --git a/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CoreAnnotations.java b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CoreAnnotations.java new file mode 100644 index 0000000..c496d4e --- /dev/null +++ b/corenlp-wrapper/stanford-corenlp-full-2013-04-04/src/edu/stanford/nlp/ling/CoreAnnotations.java @@ -0,0 +1,1478 @@ +package edu.stanford.nlp.ling; + +import java.util.Calendar; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; + +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.ErasureUtils; +import edu.stanford.nlp.util.IntPair; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Triple; + +/** + *

    + * Set of common annotations for {@link CoreMap}s. The classes + * defined here are typesafe keys for getting and setting annotation + * values. These classes need not be instantiated outside of this + * class. e.g {@link TextAnnotation}.class serves as the key and a + * String serves as the value containing the + * corresponding word. + *

    + * + *

    + * New types of {@link CoreAnnotation} can be defined anywhere that is + * convenient in the source tree - they are just classes. This file exists to + * hold widely used "core" annotations and others inherited from the + * {@link Label} family. In general, most keys should be placed in this file as + * they may often be reused throughout the code. This architecture allows for + * flexibility, but in many ways it should be considered as equivalent to an + * enum in which everything should be defined + *

    + * + *

    + * The getType method required by CoreAnnotation must return the same class type + * as its value type parameter. It feels like one should be able to get away + * without that method, but because Java erases the generic type signature, that + * info disappears at runtime. See {@link ValueAnnotation} for an example. + *

    + * + * @author dramage + * @author rafferty + * @author bethard + */ +public class CoreAnnotations { + + private CoreAnnotations() { + } // only static members + + /** + * The CoreMap key identifying the annotation's text. + * + * Note that this key is intended to be used with many different kinds of + * annotations - documents, sentences and tokens all have their own text. + */ + public static class TextAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + + /** + * The CoreMap key for getting the lemma (morphological stem) of a token. + * + * This key is typically set on token annotations. + * + * TODO: merge with StemAnnotation? + */ + public static class LemmaAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key for getting the Penn part of speech of a token. + * + * This key is typically set on token annotations. + */ + public static class PartOfSpeechAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key for getting the token-level named entity tag (e.g., DATE, + * PERSON, etc.) + * + * This key is typically set on token annotations. + */ + public static class NamedEntityTagAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key for getting the token-level named entity tag (e.g., DATE, + * PERSON, etc.) from a previous NER tagger. NERFeatureFactory is sensitive to + * this tag and will turn the annotations from the previous NER tagger into + * new features. This is currently used to implement one level of stacking -- + * we may later change it to take a list as needed. + * + * This key is typically set on token annotations. + */ + public static class StackedNamedEntityTagAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key for getting the token-level true case annotation (e.g., + * INIT_UPPER) + * + * This key is typically set on token annotations. + */ + public static class TrueCaseAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key identifying the annotation's true-cased text. + * + * Note that this key is intended to be used with many different kinds of + * annotations - documents, sentences and tokens all have their own text. + */ + public static class TrueCaseTextAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The CoreMap key for getting the tokens contained by an annotation. + * + * This key should be set for any annotation that contains tokens. It can be + * done without much memory overhead using List.subList. + */ + public static class TokensAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.>> uncheckedCast(List.class); + } + } + + /** + * The CoreMap key for getting the tokens (can be words, phrases or anything that are of type CoreMap) contained by an annotation. + * + * This key should be set for any annotation that contains tokens (words, phrases etc). It can be + * done without much memory overhead using List.subList. + */ + public static class GenericTokensAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.>> uncheckedCast(List.class); + } + } + + /** + * The CoreMap key for getting the sentences contained by an annotation. + * + * This key is typically set only on document annotations. + */ + public static class SentencesAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.uncheckedCast(List.class); + } + } + + /** + * The CoreMap key for getting the paragraphs contained by an annotation. + * + * This key is typically set only on document annotations. + */ + public static class ParagraphsAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.uncheckedCast(List.class); + } + } + + /** + * The CoreMap key identifying the first token included in an annotation. The + * token with index 0 is the first token in the document. + * + * This key should be set for any annotation that contains tokens. + */ + public static class TokenBeginAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * The CoreMap key identifying the last token after the end of an annotation. + * The token with index 0 is the first token in the document. + * + * This key should be set for any annotation that contains tokens. + */ + public static class TokenEndAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * The CoreMap key identifying the date and time associated with an + * annotation. + * + * This key is typically set on document annotations. + */ + public static class CalendarAnnotation implements CoreAnnotation { + public Class getType() { + return Calendar.class; + } + } + + /** + * These are the keys hashed on by IndexedWord + */ + /** + * This refers to the unique identifier for a "document", where document may + * vary based on your application. + */ + public static class DocIDAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * This indexes a token number inside a sentence. Standardly, tokens are + * indexed within a sentence starting at 1 (not 0: we follow common parlance + * whereby we speak of the first word of a sentence). + * This is generally an individual word or feature index - it is local, and + * may not be uniquely identifying without other identifiers such as sentence + * and doc. However, if these are the same, the index annotation should be a + * unique identifier for differentiating objects. + */ + public static class IndexAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * This indexes the beginning of a span of words, e.g., a constituent in a + * tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}. + * This annotation counts tokens. + * It standardly indexes from 1 (like IndexAnnotation). The reasons for + * this are: (i) Talking about the first word of a sentence is kind of + * natural, and (ii) We use index 0 to refer to an imaginary root in + * dependency output. + */ + public static class BeginIndexAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * This indexes the end of a span of words, e.g., a constituent in a + * tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}. This annotation + * counts tokens. It standardly indexes from 1 (like IndexAnnotation). + * The end index is not a fencepost: its value is equal to the + * IndexAnnotation of the last word in the span. + */ + public static class EndIndexAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * This indicates the sentence should end at this token. Used to + * force the ssplit annotator (eg the WordToSentenceProcessor) to + * start a new sentence at the next token. + */ + public static class ForcedSentenceEndAnnotation + implements CoreAnnotation { + public Class getType() { + return Boolean.class; + } + } + + /** + * Unique identifier within a document for a given sentence. + */ + public static class SentenceIndexAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * Contains the "value" - an ill-defined string used widely in MapLabel. + */ + public static class ValueAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class CategoryAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The exact original surface form of a token. This is created in the + * invertible PTBTokenizer. The tokenizer may normalize the token form to + * match what appears in the PTB, but this key will hold the original characters. + */ + public static class OriginalTextAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Annotation for the whitespace characters appearing before this word. This + * can be filled in by the tokenizer so that the original text string can be + * reconstructed. + */ + public static class BeforeAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Annotation for the whitespace characters appear after this word. This can + * be filled in by the tokenizer so that the original text string can be + * reconstructed. + */ + public static class AfterAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * CoNLL dep parsing - coarser POS tags. + */ + public static class CoarseTagAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * CoNLL dep parsing - the dependency type + */ + public static class CoNLLDepAnnotation implements CoreAnnotation { + public Class getType() { + return CoreMap.class; + } + } + + /** + * CoNLL SRL/dep parsing - whether the word is a predicate + */ + public static class CoNLLPredicateAnnotation implements CoreAnnotation { + public Class getType() { + return Boolean.class; + } + } + + /** + * CoNLL SRL/dep parsing - map which, for the current word, specifies its + * specific role for each predicate + */ + public static class CoNLLSRLAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.>> uncheckedCast(Map.class); + } + } + + /** + * CoNLL dep parsing - the dependency type + */ + public static class CoNLLDepTypeAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * CoNLL dep parsing - the index of the word which is the parent of this word + * in the dependency tree + */ + public static class CoNLLDepParentIndexAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * Inverse document frequency of the word this label represents + */ + public static class IDFAnnotation implements CoreAnnotation { + public Class getType() { + return Double.class; + } + } + + /** + * Keys from AbstractMapLabel (descriptions taken from that class) + */ + /** + * The standard key for storing a projected category in the map, as a String. + * For any word (leaf node), the projected category is the syntactic category + * of the maximal constituent headed by the word. Used in SemanticGraph. + */ + public static class ProjectedCategoryAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for a propbank label which is of type Argument + */ + public static class ArgumentAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Another key used for propbank - to signify core arg nodes or predicate + * nodes + */ + public static class MarkingAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for Semantic Head Word which is a String + */ + public static class SemanticHeadWordAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for Semantic Head Word POS which is a String + */ + public static class SemanticHeadTagAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Probank key for the Verb sense given in the Propbank Annotation, should + * only be in the verbnode + */ + public static class VerbSenseAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for storing category with functional tags. + */ + public static class CategoryFunctionalTagAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * This is an NER ID annotation (in case the all caps parsing didn't work out + * for you...) + */ + public static class NERIDAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The key for the normalized value of numeric named entities. + */ + public static class NormalizedNamedEntityTagAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public enum SRL_ID { + ARG, NO, ALL_NO, REL + } + + /** + * The key for semantic role labels (Note: please add to this description if + * you use this key) + */ + public static class SRLIDAnnotation implements CoreAnnotation { + public Class getType() { + return SRL_ID.class; + } + } + + /** + * The standard key for the "shape" of a word: a String representing the type + * of characters in a word, such as "Xx" for a capitalized word. See + * {@link edu.stanford.nlp.process.WordShapeClassifier} for functions for + * making shape strings. + */ + public static class ShapeAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The Standard key for storing the left terminal number relative to the root + * of the tree of the leftmost terminal dominated by the current node + */ + public static class LeftTermAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * The standard key for the parent which is a String + */ + public static class ParentAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class INAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for span which is an IntPair + */ + public static class SpanAnnotation implements CoreAnnotation { + public Class getType() { + return IntPair.class; + } + } + + /** + * The standard key for the answer which is a String + */ + public static class AnswerAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for gold answer which is a String + */ + public static class GoldAnswerAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for the features which is a Collection + */ + public static class FeaturesAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for the semantic interpretation + */ + public static class InterpretationAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for the semantic role label of a phrase. + */ + public static class RoleAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * The standard key for the gazetteer information + */ + public static class GazetteerAnnotation implements CoreAnnotation> { + public Class> getType() { + return ErasureUtils.uncheckedCast(List.class); + } + } + + /** + * Morphological stem of the word this label represents + */ + public static class StemAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class PolarityAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class MorphoNumAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class MorphoPersAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class MorphoGenAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class MorphoCaseAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * for Chinese: character level information, segmentation + */ + public static class ChineseCharAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class ChineseOrigSegAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class ChineseSegAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + /** + * Not sure exactly what this is, but it is different from + * ChineseSegAnnotation and seems to indicate if the text is segmented + */ + public static class ChineseIsSegmentedAnnotation implements CoreAnnotation { + public Class getType() { + return Boolean.class; + } + } + + /** + * The CoreMap key identifying the offset of the first character of an + * annotation. The character with index 0 is the first character in the + * document. + * + * This key should be set for any annotation that represents a span of text. + */ + public static class CharacterOffsetBeginAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * The CoreMap key identifying the offset of the last character after the end + * of an annotation. The character with index 0 is the first character in the + * document. + * + * This key should be set for any annotation that represents a span of text. + */ + public static class CharacterOffsetEndAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * Key for relative value of a word - used in RTE + */ + public static class CostMagnificationAnnotation implements CoreAnnotation { + public Class getType() { + return Double.class; + } + } + + public static class WordSenseAnnotation implements CoreAnnotation { + public Class getType() { + return String.class; + } + } + + public static class SRLInstancesAnnotation implements CoreAnnotation>>> { + public Class>>> getType() { + return ErasureUtils.uncheckedCast(List.class); + } + } + + /** + * Used by RTE to track number of text sentences, to determine when hyp + * sentences begin. + */ + public static class NumTxtSentencesAnnotation implements CoreAnnotation { + public Class getType() { + return Integer.class; + } + } + + /** + * Used in Trees + */ + public static class TagLabelAnnotation implements CoreAnnotation