8000 Merge pull request #3 from jcccf/parse_fix · imclab/stanford-corenlp-python@2e90032 · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit 2e90032

Browse files
committed
Merge pull request dasmith#3 from jcccf/parse_fix
Updated for Latest CoreNLP (2012-04-09)
2 parents 14b6306 + 221513e commit 2e90032

File tree

1 file changed

+58
-38
lines changed

1 file changed

+58
-38
lines changed

corenlp.py

Lines changed: 58 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import os
1515
import time
1616
import re
17+
from unidecode import unidecode
1718

1819
import pexpect
1920

@@ -25,6 +26,27 @@ def remove_id(word):
2526
"""Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
2627
return word.count("-") == 0 and word or word[0:word.rindex("-")]
2728

29+
def parse_bracketed(s):
30+
'''Parse word features [abc=... def = ...]
31+
Also manages to parse out features that have XML within them
32+
'''
33+
word = None
34+
attrs = {}
35+
temp = {}
36+
# Substitute XML tags, to replace them later
37+
for i, tag in enumerate(re.findall(r"(<[^<>]+>.*<\/[^<>]+>)", s)):
38+
temp["^^^%d^^^" % i] = tag
39+
s = s.replace(tag, "^^^%d^^^" % i)
40+
# Load key-value pairs, substituting as necessary
41+
for attr, val in re.findall(r"([^=\s]*)=([^=\s]*)", s):
42+
if val in temp:
43+
val = temp[val]
44+
if attr == 'Text':
45+
word = val
46+
else:
47+
attrs[attr] = val
48+
return (word, attrs)
49+
2850
def parse_parser_results(text):
2951
""" This is the nasty bit of code to interact with the command-line
3052
interface of the CoreNLP tools. Takes a string of the parser results
@@ -33,12 +55,14 @@ def parse_parser_results(text):
3355
"""
3456
state = 0
3557
tmp = {}
36-
results = []
58+
coref_set = []
59+
results = { "sentences": [] }
60+
text = unidecode(text) # Force output conversion to ASCII to avoid RPC error
3761
for line in text.split("\n"):
3862
if line.startswith("Sentence #"):
3963
state = 1
4064
if len(tmp.keys()) != 0:
41-
results.append(tmp)
65+
results["sentences"].append(tmp) # Put results in "sentences" key so "corefs" can exist outside
4266
tmp = {}
4367
elif state == 1:
4468
tmp['text'] = line.strip()
@@ -51,28 +75,17 @@ def parse_parser_results(text):
5175
exp = re.compile('\[([^\]]+)\]')
5276
matches = exp.findall(line)
5377
for s in matches:
54-
print s
55-
# split into attribute-value list
56-
av = re.split("=| ", s)
57-
# make [ignore,ignore,a,b,c,d] into [[a,b],[c,d]]
58-
# and save as attr-value dict, convert numbers into ints
59-
#tmp['words'].append((av[1], dict(zip(*[av[2:][x::2] for x in (0, 1)]))))
60-
# tried to convert digits to ints instead of strings, but
61-
# it seems the results of this can't be serialized into JSON?
62-
word = av[1]
63-
attributes = {}
64-
for a,v in zip(*[av[2:][x::2] for x in (0, 1)]):
65-
if v.isdigit():
66-
attributes[a] = int(v)
67-
else:
68-
attributes[a] = v
69-
tmp['words'].append((word, attributes))
78+
tmp['words'].append(parse_bracketed(s))
7079
state = 3
80+
tmp['parsetree'] = []
7181
elif state == 3:
72-
# skip over parse tree
82+
# Output parse tree as well (useful especially if you want to pull this into NLTK)
7383
if not (line.startswith(" ") or line.startswith("(ROOT")):
7484
state = 4
75-
tmp['tuples'] = []
85+
tmp['parsetree'] = " ".join(tmp['parsetree'])
86+
tmp['tuples'] = []
87+
else:
88+
tmp['parsetree'].append(line.strip())
7689
if state == 4:
7790
# dependency parse
7891
line = line.rstrip()
@@ -81,27 +94,34 @@ def parse_parser_results(text):
8194
if len(split_entry) == 3:
8295
rel, left, right = map(lambda x: remove_id(x), split_entry)
8396
tmp['tuples'].append(tuple([rel,left,right]))
84-
elif "Coreference links" in line:
97+
elif "Coreference set" in line:
8598
state = 5
99+
coref_set = []
86100
elif state == 5:
87-
crexp = re.compile('\s(\d*)\s(\d*)\s\-\>\s(\d*)\s(\d*), that is')
101+
if "Coreference set" in line: # Create new coreference set if needed
102+
if len(coref_set) > 0:
103+
if results.has_key('coref'):
104+
results['coref'].append(coref_set)
105+
else:
106+
results['coref'] = [coref_set]
107+
coref_set = []
108+
else:
109+
# Updated for new coreference format
110+
crexp = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
88111
matches = crexp.findall(line)
89-
for src_i, src_pos, sink_i, sink_pos in matches:
90-
# TODO: src_i and sink_i correspond to the sentences.
91-
# this was built for single sentences, and thus ignores
92-
# the sentence number. Should be fixed, but would require
93-
# restructuring the entire output.
94-
print "COREF MATCH", src_i, sink_i
95-
src = tmp['words'][int(src_pos)-1][0]
96-
sink = tmp['words'][int(sink_pos)-1][0]
97-
if tmp.has_key('coref'):
98-
tmp['coref'].append((src, sink))
99-
else:
100-
tmp['coref'] = [(src, sink)]
101-
112+
for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in matches:
113+
src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
114+
sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
115+
print "COREF MATCH", src_i, sink_i
116+
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
102117
print "CR", line
103118
if len(tmp.keys()) != 0:
104-
results.append(tmp)
119+
results["sentences"].append(tmp)
120+
if len(coref_set) > 0: # Add final coreference set if needed
121+
if results.has_key('coref'):
122+
results['coref'].append(coref_set)
123+
else:
124+
results['coref'] = [coref_set]
105125
return results
106126

107127
class StanfordCoreNLP(object):
@@ -116,8 +136,8 @@ def __init__(self):
116136
Spawns the server as a process.
117137
"""
118138

119-
jars = ["stanford-corenlp-2011-09-16.jar",
120-
"stanford-corenlp-2011-09-14-models.jar",
139+
jars = ["stanford-corenlp-2012-04-09.jar",
140+
"stanford-corenlp-2012-04-09-models.jar",
121141
"joda-time.jar",
122142
"xom.jar"]
123143

0 commit comments

Comments
 (0)
0