8000 Correferences are now parsed. Offsets are adjusted to account for dum… · TPNguyen/stanford-corenlp-python@4d3132d · GitHub
[go: up one dir, main page]

Skip to content

Commit 4d3132d

Browse files
committed
Correferences are now parsed. Offsets are adjusted to account for dummy pronouns in parse_imperative
1 parent 29e0c0d commit 4d3132d

File tree

1 file changed

+41
-7
lines changed

1 file changed

+41
-7
lines changed

corenlp.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,17 @@ def parse_parser_results(text):
5555
av = re.split("=| ", s)
5656
# make [ignore,ignore,a,b,c,d] into [[a,b],[c,d]]
5757
# and save as attr-value dict, convert numbers into ints
58-
tmp['words'].append((av[1], dict(zip(*[av[2:][x::2] for x in (0, 1)]))))
58+
#tmp['words'].append((av[1], dict(zip(*[av[2:][x::2] for x in (0, 1)]))))
5959
# tried to convert digits to ints instead of strings, but
6060
# it seems the results of this can't be serialized into JSON?
61-
# av = zip(*[av[2:][x::2] for x in (0, 1)])
62-
# tmp['words'][av[1]] = dict(map(lambda x: (x[0], x[1].isdigit() and int(x[1]) or x[1]), av))
61+
word = av[1]
62+
attributes = {}
63+
for a,v in zip(*[av[2:][x::2] for x in (0, 1)]):
64+
if v.isdigit():
65+
attributes[a] = int(v)
66+
else:
67+
attributes[a] = v
68+
tmp['words'].append((word, attributes))
6369
state = 3
6470
elif state == 3:
6571
# skip over parse tree
@@ -72,12 +78,22 @@ def parse_parser_results(text):
7278
if not line.startswith(" ") and line.endswith(")"):
7379
split_entry = re.split("\(|, ", line[:-1])
7480
if len(split_entry) == 3:
75-
rel, left, right = map(lambda x: remove_id(x), split_entry)
81+
rel, left, right = map(lambda x: x, split_entry)
7682
tmp['tuples'].append(tuple([rel,left,right]))
7783
elif "Coreference links" in line:
7884
state = 5
7985
elif state == 5:
80-
# coreference links. Not yet implemented
86+
crexp = re.compile('\s(\d*)\s(\d*)\s\-\>\s(\d*)\s(\d*), that is')
87+
matches = crexp.findall(line)
88+
for src_i, src_pos, sink_i, sink_pos in matches:
89+
print "COREF MATCH", src_i, sink_i
90+
src = tmp['words'][int(src_pos)-1][0]
91+
sink = tmp['words'][int(sink_pos)-1][0]
92+
if tmp.has_key('coref'):
93+
tmp['coref'].append((src, sink))
94+
else:
95+
tmp['coref'] = [(src, sink)]
96+
8197
print "CR", line
8298
if len(tmp.keys()) != 0:
8399
results.append(tmp)
@@ -191,8 +207,9 @@ def _parse(self, text, verbose=True):
191207
def _debug_parse(self, text, verbose=True):
192208
print "DEBUG PARSE -- "
193209
rf = open("test.out", 'r')
194-
results = rf.readlines()
210+
incoming = ''.join(rf.readlines())
195211
rf.close()
212+
results = parse_parser_results(incoming)
196213
return results
197214

198215
def parse(self, text, verbose=True):
@@ -220,7 +237,12 @@ def parse_imperative(self, text, verbose=True):
220237
used_pronoun = None
221238
pronouns = ["you","he", "she","i"]
222239
for p in pronouns:
240+
if text.startswith(p+" "):
241+
# it's already an imperative!
242+
used_pronoun = None
243+
break
223244
if p not in text:
245+
# found one not in there already
224246
used_pronoun = p
225247
break
226248
# if you can't find one, regress to original parse
@@ -229,19 +251,31 @@ def parse_imperative(self, text, verbose=True):
229251

230252
# create text with pronoun and parse it
231253
new_text = used_pronoun+" "+text.lstrip()
232-
result = self._parse(new_text, verbose)
254+
result = self._debug_parse(new_text, verbose)
255+
256+
if len(result) != 1:
257+
print "Non-imperative sentence? Multiple sentences found."
233258

234259
# remove the dummy pronoun
260+
used_pronoun_offset = len(used_pronoun)+1
235261
if result[0].has_key('text'):
236262
result[0]['text'] = text
237263
result[0]['tuples'] = filter(lambda x: not (x[1] == used_pronoun or x[2]
238264
== used_pronoun), result[0]['tuples'])
239265
result[0]['words'] = result[0]['words'][1:]
266+
# account for offset
267+
ct = 0
268+
for word, av in result[0]['words']:
269+
for a,v in av.items():
270+
if a.startswith("CharacterOffset"):
271+
result[0]['words'][ct][1][a] = v-used_pronoun_offset
272+
ct += 1
240273
return dumps(result)
241274
else:
242275
# if there's a timeout error, just return it.
243276
return dumps(result)
244277

278+
245279
if __name__ == '__main__':
246280
parser = optparse.OptionParser(usage="%prog [OPTIONS]")
247281
parser.add_option(

0 commit comments

Comments
 (0)
0