-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathlink2cluster.py
More file actions
137 lines (130 loc) · 7.59 KB
/
link2cluster.py
File metadata and controls
137 lines (130 loc) · 7.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import logging
from udapi.core.block import Block
class Link2Cluster(Block):
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.
Params:
id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
ante_attr: name of the attribute in MISC that stores the ID of the antecedent
of the current node (in the same format as `id_attr`).
delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
(i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
bridge_attr, bridge_relation_attr if these are used). Default=True.
infstat_attr: name of the attribute in MISC that stores the information status of a given mention
Will be stored in `mention.other['infstat']`. Use None for ignoring this.
coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
of the current node/mention (in the same format as `id_attr`).
Default=None, i.e. ignore this parameter.
bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type
(e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
The main goal of this parameter is to make eid unique across multiple documents.
If you use eid_counter=0, this feature will be turned off,
so entities will be created using `root.document.create_coref_entity()`,
with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
"""
def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True,
infstat_attr='information-status', coreftype_attr='coreftype',
bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs):
super().__init__(**kwargs)
self.id_attr = id_attr
self.ante_attr = ante_attr
self.delete_orig_attrs = delete_orig_attrs
self.infstat_attr = infstat_attr
self.coreftype_attr = coreftype_attr
self.bridge_attr = bridge_attr
self.bridge_relation_attr = bridge_relation_attr
self.eid_counter = int(eid_counter)
def _new_entity(self, doc):
if not self.eid_counter:
return doc.create_coref_entity()
entity = doc.create_coref_entity(eid=f"e{self.eid_counter}")
self.eid_counter += 1
return entity
def _new_mention(self, entity, node):
mention = entity.create_mention(head=node, words=[node])
if self.infstat_attr and node.misc[self.infstat_attr]:
mention.other['infstat'] = node.misc[self.infstat_attr]
if self.delete_orig_attrs:
del node.misc[self.infstat_attr]
if self.coreftype_attr and node.misc[self.coreftype_attr]:
mention.other['coreftype'] = node.misc[self.coreftype_attr]
if self.delete_orig_attrs:
del node.misc[self.coreftype_attr]
return mention
def process_document(self, doc):
id2node = {}
links = []
bridges = []
for node in doc.nodes_and_empty:
this_id = node.misc[self.id_attr]
if this_id != '':
id2node[this_id] = node
ante_id = node.misc[self.ante_attr]
if ante_id != '':
if ante_id == this_id:
logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}")
else:
links.append([ante_id, this_id])
if self.delete_orig_attrs:
for attr in (self.id_attr, self.ante_attr):
del node.misc[attr]
if self.bridge_attr:
bridge_id = node.misc[self.bridge_attr]
if bridge_id != '':
if bridge_id == this_id:
logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}")
else:
bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]])
if self.delete_orig_attrs:
for attr in (self.bridge_attr, self.bridge_relation_attr):
del node.misc[attr]
# It seems faster&simpler to process the links in any order and implement entity merging,
# rather than trying to sort the links so that no entity merging is needed.
for ante_id, this_id in links:
if ante_id not in id2node:
logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}")
else:
ante_node, this_node = id2node[ante_id], id2node[this_id]
if not this_node.coref_mentions and not ante_node.coref_mentions:
# None of the nodes is part of any mention/entity. Let's create them.
entity = self._new_entity(this_node.root.document)
self._new_mention(entity, ante_node)
self._new_mention(entity, this_node)
elif this_node.coref_mentions and ante_node.coref_mentions:
# Both of the nodes are part of mentions in different entities.
# Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
# While the official API supports "stealing" a single mention (m.entity = another_entity),
# the implementation below using _mentions and _entity is a bit faster.
e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0]
assert e_ante != e_this
for mention in e_ante.mentions:
mention._entity = e_this
e_this._mentions.extend(e_ante.mentions)
e_this._mentions.sort()
e_ante._mentions.clear()
else:
# Only one of the nodes is part of an entity. Let's add the second one to this entity.
if ante_node.coref_mentions:
self._new_mention(ante_node.coref_entities[0], this_node)
else:
self._new_mention(this_node.coref_entities[0], ante_node)
# Bridging
for ante_id, this_id, relation in bridges:
if ante_id not in id2node:
logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}")
else:
ante_node, this_node = id2node[ante_id], id2node[this_id]
if ante_node.coref_mentions:
m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node)
e_ante = m_ante.entity
else:
e_ante = self._new_entity(ante_node.root.document)
m_ante = self._new_mention(e_ante, ante_node)
if this_node.coref_mentions:
m_this = next(m for m in this_node.coref_mentions if m.head is this_node)
else:
e_this = self._new_entity(this_node.root.document)
m_this = self._new_mention(e_this, this_node)
m_this.bridging.append((e_ante, relation))