22from udapi .core .block import Block
33
44class Link2Cluster (Block ):
5- """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
5+ """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.
66
7- def __init__ (self , id_attr = 'proiel-id' , ante_attr = 'antecedent-proiel-id' , delete_orig_attrs = True , ** kwargs ):
7+ Params:
8+ id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
9+ ante_attr: name of the attribute in MISC that stores the ID of the antecedent
10+ of the current node (in the same format as `id_attr`).
11+ delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
12+ (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
13+ bridge_attr, bridge_relation_attr if these are used). Default=True.
14+ infstat_attr: name of the attribute in MISC that stores the information status of a given mention
15+ Will be stored in `mention.other['infstat']`. Use None for ignoring this.
16+ coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
17+ Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
18+ bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
19+ of the current node/mention (in the same format as `id_attr`).
20+ Default=None, i.e. ignore this parameter.
21+ bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type
22+ (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
23+ eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
24+ The main goal of this parameter is to make eid unique across multiple documents.
25+ If you use eid_counter=0, this feature will be turned off,
26+ so entities will be created using `root.document.create_coref_entity()`,
27+ with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
28+ """
29+ def __init__ (self , id_attr = 'proiel-id' , ante_attr = 'antecedent-proiel-id' , delete_orig_attrs = True ,
30+ infstat_attr = 'information-status' , coreftype_attr = 'coreftype' ,
31+ bridge_attr = None , bridge_relation_attr = None , eid_counter = 1 , ** kwargs ):
832 super ().__init__ (** kwargs )
933 self .id_attr = id_attr
1034 self .ante_attr = ante_attr
1135 self .delete_orig_attrs = delete_orig_attrs
36+ self .infstat_attr = infstat_attr
37+ self .coreftype_attr = coreftype_attr
38+ self .bridge_attr = bridge_attr
39+ self .bridge_relation_attr = bridge_relation_attr
40+ self .eid_counter = int (eid_counter )
41+
42+ def _new_entity (self , doc ):
43+ if not self .eid_counter :
44+ return doc .create_coref_entity ()
45+ entity = doc .create_coref_entity (eid = f"e{ self .eid_counter } " )
46+ self .eid_counter += 1
47+ return entity
48+
49+ def _new_mention (self , entity , node ):
50+ mention = entity .create_mention (head = node , words = [node ])
51+ if self .infstat_attr and node .misc [self .infstat_attr ]:
52+ mention .other ['infstat' ] = node .misc [self .infstat_attr ]
53+ if self .delete_orig_attrs :
54+ del node .misc [self .infstat_attr ]
55+ if self .coreftype_attr and node .misc [self .coreftype_attr ]:
56+ mention .other ['coreftype' ] = node .misc [self .coreftype_attr ]
57+ if self .delete_orig_attrs :
58+ del node .misc [self .coreftype_attr ]
59+ return mention
1260
1361 def process_document (self , doc ):
1462 id2node = {}
1563 links = []
64+ bridges = []
1665 for node in doc .nodes_and_empty :
1766 this_id = node .misc [self .id_attr ]
1867 if this_id != '' :
@@ -26,6 +75,16 @@ def process_document(self, doc):
2675 if self .delete_orig_attrs :
2776 for attr in (self .id_attr , self .ante_attr ):
2877 del node .misc [attr ]
78+ if self .bridge_attr :
79+ bridge_id = node .misc [self .bridge_attr ]
80+ if bridge_id != '' :
81+ if bridge_id == this_id :
82+ logging .warning (f"{ node } has a self-reference bridging { self .bridge_attr } ={ bridge_id } " )
83+ else :
84+ bridges .append ([bridge_id , this_id , node .misc [self .bridge_relation_attr ]])
85+ if self .delete_orig_attrs :
86+ for attr in (self .bridge_attr , self .bridge_relation_attr ):
87+ del node .misc [attr ]
2988
3089 # It seems faster&simpler to process the links in any order and implement entity merging,
3190 # rather than trying to sort the links so that no entity merging is needed.
@@ -36,14 +95,9 @@ def process_document(self, doc):
3695 ante_node , this_node = id2node [ante_id ], id2node [this_id ]
3796 if not this_node .coref_mentions and not ante_node .coref_mentions :
3897 # None of the nodes is part of any mention/entity. Let's create them.
39- entity = this_node .root .document .create_coref_entity ()
40- m_ante = entity .create_mention (head = ante_node , words = [ante_node ])
41- m_this = entity .create_mention (head = this_node , words = [this_node ])
42- for node , mention in ((ante_node , m_ante ), (this_node , m_this )):
43- if node .misc ['information-status' ]:
44- mention .other ['infstat' ] = node .misc ['information-status' ]
45- if self .delete_orig_attrs :
46- del node .misc ['information-status' ]
98+ entity = self ._new_entity (this_node .root .document )
99+ self ._new_mention (entity , ante_node )
100+ self ._new_mention (entity , this_node )
47101 elif this_node .coref_mentions and ante_node .coref_mentions :
48102 # Both of the nodes are part of mentions in different entities.
49103 # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
@@ -59,6 +113,25 @@ def process_document(self, doc):
59113 else :
60114 # Only one of the nodes is part of an entity. Let's add the second one to this entity.
61115 if ante_node .coref_mentions :
62- ante_node .coref_entities [0 ]. create_mention ( head = this_node , words = [ this_node ] )
116+ self . _new_mention ( ante_node .coref_entities [0 ], this_node )
63117 else :
64- this_node .coref_entities [0 ].create_mention (head = ante_node , words = [ante_node ])
118+ self ._new_mention (this_node .coref_entities [0 ], ante_node )
119+
120+ # Bridging
121+ for ante_id , this_id , relation in bridges :
122+ if ante_id not in id2node :
123+ logging .warning (f"{ ante_id } is referenced in { self .bridge_attr } , but not in { self .id_attr } " )
124+ else :
125+ ante_node , this_node = id2node [ante_id ], id2node [this_id ]
126+ if ante_node .coref_mentions :
127+ m_ante = next (m for m in ante_node .coref_mentions if m .head is ante_node )
128+ e_ante = m_ante .entity
129+ else :
130+ e_ante = self ._new_entity (ante_node .root .document )
131+ m_ante = self ._new_mention (e_ante , ante_node )
132+ if this_node .coref_mentions :
133+ m_this = next (m for m in this_node .coref_mentions if m .head is this_node )
134+ else :
135+ e_this = self ._new_entity (this_node .root .document )
136+ m_this = self ._new_mention (e_this , this_node )
137+ m_this .bridging .append ((e_ante , relation ))
0 commit comments