@@ -13,45 +13,52 @@ def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete
1313 def process_document (self , doc ):
1414 id2node = {}
1515 links = []
16- for node in doc .nodes :
16+ for node in doc .nodes_and_empty :
1717 this_id = node .misc [self .id_attr ]
1818 if this_id != '' :
1919 id2node [this_id ] = node
2020 ante_id = node .misc [self .ante_attr ]
2121 if ante_id != '' :
22- links .append ([ante_id , this_id ])
22+ if ante_id == this_id :
23+ logging .warning (f"{ node } has a self-reference { self .ante_attr } ={ ante_id } " )
24+ else :
25+ links .append ([ante_id , this_id ])
2326 if self .delete_orig_attrs :
2427 for attr in (self .id_attr , self .ante_attr ):
2528 del node .misc [attr ]
2629
27- for link in links :
28- if link [0 ] not in id2node :
29- logging .warning (f"{ link [0 ]} is referenced in { self .ante_attr } , but not in { self .id_attr } " )
30- links = [link for link in links if link [0 ] in id2node ]
31-
32- # nodeA < nodeB is a shortcut for nodeA.ord < nodeB.ord
33- # but here we need to sort nodes from different sentences,
34- # so we need to compare first the bundle number and then node.ord.
35- sort_key = lambda node : (node .root .bundle .number , node .ord )
36-
37- # sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this.
38- node_links = [sorted ([id2node [link [0 ]], id2node [link [1 ]]], reverse = True , key = sort_key ) for link in links ]
39-
40- # Makes sure the links are sorted by this_node (i.e. the anaphor, not the antecendent).
41- node_links .sort (key = lambda link : sort_key (link [0 ]))
42-
43- # Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating
44- # and we can prevent the need for merging two entities.
45- for this_node , ante_node in node_links :
46- assert not this_node .coref_mentions
47- if ante_node .coref_mentions :
48- ante_node .coref_entities [0 ].create_mention (head = this_node , words = [this_node ])
30+ # It seems faster&simpler to process the links in any order and implement entity merging,
31+ # rather than trying to sort the links so that no entity merging is needed.
32+ for ante_id , this_id in links :
33+ if ante_id not in id2node :
34+ logging .warning (f"{ ante_id } is referenced in { self .ante_attr } , but not in { self .id_attr } " )
4935 else :
50- entity = this_node .root .document .create_coref_entity ()
51- m_ante = entity .create_mention (head = ante_node , words = [ante_node ])
52- m_this = entity .create_mention (head = this_node , words = [this_node ])
53- for node , mention in ((ante_node , m_ante ), (this_node , m_this )):
54- if node .misc ['information-status' ]:
55- mention .other ['infstat' ] = node .misc ['information-status' ]
56- if self .delete_orig_attrs :
57- del node .misc ['information-status' ]
36+ ante_node , this_node = id2node [ante_id ], id2node [this_id ]
37+ if not this_node .coref_mentions and not ante_node .coref_mentions :
38+ # None of the nodes is part of any mention/entity. Let's create them.
39+ entity = this_node .root .document .create_coref_entity ()
40+ m_ante = entity .create_mention (head = ante_node , words = [ante_node ])
41+ m_this = entity .create_mention (head = this_node , words = [this_node ])
42+ for node , mention in ((ante_node , m_ante ), (this_node , m_this )):
43+ if node .misc ['information-status' ]:
44+ mention .other ['infstat' ] = node .misc ['information-status' ]
45+ if self .delete_orig_attrs :
46+ del node .misc ['information-status' ]
47+ elif this_node .coref_mentions and ante_node .coref_mentions :
48+ # Both of the nodes are part of mentions in different entities.
49+ # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
50+ # While the official API supports "stealing" a single mention (m.entity = another_entity),
51+ # the implementation below using _mentions and _entity is a bit faster.
52+ e_ante , e_this = this_node .coref_entities [0 ], ante_node .coref_entities [0 ]
53+ assert e_ante != e_this
54+ for mention in e_ante .mentions :
55+ mention ._entity = e_this
56+ e_this ._mentions .extend (e_ante .mentions )
57+ e_this ._mentions .sort ()
58+ e_ante ._mentions .clear ()
59+ else :
60+ # Only one of the nodes is part of an entity. Let's add the second one to this entity.
61+ if ante_node .coref_mentions :
62+ ante_node .coref_entities [0 ].create_mention (head = this_node , words = [this_node ])
63+ else :
64+ this_node .coref_entities [0 ].create_mention (head = ante_node , words = [ante_node ])
0 commit comments