File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ from udapi .core .block import Block
2+ import udapi .core .coref
3+ import logging
4+
5+ class FixEntityAcrossNewdoc (Block ):
6+ """
7+ Fix the error reported by validate.py --coref:
8+ "[L6 Coref entity-across-newdoc] Same entity id should not occur in multiple documents"
9+ by making the entity IDs (eid) unique in each newdoc document.
10+
11+ This block uses Udapi's support for loading GUM-like GRP document-wide IDs
12+ (so the implementation is simple, although unnecessarily slow).
13+ After applying this block, IDs of all entities are prefixed with document numbers,
14+ e.g. "e45" in the 12th document changes to "d12.e45".
15+ If you prefer simple eid, use corefud.IndexClusters afterwards.
16+ """
17+
18+ def process_document (self , doc ):
19+ if not doc .eid_to_entity :
20+ logging .warning (f"No entities in document { doc .meta } " )
21+ udapi .core .coref .store_coref_to_misc (doc )
22+ assert doc .meta ["global.Entity" ].startswith ("eid" )
23+ doc .meta ["global.Entity" ] = "GRP" + doc .meta ["global.Entity" ][3 :]
24+ udapi .core .coref .load_coref_from_misc (doc )
25+ doc .meta ["global.Entity" ] = "eid" + doc .meta ["global.Entity" ][3 :]
You can’t perform that action at this time.
0 commit comments