Skip to content

Commit d9c43c2

Browse files
committed
New block to join two neighboring tokens without a space between them.
1 parent 4df8bdf commit d9c43c2

4 files changed

Lines changed: 98 additions & 1 deletion

File tree

tutorial/udapi-tutorial-dz.odt

15.8 KB
Binary file not shown.

tutorial/udapi-tutorial-dz.pdf

95.4 KB
Binary file not shown.

udapi/block/ud/jointoken.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
Block ud.JoinToken will join a given token with the preceding one.
3+
"""
4+
from udapi.core.block import Block
5+
import logging
6+
7+
8+
class JoinToken(Block):
9+
"""
10+
Merge two tokens into one. A MISC attribute is used to mark the tokens that
11+
should join the preceding token. (The attribute may have been set by an
12+
annotator or by a previous block that tests the specific conditions under
13+
which joining is desired.) Joining cannot be done across sentence
14+
boundaries; if necessary, apply util.JoinSentence first. Multiword tokens
15+
are currently not supported: None of the nodes to be merged can belong to
16+
a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.)
17+
Merging is simple if there is no space between the tokens (see SpaceAfter=No
18+
at the first token). If there is a space, there are three options in theory:
19+
20+
1. Keep the tokens as two nodes but apply the UD goeswith relation
21+
(see https://universaldependencies.org/u/overview/typos.html) and
22+
the related annotation rules.
23+
2. Join them into one token that contains a space. Such "words with
24+
spaces" can be exceptionally allowed in UD if they are registered
25+
in the given language.
26+
3. Remove the space without any trace. Not recommended in UD unless the
27+
underlying text was created directly for UD and can be thus considered
28+
part of the annotation.
29+
30+
At present, this block does not support merging with spaces at all, but
31+
in the future one or more of the options may be added.
32+
"""
33+
34+
def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs):
35+
"""
36+
Args:
37+
misc_name: name of the MISC attribute that can trigger the joining
38+
default: JoinToken
39+
misc_value: value of the MISC attribute to trigger the joining;
40+
if not specified, then simple occurrence of the attribute with any value will cause the joining
41+
MISC attributes that have triggered sentence joining will be removed from their node.
42+
"""
43+
super().__init__(**kwargs)
44+
self.misc_name = misc_name
45+
self.misc_value = misc_value
46+
47+
def process_node(self, node):
48+
"""
49+
The JoinToken (or equivalent) attribute in MISC will trigger action.
50+
Either the current node will be merged with the previous node and the
51+
attribute will be removed from MISC, or a warning will be issued that
52+
the merging cannot be done and the attribute will stay in MISC. Note
53+
that multiword token lines and empty nodes are not even scanned for
54+
the attribute, so if it is there, it will stay there but no warning
55+
will be printed.
56+
"""
57+
if node.misc[self.misc_name] == '':
58+
return
59+
if self.misc_value and node.misc[self.misc_name] != self.misc_value:
60+
return
61+
prevnode = node.prev_node
62+
if not prevnode:
63+
logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name)
64+
node.misc['Bug'] = 'JoiningTokenNotSupportedHere'
65+
return
66+
if node.multiword_token or prevnode.multiword_token:
67+
logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name)
68+
node.misc['Bug'] = 'JoiningTokenNotSupportedHere'
69+
return
70+
if prevnode.misc['SpaceAfter'] != 'No':
71+
logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name)
72+
node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported'
73+
return
74+
###!!! This block currently must not be applied on data containing
75+
###!!! enhanced dependencies. We must first implement adjustments of
76+
###!!! the enhanced structure.
77+
if prevnode.deps or node.deps:
78+
logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')
79+
# If the first token depends on the second token, re-attach it to the
80+
# second token's parent to prevent cycles.
81+
if prevnode in node.descendants:
82+
prevnode.parent = node.parent
83+
prevnode.deprel = node.deprel
84+
# Re-attach all children of the second token to the first token.
85+
for c in node.children:
86+
c.parent = prevnode
87+
# Concatenate the word forms of the two tokens. Assume that morphological
88+
# annotation, including the lemma, is already updated accordingly (we
89+
# cannot guess it anyway).
90+
prevnode.form += node.form
91+
# Remove SpaceAfter=No from the first token unless the second token has
92+
# this attribute, too (meaning that there is no space between the second
93+
# token and whatever comes next).
94+
prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter']
95+
# Remove the current node. The joining instruction was in its MISC, so
96+
# it will disappear together with the node.
97+
node.remove()

udapi/block/util/joinsentence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Block util.SplitSentence will split a given sentence at a given token.
2+
Block util.JoinSentence will join a given sentence with the preceding one.
33
"""
44
import logging
55
from udapi.core.block import Block

0 commit comments

Comments
 (0)