|
| 1 | +""" |
| 2 | +Block ud.JoinToken will join a given token with the preceding one. |
| 3 | +""" |
| 4 | +from udapi.core.block import Block |
| 5 | +import logging |
| 6 | + |
| 7 | + |
| 8 | +class JoinToken(Block): |
| 9 | + """ |
| 10 | + Merge two tokens into one. A MISC attribute is used to mark the tokens that |
| 11 | + should join the preceding token. (The attribute may have been set by an |
| 12 | + annotator or by a previous block that tests the specific conditions under |
| 13 | + which joining is desired.) Joining cannot be done across sentence |
| 14 | + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens |
| 15 | + are currently not supported: None of the nodes to be merged can belong to |
| 16 | + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) |
| 17 | + Merging is simple if there is no space between the tokens (see SpaceAfter=No |
| 18 | + at the first token). If there is a space, there are three options in theory: |
| 19 | + |
| 20 | + 1. Keep the tokens as two nodes but apply the UD goeswith relation |
| 21 | + (see https://universaldependencies.org/u/overview/typos.html) and |
| 22 | + the related annotation rules. |
| 23 | + 2. Join them into one token that contains a space. Such "words with |
| 24 | + spaces" can be exceptionally allowed in UD if they are registered |
| 25 | + in the given language. |
| 26 | + 3. Remove the space without any trace. Not recommended in UD unless the |
| 27 | + underlying text was created directly for UD and can be thus considered |
| 28 | + part of the annotation. |
| 29 | + |
| 30 | + At present, this block does not support merging with spaces at all, but |
| 31 | + in the future one or more of the options may be added. |
| 32 | + """ |
| 33 | + |
| 34 | + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): |
| 35 | + """ |
| 36 | + Args: |
| 37 | + misc_name: name of the MISC attribute that can trigger the joining |
| 38 | + default: JoinToken |
| 39 | + misc_value: value of the MISC attribute to trigger the joining; |
| 40 | + if not specified, then simple occurrence of the attribute with any value will cause the joining |
| 41 | + MISC attributes that have triggered sentence joining will be removed from their node. |
| 42 | + """ |
| 43 | + super().__init__(**kwargs) |
| 44 | + self.misc_name = misc_name |
| 45 | + self.misc_value = misc_value |
| 46 | + |
| 47 | + def process_node(self, node): |
| 48 | + """ |
| 49 | + The JoinToken (or equivalent) attribute in MISC will trigger action. |
| 50 | + Either the current node will be merged with the previous node and the |
| 51 | + attribute will be removed from MISC, or a warning will be issued that |
| 52 | + the merging cannot be done and the attribute will stay in MISC. Note |
| 53 | + that multiword token lines and empty nodes are not even scanned for |
| 54 | + the attribute, so if it is there, it will stay there but no warning |
| 55 | + will be printed. |
| 56 | + """ |
| 57 | + if node.misc[self.misc_name] == '': |
| 58 | + return |
| 59 | + if self.misc_value and node.misc[self.misc_name] != self.misc_value: |
| 60 | + return |
| 61 | + prevnode = node.prev_node |
| 62 | + if not prevnode: |
| 63 | + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) |
| 64 | + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' |
| 65 | + return |
| 66 | + if node.multiword_token or prevnode.multiword_token: |
| 67 | + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) |
| 68 | + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' |
| 69 | + return |
| 70 | + if prevnode.misc['SpaceAfter'] != 'No': |
| 71 | + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) |
| 72 | + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' |
| 73 | + return |
| 74 | + ###!!! This block currently must not be applied on data containing |
| 75 | + ###!!! enhanced dependencies. We must first implement adjustments of |
| 76 | + ###!!! the enhanced structure. |
| 77 | + if prevnode.deps or node.deps: |
| 78 | + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') |
| 79 | + # If the first token depends on the second token, re-attach it to the |
| 80 | + # second token's parent to prevent cycles. |
| 81 | + if prevnode in node.descendants: |
| 82 | + prevnode.parent = node.parent |
| 83 | + prevnode.deprel = node.deprel |
| 84 | + # Re-attach all children of the second token to the first token. |
| 85 | + for c in node.children: |
| 86 | + c.parent = prevnode |
| 87 | + # Concatenate the word forms of the two tokens. Assume that morphological |
| 88 | + # annotation, including the lemma, is already updated accordingly (we |
| 89 | + # cannot guess it anyway). |
| 90 | + prevnode.form += node.form |
| 91 | + # Remove SpaceAfter=No from the first token unless the second token has |
| 92 | + # this attribute, too (meaning that there is no space between the second |
| 93 | + # token and whatever comes next). |
| 94 | + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] |
| 95 | + # Remove the current node. The joining instruction was in its MISC, so |
| 96 | + # it will disappear together with the node. |
| 97 | + node.remove() |
0 commit comments