Skip to content

Corefud.MergeSameSpan does not work after certain blocks #101

@dan-zeman

Description

@dan-zeman

The block corefud.MergeSameSpan should be relatively independent of what the other blocks do. It collects mentions in a sentence (asking for node.coref_mentions at every node), then extracts the list of words for each mention m (set(m.words)), and looks for pairs of mentions that span the same set of words. Nevertheless, there is a mysterious bug that triggers when this block is combined with the conversion from the old CorefUD format. Consider this sentence and especially line 26, nuestro:

# sent_id = CESS-CAST-P-20020501-120-s5
# text = Y Charly, cómo no, se mojó: "Estamos, más o menos, un 75% a favor de ellos y un 25% nuestro" .
1	Y	y	CCONJ	cc	_	8	advmod	8:advmod	_
2	Charly	Charly	PROPN	np00000	_	8	nsubj	8:nsubj	ClusterId=CESS-CAST-P-20020501-120-c1|ClusterType=Spec.person|MentionSpan=2|MentionMisc=CorefType:ident|SpaceAfter=No
3	,	,	PUNCT	fc	PunctType=Comm	4	punct	4:punct	_
4	cómo	cómo	ADV	_	_	8	dep	8:dep	_
5	no	no	PART	_	_	4	dep	4:dep	SpaceAfter=No
6	,	,	PUNCT	fc	PunctType=Comm	4	punct	4:punct	_
7	se	él	PRON	p0300000	Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	8	expl:pv	8:expl:pv	_
8	mojó	mojar	VERB	vmis3s0	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	0	root	0:root	SpaceAfter=No
9	:	:	PUNCT	fd	PunctType=Colo	18	punct	18:punct	_
10	"	"	PUNCT	_	PunctType=Quot	18	punct	18:punct	SpaceAfter=No
10.1	_	_	PRON	p	_	_	_	11:nsubj	ClusterId=CESS-CAST-P-20020501-120-c3|ClusterType=Spec.organization|MentionSpan=10.1|MentionMisc=ClusterTypeMismatch:Spec:Spec.organization,CorefType:ident
11	Estamos	estar	AUX	vmip1p0	Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin	18	cop	18:cop	SpaceAfter=No
12	,	,	PUNCT	fc	PunctType=Comm	13	punct	13:punct	_
13	más	más	ADV	rg	Degree=Cmp	18	advmod	18:advmod	_
14	o	o	CCONJ	cc	_	15	cc	15:cc	_
15	menos	menos	ADV	rg	Degree=Cmp	13	conj	13:conj	SpaceAfter=No
16	,	,	PUNCT	fc	PunctType=Comm	13	punct	13:punct	_
17	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	18	det	18:det	_
18	75%	75	SYM	_	NumForm=Digit	8	advcl	8:advcl	ClusterId=CESS-CAST-P-20020501-120-s5.sn.34|ClusterType=Spec.number|MentionSpan=17-22
19	a	a	ADP	sps00	_	22	case	22:case	_
20	favor	favor	NOUN	_	_	19	fixed	19:fixed	_
21	de	de	ADP	_	_	19	fixed	19:fixed	_
22	ellos	él	PRON	pp3mp000	Case=Acc,Nom|Gender=Masc|Number=Plur|Person=3|PronType=Prs	18	nmod	18:nmod	ClusterId=CESS-CAST-P-20020501-120-c4|ClusterType=Gen|MentionSpan=22|MentionMisc=CorefType:ident
23	y	y	CCONJ	cc	_	25	cc	25:cc	_
24	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	25	det	25:det	_
25	25%	25	SYM	_	NumForm=Digit	18	conj	18:conj	ClusterId=CESS-CAST-P-20020501-120-s5.sn.47|ClusterType=Spec.number|MentionSpan=24-26
26	nuestro	nuestro	PRON	px1ms0p0	Gender=Masc|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs	25	appos	25:appos	ClusterId[1]=CESS-CAST-P-20020501-120-c3|ClusterType[1]=Spec.organization|MentionSpan[1]=26|MentionMisc[1]=ClusterTypeMismatch:Spec:Spec.organization,CorefType:ident|ClusterId[2]=CESS-CAST-P-20020501-120-s5.p.54|ClusterType[2]=Gen|MentionSpan[2]=26|SpaceAfter=No
27	"	"	PUNCT	fe	PunctType=Quot	18	punct	18:punct	_
28	.	.	PUNCT	fp	PunctType=Peri	8	punct	8:punct	_

There are two single-word mention annotations coming from the original data. One of them (c3) is coreferential with the empty subject 10.1. It is also coreferential with 6 mentions in other sentences. The other (CESS-CAST-P-20020501-120-s5.p.54) is a singleton and it probably appeared there because of some named entity annotation at another (higher or lower) constituent.

Now when I run this scenario:

udapy -s read.OldCorefUD corefud.FixCorefUD02 corefud.MergeSameSpan < bug0.conllu > bug1.conllu

the two mentions on line 26 are not merged:

# newdoc
# global.Entity = eid-etype-head-other
# sent_id = CESS-CAST-P-20020501-120-s5
# text = Y Charly, cómo no, se mojó: "Estamos, más o menos, un 75% a favor de ellos y un 25% nuestro" .
1	Y	y	CCONJ	cc	_	8	advmod	8:advmod	_
2	Charly	Charly	PROPN	np00000	_	8	nsubj	8:nsubj	Entity=(CESSCASTP20020501120c1-person-1-CorefType:ident,gstype:spec)|SpaceAfter=No
3	,	,	PUNCT	fc	PunctType=Comm	4	punct	4:punct	_
4	cómo	cómo	ADV	_	_	8	dep	8:dep	_
5	no	no	PART	_	_	4	dep	4:dep	SpaceAfter=No
6	,	,	PUNCT	fc	PunctType=Comm	4	punct	4:punct	_
7	se	él	PRON	p0300000	Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	8	expl:pv	8:expl:pv	_
8	mojó	mojar	VERB	vmis3s0	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	0	root	0:root	SpaceAfter=No
9	:	:	PUNCT	fd	PunctType=Colo	18	punct	18:punct	_
10	"	"	PUNCT	_	PunctType=Quot	18	punct	18:punct	SpaceAfter=No
10.1	_	_	PRON	p	_	_	_	11:nsubj	Entity=(CESSCASTP20020501120c3-organization-1-ClusterTypeMismatch:Spec:Spec.organization,CorefType:ident,gstype:spec)
11	Estamos	estar	AUX	vmip1p0	Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin	18	cop	18:cop	SpaceAfter=No
12	,	,	PUNCT	fc	PunctType=Comm	13	punct	13:punct	_
13	más	más	ADV	rg	Degree=Cmp	18	advmod	18:advmod	_
14	o	o	CCONJ	cc	_	15	cc	15:cc	_
15	menos	menos	ADV	rg	Degree=Cmp	13	conj	13:conj	SpaceAfter=No
16	,	,	PUNCT	fc	PunctType=Comm	13	punct	13:punct	_
17	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	18	det	18:det	Entity=(CESSCASTP20020501120s5.sn.34-number-2-gstype:spec
18	75%	75	SYM	_	NumForm=Digit	8	advcl	8:advcl	_
19	a	a	ADP	sps00	_	22	case	22:case	_
20	favor	favor	NOUN	_	_	19	fixed	19:fixed	_
21	de	de	ADP	_	_	19	fixed	19:fixed	_
22	ellos	él	PRON	pp3mp000	Case=Acc,Nom|Gender=Masc|Number=Plur|Person=3|PronType=Prs	18	nmod	18:nmod	Entity=(CESSCASTP20020501120c4--1-CorefType:ident,gstype:gen)CESSCASTP20020501120s5.sn.34)
23	y	y	CCONJ	cc	_	25	cc	25:cc	_
24	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	25	det	25:det	Entity=(CESSCASTP20020501120s5.sn.47-number-2-gstype:spec
25	25%	25	SYM	_	NumForm=Digit	18	conj	18:conj	_
26	nuestro	nuestro	PRON	px1ms0p0	Gender=Masc|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs	25	appos	25:appos	Entity=(CESSCASTP20020501120s5.p.54--1-gstype:gen)(CESSCASTP20020501120c3-organization-1-ClusterTypeMismatch:Spec:Spec.organization,CorefType:ident,gstype:spec)CESSCASTP20020501120s5.sn.47)|SpaceAfter=No
27	"	"	PUNCT	fe	PunctType=Quot	18	punct	18:punct	_
28	.	.	PUNCT	fp	PunctType=Peri	8	punct	8:punct	_

However, when I save the file and re-read it with another Udapi process, the block succeeds in merging the spans (unfortunately it picks the second mention as the survivor and thus breaks coreference with 10.1 and with the antecedents in other sentences, but that's another issue):

udapy -s read.OldCorefUD corefud.FixCorefUD02 corefud.MergeSameSpan < bug0.conllu | udapy -s corefud.MergeSameSpan > bug1.conllu
# newdoc
# global.Entity = eid-etype-head-other
# sent_id = CESS-CAST-P-20020501-120-s5
# text = Y Charly, cómo no, se mojó: "Estamos, más o menos, un 75% a favor de ellos y un 25% nuestro" .
1	Y	y	CCONJ	cc	_	8	advmod	8:advmod	_
2	Charly	Charly	PROPN	np00000	_	8	nsubj	8:nsubj	Entity=(CESSCASTP20020501120c1-person-1-CorefType:ident,gstype:spec)|SpaceAfter=No
3	,	,	PUNCT	fc	PunctType=Comm	4	punct	4:punct	_
4	cómo	cómo	ADV	_	_	8	dep	8:dep	_
5	no	no	PART	_	_	4	dep	4:dep	SpaceAfter=No
6	,	,	PUNCT	fc	PunctType=Comm	4	punct	4:punct	_
7	se	él	PRON	p0300000	Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	8	expl:pv	8:expl:pv	_
8	mojó	mojar	VERB	vmis3s0	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	0	root	0:root	SpaceAfter=No
9	:	:	PUNCT	fd	PunctType=Colo	18	punct	18:punct	_
10	"	"	PUNCT	_	PunctType=Quot	18	punct	18:punct	SpaceAfter=No
10.1	_	_	PRON	p	_	_	_	11:nsubj	Entity=(CESSCASTP20020501120c3-organization-1-ClusterTypeMismatch:Spec:Spec.organization,CorefType:ident,gstype:spec)
11	Estamos	estar	AUX	vmip1p0	Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin	18	cop	18:cop	SpaceAfter=No
12	,	,	PUNCT	fc	PunctType=Comm	13	punct	13:punct	_
13	más	más	ADV	rg	Degree=Cmp	18	advmod	18:advmod	_
14	o	o	CCONJ	cc	_	15	cc	15:cc	_
15	menos	menos	ADV	rg	Degree=Cmp	13	conj	13:conj	SpaceAfter=No
16	,	,	PUNCT	fc	PunctType=Comm	13	punct	13:punct	_
17	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	18	det	18:det	Entity=(CESSCASTP20020501120s5.sn.34-number-2-gstype:spec
18	75%	75	SYM	_	NumForm=Digit	8	advcl	8:advcl	_
19	a	a	ADP	sps00	_	22	case	22:case	_
20	favor	favor	NOUN	_	_	19	fixed	19:fixed	_
21	de	de	ADP	_	_	19	fixed	19:fixed	_
22	ellos	él	PRON	pp3mp000	Case=Acc,Nom|Gender=Masc|Number=Plur|Person=3|PronType=Prs	18	nmod	18:nmod	Entity=(CESSCASTP20020501120c4--1-CorefType:ident,gstype:gen)CESSCASTP20020501120s5.sn.34)
23	y	y	CCONJ	cc	_	25	cc	25:cc	_
24	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	25	det	25:det	Entity=(CESSCASTP20020501120s5.sn.47-number-2-gstype:spec
25	25%	25	SYM	_	NumForm=Digit	18	conj	18:conj	_
26	nuestro	nuestro	PRON	px1ms0p0	Gender=Masc|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs	25	appos	25:appos	Entity=(CESSCASTP20020501120s5.p.54--1-gstype:gen)CESSCASTP20020501120s5.sn.47)|SpaceAfter=No
27	"	"	PUNCT	fe	PunctType=Quot	18	punct	18:punct	_
28	.	.	PUNCT	fp	PunctType=Peri	8	punct	8:punct	_

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions