Skip to content

Commit 678d34e

Browse files
committed
unicode/norm: preserve QC Maybe bit in packed forminfo
The bit packing for runes with decompositions assumed that QC Maybe cannot happen and only stored the yes/no bit, not the maybe bit. Squeeze the length from 6 to 5 bits to make room for saving the maybe bit too. This has no effect in Unicode 15 but will be needed for Unicode 16. For golang/go#77266. Change-Id: I377a0fc7af218ef5a481139b788fbb8b1e3980fb Reviewed-on: https://go-review.googlesource.com/c/text/+/737400 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Roland Shoemaker <roland@golang.org>
1 parent 536231a commit 678d34e

3 files changed

Lines changed: 1471 additions & 1451 deletions

File tree

unicode/norm/forminfo.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,18 @@ import "encoding/binary"
1313
// a rune to a uint16. The values take two forms. For v >= 0x8000:
1414
// bits
1515
// 15: 1 (inverse of NFD_QC bit of qcInfo)
16-
// 13..7: qcInfo (see below). isYesD is always true (no decomposition).
16+
// 12..7: qcInfo (see below). isYesD is always true (no decomposition).
1717
// 6..0: ccc (compressed CCC value).
1818
// For v < 0x8000, the respective rune has a decomposition and v is an index
1919
// into a byte array of UTF-8 decomposition sequences and additional info and
2020
// has the form:
2121
// <header> <decomp_byte>* [<tccc> [<lccc>]]
2222
// The header contains the number of bytes in the decomposition (excluding this
23-
// length byte). The two most significant bits of this length byte correspond
24-
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
23+
// length byte), with 33 mapped to 31 to fit in 5 bits.
24+
// (If any 31- or 32-byte decompositions come along, we could switch to using
25+
// use a general lookup table as long as there are at most 32 distinct lengths.)
26+
// The three most significant bits of this length byte correspond
27+
// to bit 5, 4, and 3 of qcInfo (see below). The byte sequence itself starts at v+1.
2528
// The byte sequence is followed by a trailing and leading CCC if the values
2629
// for these are not zero. The value of v determines which ccc are appended
2730
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
@@ -32,8 +35,8 @@ import "encoding/binary"
3235

3336
const (
3437
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
35-
headerLenMask = 0x3F // extract the length value from the header byte
36-
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
38+
headerLenMask = 0x1F // extract the length value from the header byte (31 => 33)
39+
headerFlagsMask = 0xE0 // extract the qcInfo bits from the header byte
3740
)
3841

3942
// Properties provides access to normalization properties of a rune.
@@ -109,14 +112,14 @@ func (p Properties) BoundaryAfter() bool {
109112
return p.isInert()
110113
}
111114

112-
// We pack quick check data in 4 bits:
115+
// We pack quick check data in 6 bits:
113116
//
114117
// 5: Combines forward (0 == false, 1 == true)
115118
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
116119
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
117120
// 1..0: Number of trailing non-starters.
118121
//
119-
// When all 4 bits are zero, the character is inert, meaning it is never
122+
// When all 6 bits are zero, the character is inert, meaning it is never
120123
// influenced by normalization.
121124
type qcInfo uint8
122125

@@ -152,6 +155,9 @@ func (p Properties) Decomposition() []byte {
152155
}
153156
i := p.index
154157
n := decomps[i] & headerLenMask
158+
if n == 31 {
159+
n = 33
160+
}
155161
i++
156162
return decomps[i : i+uint16(n)]
157163
}
@@ -260,7 +266,11 @@ func compInfo(v uint16, sz int) Properties {
260266
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
261267
p := Properties{size: uint8(sz), flags: f, index: v}
262268
if v >= firstCCC {
263-
v += uint16(h&headerLenMask) + 1
269+
n := uint16(h & headerLenMask)
270+
if n == 31 {
271+
n = 33
272+
}
273+
v += n + 1
264274
c := decomps[v]
265275
p.tccc = c >> 2
266276
p.flags |= qcInfo(c & 0x3)

unicode/norm/maketables.go

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ type FormInfo struct {
159159
combinesBackward bool // May combine with rune on the left
160160
isOneWay bool // Never appears in result
161161
inDecomp bool // Some decompositions result in this char.
162+
suffixDecomp bool // Appears after first rune of a decomposition
162163
decomp Decomposition
163164
expandedDecomp Decomposition
164165
}
@@ -397,8 +398,11 @@ func completeCharFields(form int) {
397398
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
398399
}
399400

400-
for _, r := range f.decomp {
401+
for i, r := range f.decomp {
401402
chars[r].forms[form].inDecomp = true
403+
if i > 0 {
404+
chars[r].forms[form].suffixDecomp = true
405+
}
402406
}
403407
}
404408

@@ -420,6 +424,35 @@ func completeCharFields(form int) {
420424
if isHangulWithoutJamoT(rune(i)) {
421425
f.combinesForward = true
422426
}
427+
if (i & 0xffff00) == JamoLBase {
428+
if JamoLBase <= i && i < JamoLEnd {
429+
f.combinesForward = true
430+
}
431+
if JamoVBase <= i && i < JamoVEnd {
432+
f.combinesBackward = true
433+
f.combinesForward = true
434+
}
435+
if JamoTBase <= i && i < JamoTEnd {
436+
f.combinesBackward = true
437+
}
438+
}
439+
}
440+
441+
// Phase 2½: backward combining propagation.
442+
for i := range chars {
443+
c := &chars[i]
444+
f := &c.forms[form]
445+
446+
// If the first rune of f's decomposition combines backward,
447+
// then f itself must be considered to combine backward.
448+
// This handles the "MaybeNo" runes introduced in Unicode 16.
449+
// https://www.unicode.org/reports/tr15/tr15-56.html#Contexts_Care
450+
if !f.isOneWay && len(f.decomp) > 0 {
451+
f0 := &chars[f.decomp[0]].forms[form]
452+
if f0.combinesBackward {
453+
f.combinesBackward = true
454+
}
455+
}
423456
}
424457

425458
// Phase 3: quick check values.
@@ -438,20 +471,6 @@ func completeCharFields(form int) {
438471
switch {
439472
case f.isOneWay:
440473
f.quickCheck[MComposed] = QCNo
441-
case (i & 0xffff00) == JamoLBase:
442-
f.quickCheck[MComposed] = QCYes
443-
if JamoLBase <= i && i < JamoLEnd {
444-
f.combinesForward = true
445-
}
446-
if JamoVBase <= i && i < JamoVEnd {
447-
f.quickCheck[MComposed] = QCMaybe
448-
f.combinesBackward = true
449-
f.combinesForward = true
450-
}
451-
if JamoTBase <= i && i < JamoTEnd {
452-
f.quickCheck[MComposed] = QCMaybe
453-
f.combinesBackward = true
454-
}
455474
case !f.combinesBackward:
456475
f.quickCheck[MComposed] = QCYes
457476
default:
@@ -574,20 +593,17 @@ func (m *decompSet) insert(key int, s string) {
574593
}
575594

576595
func printCharInfoTables(w io.Writer) int {
577-
mkstr := func(r rune, f *FormInfo) (int, string) {
596+
mkstr := func(r rune, f *FormInfo, c *Char) (int, string) {
578597
d := f.expandedDecomp
579598
s := string([]rune(d))
580-
if max := 1 << 6; len(s) >= max {
581-
const msg = "%U: too many bytes in decomposition: %d >= %d"
582-
log.Fatalf(msg, r, len(s), max)
583-
}
584-
head := uint8(len(s))
585-
if f.quickCheck[MComposed] != QCYes {
586-
head |= 0x40
599+
slen := len(s)
600+
if slen == 31 || slen == 32 || slen > 33 {
601+
log.Fatalf("%U: too many bytes in decomposition: %d", slen)
587602
}
588-
if f.combinesForward {
589-
head |= 0x80
603+
if slen == 33 {
604+
slen = 31
590605
}
606+
head := uint8(slen) | uint8(makeEntry(f, c)>>3<<5)
591607
s = string([]byte{head}) + s
592608

593609
lccc := ccc(d[0])
@@ -609,7 +625,7 @@ func printCharInfoTables(w io.Writer) int {
609625
s += string([]byte{tccc})
610626
index = endMulti
611627
for _, r := range d[1:] {
612-
if ccc(r) == 0 {
628+
if ccc(r) == 0 && !chars[r].forms[FCanonical].combinesBackward {
613629
index = firstCCC
614630
}
615631
}
@@ -643,10 +659,7 @@ func printCharInfoTables(w io.Writer) int {
643659
if len(f.expandedDecomp) == 0 {
644660
continue
645661
}
646-
if f.combinesBackward {
647-
log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
648-
}
649-
index, s := mkstr(c.codePoint, &f)
662+
index, s := mkstr(c.codePoint, &f, &c)
650663
decompSet.insert(index, s)
651664
}
652665
}
@@ -685,7 +698,7 @@ func printCharInfoTables(w io.Writer) int {
685698
f := c.forms[i]
686699
d := f.expandedDecomp
687700
if len(d) != 0 {
688-
_, key := mkstr(c.codePoint, &f)
701+
_, key := mkstr(c.codePoint, &f, &c)
689702
trie.Insert(rune(r), uint64(positionMap[key]))
690703
if c.ccc != ccc(d[0]) {
691704
// We assume the lead ccc of a decomposition !=0 in this case.
@@ -834,9 +847,6 @@ func verifyComputed() {
834847
if f.combinesBackward != isMaybe {
835848
log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
836849
}
837-
if len(f.decomp) > 0 && f.combinesForward && isMaybe {
838-
log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
839-
}
840850

841851
if len(f.expandedDecomp) != 0 {
842852
continue

0 commit comments

Comments
 (0)