Skip to content
This repository was archived by the owner on Aug 31, 2021. It is now read-only.

Commit 6dabce1

Browse files
author
livecodeali
committed
[[ Grapheme Handling ]] Implement grapheme cluster break rules
1 parent 645caba commit 6dabce1

File tree

2 files changed

+96
-0
lines changed

2 files changed

+96
-0
lines changed

libfoundation/include/foundation-unicode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ inline uinteger_t MCUnicodeMapFromNative_ISO8859_1(char_t p_native)
716716

717717
////////////////////////////////////////////////////////////////////////////////
718718

719+
bool MCUnicodeIsGraphemeClusterBoundary(codepoint_t p_left, codepoint_t p_right);
719720

721+
////////////////////////////////////////////////////////////////////////////////
720722

721723
#endif /* ifndef __MC_FOUNDATION_UNICODE__ */

libfoundation/src/foundation-unicode.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,3 +2407,97 @@ bool MCUnicodeWildcardMatch(const void *source_chars, uindex_t source_length, bo
24072407

24082408
return true;
24092409
}
2410+
2411+
////////////////////////////////////////////////////////////////////////////////
2412+
2413+
// Implement rules GB 6 - 8 based on Hangul syllable type
2414+
static bool __MCUnicodeIsHangulClusterBoundary(int32_t p_left, int32_t p_right)
2415+
{
2416+
switch (p_left)
2417+
{
2418+
case U_GCB_L:
2419+
return p_right == U_GCB_T;
2420+
case U_GCB_LV:
2421+
case U_GCB_V:
2422+
return p_right != U_GCB_V && p_right != U_GCB_T;
2423+
case U_GCB_LVT:
2424+
case U_GCB_T:
2425+
return p_right != U_GCB_T;
2426+
default:
2427+
MCUnreachable();
2428+
}
2429+
}
2430+
2431+
static bool __MCUnicodeIsControl(int32_t p_gcb)
2432+
{
2433+
return p_gcb == U_GCB_CR || p_gcb == U_GCB_LF || p_gcb == U_GCB_CONTROL;
2434+
}
2435+
2436+
static bool __MCUnicodeIsHangulSyllable(int32_t p_gcb)
2437+
{
2438+
switch (p_gcb)
2439+
{
2440+
case U_GCB_L:
2441+
case U_GCB_LV:
2442+
case U_GCB_LVT:
2443+
case U_GCB_T:
2444+
case U_GCB_V:
2445+
return true;
2446+
default:
2447+
break;
2448+
}
2449+
2450+
return false;
2451+
}
2452+
2453+
bool MCUnicodeIsGraphemeClusterBoundary(codepoint_t p_left, codepoint_t p_right)
2454+
{
2455+
int32_t t_left_gcb;
2456+
t_left_gcb = MCUnicodeGetIntegerProperty(p_left, kMCUnicodePropertyGraphemeClusterBreak);
2457+
2458+
int32_t t_right_gcb;
2459+
t_right_gcb = MCUnicodeGetIntegerProperty(p_right, kMCUnicodePropertyGraphemeClusterBreak);
2460+
2461+
// We treat CR LF as 2 graphemes, contrary to GB 3
2462+
/*
2463+
if (t_left_gcb == U_GCB_CR && t_right_gcb == U_GCB_LF)
2464+
return false;
2465+
*/
2466+
2467+
// GB 4: Break after controls
2468+
if (__MCUnicodeIsControl(t_left_gcb))
2469+
return true;
2470+
2471+
// GB 5: Break before controls
2472+
if (__MCUnicodeIsControl(t_right_gcb))
2473+
return true;
2474+
2475+
// GB 6 - 8: Do not break Hangul syllable sequences.
2476+
if (__MCUnicodeIsHangulSyllable(t_left_gcb) && __MCUnicodeIsHangulSyllable(t_right_gcb))
2477+
{
2478+
if (!__MCUnicodeIsHangulClusterBoundary(t_left_gcb, t_right_gcb))
2479+
return false;
2480+
}
2481+
2482+
// GB 8a: Do not break between regional indicator symbols.
2483+
if (t_left_gcb == U_GCB_REGIONAL_INDICATOR && t_right_gcb == U_GCB_REGIONAL_INDICATOR)
2484+
return false;
2485+
2486+
// GB 9: Do not break before extending characters.
2487+
if (t_right_gcb == U_GCB_EXTEND)
2488+
return false;
2489+
2490+
// GB 9a: Do not break before SpacingMarks
2491+
if (t_right_gcb == U_GCB_SPACING_MARK)
2492+
return false;
2493+
2494+
// GB 9b: Do not break after Prepend characters
2495+
if (t_left_gcb == U_GCB_PREPEND)
2496+
return false;
2497+
2498+
// GB 10: Otherwise, break everywhere.
2499+
return true;
2500+
}
2501+
2502+
////////////////////////////////////////////////////////////////////////////////
2503+

0 commit comments

Comments
 (0)