-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Extract shared rustpython-unicode crate and route core Unicode semantics through it
#7561
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
d3af1c5
67485b5
e968d83
5cf1bd6
0a340de
2934897
4efa5da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -332,135 +332,61 @@ const fn utf8_is_cont_byte(byte: u8) -> bool { | |
| /// Mask of the value bits of a continuation byte. | ||
| const CONT_MASK: u8 = 0b0011_1111; | ||
|
|
||
| const fn is_py_ascii_whitespace(b: u8) -> bool { | ||
| matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') | ||
| } | ||
|
|
||
| #[inline] | ||
| pub(crate) fn is_word(ch: u32) -> bool { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. importing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switched the SRE engine over to a direct |
||
| ch == '_' as u32 | ||
| || u8::try_from(ch) | ||
| .map(|x| x.is_ascii_alphanumeric()) | ||
| .unwrap_or(false) | ||
| rustpython_unicode::regex::is_word(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_space(ch: u32) -> bool { | ||
| u8::try_from(ch) | ||
| .map(is_py_ascii_whitespace) | ||
| .unwrap_or(false) | ||
| rustpython_unicode::regex::is_space(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_digit(ch: u32) -> bool { | ||
| u8::try_from(ch) | ||
| .map(|x| x.is_ascii_digit()) | ||
| .unwrap_or(false) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_loc_alnum(ch: u32) -> bool { | ||
| // FIXME: Ignore the locales | ||
| u8::try_from(ch) | ||
| .map(|x| x.is_ascii_alphanumeric()) | ||
| .unwrap_or(false) | ||
| rustpython_unicode::regex::is_digit(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_loc_word(ch: u32) -> bool { | ||
| ch == '_' as u32 || is_loc_alnum(ch) | ||
| rustpython_unicode::regex::is_locale_word(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) const fn is_linebreak(ch: u32) -> bool { | ||
| ch == '\n' as u32 | ||
| rustpython_unicode::regex::is_linebreak(ch) | ||
| } | ||
| #[inline] | ||
| pub fn lower_ascii(ch: u32) -> u32 { | ||
| u8::try_from(ch) | ||
| .map(|x| x.to_ascii_lowercase() as u32) | ||
| .unwrap_or(ch) | ||
| rustpython_unicode::regex::lower_ascii(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn lower_locate(ch: u32) -> u32 { | ||
| // FIXME: Ignore the locales | ||
| lower_ascii(ch) | ||
| rustpython_unicode::regex::lower_locale(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn upper_locate(ch: u32) -> u32 { | ||
| // FIXME: Ignore the locales | ||
| u8::try_from(ch) | ||
| .map(|x| x.to_ascii_uppercase() as u32) | ||
| .unwrap_or(ch) | ||
| rustpython_unicode::regex::upper_locale(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_uni_digit(ch: u32) -> bool { | ||
| // TODO: check with cpython | ||
| char::try_from(ch) | ||
| .map(|x| x.is_ascii_digit()) | ||
| .unwrap_or(false) | ||
| rustpython_unicode::regex::is_unicode_digit(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_uni_space(ch: u32) -> bool { | ||
| // TODO: check with cpython | ||
| is_space(ch) | ||
| || matches!( | ||
| ch, | ||
| 0x0009 | ||
| | 0x000A | ||
| | 0x000B | ||
| | 0x000C | ||
| | 0x000D | ||
| | 0x001C | ||
| | 0x001D | ||
| | 0x001E | ||
| | 0x001F | ||
| | 0x0020 | ||
| | 0x0085 | ||
| | 0x00A0 | ||
| | 0x1680 | ||
| | 0x2000 | ||
| | 0x2001 | ||
| | 0x2002 | ||
| | 0x2003 | ||
| | 0x2004 | ||
| | 0x2005 | ||
| | 0x2006 | ||
| | 0x2007 | ||
| | 0x2008 | ||
| | 0x2009 | ||
| | 0x200A | ||
| | 0x2028 | ||
| | 0x2029 | ||
| | 0x202F | ||
| | 0x205F | ||
| | 0x3000 | ||
| ) | ||
| rustpython_unicode::regex::is_unicode_space(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { | ||
| matches!( | ||
| ch, | ||
| 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 | ||
| ) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_uni_alnum(ch: u32) -> bool { | ||
| // TODO: check with cpython | ||
| char::try_from(ch) | ||
| .map(|x| x.is_alphanumeric()) | ||
| .unwrap_or(false) | ||
| rustpython_unicode::regex::is_unicode_linebreak(ch) | ||
| } | ||
| #[inline] | ||
| pub(crate) fn is_uni_word(ch: u32) -> bool { | ||
| ch == '_' as u32 || is_uni_alnum(ch) | ||
| rustpython_unicode::regex::is_unicode_word(ch) | ||
| } | ||
| #[inline] | ||
| pub fn lower_unicode(ch: u32) -> u32 { | ||
| // TODO: check with cpython | ||
| char::try_from(ch) | ||
| .map(|x| x.to_lowercase().next().unwrap() as u32) | ||
| .unwrap_or(ch) | ||
| rustpython_unicode::regex::lower_unicode(ch) | ||
| } | ||
| #[inline] | ||
| pub fn upper_unicode(ch: u32) -> u32 { | ||
| // TODO: check with cpython | ||
| char::try_from(ch) | ||
| .map(|x| x.to_uppercase().next().unwrap() as u32) | ||
| .unwrap_or(ch) | ||
| rustpython_unicode::regex::upper_unicode(ch) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
still need this function? why not directly calling is_repr_printable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed the redundant wrapper and now call
rustpython_unicode::classify::is_repr_printabledirectly fromescape.rsin 4efa5da.