Skip to content
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ rustpython-vm = { path = "crates/vm", default-features = false, version = "0.5.0
rustpython-pylib = { path = "crates/pylib", version = "0.5.0" }
rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" }
rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" }
rustpython-unicode = { path = "crates/unicode", default-features = false, version = "0.5.0" }
rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" }
rustpython-doc = { path = "crates/doc", version = "0.5.0" }

Expand Down
2 changes: 1 addition & 1 deletion crates/codegen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"]

[dependencies]
rustpython-compiler-core = { workspace = true }
rustpython-unicode = { workspace = true, default-features = false }
rustpython-literal = {workspace = true }
rustpython-wtf8 = { workspace = true }
ruff_python_ast = { workspace = true }
Expand All @@ -29,7 +30,6 @@ num-traits = { workspace = true }
thiserror = { workspace = true }
malachite-bigint = { workspace = true }
memchr = { workspace = true }
unicode_names2 = { workspace = true }

[dev-dependencies]
ruff_python_parser = { workspace = true }
Expand Down
4 changes: 3 additions & 1 deletion crates/codegen/src/string_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ impl StringParser {
let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1];

unicode_names2::character(name).ok_or_else(|| unreachable!())
rustpython_unicode::data::lookup(name)
.and_then(char::from_u32)
.ok_or_else(|| unreachable!())
}

/// Parse an escaped character, returning the new character.
Expand Down
2 changes: 1 addition & 1 deletion crates/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"]

[dependencies]
rustpython-literal = { workspace = true }
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true }

ascii = { workspace = true }
Expand All @@ -29,7 +30,6 @@ malachite-q = { workspace = true }
malachite-base = { workspace = true }
num-traits = { workspace = true }
parking_lot = { workspace = true, optional = true }
unicode_names2 = { workspace = true }
radium = { workspace = true }

lock_api = "0.4"
Expand Down
2 changes: 1 addition & 1 deletion crates/common/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ pub mod errors {
let mut out = String::with_capacity(num_chars * 4);
for c in err_str.code_points() {
let c_u32 = c.to_u32();
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
if let Some(c_name) = rustpython_unicode::data::name(c_u32) {
write!(out, "\\N{{{c_name}}}").unwrap();
} else if c_u32 >= 0x10000 {
write!(out, "\\U{c_u32:08x}").unwrap();
Expand Down
2 changes: 1 addition & 1 deletion crates/literal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ license = { workspace = true }
rust-version = { workspace = true }

[dependencies]
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true }

hexf-parse = "0.2.1"
is-macro.workspace = true
lexical-parse-float = { version = "1.0.6", features = ["format"] }
num-traits = { workspace = true }
icu_properties = { workspace = true }

[dev-dependencies]
rand = { workspace = true }
16 changes: 1 addition & 15 deletions crates/literal/src/char.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use icu_properties::props::{EnumeratedProperty, GeneralCategory};

/// According to python following categories aren't printable:
/// * Cc (Other, Control)
/// * Cf (Other, Format)
Expand All @@ -10,17 +8,5 @@ use icu_properties::props::{EnumeratedProperty, GeneralCategory};
/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
/// * Zs (Separator, Space) other than ASCII space('\x20').
pub fn is_printable(c: char) -> bool {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still need this function? why not directly calling is_repr_printable?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the redundant wrapper and now call rustpython_unicode::classify::is_repr_printable directly from escape.rs in 4efa5da.

let cat = GeneralCategory::for_char(c);

!matches!(
cat,
GeneralCategory::SpaceSeparator
| GeneralCategory::LineSeparator
| GeneralCategory::ParagraphSeparator
| GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
)
rustpython_unicode::classify::is_repr_printable(c as u32)
}
1 change: 1 addition & 0 deletions crates/sre_engine/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ name = "benches"
harness = false

[dependencies]
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true }
num_enum = { workspace = true }
bitflags = { workspace = true }
Expand Down
102 changes: 14 additions & 88 deletions crates/sre_engine/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,135 +332,61 @@ const fn utf8_is_cont_byte(byte: u8) -> bool {
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;

const fn is_py_ascii_whitespace(b: u8) -> bool {
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
}

#[inline]
pub(crate) fn is_word(ch: u32) -> bool {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

importing rustpython_unicode::regex as unicode_regex will remove needs for all this helpers

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switched the SRE engine over to a direct rustpython_unicode::regex import and removed the pass-through helpers in 4efa5da.

ch == '_' as u32
|| u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
rustpython_unicode::regex::is_word(ch)
}
#[inline]
pub(crate) fn is_space(ch: u32) -> bool {
u8::try_from(ch)
.map(is_py_ascii_whitespace)
.unwrap_or(false)
rustpython_unicode::regex::is_space(ch)
}
#[inline]
pub(crate) fn is_digit(ch: u32) -> bool {
u8::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
rustpython_unicode::regex::is_digit(ch)
}
#[inline]
pub(crate) fn is_loc_word(ch: u32) -> bool {
ch == '_' as u32 || is_loc_alnum(ch)
rustpython_unicode::regex::is_locale_word(ch)
}
#[inline]
pub(crate) const fn is_linebreak(ch: u32) -> bool {
ch == '\n' as u32
rustpython_unicode::regex::is_linebreak(ch)
}
#[inline]
pub fn lower_ascii(ch: u32) -> u32 {
u8::try_from(ch)
.map(|x| x.to_ascii_lowercase() as u32)
.unwrap_or(ch)
rustpython_unicode::regex::lower_ascii(ch)
}
#[inline]
pub(crate) fn lower_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
lower_ascii(ch)
rustpython_unicode::regex::lower_locale(ch)
}
#[inline]
pub(crate) fn upper_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.to_ascii_uppercase() as u32)
.unwrap_or(ch)
rustpython_unicode::regex::upper_locale(ch)
}
#[inline]
pub(crate) fn is_uni_digit(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
rustpython_unicode::regex::is_unicode_digit(ch)
}
#[inline]
pub(crate) fn is_uni_space(ch: u32) -> bool {
// TODO: check with cpython
is_space(ch)
|| matches!(
ch,
0x0009
| 0x000A
| 0x000B
| 0x000C
| 0x000D
| 0x001C
| 0x001D
| 0x001E
| 0x001F
| 0x0020
| 0x0085
| 0x00A0
| 0x1680
| 0x2000
| 0x2001
| 0x2002
| 0x2003
| 0x2004
| 0x2005
| 0x2006
| 0x2007
| 0x2008
| 0x2009
| 0x200A
| 0x2028
| 0x2029
| 0x202F
| 0x205F
| 0x3000
)
rustpython_unicode::regex::is_unicode_space(ch)
}
#[inline]
pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
matches!(
ch,
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
)
}
#[inline]
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.unwrap_or(false)
rustpython_unicode::regex::is_unicode_linebreak(ch)
}
#[inline]
pub(crate) fn is_uni_word(ch: u32) -> bool {
ch == '_' as u32 || is_uni_alnum(ch)
rustpython_unicode::regex::is_unicode_word(ch)
}
#[inline]
pub fn lower_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_lowercase().next().unwrap() as u32)
.unwrap_or(ch)
rustpython_unicode::regex::lower_unicode(ch)
}
#[inline]
pub fn upper_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_uppercase().next().unwrap() as u32)
.unwrap_or(ch)
rustpython_unicode::regex::upper_unicode(ch)
}
8 changes: 1 addition & 7 deletions crates/stdlib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ flame-it = ["flame"]
[dependencies]
# rustpython crates
rustpython-derive = { workspace = true }
rustpython-unicode = { workspace = true, features = ["casefold"] }
rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
rustpython-common = { workspace = true }

Expand Down Expand Up @@ -76,13 +77,6 @@ pbkdf2 = { version = "0.12", features = ["hmac"] }
constant_time_eq = { workspace = true }

## unicode stuff
unicode_names2 = { workspace = true }
# update version all at the same time
icu_properties = { workspace = true }
icu_normalizer = { workspace = true }
unic-ucd-age = { workspace = true }
ucd = "0.1.1"

# compression
adler32 = "1.2.0"
crc32fast = "1.3.2"
Expand Down
Loading
Loading