From f76be43d265862414fb6ca11383c5891d1026d17 Mon Sep 17 00:00:00 2001 From: Antonio Yang Date: Sun, 9 Jun 2019 11:12:31 +0800 Subject: [PATCH] str.isprintable - check unicode type by unicode_categories - rm redundant check of empty string --- tests/snippets/strings.py | 7 +++++++ vm/Cargo.toml | 1 + vm/src/obj/objstr.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/tests/snippets/strings.py b/tests/snippets/strings.py index aaeaed5f3e6..19cd98269a1 100644 --- a/tests/snippets/strings.py +++ b/tests/snippets/strings.py @@ -216,3 +216,10 @@ def try_mutate_str(): for s, b, e in zip(ss, bs, ['u8', 'U8', 'utf-8', 'UTF-8', 'utf_8']): assert s.encode(e) == b # assert s.encode(encoding=e) == b + +# str.isisprintable +assert "".isprintable() +assert " ".isprintable() +assert "abcdefg".isprintable() +assert not "abcdefg\n".isprintable() +assert "อด".isprintable() diff --git a/vm/Cargo.toml b/vm/Cargo.toml index ddc8ee20c02..cc7fe23b6c5 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -31,6 +31,7 @@ hexf = "0.1.0" indexmap = "1.0.2" crc = "^1.0.0" bincode = "1.1.4" +unicode_categories = "0.1.1" # TODO: release and publish to crates.io diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 40d19129685..8ea5cd6993f 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -1,3 +1,4 @@ +extern crate unicode_categories; extern crate unicode_xid; use std::fmt; @@ -27,6 +28,8 @@ use super::objsequence::PySliceableSequence; use super::objslice::PySlice; use super::objtype::{self, PyClassRef}; +use unicode_categories::UnicodeCategories; + /// str(object='') -> str /// str(bytes_or_buffer[, encoding[, errors]]) -> str /// @@ -519,6 +522,29 @@ impl PyString { } } + /// Return true if all characters in the string are printable or the string is empty, + /// false otherwise. Nonprintable characters are those characters defined in the + /// Unicode character database as `Other` or `Separator`, + /// excepting the ASCII space (0x20) which is considered printable. + /// + /// All characters except those characters defined in the Unicode character + /// database as following categories are considered printable. + /// * Cc (Other, Control) + /// * Cf (Other, Format) + /// * Cs (Other, Surrogate) + /// * Co (Other, Private Use) + /// * Cn (Other, Not Assigned) + /// * Zl Separator, Line ('\u2028', LINE SEPARATOR) + /// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) + /// * Zs (Separator, Space) other than ASCII space('\x20'). + #[pymethod] + fn isprintable(&self, _vm: &VirtualMachine) -> bool { + self.value.chars().all(|c| match c { + '\u{0020}' => true, + _ => !(c.is_other_control() | c.is_separator()), + }) + } + // cpython's isspace ignores whitespace, including \t and \n, etc, unless the whole string is empty // which is why isspace is using is_ascii_whitespace. Same for isupper & islower #[pymethod]