From 2722bc06de8ab99d872e34736dd81d43c99f7c95 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:36:31 +0000 Subject: [PATCH 1/3] Initial plan From 5dd88ee5ae6e777732be18fe96abb03ad3a1711d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:57:44 +0000 Subject: [PATCH 2/3] Fix isalnum/isalpha to use Unicode general category checks; fix regex \\w for Mn characters Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/6f573a91-8811-486c-933d-7ba9a9067643 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- Cargo.lock | 1 + crates/sre_engine/Cargo.toml | 1 + crates/sre_engine/src/string.rs | 16 ++++++++++-- crates/vm/src/builtins/str.rs | 27 +++++++++++++++++++-- extra_tests/snippets/builtin_str_unicode.py | 9 +++++++ 5 files changed, 50 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 139dc98947e..7fa1a444646 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3249,6 +3249,7 @@ dependencies = [ "num_enum", "optional", "rustpython-wtf8", + "unic-ucd-category", ] [[package]] diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml index 4f899e6b3e9..448f34509a6 100644 --- a/crates/sre_engine/Cargo.toml +++ b/crates/sre_engine/Cargo.toml @@ -16,6 +16,7 @@ harness = false [dependencies] rustpython-wtf8 = { workspace = true } +unic-ucd-category = { workspace = true } num_enum = { workspace = true } bitflags = { workspace = true } optional = { workspace = true } diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index 489819bfb3e..7c42b0d8dad 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -1,4 +1,5 @@ use rustpython_wtf8::Wtf8; +use unic_ucd_category::GeneralCategory; #[derive(Debug, Clone, Copy)] pub struct StringCursor { @@ -441,9 +442,20 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { } #[inline] pub(crate) fn is_uni_alnum(ch: u32) -> bool { - // TODO: check with cpython char::try_from(ch) - .map(|x| x.is_alphanumeric()) + .map(|c| { + matches!( + GeneralCategory::of(c), + GeneralCategory::UppercaseLetter + | GeneralCategory::LowercaseLetter + | GeneralCategory::TitlecaseLetter + | GeneralCategory::ModifierLetter + | GeneralCategory::OtherLetter + | GeneralCategory::DecimalNumber + | GeneralCategory::LetterNumber + | GeneralCategory::OtherNumber + ) + }) .unwrap_or(false) } #[inline] diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 8707c5cf769..fc724fb8664 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -945,7 +945,20 @@ impl PyStr { #[pymethod] fn isalnum(&self) -> bool { - !self.data.is_empty() && self.char_all(char::is_alphanumeric) + !self.data.is_empty() + && self.char_all(|c| { + matches!( + GeneralCategory::of(c), + GeneralCategory::UppercaseLetter + | GeneralCategory::LowercaseLetter + | GeneralCategory::TitlecaseLetter + | GeneralCategory::ModifierLetter + | GeneralCategory::OtherLetter + | GeneralCategory::DecimalNumber + | GeneralCategory::LetterNumber + | GeneralCategory::OtherNumber + ) + }) } #[pymethod] @@ -1056,7 +1069,17 @@ impl PyStr { #[pymethod] fn isalpha(&self) -> bool { - !self.data.is_empty() && self.char_all(char::is_alphabetic) + !self.data.is_empty() + && self.char_all(|c| { + matches!( + GeneralCategory::of(c), + GeneralCategory::UppercaseLetter + | GeneralCategory::LowercaseLetter + | GeneralCategory::TitlecaseLetter + | GeneralCategory::ModifierLetter + | GeneralCategory::OtherLetter + ) + }) } #[pymethod] diff --git a/extra_tests/snippets/builtin_str_unicode.py b/extra_tests/snippets/builtin_str_unicode.py index ca4a99199c3..fe908869f34 100644 --- a/extra_tests/snippets/builtin_str_unicode.py +++ b/extra_tests/snippets/builtin_str_unicode.py @@ -38,3 +38,12 @@ # TODO: add east_asian_width and mirrored # assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N' # assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a") + +# U+0345 COMBINING GREEK YPOGEGRAMMENI (category Mn) should not be alphanumeric. +# CPython's isalpha/isalnum use Unicode letter categories (Lu/Ll/Lt/Lm/Lo), +# not the broader Unicode Alphabetic derived property. +assert not "\u0345".isalpha(), "isalpha should not match Mn category characters" +assert not "\u0345".isalnum(), "isalnum should not match Mn category characters" + +import re +assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)" From fab1c0cc01724cf9bc8eb3f508e7cf130bb3d878 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 23:47:11 +0000 Subject: [PATCH 3/3] Fix import ordering: move import re to be sorted with import unicodedata Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/18cc7216-15cf-40d1-8726-23e21aa4c368 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- extra_tests/snippets/builtin_str_unicode.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extra_tests/snippets/builtin_str_unicode.py b/extra_tests/snippets/builtin_str_unicode.py index fe908869f34..08553c55c58 100644 --- a/extra_tests/snippets/builtin_str_unicode.py +++ b/extra_tests/snippets/builtin_str_unicode.py @@ -11,6 +11,7 @@ assert c == "👋👋👋" +import re import unicodedata assert unicodedata.category("a") == "Ll" @@ -44,6 +45,4 @@ # not the broader Unicode Alphabetic derived property. assert not "\u0345".isalpha(), "isalpha should not match Mn category characters" assert not "\u0345".isalnum(), "isalnum should not match Mn category characters" - -import re assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"