From 2722bc06de8ab99d872e34736dd81d43c99f7c95 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 27 Mar 2026 16:36:31 +0000
Subject: [PATCH 1/3] Initial plan


From 5dd88ee5ae6e777732be18fe96abb03ad3a1711d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 27 Mar 2026 16:57:44 +0000
Subject: [PATCH 2/3] Fix isalnum/isalpha to use Unicode general category
 checks; fix regex \\w for Mn characters

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/6f573a91-8811-486c-933d-7ba9a9067643

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 Cargo.lock                                  |  1 +
 crates/sre_engine/Cargo.toml                |  1 +
 crates/sre_engine/src/string.rs             | 16 ++++++++++--
 crates/vm/src/builtins/str.rs               | 27 +++++++++++++++++++--
 extra_tests/snippets/builtin_str_unicode.py |  9 +++++++
 5 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 139dc98947e..7fa1a444646 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3249,6 +3249,7 @@ dependencies = [
  "num_enum",
  "optional",
  "rustpython-wtf8",
+ "unic-ucd-category",
 ]
 
 [[package]]
diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml
index 4f899e6b3e9..448f34509a6 100644
--- a/crates/sre_engine/Cargo.toml
+++ b/crates/sre_engine/Cargo.toml
@@ -16,6 +16,7 @@ harness = false
 
 [dependencies]
 rustpython-wtf8 = { workspace = true }
+unic-ucd-category = { workspace = true }
 num_enum = { workspace = true }
 bitflags = { workspace = true }
 optional = { workspace = true }
diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs
index 489819bfb3e..7c42b0d8dad 100644
--- a/crates/sre_engine/src/string.rs
+++ b/crates/sre_engine/src/string.rs
@@ -1,4 +1,5 @@
 use rustpython_wtf8::Wtf8;
+use unic_ucd_category::GeneralCategory;
 
 #[derive(Debug, Clone, Copy)]
 pub struct StringCursor {
@@ -441,9 +442,20 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
 }
 #[inline]
 pub(crate) fn is_uni_alnum(ch: u32) -> bool {
-    // TODO: check with cpython
     char::try_from(ch)
-        .map(|x| x.is_alphanumeric())
+        .map(|c| {
+            matches!(
+                GeneralCategory::of(c),
+                GeneralCategory::UppercaseLetter
+                    | GeneralCategory::LowercaseLetter
+                    | GeneralCategory::TitlecaseLetter
+                    | GeneralCategory::ModifierLetter
+                    | GeneralCategory::OtherLetter
+                    | GeneralCategory::DecimalNumber
+                    | GeneralCategory::LetterNumber
+                    | GeneralCategory::OtherNumber
+            )
+        })
         .unwrap_or(false)
 }
 #[inline]
diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs
index 8707c5cf769..fc724fb8664 100644
--- a/crates/vm/src/builtins/str.rs
+++ b/crates/vm/src/builtins/str.rs
@@ -945,7 +945,20 @@ impl PyStr {
 
     #[pymethod]
     fn isalnum(&self) -> bool {
-        !self.data.is_empty() && self.char_all(char::is_alphanumeric)
+        !self.data.is_empty()
+            && self.char_all(|c| {
+                matches!(
+                    GeneralCategory::of(c),
+                    GeneralCategory::UppercaseLetter
+                        | GeneralCategory::LowercaseLetter
+                        | GeneralCategory::TitlecaseLetter
+                        | GeneralCategory::ModifierLetter
+                        | GeneralCategory::OtherLetter
+                        | GeneralCategory::DecimalNumber
+                        | GeneralCategory::LetterNumber
+                        | GeneralCategory::OtherNumber
+                )
+            })
     }
 
     #[pymethod]
@@ -1056,7 +1069,17 @@ impl PyStr {
 
     #[pymethod]
     fn isalpha(&self) -> bool {
-        !self.data.is_empty() && self.char_all(char::is_alphabetic)
+        !self.data.is_empty()
+            && self.char_all(|c| {
+                matches!(
+                    GeneralCategory::of(c),
+                    GeneralCategory::UppercaseLetter
+                        | GeneralCategory::LowercaseLetter
+                        | GeneralCategory::TitlecaseLetter
+                        | GeneralCategory::ModifierLetter
+                        | GeneralCategory::OtherLetter
+                )
+            })
     }
 
     #[pymethod]
diff --git a/extra_tests/snippets/builtin_str_unicode.py b/extra_tests/snippets/builtin_str_unicode.py
index ca4a99199c3..fe908869f34 100644
--- a/extra_tests/snippets/builtin_str_unicode.py
+++ b/extra_tests/snippets/builtin_str_unicode.py
@@ -38,3 +38,12 @@
 # TODO: add east_asian_width and mirrored
 # assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
 # assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")
+
+# U+0345 COMBINING GREEK YPOGEGRAMMENI (category Mn) should not be alphanumeric.
+# CPython's isalpha/isalnum use Unicode letter categories (Lu/Ll/Lt/Lm/Lo),
+# not the broader Unicode Alphabetic derived property.
+assert not "\u0345".isalpha(), "isalpha should not match Mn category characters"
+assert not "\u0345".isalnum(), "isalnum should not match Mn category characters"
+
+import re
+assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"

From fab1c0cc01724cf9bc8eb3f508e7cf130bb3d878 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 27 Mar 2026 23:47:11 +0000
Subject: [PATCH 3/3] Fix import ordering: move import re to be sorted with
 import unicodedata

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/18cc7216-15cf-40d1-8726-23e21aa4c368

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 extra_tests/snippets/builtin_str_unicode.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/extra_tests/snippets/builtin_str_unicode.py b/extra_tests/snippets/builtin_str_unicode.py
index fe908869f34..08553c55c58 100644
--- a/extra_tests/snippets/builtin_str_unicode.py
+++ b/extra_tests/snippets/builtin_str_unicode.py
@@ -11,6 +11,7 @@
 
 assert c == "👋👋👋"
 
+import re
 import unicodedata
 
 assert unicodedata.category("a") == "Ll"
@@ -44,6 +45,4 @@
 # not the broader Unicode Alphabetic derived property.
 assert not "\u0345".isalpha(), "isalpha should not match Mn category characters"
 assert not "\u0345".isalnum(), "isalnum should not match Mn category characters"
-
-import re
 assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"