From c784755bdc226c97658a5ff429dcfbc567e7b9b3 Mon Sep 17 00:00:00 2001 From: nattzn <38107965+nattzn@users.noreply.github.com> Date: Mon, 15 Jun 2026 16:02:50 +0900 Subject: [PATCH 1/3] mruby-regexp: Fix String#gsub to handle zero-width matches correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit String#gsub {} を切り出した rest ではなく、元の文字列 self と検索位置 pos で進める実装に変更した。 --- mrbgems/mruby-regexp/mrblib/string_regexp.rb | 21 ++++++++++---------- mrbgems/mruby-regexp/test/regexp.rb | 6 ++++++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/mrbgems/mruby-regexp/mrblib/string_regexp.rb b/mrbgems/mruby-regexp/mrblib/string_regexp.rb index e15e507a3a..454614a661 100644 --- a/mrbgems/mruby-regexp/mrblib/string_regexp.rb +++ b/mrbgems/mruby-regexp/mrblib/string_regexp.rb @@ -35,21 +35,22 @@ def gsub(pattern, replacement = nil, &block) end # block case: keep in Ruby to avoid VM callback from C parts = [] - rest = self - while rest.length > 0 - md = pattern.match(rest) + pos = 0 + while pos <= self.length + md = pattern.match(self, pos) break unless md - parts << md.pre_match + match_start = md.begin(0) + match_end = md.end(0) + parts << self[pos...match_start] parts << block.call(md[0]).to_s - matched_len = md[0].length - if matched_len == 0 - parts << rest[0] if rest.length > 0 - rest = rest[1..-1] || "" + if match_start == match_end + parts << self[match_end] if match_end < self.length + pos = match_end + 1 else - rest = md.post_match + pos = match_end end end - parts << rest + parts << self[pos..-1] parts.join end diff --git a/mrbgems/mruby-regexp/test/regexp.rb b/mrbgems/mruby-regexp/test/regexp.rb index e84ead98d4..0c99c60be4 100644 --- a/mrbgems/mruby-regexp/test/regexp.rb +++ b/mrbgems/mruby-regexp/test/regexp.rb @@ -349,6 +349,12 @@ assert_equal "HELLO WORLD", "hello world".gsub(/\w+/) { |m| m.upcase } end +assert("String#gsub with block and zero-width match") do + assert_equal "!abc", "abc".gsub(/^/) { "!" } + assert_equal "a!bc", "abc".gsub(/(?=b)/) { "!" } + assert_equal "!a!b!c!", "abc".gsub(//) { "!" } +end + assert("String#gsub date reformat") do result = "2026-03-21".gsub(/(\d+)-(\d+)-(\d+)/) { "#{$~[3]}/#{$~[2]}/#{$~[1]}" } assert_equal "21/03/2026", result From 9e211c54b05438a148cac2cc097ad4cb25c03401 Mon Sep 17 00:00:00 2001 From: nattzn <38107965+nattzn@users.noreply.github.com> Date: Mon, 15 Jun 2026 17:25:28 +0900 Subject: [PATCH 2/3] mruby-regexp: Fix String#gsub block offsets for UTF-8 strings Add UTF-8 zero-width match coverage to ensure gsub advances by a whole character without splitting multibyte strings. --- mrbgems/mruby-regexp/mrblib/string_regexp.rb | 17 ++++++++++++----- mrbgems/mruby-regexp/test/regexp.rb | 5 +++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/mrbgems/mruby-regexp/mrblib/string_regexp.rb b/mrbgems/mruby-regexp/mrblib/string_regexp.rb index 454614a661..c63e6e3b01 100644 --- a/mrbgems/mruby-regexp/mrblib/string_regexp.rb +++ b/mrbgems/mruby-regexp/mrblib/string_regexp.rb @@ -36,21 +36,28 @@ def gsub(pattern, replacement = nil, &block) # block case: keep in Ruby to avoid VM callback from C parts = [] pos = 0 - while pos <= self.length + len = self.bytesize + while pos <= len md = pattern.match(self, pos) break unless md match_start = md.begin(0) match_end = md.end(0) - parts << self[pos...match_start] + parts << self.byteslice(pos, match_start - pos) parts << block.call(md[0]).to_s if match_start == match_end - parts << self[match_end] if match_end < self.length - pos = match_end + 1 + rest = self.byteslice(match_end..-1) + if rest && rest.bytesize > 0 + char = rest[0] + parts << char + pos = match_end + char.bytesize + else + pos = match_end + 1 + end else pos = match_end end end - parts << self[pos..-1] + parts << self.byteslice(pos..-1) parts.join end diff --git a/mrbgems/mruby-regexp/test/regexp.rb b/mrbgems/mruby-regexp/test/regexp.rb index 0c99c60be4..6739f81254 100644 --- a/mrbgems/mruby-regexp/test/regexp.rb +++ b/mrbgems/mruby-regexp/test/regexp.rb @@ -353,6 +353,11 @@ assert_equal "!abc", "abc".gsub(/^/) { "!" } assert_equal "a!bc", "abc".gsub(/(?=b)/) { "!" } assert_equal "!a!b!c!", "abc".gsub(//) { "!" } + if __ENCODING__ == "UTF-8" + assert_equal "!いろは", "いろは".gsub(/^/) { "!" } + assert_equal "い!ろは", "いろは".gsub(/(?=ろ)/) { "!" } + assert_equal "!い!ろ!は!", "いろは".gsub(//) { "!" } + end end assert("String#gsub date reformat") do From 6d99e8dec81d5b8f47009b119d926ec75b3dd183 Mon Sep 17 00:00:00 2001 From: nattzn <38107965+nattzn@users.noreply.github.com> Date: Mon, 15 Jun 2026 17:39:48 +0900 Subject: [PATCH 3/3] mruby-regexp: Avoid multiline ^ match after final newline --- mrbgems/mruby-regexp/src/re_exec.c | 4 ++-- mrbgems/mruby-regexp/test/regexp.rb | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mrbgems/mruby-regexp/src/re_exec.c b/mrbgems/mruby-regexp/src/re_exec.c index 8f6e047319..39dd39165a 100644 --- a/mrbgems/mruby-regexp/src/re_exec.c +++ b/mrbgems/mruby-regexp/src/re_exec.c @@ -145,7 +145,7 @@ add_thread(pike_state *s, re_threadlist *list, continue; case RE_BOL: - if (sp == s->str || ((s->pat->flags & RE_FLAG_MULTILINE) && sp > s->str && sp[-1] == '\n')) { + if (sp == s->str || ((s->pat->flags & RE_FLAG_MULTILINE) && sp > s->str && sp < s->str_end && sp[-1] == '\n')) { pc++; continue; } return; @@ -452,7 +452,7 @@ bt_match(const mrb_regexp_pattern *pat, const char *str, const char *str_end, } case RE_BOL: - if (sp != str && !(pat->flags & RE_FLAG_MULTILINE && sp > str && sp[-1] == '\n')) return FALSE; + if (sp != str && !(pat->flags & RE_FLAG_MULTILINE && sp > str && sp < str_end && sp[-1] == '\n')) return FALSE; pc++; break; diff --git a/mrbgems/mruby-regexp/test/regexp.rb b/mrbgems/mruby-regexp/test/regexp.rb index 6739f81254..5c20700a00 100644 --- a/mrbgems/mruby-regexp/test/regexp.rb +++ b/mrbgems/mruby-regexp/test/regexp.rb @@ -353,6 +353,9 @@ assert_equal "!abc", "abc".gsub(/^/) { "!" } assert_equal "a!bc", "abc".gsub(/(?=b)/) { "!" } assert_equal "!a!b!c!", "abc".gsub(//) { "!" } + assert_equal "!\n", "\n".gsub(/^/m) { "!" } + assert_equal "!a\n", "a\n".gsub(/^/m) { "!" } + assert_equal "!a\n!b", "a\nb".gsub(/^/m) { "!" } if __ENCODING__ == "UTF-8" assert_equal "!いろは", "いろは".gsub(/^/) { "!" } assert_equal "い!ろは", "いろは".gsub(/(?=ろ)/) { "!" }