fix regex.scan to work correclty with utf8 strings

author: inoas <mail@inoas.com> 2022-10-25 23:01:23 +0200
committer: Louis Pilfold <louis@lpil.uk> 2022-10-27 15:13:03 +0100
commit: ed9405a0eb12061b3ce680f80e6d89b5ff518f21 (patch)
tree: 107d7ea2116fef3358c8ecd88ee84e8d8a2e315e
parent: c758631b79d594b884a8031182464e9251bd9ee4 (diff)
download: gleam_stdlib-ed9405a0eb12061b3ce680f80e6d89b5ff518f21.tar.gz
gleam_stdlib-ed9405a0eb12061b3ce680f80e6d89b5ff518f21.zip
2 files changed, 21 insertions, 6 deletions
diff --git a/src/gleam_stdlib.erl b/src/gleam_stdlib.erl
index dc3727c..04b73eb 100644
--- a/src/gleam_stdlib.erl
+++ b/src/gleam_stdlib.erl
@@ -200,16 +200,17 @@ regex_check(Regex, String) ->
 regex_split(Regex, String) ->
     re:split(String, Regex).
 
-regex_submatches(String, {S, L}) ->
-    SubMatch = string:slice(String, S, L),
-    case string:is_empty(SubMatch) of
+regex_submatches(String, {Start, Length}) ->
+    Binary = unicode:characters_to_binary(String, unicode, unicode),
+    BinarySlice = binary:part(Binary, {Start, Length}),
+    case string:is_empty(binary_to_list(BinarySlice)) of
         true -> none;
-        false -> {some, SubMatch}
+        false -> {some, BinarySlice}
     end.
 
-regex_matches(String, [{S, L} | Submatches]) ->
+regex_matches(String, [{Start, Length} | Submatches]) ->
     Submatches1 = lists:map(fun(X) -> regex_submatches(String, X) end, Submatches),
-    {match, binary:part(String, S, L), Submatches1}.
+    {match, binary:part(String, Start, Length), Submatches1}.
 
 regex_scan(Regex, String) ->
     case re:run(String, Regex, [global]) of
diff --git a/test/gleam/regex_test.gleam b/test/gleam/regex_test.gleam
index cdd6e68..30d197f 100644
--- a/test/gleam/regex_test.gleam
+++ b/test/gleam/regex_test.gleam
@@ -64,4 +64,18 @@ pub fn scan_test() {
     Match(content: "on a boat", submatches: [None, Some("boat")]),
     Match(content: "in a lake", submatches: [None, Some("lake")]),
   ])
+
+  assert Ok(re) = regex.from_string("answer (\\d+)")
+  regex.scan(re, "Is the answer 42?")
+  |> should.equal([Match(content: "answer 42", submatches: [Some("42")])])
+
+  assert Ok(re) = regex.from_string("(\\d+)")
+  regex.scan(re, "hello 42")
+  |> should.equal([Match(content: "42", submatches: [Some("42")])])
+
+  regex.scan(re, "你好 42")
+  |> should.equal([Match(content: "42", submatches: [Some("42")])])
+
+  regex.scan(re, "你好 42 世界")
+  |> should.equal([Match(content: "42", submatches: [Some("42")])])
 }
author	inoas <mail@inoas.com>	2022-10-25 23:01:23 +0200
committer	Louis Pilfold <louis@lpil.uk>	2022-10-27 15:13:03 +0100
commit	ed9405a0eb12061b3ce680f80e6d89b5ff518f21 (patch)
tree	107d7ea2116fef3358c8ecd88ee84e8d8a2e315e
parent	c758631b79d594b884a8031182464e9251bd9ee4 (diff)
download	gleam_stdlib-ed9405a0eb12061b3ce680f80e6d89b5ff518f21.tar.gz gleam_stdlib-ed9405a0eb12061b3ce680f80e6d89b5ff518f21.zip