diff options
author | Ryan M. Moore <rmm1047@gmail.com> | 2024-12-30 22:00:57 -0500 |
---|---|---|
committer | Louis Pilfold <louis@lpil.uk> | 2025-01-03 21:02:38 +0000 |
commit | 6f44f8382a7dad77c42d53193701ae8e49beb214 (patch) | |
tree | 013ef4d1e253a88ecfad21eaac82a22a77b871b9 | |
parent | c5d0edeaf6edd3280883497d931bdae8aa88afa5 (diff) | |
download | gleam_stdlib-6f44f8382a7dad77c42d53193701ae8e49beb214.tar.gz gleam_stdlib-6f44f8382a7dad77c42d53193701ae8e49beb214.zip |
Fix non-character handling in `string.utf_codepoint`
Treats `U+FFFE` and `U+FFFF` as valid Unicode codepoints rather than errors. See #778.
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | src/gleam/string.gleam | 1 | ||||
-rw-r--r-- | test/gleam/string_test.gleam | 32 |
3 files changed, 31 insertions, 4 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 60b92de..6945343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ - The deprecated `function.compose`, `function.constant`, `function.apply*`, `function.curry*`, `result.nil_error`, `list.concat`, `bool.compare`, and `bool.to_int` functions have been removed. +- Fixed a bug where `string.utf_codepoint` would treat valid Unicode codepoints + `U+FFFE` and `U+FFFF` as invalid. ## v0.51.0 - 2024-12-22 diff --git a/src/gleam/string.gleam b/src/gleam/string.gleam index 8802f09..fd43b29 100644 --- a/src/gleam/string.gleam +++ b/src/gleam/string.gleam @@ -809,7 +809,6 @@ pub fn from_utf_codepoints(utf_codepoints: List(UtfCodepoint)) -> String pub fn utf_codepoint(value: Int) -> Result(UtfCodepoint, Nil) { case value { i if i > 1_114_111 -> Error(Nil) - 65_534 | 65_535 -> Error(Nil) i if i >= 55_296 && i <= 57_343 -> Error(Nil) i if i < 0 -> Error(Nil) i -> Ok(unsafe_int_to_utf_codepoint(i)) diff --git a/test/gleam/string_test.gleam b/test/gleam/string_test.gleam index 4eddb9c..14e6476 100644 --- a/test/gleam/string_test.gleam +++ b/test/gleam/string_test.gleam @@ -702,17 +702,43 @@ pub fn from_utf_codepoints_test() { } pub fn utf_codepoint_test() { - string.utf_codepoint(1_114_444) + // Less than the lower bound on valid codepoints + string.utf_codepoint(-1) |> should.be_error - string.utf_codepoint(65_534) + // The lower bound on valid codepoints + string.utf_codepoint(0) + |> should.be_ok + + // The upper bound for valid code points + string.utf_codepoint(1_114_111) + |> should.be_ok + + // Greater than the upper bound on valid codepoints + string.utf_codepoint(1_114_112) |> should.be_error + // Non-characters U+FFFE and U+FFFF are valid codepoints. See (#778). + string.utf_codepoint(65_534) + |> should.be_ok + string.utf_codepoint(65_535) + |> should.be_ok + + // One less than the lowest "High-surrogate code point" + string.utf_codepoint(55_295) + |> should.be_ok + + // Lowest value of the "High-surrogate code point" (U+D800 to U+DBFF) string.utf_codepoint(55_296) |> should.be_error - string.utf_codepoint(-1) + // Highest value of the "Low-surrogate code point" (U+DC00 to U+DFFF) + string.utf_codepoint(57_343) |> should.be_error + + // One greater than the highest "Low-surrogate code point" + string.utf_codepoint(57_344) + |> should.be_ok } pub fn bit_array_utf_codepoint_test() { |