diff options
author | Giacomo Cavalieri <giacomo.cavalieri@icloud.com> | 2024-11-12 18:54:54 +0100 |
---|---|---|
committer | Louis Pilfold <louis@lpil.uk> | 2024-11-15 15:22:34 +0000 |
commit | 5fced08f04dd7c2dad7c644b1333926a9126d7c9 (patch) | |
tree | ea6cca57245fb6022f433ac7b9886a3979ebe2c9 | |
parent | 32f29ae21f8466f431ac92a264e9b6482bc63c1c (diff) | |
download | gleam_stdlib-5fced08f04dd7c2dad7c644b1333926a9126d7c9.tar.gz gleam_stdlib-5fced08f04dd7c2dad7c644b1333926a9126d7c9.zip |
add string and bytes tree modules
-rw-r--r-- | src/gleam/bytes_tree.gleam | 186 | ||||
-rw-r--r-- | src/gleam/string_tree.gleam | 248 | ||||
-rw-r--r-- | test/gleam/bytes_tree_test.gleam | 94 | ||||
-rw-r--r-- | test/gleam/string_tree_test.gleam | 149 |
4 files changed, 677 insertions, 0 deletions
diff --git a/src/gleam/bytes_tree.gleam b/src/gleam/bytes_tree.gleam new file mode 100644 index 0000000..f5b5f8b --- /dev/null +++ b/src/gleam/bytes_tree.gleam @@ -0,0 +1,186 @@ +//// `BytesTree` is a type used for efficiently building binary content to be +//// written to a file or a socket. Internally it is represented as tree so to +//// append or prepend to a bytes tree is a constant time operation that +//// allocates a new node in the tree without copying any of the content. When +//// writing to an output stream the tree is traversed and the content is sent +//// directly rather than copying it into a single buffer beforehand. +//// +//// If we append one bit array to another the bit arrays must be copied to a +//// new location in memory so that they can sit together. This behaviour +//// enables efficient reading of the data but copying can be expensive, +//// especially if we want to join many bit arrays together. +//// +//// BytesTree is different in that it can be joined together in constant +//// time using minimal memory, and then can be efficiently converted to a +//// bit array using the `to_bit_array` function. +//// +//// Byte trees are always byte aligned, so that a number of bits that is not +//// divisible by 8 will be padded with 0s. +//// +//// On Erlang this type is compatible with Erlang's iolists. + +// TODO: pad bit arrays to byte boundaries when adding to a tree. +import gleam/bit_array +import gleam/list +import gleam/string_tree.{type StringTree} + +pub opaque type BytesTree { + Bytes(BitArray) + Text(StringTree) + Many(List(BytesTree)) +} + +/// Create an empty `BytesTree`. Useful as the start of a pipe chaining many +/// trees together. +/// +pub fn new() -> BytesTree { + concat([]) +} + +/// Prepends a bit array to the start of a bytes tree. +/// +/// Runs in constant time. +/// +pub fn prepend(to second: BytesTree, prefix first: BitArray) -> BytesTree { + append_tree(from_bit_array(first), second) +} + +/// Appends a bit array to the end of a bytes tree. +/// +/// Runs in constant time. +/// +pub fn append(to first: BytesTree, suffix second: BitArray) -> BytesTree { + append_tree(first, from_bit_array(second)) +} + +/// Prepends a bytes tree onto the start of another. +/// +/// Runs in constant time. +/// +pub fn prepend_tree(to second: BytesTree, prefix first: BytesTree) -> BytesTree { + append_tree(first, second) +} + +/// Appends a bytes tree onto the end of another. +/// +/// Runs in constant time. +/// +@external(erlang, "gleam_stdlib", "iodata_append") +pub fn append_tree(to first: BytesTree, suffix second: BytesTree) -> BytesTree { + case second { + Many(trees) -> Many([first, ..trees]) + _ -> Many([first, second]) + } +} + +/// Prepends a string onto the start of a bytes tree. +/// +/// Runs in constant time when running on Erlang. +/// Runs in linear time with the length of the string otherwise. +/// +pub fn prepend_string(to second: BytesTree, prefix first: String) -> BytesTree { + append_tree(from_string(first), second) +} + +/// Appends a string onto the end of a bytes tree. +/// +/// Runs in constant time when running on Erlang. +/// Runs in linear time with the length of the string otherwise. +/// +pub fn append_string(to first: BytesTree, suffix second: String) -> BytesTree { + append_tree(first, from_string(second)) +} + +/// Joins a list of bytes trees into a single one. +/// +/// Runs in constant time. +/// +@external(erlang, "gleam_stdlib", "identity") +pub fn concat(trees: List(BytesTree)) -> BytesTree { + Many(trees) +} + +/// Joins a list of bit arrays into a single bytes tree. +/// +/// Runs in constant time. +/// +@external(erlang, "gleam_stdlib", "identity") +pub fn concat_bit_arrays(bits: List(BitArray)) -> BytesTree { + bits + |> list.map(fn(b) { from_bit_array(b) }) + |> concat() +} + +/// Creates a new bytes tree from a string. +/// +/// Runs in constant time when running on Erlang. +/// Runs in linear time otherwise. +/// +@external(erlang, "gleam_stdlib", "wrap_list") +pub fn from_string(string: String) -> BytesTree { + Text(string_tree.from_string(string)) +} + +/// Creates a new bytes tree from a string tree. +/// +/// Runs in constant time when running on Erlang. +/// Runs in linear time otherwise. +/// +@external(erlang, "gleam_stdlib", "wrap_list") +pub fn from_string_tree(tree: string_tree.StringTree) -> BytesTree { + Text(tree) +} + +/// Creates a new bytes tree from a bit array. +/// +/// Runs in constant time. +/// +@external(erlang, "gleam_stdlib", "wrap_list") +pub fn from_bit_array(bits: BitArray) -> BytesTree { + Bytes(bits) +} + +/// Turns a bytes tree into a bit array. +/// +/// Runs in linear time. +/// +/// When running on Erlang this function is implemented natively by the +/// virtual machine and is highly optimised. +/// +@external(erlang, "erlang", "list_to_bitstring") +pub fn to_bit_array(tree: BytesTree) -> BitArray { + [[tree]] + |> to_list([]) + |> list.reverse + |> bit_array.concat +} + +fn to_list(stack: List(List(BytesTree)), acc: List(BitArray)) -> List(BitArray) { + case stack { + [] -> acc + + [[], ..remaining_stack] -> to_list(remaining_stack, acc) + + [[Bytes(bits), ..rest], ..remaining_stack] -> + to_list([rest, ..remaining_stack], [bits, ..acc]) + + [[Text(tree), ..rest], ..remaining_stack] -> { + let bits = bit_array.from_string(string_tree.to_string(tree)) + to_list([rest, ..remaining_stack], [bits, ..acc]) + } + + [[Many(trees), ..rest], ..remaining_stack] -> + to_list([trees, rest, ..remaining_stack], acc) + } +} + +/// Returns the size of the bytes tree's content in bytes. +/// +/// Runs in linear time. +/// +@external(erlang, "erlang", "iolist_size") +pub fn byte_size(tree: BytesTree) -> Int { + [[tree]] + |> to_list([]) + |> list.fold(0, fn(acc, bits) { bit_array.byte_size(bits) + acc }) +} diff --git a/src/gleam/string_tree.gleam b/src/gleam/string_tree.gleam new file mode 100644 index 0000000..9aeb367 --- /dev/null +++ b/src/gleam/string_tree.gleam @@ -0,0 +1,248 @@ +import gleam/list + +/// `StringTree` is a type used for efficiently building text content to be +/// written to a file or a socket. Internally it is represented as tree so to +/// append or prepend to a string tree is a constant time operation that +/// allocates a new node in the tree without copying any of the content. When +/// writing to an output stream the tree is traversed and the content is sent +/// directly rather than copying it into a single buffer beforehand. +/// +/// On Erlang this type is compatible with Erlang's iodata. On JavaScript this +/// type is compatible with normal strings. +/// +/// The BEAM virtual machine has an optimisation for appending strings, where it +/// will mutate the string buffer when safe to do so, so if you are looking to +/// build a string through appending many small strings then you may get better +/// performance by not using a string tree. Always benchmark your performance +/// sensitive code. +/// +pub type StringTree + +/// Create an empty `StringTree`. Useful as the start of a pipe chaining many +/// trees together. +/// +pub fn new() -> StringTree { + do_from_strings([]) +} + +/// Prepends a `String` onto the start of some `StringTree`. +/// +/// Runs in constant time. +/// +pub fn prepend(to tree: StringTree, prefix prefix: String) -> StringTree { + append_tree(from_string(prefix), tree) +} + +/// Appends a `String` onto the end of some `StringTree`. +/// +/// Runs in constant time. +/// +pub fn append(to tree: StringTree, suffix second: String) -> StringTree { + append_tree(tree, from_string(second)) +} + +/// Prepends some `StringTree` onto the start of another. +/// +/// Runs in constant time. +/// +pub fn prepend_tree( + to tree: StringTree, + prefix prefix: StringTree, +) -> StringTree { + do_append(prefix, tree) +} + +/// Appends some `StringTree` onto the end of another. +/// +/// Runs in constant time. +/// +pub fn append_tree(to tree: StringTree, suffix suffix: StringTree) -> StringTree { + do_append(tree, suffix) +} + +@external(erlang, "gleam_stdlib", "iodata_append") +@external(javascript, "../gleam_stdlib.mjs", "add") +fn do_append(a: StringTree, b: StringTree) -> StringTree + +/// Converts a list of strings into a `StringTree`. +/// +/// Runs in constant time. +/// +pub fn from_strings(strings: List(String)) -> StringTree { + do_from_strings(strings) +} + +@external(erlang, "gleam_stdlib", "identity") +@external(javascript, "../gleam_stdlib.mjs", "concat") +fn do_from_strings(a: List(String)) -> StringTree + +/// Joins a list of trees into a single tree. +/// +/// Runs in constant time. +/// +pub fn concat(trees: List(StringTree)) -> StringTree { + do_concat(trees) +} + +@external(erlang, "gleam_stdlib", "identity") +@external(javascript, "../gleam_stdlib.mjs", "concat") +fn do_concat(trees: List(StringTree)) -> StringTree + +/// Converts a string into a `StringTree`. +/// +/// Runs in constant time. +/// +pub fn from_string(string: String) -> StringTree { + do_from_string(string) +} + +@external(erlang, "gleam_stdlib", "identity") +@external(javascript, "../gleam_stdlib.mjs", "identity") +fn do_from_string(string: String) -> StringTree + +/// Turns a `StringTree` into a `String` +/// +/// This function is implemented natively by the virtual machine and is highly +/// optimised. +/// +pub fn to_string(tree: StringTree) -> String { + do_to_string(tree) +} + +@external(erlang, "unicode", "characters_to_binary") +@external(javascript, "../gleam_stdlib.mjs", "identity") +fn do_to_string(tree: StringTree) -> String + +/// Returns the size of the `StringTree` in bytes. +/// +pub fn byte_size(tree: StringTree) -> Int { + do_byte_size(tree) +} + +@external(erlang, "erlang", "iolist_size") +@external(javascript, "../gleam_stdlib.mjs", "length") +fn do_byte_size(tree: StringTree) -> Int + +/// Joins the given trees into a new tree separated with the given string. +/// +pub fn join(trees: List(StringTree), with sep: String) -> StringTree { + trees + |> list.intersperse(from_string(sep)) + |> concat +} + +/// Converts a `StringTree` to a new one where the contents have been +/// lowercased. +/// +pub fn lowercase(tree: StringTree) -> StringTree { + do_lowercase(tree) +} + +@external(erlang, "string", "lowercase") +@external(javascript, "../gleam_stdlib.mjs", "lowercase") +fn do_lowercase(tree: StringTree) -> StringTree + +/// Converts a `StringTree` to a new one where the contents have been +/// uppercased. +/// +pub fn uppercase(tree: StringTree) -> StringTree { + do_uppercase(tree) +} + +@external(erlang, "string", "uppercase") +@external(javascript, "../gleam_stdlib.mjs", "uppercase") +fn do_uppercase(tree: StringTree) -> StringTree + +/// Converts a `StringTree` to a new one with the contents reversed. +/// +pub fn reverse(tree: StringTree) -> StringTree { + do_reverse(tree) +} + +@external(erlang, "string", "reverse") +fn do_reverse(tree: StringTree) -> StringTree { + tree + |> to_string + |> do_to_graphemes + |> list.reverse + |> from_strings +} + +@external(javascript, "../gleam_stdlib.mjs", "graphemes") +fn do_to_graphemes(string: String) -> List(String) + +/// Splits a `StringTree` on a given pattern into a list of trees. +/// +pub fn split(tree: StringTree, on pattern: String) -> List(StringTree) { + do_split(tree, pattern) +} + +type Direction { + All +} + +@external(javascript, "../gleam_stdlib.mjs", "split") +fn do_split(tree: StringTree, pattern: String) -> List(StringTree) { + erl_split(tree, pattern, All) +} + +@external(erlang, "string", "split") +fn erl_split(a: StringTree, b: String, c: Direction) -> List(StringTree) + +/// Replaces all instances of a pattern with a given string substitute. +/// +@external(erlang, "gleam_stdlib", "string_replace") +@external(javascript, "../gleam_stdlib.mjs", "string_replace") +pub fn replace( + in tree: StringTree, + each pattern: String, + with substitute: String, +) -> StringTree + +/// Compares two string trees to determine if they have the same textual +/// content. +/// +/// Comparing two string trees using the `==` operator may return `False` even +/// if they have the same content as they may have been build in different ways, +/// so using this function is often preferred. +/// +/// ## Examples +/// +/// ```gleam +/// from_strings(["a", "b"]) == from_string("ab") +/// // -> False +/// ``` +/// +/// ```gleam +/// is_equal(from_strings(["a", "b"]), from_string("ab")) +/// // -> True +/// ``` +/// +@external(erlang, "string", "equal") +pub fn is_equal(a: StringTree, b: StringTree) -> Bool { + a == b +} + +/// Inspects a `StringTree` to determine if it is equivalent to an empty string. +/// +/// ## Examples +/// +/// ```gleam +/// from_string("ok") |> is_empty +/// // -> False +/// ``` +/// +/// ```gleam +/// from_string("") |> is_empty +/// // -> True +/// ``` +/// +/// ```gleam +/// from_strings([]) |> is_empty +/// // -> True +/// ``` +/// +@external(erlang, "string", "is_empty") +pub fn is_empty(tree: StringTree) -> Bool { + from_string("") == tree +} diff --git a/test/gleam/bytes_tree_test.gleam b/test/gleam/bytes_tree_test.gleam new file mode 100644 index 0000000..1f7245a --- /dev/null +++ b/test/gleam/bytes_tree_test.gleam @@ -0,0 +1,94 @@ +import gleam/bytes_tree +import gleam/should +import gleam/string_tree + +pub fn tree_test() { + let data = + bytes_tree.from_bit_array(<<1>>) + |> bytes_tree.append(<<2>>) + |> bytes_tree.append(<<3>>) + |> bytes_tree.prepend(<<0>>) + + data + |> bytes_tree.to_bit_array + |> should.equal(<<0, 1, 2, 3>>) + + data + |> bytes_tree.byte_size + |> should.equal(4) +} + +pub fn tree_with_strings_test() { + let data = + bytes_tree.from_bit_array(<<1>>) + |> bytes_tree.append_string("2") + |> bytes_tree.append_string("3") + |> bytes_tree.prepend_string("0") + + data + |> bytes_tree.to_bit_array + |> should.equal(<<"0":utf8, 1, "2":utf8, "3":utf8>>) + + data + |> bytes_tree.byte_size + |> should.equal(4) +} + +pub fn tree_with_trees_test() { + let data = + bytes_tree.from_bit_array(<<1>>) + |> bytes_tree.append_tree(bytes_tree.from_bit_array(<<2>>)) + |> bytes_tree.append_tree(bytes_tree.from_bit_array(<<3>>)) + |> bytes_tree.prepend_tree(bytes_tree.from_bit_array(<<0>>)) + + data + |> bytes_tree.to_bit_array + |> should.equal(<<0, 1, 2, 3>>) + + data + |> bytes_tree.byte_size + |> should.equal(4) +} + +pub fn concat_test() { + [ + bytes_tree.from_bit_array(<<1, 2>>), + bytes_tree.from_bit_array(<<3, 4>>), + bytes_tree.from_bit_array(<<5, 6>>), + ] + |> bytes_tree.concat + |> bytes_tree.to_bit_array + |> should.equal(<<1, 2, 3, 4, 5, 6>>) +} + +pub fn concat_bit_arrays_test() { + bytes_tree.concat_bit_arrays([<<"h":utf8>>, <<"e":utf8>>, <<"y":utf8>>]) + |> bytes_tree.to_bit_array + |> should.equal(<<"hey":utf8>>) +} + +pub fn from_bit_array() { + // Regression test: no additional modification of the tree + bytes_tree.from_bit_array(<<>>) + |> bytes_tree.to_bit_array + |> should.equal(<<>>) +} + +pub fn from_string_test() { + // Regression test: no additional modification of the tree + bytes_tree.from_string("") + |> bytes_tree.to_bit_array + |> should.equal(<<>>) +} + +pub fn new_test() { + bytes_tree.new() + |> bytes_tree.to_bit_array + |> should.equal(<<>>) +} + +pub fn from_string_tree_test() { + string_tree.from_string("hello") + |> bytes_tree.from_string_tree + |> should.equal(bytes_tree.from_string("hello")) +} diff --git a/test/gleam/string_tree_test.gleam b/test/gleam/string_tree_test.gleam new file mode 100644 index 0000000..f2f14e3 --- /dev/null +++ b/test/gleam/string_tree_test.gleam @@ -0,0 +1,149 @@ +import gleam/should +import gleam/string_tree + +pub fn string_tree_test() { + let data = + string_tree.from_string("ello") + |> string_tree.append(",") + |> string_tree.append(" world!") + |> string_tree.prepend("H") + + data + |> string_tree.to_string + |> should.equal("Hello, world!") + + data + |> string_tree.byte_size + |> should.equal(13) + + let data = + string_tree.from_string("ello") + |> string_tree.append_tree(string_tree.from_string(",")) + |> string_tree.append_tree( + string_tree.concat([ + string_tree.from_string(" wo"), + string_tree.from_string("rld!"), + ]), + ) + |> string_tree.prepend_tree(string_tree.from_string("H")) + + data + |> string_tree.to_string + |> should.equal("Hello, world!") + + data + |> string_tree.byte_size + |> should.equal(13) +} + +pub fn reverse_test() { + "Ĺo͂řȩm̅" + |> string_tree.from_string + |> string_tree.reverse + |> string_tree.reverse + |> string_tree.to_string + |> should.equal("Ĺo͂řȩm̅") + + "Ĺo͂řȩm̅" + |> string_tree.from_string + |> string_tree.reverse + |> string_tree.to_string + |> should.equal("m̅ȩřo͂Ĺ") + + "👶🏿" + |> string_tree.from_string + |> string_tree.reverse + |> string_tree.reverse + |> string_tree.to_string + |> should.equal("👶🏿") + + "👶🏿" + |> string_tree.from_string + |> string_tree.reverse + |> string_tree.to_string + |> should.equal("👶🏿") +} + +pub fn lowercase_test() { + ["Gleam", "Gleam"] + |> string_tree.from_strings + |> string_tree.lowercase + |> string_tree.to_string + |> should.equal("gleamgleam") +} + +pub fn uppercase_test() { + ["Gleam", "Gleam"] + |> string_tree.from_strings + |> string_tree.uppercase + |> string_tree.to_string + |> should.equal("GLEAMGLEAM") +} + +pub fn split_test() { + "Gleam,Erlang,Elixir" + |> string_tree.from_string + |> string_tree.split(",") + |> should.equal([ + string_tree.from_string("Gleam"), + string_tree.from_string("Erlang"), + string_tree.from_string("Elixir"), + ]) + + ["Gleam, Erl", "ang,Elixir"] + |> string_tree.from_strings + |> string_tree.split(", ") + |> should.equal([ + string_tree.from_string("Gleam"), + string_tree.from_strings(["Erl", "ang,Elixir"]), + ]) +} + +pub fn is_equal_test() { + string_tree.from_string("12") + |> string_tree.is_equal(string_tree.from_strings(["1", "2"])) + |> should.be_true + + string_tree.from_string("12") + |> string_tree.is_equal(string_tree.from_string("12")) + |> should.be_true + + string_tree.from_string("12") + |> string_tree.is_equal(string_tree.from_string("2")) + |> should.be_false +} + +pub fn is_empty_test() { + string_tree.from_string("") + |> string_tree.is_empty + |> should.be_true + + string_tree.from_string("12") + |> string_tree.is_empty + |> should.be_false + + string_tree.from_strings([]) + |> string_tree.is_empty + |> should.be_true + + string_tree.from_strings(["", ""]) + |> string_tree.is_empty + |> should.be_true +} + +pub fn new_test() { + string_tree.new() + |> string_tree.to_string + |> should.equal("") +} + +pub fn join_test() { + [ + string_tree.from_string("Gleam"), + string_tree.from_string("Elixir"), + string_tree.from_string("Erlang"), + ] + |> string_tree.join(", ") + |> string_tree.to_string + |> should.equal("Gleam, Elixir, Erlang") +} |