diff options
author | Joshua Reusch <jreusch4@gmail.com> | 2024-11-18 02:11:26 +0100 |
---|---|---|
committer | Louis Pilfold <louis@lpil.uk> | 2024-11-25 17:49:59 +0000 |
commit | 5ea331e34ea203f1186c9c30f4d001590a69fbf2 (patch) | |
tree | 9914ee2fce4448078a18fc71a23498c972635f70 /src | |
parent | c26b2ddcac5af6736cd95af11e010b3bcb002374 (diff) | |
download | gleam_stdlib-5ea331e34ea203f1186c9c30f4d001590a69fbf2.tar.gz gleam_stdlib-5ea331e34ea203f1186c9c30f4d001590a69fbf2.zip |
use string patterns and unsafe binary loops
Diffstat (limited to 'src')
-rw-r--r-- | src/gleam/uri.gleam | 198 | ||||
-rw-r--r-- | src/gleam_stdlib.erl | 28 | ||||
-rw-r--r-- | src/gleam_stdlib.mjs | 8 |
3 files changed, 136 insertions, 98 deletions
diff --git a/src/gleam/uri.gleam b/src/gleam/uri.gleam index 811870f..7434548 100644 --- a/src/gleam/uri.gleam +++ b/src/gleam/uri.gleam @@ -120,23 +120,21 @@ fn parse_scheme_loop( pieces: UriPieces, size: Int, ) -> Result(UriPieces, Nil) { - case string.pop_grapheme(uri_string) { + case uri_string { // `/` is not allowed to appear in a scheme so we know it's over and we can // start parsing the authority with slashes. - Ok(#("/", _)) if size == 0 -> - parse_authority_with_slashes(uri_string, pieces) - Ok(#("/", _)) -> { - let scheme = string.slice(original, at_index: 0, length: size) + "/" <> _ if size == 0 -> parse_authority_with_slashes(uri_string, pieces) + "/" <> _ -> { + let scheme = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, scheme: Some(scheme)) parse_authority_with_slashes(uri_string, pieces) } // `?` is not allowed to appear in a schemem, in an authority, or in a path; // so if we see it we know it marks the beginning of the query part. - Ok(#("?", _)) if size == 0 -> - parse_query_with_question_mark(uri_string, pieces) - Ok(#("?", _)) -> { - let scheme = string.slice(original, at_index: 0, length: size) + "?" <> _ if size == 0 -> parse_query_with_question_mark(uri_string, pieces) + "?" <> _ -> { + let scheme = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, scheme: Some(scheme)) parse_query_with_question_mark(uri_string, pieces) } @@ -144,31 +142,34 @@ fn parse_scheme_loop( // `#` is not allowed to appear in a scheme, in an authority, in a path or // in a query; so if we see it we know it marks the beginning of the final // fragment. - Ok(#("#", rest)) if size == 0 -> parse_fragment(rest, pieces) - Ok(#("#", rest)) -> { - let scheme = string.slice(original, at_index: 0, length: size) + "#" <> rest if size == 0 -> parse_fragment(rest, pieces) + "#" <> rest -> { + let scheme = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, scheme: Some(scheme)) parse_fragment(rest, pieces) } // A colon marks the end of a uri scheme, but if it is not preceded by any // character then it's not a valid URI. - Ok(#(":", _)) if size == 0 -> Error(Nil) - Ok(#(":", rest)) -> { - let scheme = string.slice(original, at_index: 0, length: size) + ":" <> _ if size == 0 -> Error(Nil) + ":" <> rest -> { + let scheme = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, scheme: Some(scheme)) parse_authority_with_slashes(rest, pieces) } + // If we could get to the end of the string and we've met no special + // chars whatsoever, that means the entire string is just a long path. + "" -> Ok(UriPieces(..pieces, path: original)) + // In all other cases the first character is just a valid URI scheme // character and we just keep munching characters until we reach the end of // the uri scheme (or the end of the string and that would mean this is not // a valid uri scheme since we found no `:`). - Ok(#(_, rest)) -> parse_scheme_loop(original, rest, pieces, size + 1) - - // If we could get to the end of the string and we've met no special - // chars whatsoever, that means the entire string is just a long path. - Error(_) -> Ok(UriPieces(..pieces, path: original)) + _ -> { + let #(_, rest) = pop_codeunit(uri_string) + parse_scheme_loop(original, rest, pieces, size + 1) + } } } @@ -191,36 +192,38 @@ fn parse_authority_with_slashes_loop( pieces: UriPieces, size: Int, ) -> Result(UriPieces, Nil) { - case string.pop_grapheme(uri_string) { + case uri_string { // `/` marks the beginning of a path. - Ok(#("/", _)) -> { - let authority = string.slice(original, at_index: 0, length: size) + "/" <> _ -> { + let authority = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, authority_with_slashes: Some(authority)) parse_path(uri_string, pieces) } // `?` marks the beginning of the query with question mark. - Ok(#("?", _)) -> { - let authority = string.slice(original, at_index: 0, length: size) + "?" <> _ -> { + let authority = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, authority_with_slashes: Some(authority)) parse_query_with_question_mark(uri_string, pieces) } // `#` marks the beginning of the fragment part. - Ok(#("#", rest)) -> { - let authority = string.slice(original, at_index: 0, length: size) + "#" <> rest -> { + let authority = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, authority_with_slashes: Some(authority)) parse_fragment(rest, pieces) } + // If the string is over that means the entirety of the string was the + // authority and it has an empty path, query and fragment. + "" -> Ok(UriPieces(..pieces, authority_with_slashes: Some(original))) + // In all other cases the character is allowed to be part of the authority // so we just keep munching until we reach to its end. - Ok(#(_, rest)) -> + _ -> { + let #(_, rest) = pop_codeunit(uri_string) parse_authority_with_slashes_loop(original, rest, pieces, size + 1) - - // If the string is over that means the entirety of the string was the - // authority and it has an empty path, query and fragment. - Error(_) -> Ok(UriPieces(..pieces, authority_with_slashes: Some(original))) + } } } @@ -234,28 +237,31 @@ fn parse_path_loop( pieces: UriPieces, size: Int, ) -> Result(UriPieces, Nil) { - case string.pop_grapheme(uri_string) { + case uri_string { // `?` marks the beginning of the query with question mark. - Ok(#("?", _)) -> { - let path = string.slice(original, at_index: 0, length: size) + "?" <> _ -> { + let path = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, path: path) parse_query_with_question_mark(uri_string, pieces) } // `#` marks the beginning of the fragment part. - Ok(#("#", rest)) -> { - let path = string.slice(original, at_index: 0, length: size) + "#" <> rest -> { + let path = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, path: path) parse_fragment(rest, pieces) } - // In all other cases the character is allowed to be part of the path so we - // just keep munching until we reach to its end. - Ok(#(_, rest)) -> parse_path_loop(original, rest, pieces, size + 1) - // If the string is over that means the entirety of the string was the path // and it has an empty query and fragment. - Error(_) -> Ok(UriPieces(..pieces, path: original)) + "" -> Ok(UriPieces(..pieces, path: original)) + + // In all other cases the character is allowed to be part of the path so we + // just keep munching until we reach to its end. + _ -> { + let #(_, rest) = pop_codeunit(uri_string) + parse_path_loop(original, rest, pieces, size + 1) + } } } @@ -272,23 +278,24 @@ fn parse_query_with_question_mark_loop( pieces: UriPieces, size: Int, ) -> Result(UriPieces, Nil) { - case string.pop_grapheme(uri_string) { + case uri_string { // `#` marks the beginning of the fragment part. - Ok(#("#", rest)) -> { - let query = string.slice(original, at_index: 0, length: size) + "#" <> rest -> { + let query = codeunit_slice(original, at_index: 0, length: size) let pieces = UriPieces(..pieces, query_with_question_mark: Some(query)) parse_fragment(rest, pieces) } + // If the string is over that means the entirety of the string was the query + // and it has an empty fragment. + "" -> Ok(UriPieces(..pieces, query_with_question_mark: Some(original))) + // In all other cases the character is allowed to be part of the query so we // just keep munching until we reach to its end. - Ok(#(_, rest)) -> + _ -> { + let #(_, rest) = pop_codeunit(uri_string) parse_query_with_question_mark_loop(original, rest, pieces, size + 1) - - // If the string is over that means the entirety of the string was the query - // and it has an empty fragment. - Error(_) -> - Ok(UriPieces(..pieces, query_with_question_mark: Some(original))) + } } } @@ -300,8 +307,8 @@ fn noneify_query(x: Option(String)) -> Option(String) { case x { None -> None Some(x) -> - case string.pop_grapheme(x) { - Ok(#("?", query)) -> Some(query) + case x { + "?" <> query -> Some(query) _ -> None } } @@ -346,27 +353,30 @@ fn parse_authority_pieces(string: String) -> AuthorityPieces { } fn parse_userinfo_loop(original, string, pieces, size) { - case string.pop_grapheme(string) { - Error(_) -> parse_host(original, pieces) + case string { + "" -> parse_host(original, pieces) - Ok(#("@", rest)) -> { - let userinfo = string.slice(original, at_index: 0, length: size) + "@" <> rest -> { + let userinfo = codeunit_slice(original, at_index: 0, length: size) let pieces = AuthorityPieces(..pieces, userinfo: Some(userinfo)) parse_host(rest, pieces) } - Ok(#(_, rest)) -> parse_userinfo_loop(original, rest, pieces, size + 1) + _ -> { + let #(_, rest) = pop_codeunit(string) + parse_userinfo_loop(original, rest, pieces, size + 1) + } } } fn parse_host(string, pieces) { - case string.pop_grapheme(string) { - Ok(#("[", _)) -> parse_host_within_brackets(string, pieces) - Ok(#(":", rest)) -> { + case string { + "[" <> _ -> parse_host_within_brackets(string, pieces) + ":" <> rest -> { let pieces = AuthorityPieces(..pieces, host: Some("")) parse_port(rest, pieces) } - Ok(#(_, _)) -> parse_host_outside_of_brackets(string, pieces) - Error(_) -> AuthorityPieces(..pieces, host: Some("")) + "" -> AuthorityPieces(..pieces, host: Some("")) + _ -> parse_host_outside_of_brackets(string, pieces) } } @@ -375,51 +385,54 @@ fn parse_host_within_brackets(string, pieces) { } fn parse_host_within_brackets_loop(original, string, pieces, size) { - case string.pop_grapheme(string) { - Error(_) -> AuthorityPieces(..pieces, host: Some(string)) - Ok(#("]", rest)) -> { - let host = string.slice(original, at_index: 0, length: size + 1) + case string { + "" -> AuthorityPieces(..pieces, host: Some(string)) + "]" <> rest -> { + let host = codeunit_slice(original, at_index: 0, length: size + 1) let pieces = AuthorityPieces(..pieces, host: Some(host)) parse_port(rest, pieces) } - Ok(#(char, rest)) -> + _ -> { + let #(char, rest) = pop_codeunit(string) case is_valid_host_withing_brackets_char(char) { True -> parse_host_within_brackets_loop(original, rest, pieces, size + 1) False -> parse_host_outside_of_brackets_loop(original, rest, pieces, size + 1) } + } } } -fn is_valid_host_withing_brackets_char(char: String) -> Bool { - case char { - "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" -> True - "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" -> True - "w" | "x" | "y" | "z" -> True - "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" -> True - "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" -> True - "W" | "X" | "Y" | "Z" -> True - "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" -> True - "." | ":" -> True - _ -> False - } +fn is_valid_host_withing_brackets_char(char: Int) -> Bool { + // [0-9] + { 48 >= char && char <= 57 } + // [A-Z] + || { 65 >= char && char <= 90 } + // [a-z] + || { 97 >= char && char <= 122 } + // : + || char == 58 + // . + || char == 46 } fn parse_host_outside_of_brackets(string, pieces) { parse_host_outside_of_brackets_loop(string, string, pieces, 0) } -fn parse_host_outside_of_brackets_loop(original, string, pieces, size) { - case string.pop_grapheme(string) { - Error(_) -> AuthorityPieces(..pieces, host: Some(original)) - Ok(#(":", rest)) -> { - let host = string.slice(original, at_index: 0, length: size) +fn parse_host_outside_of_brackets_loop(original, str, pieces, size) { + case str { + "" -> AuthorityPieces(..pieces, host: Some(original)) + ":" <> rest -> { + let host = codeunit_slice(original, at_index: 0, length: size) let pieces = AuthorityPieces(..pieces, host: Some(host)) parse_port(rest, pieces) } - Ok(#(_, rest)) -> + _ -> { + let #(_, rest) = pop_codeunit(str) parse_host_outside_of_brackets_loop(original, rest, pieces, size + 1) + } } } @@ -430,6 +443,19 @@ fn parse_port(string, pieces) { } } +// WARN: this function returns invalid strings! +// We need to return a String anyways to have this as the representation on the +// JavaScript target. +// Alternatively, we could rewrite the entire code to use a single +// `fold_codeunits`-style loop and a state machine. +@external(erlang, "gleam_stdlib", "string_pop_codeunit") +@external(javascript, "../gleam_stdlib.mjs", "pop_codeunit") +fn pop_codeunit(str: String) -> #(Int, String) + +@external(erlang, "binary", "part") +@external(javascript, "../gleam_stdlib.mjs", "string_codeunit_slice") +fn codeunit_slice(str: String, at_index from: Int, length length: Int) -> String + fn extra_required(list: List(a), remaining: Int) -> Int { case list { _ if remaining == 0 -> 0 diff --git a/src/gleam_stdlib.erl b/src/gleam_stdlib.erl index 562ef23..ffea232 100644 --- a/src/gleam_stdlib.erl +++ b/src/gleam_stdlib.erl @@ -3,18 +3,19 @@ -export([ map_get/2, iodata_append/2, identity/1, decode_int/1, decode_bool/1, decode_float/1, decode_list/1, decode_option/2, decode_field/2, parse_int/1, - parse_float/1, less_than/2, string_pop_grapheme/1, string_starts_with/2, - wrap_list/1, string_ends_with/2, string_pad/4, decode_map/1, uri_parse/1, - bit_array_int_to_u32/1, bit_array_int_from_u32/1, decode_result/1, - bit_array_slice/3, decode_bit_array/1, compile_regex/2, regex_scan/2, - percent_encode/1, percent_decode/1, regex_check/2, regex_split/2, - base_decode64/1, parse_query/1, bit_array_concat/1, - bit_array_base64_encode/2, size_of_tuple/1, - decode_tuple/1, decode_tuple2/1, decode_tuple3/1, decode_tuple4/1, - decode_tuple5/1, decode_tuple6/1, tuple_get/2, classify_dynamic/1, print/1, - println/1, print_error/1, println_error/1, inspect/1, float_to_string/1, - int_from_base_string/2, utf_codepoint_list_to_string/1, contains_string/2, - crop_string/2, base16_decode/1, string_replace/3, regex_replace/3, slice/3, bit_array_to_int_and_size/1 + parse_float/1, less_than/2, string_pop_grapheme/1, string_pop_codeunit/1, + string_starts_with/2, wrap_list/1, string_ends_with/2, string_pad/4, + decode_map/1, uri_parse/1, bit_array_int_to_u32/1, bit_array_int_from_u32/1, + decode_result/1, bit_array_slice/3, decode_bit_array/1, compile_regex/2, + regex_scan/2, percent_encode/1, percent_decode/1, regex_check/2, + regex_split/2, base_decode64/1, parse_query/1, bit_array_concat/1, + bit_array_base64_encode/2, size_of_tuple/1, decode_tuple/1, decode_tuple2/1, + decode_tuple3/1, decode_tuple4/1, decode_tuple5/1, decode_tuple6/1, + tuple_get/2, classify_dynamic/1, print/1, println/1, print_error/1, + println_error/1, inspect/1, float_to_string/1, int_from_base_string/2, + utf_codepoint_list_to_string/1, contains_string/2, crop_string/2, + base16_decode/1, string_replace/3, regex_replace/3, slice/3, + bit_array_to_int_and_size/1 ]). %% Taken from OTP's uri_string module @@ -203,6 +204,9 @@ string_pop_grapheme(String) -> _ -> {error, nil} end. +string_pop_codeunit(<<Cp/integer, Rest/binary>>) -> {Cp, Rest}; +string_pop_codeunit(Binary) -> {0, Binary}. + bit_array_concat(BitArrays) -> list_to_bitstring(BitArrays). diff --git a/src/gleam_stdlib.mjs b/src/gleam_stdlib.mjs index a70309e..46bc72a 100644 --- a/src/gleam_stdlib.mjs +++ b/src/gleam_stdlib.mjs @@ -185,6 +185,10 @@ export function pop_grapheme(string) { } } +export function pop_codeunit(str) { + return [str.charCodeAt(0)|0, str.slice(1)] +} + export function lowercase(string) { return string.toLowerCase(); } @@ -256,6 +260,9 @@ export function string_slice(string, idx, len) { } } +export function string_codeunit_slice(str, from, length) { + return str.slice(from, from + length) +} export function crop_string(string, substring) { return string.substring(string.indexOf(substring)); } @@ -1004,3 +1011,4 @@ export function bit_array_starts_with(bits, prefix) { return true; } + |