diff options
author | Louis Pilfold <louis@lpil.uk> | 2021-09-04 18:11:04 +0100 |
---|---|---|
committer | Louis Pilfold <louis@lpil.uk> | 2021-09-04 18:11:04 +0100 |
commit | a54b1986daf44ebfd941849584175215306ad6d9 (patch) | |
tree | 586b723fe3bdebb0e4dfca7cc45778351a822f3a | |
parent | 3477ae008a46ffca6c79ee4f9ad8ec876d25ce31 (diff) | |
download | gleam_stdlib-a54b1986daf44ebfd941849584175215306ad6d9.tar.gz gleam_stdlib-a54b1986daf44ebfd941849584175215306ad6d9.zip |
Gleam URI parsing
-rw-r--r-- | src/gleam/uri.gleam | 224 | ||||
-rw-r--r-- | test/gleam/uri_test.gleam | 368 |
2 files changed, 443 insertions, 149 deletions
diff --git a/src/gleam/uri.gleam b/src/gleam/uri.gleam index a757b8b..e67837e 100644 --- a/src/gleam/uri.gleam +++ b/src/gleam/uri.gleam @@ -7,19 +7,20 @@ //// Query encoding (Form encoding) is defined in the w3c specification. //// https://www.w3.org/TR/html52/sec-forms.html#urlencoded-form-data -import gleam/option.{None, Option, Some} -import gleam/string -import gleam/int -import gleam/list - if erlang { - import gleam/result import gleam/dynamic.{Dynamic} - import gleam/map - import gleam/function - import gleam/pair } +import gleam/function +import gleam/int +import gleam/list +import gleam/map +import gleam/option.{None, Option, Some} +import gleam/pair +import gleam/regex +import gleam/result +import gleam/string + /// Type representing holding the parsed components of an URI. /// All components of a URI are optional, except the path. /// @@ -35,56 +36,161 @@ pub type Uri { ) } -if erlang { - pub external fn erl_parse(String) -> Dynamic = - "uri_string" "parse" +// Special thanks to Elixir for this algorithm +// +/// Parses a compliant URI string into the `Uri` Type. +/// If the string is not a valid URI string then an error is returned. +/// +/// The opposite operation is `uri.to_string` +/// +/// ## Examples +/// +/// ``` +/// > parse("https://example.com:1234/a/b?query=true#fragment") +/// +/// Ok(Uri(scheme: Some("https"), ...)) +/// ``` +/// +pub fn parse(uri_string: String) -> Uri { + // From https://tools.ietf.org/html/rfc3986#appendix-B + let pattern = + // 12 3 4 5 6 7 8 + "^(([a-z][a-z0-9\\+\\-\\.]*):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#.*)?" + let matches = + pattern + |> regex_submatches(uri_string) + |> pad_list(8) - type UriKey { - Scheme - Userinfo - Host - Port - Path - Query - Fragment + let #(scheme, authority, path, query, fragment) = case matches { + [ + _scheme_with_colon, + scheme, + authority_with_slashes, + _authority, + path, + query_with_question_mark, + _query, + fragment, + ] -> #( + scheme, + authority_with_slashes, + path, + query_with_question_mark, + fragment, + ) + _ -> #(None, None, None, None, None) } - /// Parses a compliant URI string into the `Uri` Type. - /// If the string is not a valid URI string then an error is returned. - /// - /// The opposite operation is `uri.to_string` - /// - /// ## Examples - /// - /// ``` - /// > parse("https://example.com:1234/a/b?query=true#fragment") - /// - /// Ok(Uri(scheme: Some("https"), ...)) - /// ``` - /// - pub fn parse(string: String) -> Result(Uri, Nil) { - try uri_map = - dynamic.map(erl_parse(string)) - |> result.nil_error - let get = fn(k: UriKey, decode_type: dynamic.Decoder(t)) -> Option(t) { - uri_map - |> map.get(dynamic.from(k)) - |> result.then(function.compose(decode_type, result.nil_error)) - |> option.from_result + let scheme = noneify_empty_string(scheme) + let path = option.unwrap(path, "") + let query = noneify_query(query) + let #(userinfo, host, port) = split_authority(authority) + let fragment = + fragment + |> option.to_result(Nil) + |> result.then(string.pop_grapheme) + |> result.map(pair.second) + |> option.from_result + let port = case port { + None -> + case scheme { + Some("ftp") -> Some(21) + Some("sftp") -> Some(22) + Some("tftp") -> Some(69) + Some("http") -> Some(80) + Some("https") -> Some(443) + Some("ldap") -> Some(389) + _ -> None + } + _ -> port + } + let scheme = + scheme + |> noneify_empty_string + |> option.map(string.lowercase) + Uri( + scheme: scheme, + userinfo: userinfo, + host: host, + port: port, + path: path, + query: query, + fragment: fragment, + ) +} + +fn regex_submatches(pattern: String, string: String) -> List(Option(String)) { + pattern + |> regex.compile(regex.Options(case_insensitive: True, multi_line: False)) + |> result.nil_error + |> result.map(regex.scan(_, string)) + |> result.then(list.head) + |> result.map(fn(m: regex.Match) { m.submatches }) + |> result.unwrap([]) +} + +fn noneify_query(x: Option(String)) -> Option(String) { + case x { + None -> None + Some(x) -> + case string.pop_grapheme(x) { + Ok(#("?", query)) -> Some(query) + _ -> None + } + } +} + +fn noneify_empty_string(x: Option(String)) -> Option(String) { + case x { + Some("") | None -> None + Some(_) -> x + } +} + +// Split an authority into its userinfo, host and port parts. +fn split_authority( + authority: Option(String), +) -> #(Option(String), Option(String), Option(Int)) { + case option.unwrap(authority, "") { + "" -> #(None, None, None) + "//" -> #(None, Some(""), None) + authority -> { + let matches = + "^(//)?((.*)@)?(\\[[a-zA-Z0-9:.]*\\]|[^:]*)(:(\\d*))?" + |> regex_submatches(authority) + |> pad_list(6) + case matches { + [_, _, userinfo, host, _, port] -> { + let userinfo = noneify_empty_string(userinfo) + let host = noneify_empty_string(host) + let port = + port + |> option.unwrap("") + |> int.parse + |> option.from_result + #(userinfo, host, port) + } + _ -> #(None, None, None) + } } + } +} + +fn pad_list(list: List(Option(a)), size: Int) -> List(Option(a)) { + list + |> list.append(list.repeat(None, extra_required(list, size))) +} - let uri = - Uri( - scheme: get(Scheme, dynamic.string), - userinfo: get(Userinfo, dynamic.string), - host: get(Host, dynamic.string), - port: get(Port, dynamic.int), - path: option.unwrap(get(Path, dynamic.string), ""), - query: get(Query, dynamic.string), - fragment: get(Fragment, dynamic.string), - ) - Ok(uri) +fn extra_required(list: List(a), remaining: Int) -> Int { + case list { + _ if remaining == 0 -> 0 + [] -> remaining + [_, ..xs] -> extra_required(xs, remaining - 1) } +} + +if erlang { + import gleam/io external fn erl_parse_query(String) -> Dynamic = "uri_string" "dissect_query" @@ -284,7 +390,15 @@ pub fn to_string(uri: Uri) -> String { pub fn origin(uri: Uri) -> Result(String, Nil) { let Uri(scheme: scheme, host: host, port: port, ..) = uri case scheme { - Some("https") | Some("http") -> { + Some("https") if port == Some(443) -> { + let origin = Uri(scheme, None, host, None, "", None, None) + Ok(to_string(origin)) + } + Some("http") if port == Some(80) -> { + let origin = Uri(scheme, None, host, None, "", None, None) + Ok(to_string(origin)) + } + Some(s) if s == "http" || s == "https" -> { let origin = Uri(scheme, None, host, port, "", None, None) Ok(to_string(origin)) } @@ -318,7 +432,7 @@ pub fn merge(base: Uri, relative: Uri) -> Result(Uri, Nil) { option.or(relative.scheme, base.scheme), None, relative.host, - relative.port, + option.or(relative.port, base.port), path, relative.query, relative.fragment, diff --git a/test/gleam/uri_test.gleam b/test/gleam/uri_test.gleam index f866e96..e184c94 100644 --- a/test/gleam/uri_test.gleam +++ b/test/gleam/uri_test.gleam @@ -1,48 +1,188 @@ +// TODO: IPv6 URI parse tests +// https://github.com/elixir-lang/elixir/blob/2d43b9670f54c4d8e0be1ee4d2ee8f99d7378480/lib/elixir/test/elixir/uri_test.exs import gleam/uri import gleam/should import gleam/option.{None, Some} +import gleam/string +import gleam/list + +pub fn full_parse_test() { + let parsed = + uri.parse("https://weebl:bob@example.com:1234/path?query=true#fragment") + should.equal(parsed.scheme, Some("https")) + should.equal(parsed.userinfo, Some("weebl:bob")) + should.equal(parsed.host, Some("example.com")) + should.equal(parsed.port, Some(1234)) + should.equal(parsed.path, "/path") + should.equal(parsed.query, Some("query=true")) + should.equal(parsed.fragment, Some("fragment")) +} -if erlang { - import gleam/string - import gleam/list - - pub fn full_parse_test() { - assert Ok(parsed) = - uri.parse("https://weebl:bob@example.com:1234/path?query=true#fragment") - should.equal(parsed.scheme, Some("https")) - should.equal(parsed.userinfo, Some("weebl:bob")) - should.equal(parsed.host, Some("example.com")) - should.equal(parsed.port, Some(1234)) - should.equal(parsed.path, "/path") - should.equal(parsed.query, Some("query=true")) - should.equal(parsed.fragment, Some("fragment")) - } - - pub fn parse_only_path_test() { - assert Ok(parsed) = uri.parse("") - should.equal(parsed.scheme, None) - should.equal(parsed.userinfo, None) - should.equal(parsed.host, None) - should.equal(parsed.port, None) - should.equal(parsed.path, "") - should.equal(parsed.query, None) - should.equal(parsed.fragment, None) - } - - pub fn parse_only_host_test() { - assert Ok(parsed) = uri.parse("//") - should.equal(parsed.scheme, None) - should.equal(parsed.userinfo, None) - should.equal(parsed.host, Some("")) - should.equal(parsed.port, None) - should.equal(parsed.path, "") - should.equal(parsed.query, None) - should.equal(parsed.fragment, None) - } - - pub fn error_parsing_uri_test() { - should.equal(uri.parse("::"), Error(Nil)) - } +pub fn parse_only_path_test() { + let parsed = uri.parse("") + should.equal(parsed.scheme, None) + should.equal(parsed.userinfo, None) + should.equal(parsed.host, None) + should.equal(parsed.port, None) + should.equal(parsed.path, "") + should.equal(parsed.query, None) + should.equal(parsed.fragment, None) +} + +pub fn parse_only_host_test() { + let parsed = uri.parse("//") + should.equal(parsed.scheme, None) + should.equal(parsed.userinfo, None) + should.equal(parsed.host, Some("")) + should.equal(parsed.port, None) + should.equal(parsed.path, "") + should.equal(parsed.query, None) + should.equal(parsed.fragment, None) +} + +pub fn colon_uri_test() { + let parsed = uri.parse("::") + should.equal(parsed.scheme, None) + should.equal(parsed.userinfo, None) + should.equal(parsed.host, None) + should.equal(parsed.port, None) + should.equal(parsed.path, "::") + should.equal(parsed.query, None) + should.equal(parsed.fragment, None) +} + +pub fn parse_scheme_test() { + uri.parse("http://one.com/path/to/something?one=two&two=one#fragment") + |> should.equal(uri.Uri( + scheme: Some("http"), + host: Some("one.com"), + path: "/path/to/something", + query: Some("one=two&two=one"), + fragment: Some("fragment"), + port: Some(80), + userinfo: None, + )) +} + +pub fn parse_https_scheme_test() { + uri.parse("https://foo.com") + |> should.equal(uri.Uri( + scheme: Some("https"), + host: Some("foo.com"), + path: "", + query: None, + fragment: None, + port: Some(443), + userinfo: None, + )) +} + +pub fn parse_file_scheme_test() { + uri.parse("file:///one/two/three") + |> should.equal(uri.Uri( + scheme: Some("file"), + host: Some(""), + path: "/one/two/three", + query: None, + fragment: None, + port: None, + userinfo: None, + )) +} + +pub fn parse_ftp_scheme_test() { + "ftp://user001:password@private.ftp-server.example.com/my_directory/my_file.txt" + |> uri.parse + |> should.equal(uri.Uri( + scheme: Some("ftp"), + host: Some("private.ftp-server.example.com"), + userinfo: Some("user001:password"), + path: "/my_directory/my_file.txt", + query: None, + fragment: None, + port: Some(21), + )) +} + +pub fn parse_sftp_scheme_test() { + "sftp://user001:password@private.ftp-server.example.com/my_directory/my_file.txt" + |> uri.parse + |> should.equal(uri.Uri( + scheme: Some("sftp"), + host: Some("private.ftp-server.example.com"), + userinfo: Some("user001:password"), + path: "/my_directory/my_file.txt", + query: None, + fragment: None, + port: Some(22), + )) +} + +pub fn parse_tftp_scheme_test() { + "tftp://user001:password@private.ftp-server.example.com/my_directory/my_file.txt" + |> uri.parse + |> should.equal(uri.Uri( + scheme: Some("tftp"), + host: Some("private.ftp-server.example.com"), + userinfo: Some("user001:password"), + path: "/my_directory/my_file.txt", + query: None, + fragment: None, + port: Some(69), + )) +} + +pub fn parse_ldap_scheme_test() { + "ldap:///dc=example,dc=com??sub?(givenName=John)" + |> uri.parse + |> should.equal(uri.Uri( + scheme: Some("ldap"), + host: Some(""), + userinfo: None, + path: "/dc=example,dc=com", + query: Some("?sub?(givenName=John)"), + fragment: None, + port: Some(389), + )) +} + +pub fn parse_ldap_2_scheme_test() { + "ldap://ldap.example.com/cn=John%20Doe,dc=foo,dc=com" + |> uri.parse + |> should.equal(uri.Uri( + scheme: Some("ldap"), + host: Some("ldap.example.com"), + userinfo: None, + path: "/cn=John%20Doe,dc=foo,dc=com", + query: None, + fragment: None, + port: Some(389), + )) +} + +pub fn parse_bad_uris_test() { + uri.parse("") + uri.parse("https:??@?F?@#>F//23/") + assert ":https" = uri.parse(":https").path + assert "https" = uri.parse("https").path +} + +pub fn parse_downcases_scheme() { + let uri = uri.parse("HTTPS://EXAMPLE.COM") + assert Some("https") = uri.scheme + assert Some("EXAMPLE.COM") = uri.host +} + +pub fn parse_empty_fragments_test() { + assert Some("") = uri.parse("http://example.com#").fragment + assert Some("") = uri.parse("http://example.com/#").fragment + assert Some("") = uri.parse("http://example.com/test#").fragment +} + +pub fn parse_empty_queries_test() { + assert Some("") = uri.parse("http://example.com?").query + assert Some("") = uri.parse("http://example.com/?").query + assert Some("") = uri.parse("http://example.com/test?").query } pub fn full_uri_to_string_test() { @@ -277,110 +417,150 @@ if erlang { should.equal(uri.path_segments("/weebl/../bob"), ["bob"]) } - pub fn origin_test() { - assert Ok(parsed) = uri.parse("http://example.test/path?weebl#bob") + pub fn origin1_test() { + let parsed = uri.parse("http://example.test/path?weebl#bob") uri.origin(parsed) |> should.equal(Ok("http://example.test/")) + } - assert Ok(parsed) = uri.parse("http://example.test:8080") + pub fn origin2_test() { + let parsed = uri.parse("http://example.test:8080") uri.origin(parsed) |> should.equal(Ok("http://example.test:8080/")) + } - assert Ok(parsed) = uri.parse("https://example.test") + pub fn origin3_test() { + let parsed = uri.parse("https://example.test") uri.origin(parsed) |> should.equal(Ok("https://example.test/")) + } - assert Ok(parsed) = uri.parse("http:///path") + pub fn origin4_test() { + let parsed = uri.parse("http:///path") uri.origin(parsed) |> should.equal(Ok("http://")) + } - assert Ok(parsed) = uri.parse("http://") + pub fn origin5_test() { + let parsed = uri.parse("http://") uri.origin(parsed) |> should.equal(Ok("http://")) + } - assert Ok(parsed) = uri.parse("/path") + pub fn origin6_test() { + let parsed = uri.parse("/path") uri.origin(parsed) |> should.equal(Error(Nil)) + } - assert Ok(parsed) = uri.parse("file:///dev/null") + pub fn origin7_test() { + let parsed = uri.parse("file:///dev/null") uri.origin(parsed) |> should.equal(Error(Nil)) } - pub fn merge_test() { - assert Ok(a) = uri.parse("/relative") - assert Ok(b) = uri.parse("") + pub fn merge1_test() { + let a = uri.parse("/relative") + let b = uri.parse("") uri.merge(a, b) |> should.equal(Error(Nil)) + } - assert Ok(a) = uri.parse("http://google.com/weebl") - assert Ok(b) = uri.parse("http://example.com/baz") + pub fn merge2_test() { + let a = uri.parse("http://google.com/weebl") + let b = uri.parse("http://example.com/baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://google.com/weebl") - assert Ok(b) = uri.parse("http://example.com/.././bob/../../baz") + pub fn merge3_test() { + let a = uri.parse("http://google.com/weebl") + let b = uri.parse("http://example.com/.././bob/../../baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://google.com/weebl") - assert Ok(b) = uri.parse("//example.com/baz") + pub fn merge4_test() { + let a = uri.parse("http://google.com/weebl") + let b = uri.parse("//example.com/baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://google.com/weebl") - assert Ok(b) = uri.parse("//example.com/.././bob/../../../baz") + pub fn merge5_test() { + let a = uri.parse("http://google.com/weebl") + let b = uri.parse("//example.com/.././bob/../../../baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob") - assert Ok(b) = uri.parse("/baz") + pub fn merge6_test() { + let a = uri.parse("http://example.com/weebl/bob") + let b = uri.parse("/baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob") - assert Ok(b) = uri.parse("baz") + pub fn merge7_test() { + let a = uri.parse("http://example.com/weebl/bob") + let b = uri.parse("baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/baz")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/baz"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/") - assert Ok(b) = uri.parse("baz") + pub fn merge8_test() { + let a = uri.parse("http://example.com/weebl/") + let b = uri.parse("baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/baz")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/baz"))) + } - assert Ok(a) = uri.parse("http://example.com") - assert Ok(b) = uri.parse("baz") + pub fn merge9_test() { + let a = uri.parse("http://example.com") + let b = uri.parse("baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://example.com") - assert Ok(b) = uri.parse("/.././bob/../../../baz") + pub fn merge10_test() { + let a = uri.parse("http://example.com") + let b = uri.parse("/.././bob/../../../baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(Ok(uri.parse("http://example.com/baz"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob") - assert Ok(b) = uri.parse("") + pub fn merge11_test() { + let a = uri.parse("http://example.com/weebl/bob") + let b = uri.parse("") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/bob"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob") - assert Ok(b) = uri.parse("#fragment") + pub fn merge12_test() { + let a = uri.parse("http://example.com/weebl/bob") + let b = uri.parse("#fragment") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob#fragment")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/bob#fragment"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob") - assert Ok(b) = uri.parse("?query") + pub fn merge13_test() { + let a = uri.parse("http://example.com/weebl/bob") + let b = uri.parse("?query") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob?query")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/bob?query"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob?query1") - assert Ok(b) = uri.parse("?query2") + pub fn merge14_test() { + let a = uri.parse("http://example.com/weebl/bob?query1") + let b = uri.parse("?query2") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob?query2")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/bob?query2"))) + } - assert Ok(a) = uri.parse("http://example.com/weebl/bob?query") - assert Ok(b) = uri.parse("") + pub fn merge15_test() { + let a = uri.parse("http://example.com/weebl/bob?query") + let b = uri.parse("") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob?query")) + |> should.equal(Ok(uri.parse("http://example.com/weebl/bob?query"))) } } |