diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/url.ml | 149 | 
1 files changed, 149 insertions, 0 deletions
| diff --git a/src/url.ml b/src/url.ml new file mode 100644 index 0000000..5d36dae --- /dev/null +++ b/src/url.ml @@ -0,0 +1,149 @@ +(* Extract and validate the scheme part. +   As per the RFC: + +  Scheme names consist of a sequence of characters. The lower case +  letters "a"--"z", digits, and the characters plus ("+"), period +  ("."), and hyphen ("-") are allowed. For resiliency, programs +  interpreting URLs should treat upper case letters as equivalent to +  lower case in scheme names (e.g., allow "HTTP" as well as "http"). + *) +let split_scheme url = +  let aux url = +    let res = Pcre.exec ~pat:{|^([a-zA-Z0-9\.\-]+):(.*)$|} url in +    let scheme = Pcre.get_substring res 1 in +    let uri = Pcre.get_substring res 2 in +    (String.lowercase_ascii scheme, uri) +  in +  try Ok (aux url) +  with Not_found -> Error (Printf.sprintf {|"%s" is not a valid URL|} url) + +let is_scheme_allowed allowed_schemes scheme = +  match List.find_opt ((=) scheme) allowed_schemes with +  | Some _ -> Ok () +  | None -> Error (Printf.sprintf {|URL scheme "%s:" is not allowed|} scheme) + +let regex_matches regex s = +  try +    let _ = Pcre.exec ~rex:regex s in +    true +  with Not_found -> false + +let host_path_format = +  Pcre.regexp +  {|^//(?:[a-zA-Z0-9\-\._~]+(?::[a-zA-Z0-9\-\._~]*)?@)?([a-zA-Z0-9\-\._~]+|\[[a-zA-Z0-9:\.]+\])(?::([0-9]+))?(/.*)?$|} + +let host_name_format = Pcre.regexp {|^[a-zA-Z0-9]+([\-\._~]{1}[a-zA-Z0-9]+)*$|} +let ipv4_addr_format = Pcre.regexp {|^(([1-9]\d{0,2}|0)\.){3}([1-9]\d{0,2}|0)$|} +let ipv6_addr_format = Pcre.regexp {|^\[([a-z0-9:\.]+|[A-Z0-9:\.]+)\]$|} + +let is_port s = +  try +    let n = int_of_string s in +    if n > 0 && n < 65536 then true +    else false +  with Failure _ -> false + +let is_ipv4_octet s = +  try +    let n = int_of_string s in +    if n >= 0 && n < 256 then true +    else false +  with Failure _ -> false + +let is_ipv6_segment s = +  try +    let n = int_of_string ("0x" ^ s) in +    if n >= 0 && n < 65536 then true +    else false +  with Failure _ -> false + +let is_ipv4_addr s = +  let res = Pcre.exec ~rex:ipv4_addr_format s in +  let ipv4_addr_str = Pcre.get_substring res 0 in +  let ipv4_addr_l = String.split_on_char '.' ipv4_addr_str in +  List.for_all is_ipv4_octet ipv4_addr_l + +let is_ipv6_pure_addr s = +  let ipv6_addr_l = String.split_on_char ':' s in +  if List.length ipv6_addr_l > 8 || List.length ipv6_addr_l < 3 then false +  else +    let seg_str_l = List.filter (fun s -> String.length s > 0) ipv6_addr_l in +    List.for_all is_ipv6_segment seg_str_l + +let is_ipv6_dual_addr s = +  let ipv6_addr_l = List.rev (String.split_on_char ':' s) in +  match ipv6_addr_l with +  | [] -> false +  | h::t -> +      if not (is_ipv4_addr h) then false +      else +        if List.length t > 6 || List.length t < 2 then false +        else +          let seg_str_l = List.filter (fun s -> String.length s > 0) t in +          List.for_all is_ipv6_segment seg_str_l + +let is_ipv6_addr s = +  let res = Pcre.exec ~rex:ipv6_addr_format s in +  let ipv6_addr_str = Pcre.get_substring res 1 in +  try +    let typo = Pcre.exec ~pat:{|:::|} ipv6_addr_str in +    match typo with +    | _ -> false +  with Not_found -> +    is_ipv6_pure_addr ipv6_addr_str || is_ipv6_dual_addr ipv6_addr_str + +let host_path_matches s = +  try +    let res = Pcre.exec ~rex:host_path_format s in +    let substr = Pcre.get_substrings ~full_match:false res in +    let port_str = Array.get substr 1 in +    if String.length port_str > 0 && not (is_port port_str) then false +    else +      let host = Array.get substr 0 in +      match host with +      | host when regex_matches ipv6_addr_format host -> is_ipv6_addr host +      | host when regex_matches ipv4_addr_format host -> is_ipv4_addr host +      | host when regex_matches host_name_format host -> true +      | _ -> false +  with Not_found -> false; + +let validate_uri scheme uri = +  if host_path_matches uri then Ok () +  else Error (Printf.sprintf {|"%s" is not a valid URI for the %s URL scheme|} uri scheme) + +let validate_url allowed_schemes url = +  let (let*) = Result.bind in +  let* scheme, uri = split_scheme url in +  let* () = is_scheme_allowed allowed_schemes scheme in +  let* () = validate_uri scheme uri in +  Ok () + +let file_transport_schemes = ["http"; "https"; "ftp"; "sftp"; "scp"; "tftp"] + +let message_schemes = ["mailto"; "tel"; "sms"] + +let allowed_schemes = ref [] +let url = ref "" + +let args = [ +    ("--scheme", +     Arg.String (fun s -> allowed_schemes := s :: !allowed_schemes), +     "Allow only specified schemes"); +    ("--file-transport", +     Arg.Unit (fun () -> allowed_schemes := (List.append !allowed_schemes file_transport_schemes)), +     "Allow only file transport protocols (HTTP/S, FTP, SCP/SFTP, TFTP)"); +    ("--", Arg.Rest (fun s -> url := s), "Interpret next item as an argument"); +] + +let usage = Printf.sprintf "Usage: %s [OPTIONS] <URL>" Sys.argv.(0) + +let () = +  let () = Arg.parse args (fun s -> url := s) usage in +  (* Force all allowed scheme named to lowercase for ease of comparison. *) +  let allowed_schemes = List.map String.lowercase_ascii !allowed_schemes in +  let res = validate_url allowed_schemes !url in +  match res with +  | Ok () -> () +  | Error msg -> +    let () = Printf.fprintf stdout "%s" msg in +    exit 1 | 
