summaryrefslogtreecommitdiff
path: root/src/url.ml
blob: 2f3367b735dbf49fbd0a129456e2f15b72947a69 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
(* Extract and validate the scheme part.
   As per the RFC:

  Scheme names consist of a sequence of characters. The lower case
  letters "a"--"z", digits, and the characters plus ("+"), period
  ("."), and hyphen ("-") are allowed. For resiliency, programs
  interpreting URLs should treat upper case letters as equivalent to
  lower case in scheme names (e.g., allow "HTTP" as well as "http").
 *)
let split_scheme url =
  let aux url =
    let res = Pcre.exec ~pat:{|^([a-zA-Z0-9\.\-]+):(.*)$|} url in
    let scheme = Pcre.get_substring res 1 in
    let uri = Pcre.get_substring res 2 in
    (String.lowercase_ascii scheme, uri)
  in
  try Ok (aux url)
  with Not_found -> Error (Printf.sprintf {|"%s" is not a valid URL|} url)

let is_scheme_allowed allowed_schemes scheme =
  match List.find_opt ((=) scheme) allowed_schemes with
  | Some _ -> Ok ()
  | None -> Error (Printf.sprintf {|URL scheme "%s:" is not allowed|} scheme)

let regex_matches regex s =
  try
    let _ = Pcre.exec ~rex:regex s in
    true
  with Not_found -> false

let host_path_format =
  Pcre.regexp
  {|^//(?:[a-zA-Z0-9\-\._~]+(?::[a-zA-Z0-9\-\._~]*)?@)?([a-zA-Z0-9\-\._~]+|\[[a-zA-Z0-9:\.]+\])(?::([0-9]+))?(/.*)?$|}

let host_name_format = Pcre.regexp {|^[a-zA-Z0-9]+([\-\._~]{1}[a-zA-Z0-9]+)*$|}
let ipv4_addr_format = Pcre.regexp {|^(([1-9]\d{0,2}|0)\.){3}([1-9]\d{0,2}|0)$|}
let ipv6_addr_format = Pcre.regexp {|^\[([a-z0-9:\.]+|[A-Z0-9:\.]+)\]$|}

let is_port s =
  try
    let n = int_of_string s in
    if n > 0 && n < 65536 then true
    else false
  with Failure _ -> false

let is_ipv4_octet s =
  try
    let n = int_of_string s in
    if n >= 0 && n < 256 then true
    else false
  with Failure _ -> false

let is_ipv6_segment s =
  try
    let n = int_of_string ("0x" ^ s) in
    if n >= 0 && n < 65536 then true
    else false
  with Failure _ -> false

let is_ipv4_addr s =
  let res = Pcre.exec ~rex:ipv4_addr_format s in
  let ipv4_addr_str = Pcre.get_substring res 0 in
  let ipv4_addr_l = String.split_on_char '.' ipv4_addr_str in
  List.for_all is_ipv4_octet ipv4_addr_l

let is_ipv6_pure_addr s =
  let ipv6_addr_l = String.split_on_char ':' s in
  if List.length ipv6_addr_l > 8 || List.length ipv6_addr_l < 3 then false
  else
    let seg_str_l = List.filter (fun s -> String.length s > 0) ipv6_addr_l in
    List.for_all is_ipv6_segment seg_str_l

let is_ipv6_dual_addr s =
  let ipv6_addr_l = List.rev (String.split_on_char ':' s) in
  match ipv6_addr_l with
  | [] -> false
  | h::t ->
      if not (is_ipv4_addr h) then false
      else
        if List.length t > 6 || List.length t < 2 then false
        else
          let seg_str_l = List.filter (fun s -> String.length s > 0) t in
          List.for_all is_ipv6_segment seg_str_l

let is_ipv6_addr s =
  let res = Pcre.exec ~rex:ipv6_addr_format s in
  let ipv6_addr_str = Pcre.get_substring res 1 in
  try
    let typo = Pcre.exec ~pat:{|:::|} ipv6_addr_str in
    match typo with
    | _ -> false
  with Not_found ->
    is_ipv6_pure_addr ipv6_addr_str || is_ipv6_dual_addr ipv6_addr_str

let host_path_matches s =
  try
    let res = Pcre.exec ~rex:host_path_format s in
    let substr = Pcre.get_substrings ~full_match:false res in
    let port_str = Array.get substr 1 in
    if String.length port_str > 0 && not (is_port port_str) then false
    else
      let host = Array.get substr 0 in
      match host with
      | host when regex_matches ipv6_addr_format host -> is_ipv6_addr host
      | host when regex_matches ipv4_addr_format host -> is_ipv4_addr host
      | host when regex_matches host_name_format host -> true
      | _ -> false
  with Not_found -> false

let validate_uri scheme uri =
  if host_path_matches uri then Ok ()
  else Error (Printf.sprintf {|"%s" is not a valid URI for the %s URL scheme|} uri scheme)

let validate_url allowed_schemes url =
  let (let*) = Result.bind in
  let* scheme, uri = split_scheme url in
  let* () = is_scheme_allowed allowed_schemes scheme in
  let* () = validate_uri scheme uri in
  Ok ()

let file_transport_schemes = ["http"; "https"; "ftp"; "sftp"; "scp"; "tftp"]

let message_schemes = ["mailto"; "tel"; "sms"]

let allowed_schemes = ref []
let url = ref ""

let args = [
    ("--scheme",
     Arg.String (fun s -> allowed_schemes := s :: !allowed_schemes),
     "Allow only specified schemes");
    ("--file-transport",
     Arg.Unit (fun () -> allowed_schemes := (List.append !allowed_schemes file_transport_schemes)),
     "Allow only file transport protocols (HTTP/S, FTP, SCP/SFTP, TFTP)");
    ("--", Arg.Rest (fun s -> url := s), "Interpret next item as an argument");
]

let usage = Printf.sprintf "Usage: %s [OPTIONS] <URL>" Sys.argv.(0)

let () =
  let () = Arg.parse args (fun s -> url := s) usage in
  (* Force all allowed scheme named to lowercase for ease of comparison. *)
  let allowed_schemes = List.map String.lowercase_ascii !allowed_schemes in
  let res = validate_url allowed_schemes !url in
  match res with
  | Ok () -> ()
  | Error msg ->
    let () = Printf.fprintf stdout "%s" msg in
    exit 1