From f7a7471bf927543084e2f3ee8966bc5c0a920906 Mon Sep 17 00:00:00 2001 From: John Estabrook Date: Fri, 8 Mar 2024 14:24:54 -0600 Subject: T6111: Fix modification of String.escaped to leave UTF-8 bytes unescaped The existing fix still passed the string as a whole to Bytes.escaped (as does the standard library function): the exemption of chars with high bit set needs to occur in Bytes.escaped as well. --- src/util.ml | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/util.ml b/src/util.ml index 9c5df23..168589d 100644 --- a/src/util.ml +++ b/src/util.ml @@ -4,6 +4,8 @@ external lex_numeric_compare: string -> string -> int = "caml_lex_numeric_compar external length : string -> int = "%string_length" external unsafe_get : string -> int -> char = "%string_unsafe_get" +external char_code: char -> int = "%identity" +external char_chr: int -> char = "%identity" module B = Bytes @@ -16,13 +18,58 @@ let get_lexing_position lexbuf = let column = p.Lexing.pos_cnum - p.Lexing.pos_bol + 1 in (line_number, column) +(* Modification of Bytes.escaped to leave UTF-8 bytes unescaped *) +let escape_bytes s = + let char_code_zero = 48 in + let high_bit_set = 128 in + let n = ref 0 in + for i = 0 to B.length s - 1 do + n := !n + + (match B.unsafe_get s i with + | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2 + | ' ' .. '~' -> 1 + | c when (char_code c >= high_bit_set) -> 1 + | _ -> 4) + done; + if !n = B.length s then B.copy s else begin + let s' = B.create !n in + n := 0; + for i = 0 to B.length s - 1 do + begin match B.unsafe_get s i with + | ('\"' | '\\') as c -> + B.unsafe_set s' !n '\\'; incr n; B.unsafe_set s' !n c + | '\n' -> + B.unsafe_set s' !n '\\'; incr n; B.unsafe_set s' !n 'n' + | '\t' -> + B.unsafe_set s' !n '\\'; incr n; B.unsafe_set s' !n 't' + | '\r' -> + B.unsafe_set s' !n '\\'; incr n; B.unsafe_set s' !n 'r' + | '\b' -> + B.unsafe_set s' !n '\\'; incr n; B.unsafe_set s' !n 'b' + | (' ' .. '~') as c -> B.unsafe_set s' !n c + | c when (char_code c >= high_bit_set ) -> B.unsafe_set s' !n c + | c -> + let a = char_code c in + B.unsafe_set s' !n '\\'; + incr n; + B.unsafe_set s' !n (char_chr (char_code_zero + a / 100)); + incr n; + B.unsafe_set s' !n (char_chr (char_code_zero + (a / 10) mod 10)); + incr n; + B.unsafe_set s' !n (char_chr (char_code_zero + a mod 10)); + end; + incr n + done; + s' + end + (* Modification of String.escaped to leave UTF-8 bytes unescaped *) let escape_string s = let rec escape_if_needed s n i = if i >= n then s else match unsafe_get s i with | '\"' | '\\' | '\000'..'\031' | '\127' -> - bts (B.escaped (bos s)) + bts (escape_bytes (bos s)) | _ -> escape_if_needed s n (i+1) in escape_if_needed s (length s) 0 -- cgit v1.2.3