diff options
Diffstat (limited to 'dev-ml')
-rw-r--r-- | dev-ml/markup/files/test.patch | 273 | ||||
-rw-r--r-- | dev-ml/markup/files/uutf.patch | 1085 | ||||
-rw-r--r-- | dev-ml/markup/markup-0.7.2-r1.ebuild (renamed from dev-ml/markup/markup-0.7.2.ebuild) | 11 |
3 files changed, 1366 insertions, 3 deletions
diff --git a/dev-ml/markup/files/test.patch b/dev-ml/markup/files/test.patch new file mode 100644 index 000000000000..f2a525764831 --- /dev/null +++ b/dev-ml/markup/files/test.patch @@ -0,0 +1,273 @@ +Index: markup.ml-0.7.2/test/test_encoding.ml +=================================================================== +--- markup.ml-0.7.2.orig/test/test_encoding.ml ++++ markup.ml-0.7.2/test/test_encoding.ml +@@ -15,9 +15,9 @@ let test_ucs_4 (f : Encoding.t) name s1 + expect_error (1, 2) (`Decoding_error (bad_bytes, name)) + begin fun report -> + let chars = s1 |> string |> f ~report in +- next_option chars ok (assert_equal (Some (Char.code 'f'))); ++ next_option chars ok (assert_equal (Some ((Uchar.of_int (Char.code 'f'))))); + next_option chars ok (assert_equal (Some Uutf.u_rep)); +- next_option chars ok (assert_equal (Some (Char.code 'o'))); ++ next_option chars ok (assert_equal (Some ((Uchar.of_int (Char.code 'o'))))); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None) + end; +@@ -25,9 +25,9 @@ let test_ucs_4 (f : Encoding.t) name s1 + expect_error (2, 2) (`Decoding_error ("\x00\x00\x00", name)) + begin fun report -> + let chars = s2 |> string |> f ~report in +- next_option chars ok (assert_equal (Some (Char.code 'f'))); +- next_option chars ok (assert_equal (Some 0x000A)); +- next_option chars ok (assert_equal (Some (Char.code 'o'))); ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'f')))); ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x000A))); ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'o')))); + next_option chars ok (assert_equal (Some Uutf.u_rep)); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None) +@@ -38,12 +38,12 @@ let tests = [ + let s = "\xef\xbb\xbffoo\xf0\x9f\x90\x99bar\xa0more" in + expect_error (1, 8) (`Decoding_error ("\xa0", "utf-8")) begin fun report -> + let chars = s |> string |> utf_8 ~report in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); +- next_option chars ok (assert_equal (Some 0x1F419)); +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r'])); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419))); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r'])); + next_option chars ok (assert_equal (Some Uutf.u_rep)); + next_n 4 chars ok +- (assert_equal (List.map Char.code ['m'; 'o'; 'r'; 'e'])); ++ (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['m'; 'o'; 'r'; 'e'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None) + end); +@@ -53,11 +53,11 @@ let tests = [ + expect_error (1, 6) (`Decoding_error ("\xdc\x19", "utf-16be")) + begin fun report -> + let chars = s |> string |> utf_16be ~report in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); +- next_option chars ok (assert_equal (Some 0x1F419)); +- next_option chars ok (assert_equal (Some (Char.code 'b'))); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419))); ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'b')))); + next_option chars ok (assert_equal (Some Uutf.u_rep)); +- next_n 16 chars ok (assert_equal (List.map Char.code ['a'; 'r'])); ++ next_n 16 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['a'; 'r'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None) + end); +@@ -67,11 +67,11 @@ let tests = [ + expect_error (1, 6) (`Decoding_error ("\x19\xdc", "utf-16le")) + begin fun report -> + let chars = s |> string |> utf_16le ~report in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); +- next_option chars ok (assert_equal (Some 0x1F419)); +- next_option chars ok (assert_equal (Some (Char.code 'b'))); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419))); ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'b')))); + next_option chars ok (assert_equal (Some Uutf.u_rep)); +- next_n 16 chars ok (assert_equal (List.map Char.code ['a'; 'r'])); ++ next_n 16 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['a'; 'r'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None) + end); +@@ -79,7 +79,7 @@ let tests = [ + ("encoding.iso_8859_1" >:: fun _ -> + let chars = string "foo\xa0" |> iso_8859_1 in + next_n 4 chars +- ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'; '\xa0'])); ++ ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'; '\xa0'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None)); + +@@ -88,26 +88,26 @@ let tests = [ + expect_error (1, 4) (`Decoding_error ("\xa0", "us-ascii")) + begin fun report -> + let chars = s |> string |> us_ascii ~report in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); + next_option chars ok (assert_equal (Some Uutf.u_rep)); +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r'])); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None) + end); + + ("encoding.windows_1251" >:: fun _ -> + let chars = string "foo\xe0\xe1\xe2bar" |> windows_1251 in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); +- next_n 3 chars ok (assert_equal [0x0430; 0x0431; 0x0432]); +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r'])); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); ++ next_n 3 chars ok (assert_equal [Uchar.of_int 0x0430; Uchar.of_int 0x0431; Uchar.of_int 0x0432]); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None)); + + ("encoding.windows_1252" >:: fun _ -> + let chars = string "foo\x80\x83bar" |> windows_1252 in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); +- next_n 2 chars ok (assert_equal [0x20AC; 0x0192]); +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r'])); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); ++ next_n 2 chars ok (assert_equal [Uchar.of_int 0x20AC; Uchar.of_int 0x0192]); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None)); + +@@ -137,7 +137,7 @@ let tests = [ + + ("encoding.ebcdic" >:: fun _ -> + let chars = string "\x86\x96\x96" |> ebcdic in +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'])); ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'])); + next_option chars ok (assert_equal None); + next_option chars ok (assert_equal None)); + ] +Index: markup.ml-0.7.2/test/test_html_tokenizer.ml +=================================================================== +--- markup.ml-0.7.2.orig/test/test_html_tokenizer.ml ++++ markup.ml-0.7.2/test/test_html_tokenizer.ml +@@ -134,7 +134,7 @@ let tests = [ + expect "�" + [ 1, 1, E (`Bad_token ("�", + reference, "out of range")); +- 1, 1, S (`Char Uutf.u_rep); ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 35, S `EOF]; + + expect "�" +@@ -142,22 +142,22 @@ let tests = [ + reference, "missing ';' at end")); + 1, 1, E (`Bad_token ("�", + reference, "out of range")); +- 1, 1, S (`Char Uutf.u_rep); ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 34, S `EOF]; + + expect "�" + [ 1, 1, E (`Bad_token ("�", reference, "out of range")); +- 1, 1, S (`Char Uutf.u_rep); ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 9, S `EOF]; + + expect "�" + [ 1, 1, E (`Bad_token ("�", reference, "out of range")); +- 1, 1, S (`Char Uutf.u_rep); ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 11, S `EOF]; + + expect "�" + [ 1, 1, E (`Bad_token ("�", reference, "out of range")); +- 1, 1, S (`Char Uutf.u_rep); ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 5, S `EOF]; + + expect "" +@@ -264,7 +264,7 @@ let tests = [ + expect ~state:`RCDATA "f\x00</foo>" + ([ 1, 1, S (`Char 0x66); + 1, 2, E (`Bad_token ("U+0000", "content", "null")); +- 1, 2, S (`Char Uutf.u_rep)] @ ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @ + (char_sequence ~start:3 "</foo>")); + + expect ~state:`RCDATA "<title>f</title >" +@@ -302,7 +302,7 @@ let tests = [ + expect ~state:`RAWTEXT "f\x00</foo>" + ([ 1, 1, S (`Char 0x66); + 1, 2, E (`Bad_token ("U+0000", "content", "null")); +- 1, 2, S (`Char Uutf.u_rep)] @ ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @ + (char_sequence ~start:3 "</foo>"))); + + ("html.tokenizer.script-data" >:: fun _ -> +@@ -330,7 +330,7 @@ let tests = [ + expect ~state:`Script_data "f<!--o\x00o" + ((char_sequence ~no_eof:true "f<!--o") @ + [1, 7, E (`Bad_token ("U+0000", "script", "null")); +- 1, 7, S (`Char Uutf.u_rep); ++ 1, 7, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 8, S (`Char 0x6F); + 1, 9, E (`Unexpected_eoi "script"); + 1, 9, S `EOF]); +@@ -363,7 +363,7 @@ let tests = [ + expect ~state:`Script_data "f<!--a-\x00-" + ((char_sequence ~no_eof:true "f<!--a-") @ + [ 1, 8, E (`Bad_token ("U+0000", "script", "null")); +- 1, 8, S (`Char Uutf.u_rep); ++ 1, 8, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 9, S (`Char 0x02D); + 1, 10, E (`Unexpected_eoi "script"); + 1, 10, S `EOF]); +@@ -371,7 +371,7 @@ let tests = [ + expect ~state:`Script_data "f<!--a--\x00--" + ((char_sequence ~no_eof:true "f<!--a--") @ + [ 1, 9, E (`Bad_token ("U+0000", "script", "null")); +- 1, 9, S (`Char Uutf.u_rep); ++ 1, 9, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 10, S (`Char 0x02D); + 1, 11, S (`Char 0x02D); + 1, 12, E (`Unexpected_eoi "script"); +@@ -380,14 +380,14 @@ let tests = [ + expect ~state:`Script_data "f<!--<script>\x00" + ((char_sequence ~no_eof:true "f<!--<script>") @ + [ 1, 14, E (`Bad_token ("U+0000", "script", "null")); +- 1, 14, S (`Char Uutf.u_rep); ++ 1, 14, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 15, E (`Unexpected_eoi "script"); + 1, 15, S `EOF]); + + expect ~state:`Script_data "f<!--<script>-\x00-" + ((char_sequence ~no_eof:true "f<!--<script>-") @ + [ 1, 15, E (`Bad_token ("U+0000", "script", "null")); +- 1, 15, S (`Char Uutf.u_rep); ++ 1, 15, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 16, S (`Char 0x2D); + 1, 17, E (`Unexpected_eoi "script"); + 1, 17, S `EOF]); +@@ -395,7 +395,7 @@ let tests = [ + expect ~state:`Script_data "f<!--<script>--\x00--" + ((char_sequence ~no_eof:true "f<!--<script>--") @ + [ 1, 16, E (`Bad_token ("U+0000", "script", "null")); +- 1, 16, S (`Char Uutf.u_rep); ++ 1, 16, S (`Char (Uchar.to_int Uutf.u_rep)); + 1, 17, S (`Char 0x2D); + 1, 18, S (`Char 0x2D); + 1, 19, E (`Unexpected_eoi "script"); +@@ -413,7 +413,7 @@ let tests = [ + expect ~state:`Script_data "f\x00</foo>" + ([ 1, 1, S (`Char 0x66); + 1, 2, E (`Bad_token ("U+0000", "content", "null")); +- 1, 2, S (`Char Uutf.u_rep)] @ ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @ + (char_sequence ~start:3 "</foo>"))); + + ("html.tokenizer.plaintext" >:: fun _ -> +@@ -424,7 +424,7 @@ let tests = [ + expect ~state:`PLAINTEXT "f\x00</foo>" + ([ 1, 1, S (`Char 0x66); + 1, 2, E (`Bad_token ("U+0000", "content", "null")); +- 1, 2, S (`Char Uutf.u_rep)] @ ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @ + (char_sequence ~start:3 "</foo>"))); + + ("html.tokenizer.comment" >:: fun _ -> +Index: markup.ml-0.7.2/test/test_input.ml +=================================================================== +--- markup.ml-0.7.2.orig/test/test_input.ml ++++ markup.ml-0.7.2/test/test_input.ml +@@ -71,7 +71,7 @@ let tests = [ + end); + + ("input.bom" >:: fun _ -> +- [0xFEFF; 0x66] ++ [Uchar.of_int 0xFEFF; Uchar.of_int 0x66] + |> of_list + |> preprocess is_valid_xml_char Error.ignore_errors + |> fst diff --git a/dev-ml/markup/files/uutf.patch b/dev-ml/markup/files/uutf.patch new file mode 100644 index 000000000000..f561084ee454 --- /dev/null +++ b/dev-ml/markup/files/uutf.patch @@ -0,0 +1,1085 @@ +Index: markup.ml-0.7.2/src/common.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/common.ml ++++ markup.ml-0.7.2/src/common.ml +@@ -134,7 +134,7 @@ let is_printable = is_in_range 0x0020 0x + let char c = + if is_printable c then begin + let buffer = Buffer.create 4 in +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + Buffer.contents buffer + end + else +Index: markup.ml-0.7.2/src/detect.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/detect.ml ++++ markup.ml-0.7.2/src/detect.ml +@@ -222,7 +222,7 @@ let meta_tag_prescan = + let rec iterate () = + next source throw (fun () -> k "") (function + | c when c = quote -> k (Buffer.contents buffer) +- | c -> add_utf_8 buffer (Char.code (Char.lowercase c)); iterate ()) ++ | c -> add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c))); iterate ()) + in + iterate () + in +@@ -236,7 +236,7 @@ let meta_tag_prescan = + push source c; + k (Buffer.contents buffer) + | c -> +- add_utf_8 buffer (Char.code (Char.lowercase c)); ++ add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c))); + iterate ()) + in + iterate () +@@ -315,7 +315,7 @@ let meta_tag_prescan = + k (Buffer.contents buffer) + + | Some c -> +- add_utf_8 buffer (Char.code (Char.lowercase c)); ++ add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c))); + iterate () + end + in +Index: markup.ml-0.7.2/src/encoding.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/encoding.ml ++++ markup.ml-0.7.2/src/encoding.ml +@@ -4,7 +4,7 @@ + open Common + open Kstream + +-type t = ?report:Error.parse_handler -> char Kstream.t -> int Kstream.t ++type t = ?report:Error.parse_handler -> char Kstream.t -> Uchar.t Kstream.t + + let wrap f = fun ?(report = Error.ignore_errors) s -> f report s + +@@ -24,8 +24,8 @@ let _uutf_decoder encoding name = + k Uutf.u_rep) + | `Await -> + next bytes throw +- (fun () -> Uutf.Manual.src decoder "" 0 0; run ()) +- (fun c -> Uutf.Manual.src decoder (String.make 1 c) 0 1; run ()) ++ (fun () -> Uutf.Manual.src decoder Bytes.empty 0 0; run ()) ++ (fun c -> Uutf.Manual.src decoder (Bytes.make 1 c) 0 1; run ()) + in + run ()) + |> make) +@@ -87,7 +87,7 @@ let _ucs_4_decoder arrange name = + let skip = + if !first then begin + first := false; +- scalar = Uutf.u_bom ++ scalar = Uchar.to_int Uutf.u_bom + end + else + false +@@ -96,9 +96,9 @@ let _ucs_4_decoder arrange name = + if skip then run () + else + if scalar = 0x000A then +- newline k scalar ++ newline k (Uchar.of_int scalar) + else +- char k scalar ++ char k (Uchar.of_int scalar) + + | [] -> empty () + +@@ -130,7 +130,7 @@ let code_page table = + + (fun _ bytes -> + (fun throw empty k -> +- next bytes throw empty (fun c -> k table.(Char.code c))) ++ next bytes throw empty (fun c -> k (Uchar.of_int table.(Char.code c)))) + |> make) + |> wrap + +Index: markup.ml-0.7.2/src/html_parser.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/html_parser.ml ++++ markup.ml-0.7.2/src/html_parser.ml +@@ -1022,7 +1022,7 @@ let parse requested_context report (toke + let frameset_ok = ref true in + let head_seen = ref false in + +- let add_character = Text.add text in ++ let add_character = (fun x y -> Text.add text x (Uchar.of_int y)) in + + set_foreign (fun () -> + Stack.current_element_is_foreign context open_elements); +@@ -2717,7 +2717,7 @@ let parse requested_context report (toke + | l, `Char 0 -> + report l (`Bad_token ("U+0000", "foreign content", "null")) !throw + (fun () -> +- add_character l Uutf.u_rep; ++ add_character l (Uchar.to_int Uutf.u_rep); + mode ()) + + | l, `Char (0x0009 | 0x000A | 0x000C | 0x000D | 0x0020 as c) -> +Index: markup.ml-0.7.2/src/html_tokenizer.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/html_tokenizer.ml ++++ markup.ml-0.7.2/src/html_tokenizer.ml +@@ -252,7 +252,7 @@ let tokenize report (input, get_location + report location + (`Bad_token (prefix ^ text ^ semicolon, "character reference", + "Windows-1252 character")) !throw (fun () -> +- k (Some (`One n))) ++ k (Some (`One (Uchar.of_int n)))) + + else + match n with +@@ -268,9 +268,9 @@ let tokenize report (input, get_location + (`Bad_token (prefix ^ text ^ semicolon, + "character reference", + "invalid HTML character")) !throw (fun () -> +- k (Some (`One n))) ++ k (Some (`One (Uchar.of_int n)))) + +- | n -> k (Some (`One n)) ++ | n -> k (Some (`One (Uchar.of_int n))) + end + end + in +@@ -366,6 +366,10 @@ let tokenize report (input, get_location + | _ -> unterminated ()) + in + ++ let ma = function ++ a, `One x -> (a, `One (Uchar.of_int x)) ++ | a, `Two (x,y) -> (a, `Two (Uchar.of_int x, Uchar.of_int y)) in ++ + let rec match_named best matched replace candidate = + next_option input !throw (function + | None -> finish best matched replace +@@ -377,8 +381,8 @@ let tokenize report (input, get_location + | `None -> finish best matched (v::replace) + | `Continue -> match_named best matched (v::replace) candidate + | `Match_and_continue m -> +- match_named (Some m) (v::(replace @ matched)) [] candidate +- | `Match m -> finish (Some m) (v::matched) []) ++ match_named (Some (ma m)) (v::(replace @ matched)) [] candidate ++ | `Match m -> finish (Some (ma m)) (v::matched) []) + in + match_named None [] [] "") + +@@ -409,11 +413,11 @@ let tokenize report (input, get_location + emit (l, `Char 0x0026) state + + | Some (`One c) -> +- emit (l, `Char c) state ++ emit (l, `Char (Uchar.to_int c)) state + + | Some (`Two (c, c')) -> +- emit (l, `Char c) (fun () -> +- emit (l, `Char c') state) ++ emit (l, `Char (Uchar.to_int c)) (fun () -> ++ emit (l, `Char (Uchar.to_int c')) state) + end + + (* 8.2.4.3. *) +@@ -427,7 +431,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () -> +- emit (l, `Char Uutf.u_rep) rcdata_state) ++ emit (l, `Char (Uchar.to_int Uutf.u_rep)) rcdata_state) + + | None -> + emit_eof () +@@ -444,7 +448,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () -> +- emit (l, `Char Uutf.u_rep) rawtext_state) ++ emit (l, `Char (Uchar.to_int Uutf.u_rep)) rawtext_state) + + | None -> + emit_eof () +@@ -461,7 +465,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep script_data_state) ++ emit_character l (Uchar.to_int Uutf.u_rep) script_data_state) + + | None -> + emit_eof () +@@ -475,7 +479,7 @@ let tokenize report (input, get_location + next_option input !throw begin function + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () -> +- emit (l, `Char Uutf.u_rep) plaintext_state) ++ emit (l, `Char (Uchar.to_int Uutf.u_rep)) plaintext_state) + + | None -> + emit_eof () +@@ -501,7 +505,7 @@ let tokenize report (input, get_location + end_tag_open_state l' tag + + | Some (_, c) when is_alphabetic c -> +- add_utf_8 tag._tag_name (to_lowercase c); ++ add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c)); + tag_name_state l' tag + + | Some (_, 0x003F) -> +@@ -529,7 +533,7 @@ let tokenize report (input, get_location + + next_option input !throw begin function + | Some (_, c) when is_alphabetic c -> +- add_utf_8 tag._tag_name (to_lowercase c); ++ add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c)); + tag_name_state l' tag + + | Some (_, 0x003E) -> +@@ -569,7 +573,7 @@ let tokenize report (input, get_location + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state + + | Some (_, c) -> +- add_utf_8 tag._tag_name (to_lowercase c); ++ add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c)); + tag_name_state l' tag + end + +@@ -589,7 +593,7 @@ let tokenize report (input, get_location + next_option input !throw begin function + | Some (_, c as v) when is_alphabetic c -> + let name_buffer = Buffer.create 32 in +- add_utf_8 name_buffer (to_lowercase c); ++ add_utf_8 name_buffer (Uchar.of_int (to_lowercase c)); + text_end_tag_name_state state l' (v::cs) name_buffer + + | maybe_v -> +@@ -618,7 +622,7 @@ let tokenize report (input, get_location + emit_tag l' (create_tag ()) + + | Some ((_, c) as v) when is_alphabetic c -> +- add_utf_8 name_buffer (to_lowercase c); ++ add_utf_8 name_buffer (Uchar.of_int (to_lowercase c)); + text_end_tag_name_state state l' (v::cs) name_buffer + + | maybe_v -> +@@ -676,7 +680,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep (fun () -> ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () -> + script_data_escaped_state l')) + + | None -> +@@ -699,7 +703,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep (fun () -> ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () -> + script_data_escaped_state l')) + + | None -> +@@ -725,7 +729,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep (fun () -> ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () -> + script_data_escaped_state l')) + + | None -> +@@ -745,7 +749,7 @@ let tokenize report (input, get_location + + | Some (_, c as v) when is_alphabetic c -> + let tag_buffer = Buffer.create 32 in +- add_utf_8 tag_buffer (to_lowercase c); ++ add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c)); + emit_characters (List.rev (v::cs)) (fun () -> + script_data_double_escape_start_state l' tag_buffer) + +@@ -765,7 +769,7 @@ let tokenize report (input, get_location + else script_data_escaped_state l') + + | Some (l, c) when is_alphabetic c -> +- add_utf_8 tag_buffer (to_lowercase c); ++ add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c)); + emit_character l c (fun () -> + script_data_double_escape_start_state l' tag_buffer) + +@@ -787,7 +791,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep (fun () -> ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () -> + script_data_double_escaped_state l')) + + | None -> +@@ -811,7 +815,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep (fun () -> ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () -> + script_data_double_escaped_state l')) + + | None -> +@@ -838,7 +842,7 @@ let tokenize report (input, get_location + + | Some (l, 0) -> + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () -> +- emit_character l Uutf.u_rep (fun () -> ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () -> + script_data_double_escaped_state l')) + + | None -> +@@ -872,7 +876,7 @@ let tokenize report (input, get_location + else script_data_double_escaped_state l') + + | Some (l, c) when is_alphabetic c -> +- add_utf_8 tag_buffer (to_lowercase c); ++ add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c)); + emit_character l c (fun () -> + script_data_double_escape_end_state l' tag_buffer) + +@@ -910,10 +914,10 @@ let tokenize report (input, get_location + | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D as c)) -> + report l (`Bad_token (char c, "attribute name", + "invalid start character")) !throw (fun () -> +- start_attribute c) ++ start_attribute (Uchar.of_int c)) + + | Some (_, c) -> +- start_attribute (to_lowercase c) ++ start_attribute (Uchar.of_int (to_lowercase c)) + end + + (* 8.2.4.35. *) +@@ -942,14 +946,14 @@ let tokenize report (input, get_location + | Some (l, (0x0022 | 0x0027 | 0x003C as c)) -> + report l (`Bad_token (char c, "attribute name", + "invalid name character")) !throw (fun () -> +- add_utf_8 name_buffer c; ++ add_utf_8 name_buffer (Uchar.of_int c); + attribute_name_state l' tag name_buffer) + + | None -> + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state + + | Some (_, c) -> +- add_utf_8 name_buffer (to_lowercase c); ++ add_utf_8 name_buffer (Uchar.of_int (to_lowercase c)); + attribute_name_state l' tag name_buffer + end + +@@ -985,13 +989,13 @@ let tokenize report (input, get_location + | Some (l, (0x0022 | 0x0027 | 0x003C as c)) -> + report l (`Bad_token (char c, "attribute name", + "invalid start character")) !throw (fun () -> +- start_next_attribute c) ++ start_next_attribute (Uchar.of_int c)) + + | None -> + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state + + | Some (_, c) -> +- start_next_attribute (to_lowercase c) ++ start_next_attribute (Uchar.of_int (to_lowercase c)) + end + + (* 8.2.4.37. *) +@@ -1030,13 +1034,13 @@ let tokenize report (input, get_location + | Some (l, (0x003C | 0x003D | 0x0060 as c)) -> + report l (`Bad_token (char c, "attribute value", + "invalid start character")) !throw (fun () -> +- start_value attribute_value_unquoted_state (Some c)) ++ start_value attribute_value_unquoted_state (Some (Uchar.of_int c))) + + | None -> + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state + + | Some (_, c) -> +- start_value attribute_value_unquoted_state (Some c) ++ start_value attribute_value_unquoted_state (Some (Uchar.of_int c)) + end + + (* 8.2.4.38 and 8.2.4.39. *) +@@ -1062,7 +1066,7 @@ let tokenize report (input, get_location + data_state + + | Some (_, c) -> +- add_utf_8 value_buffer c; ++ add_utf_8 value_buffer (Uchar.of_int c); + attribute_value_quoted_state quote l' tag name value_buffer + end + +@@ -1092,14 +1096,14 @@ let tokenize report (input, get_location + | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D | 0x0060 as c)) -> + report l (`Bad_token (char c, "attribute value", + "invalid character")) !throw (fun () -> +- add_utf_8 value_buffer c; ++ add_utf_8 value_buffer (Uchar.of_int c); + attribute_value_unquoted_state l' tag name value_buffer) + + | None -> + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state + + | Some (_, c) -> +- add_utf_8 value_buffer c; ++ add_utf_8 value_buffer (Uchar.of_int c); + attribute_value_unquoted_state l' tag name value_buffer + end + +@@ -1107,7 +1111,7 @@ let tokenize report (input, get_location + and character_reference_in_attribute allowed l value_buffer k = + consume_character_reference true (Some allowed) l begin function + | None -> +- add_utf_8 value_buffer 0x0026; ++ add_utf_8 value_buffer (Uchar.of_int 0x0026); + k () + + | Some (`One c) -> +@@ -1176,7 +1180,7 @@ let tokenize report (input, get_location + emit_comment l' buffer + + | Some (_, c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + consume () + end + in +@@ -1239,7 +1243,7 @@ let tokenize report (input, get_location + emit_comment l' buffer) + + | Some (_, c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -1266,7 +1270,7 @@ let tokenize report (input, get_location + + | Some (_, c) -> + Buffer.add_char buffer '-'; +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -1286,7 +1290,7 @@ let tokenize report (input, get_location + emit_comment l' buffer) + + | Some (_, c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -1308,7 +1312,7 @@ let tokenize report (input, get_location + + | Some (_, c) -> + Buffer.add_char buffer '-'; +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -1343,7 +1347,7 @@ let tokenize report (input, get_location + report l (`Bad_token ("--" ^ (char c), "comment", + "'--' should be in '-->'")) !throw (fun () -> + Buffer.add_string buffer "--"; +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer) + end + +@@ -1369,7 +1373,7 @@ let tokenize report (input, get_location + + | Some (_, c) -> + Buffer.add_string buffer "--!"; +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -1420,7 +1424,7 @@ let tokenize report (input, get_location + + | Some (_, c) -> + doctype._doctype_name <- +- add_doctype_char doctype._doctype_name (to_lowercase c); ++ add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c)); + doctype_name_state l' doctype + end + +@@ -1445,7 +1449,7 @@ let tokenize report (input, get_location + + | Some (_, c) -> + doctype._doctype_name <- +- add_doctype_char doctype._doctype_name (to_lowercase c); ++ add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c)); + doctype_name_state l' doctype + end + +@@ -1574,7 +1578,7 @@ let tokenize report (input, get_location + emit_doctype ~quirks:true l' doctype) + + | Some (_, c) -> +- add doctype c; ++ add doctype (Uchar.of_int c); + doctype_identifier_quoted_state add quote next_state l' doctype + end + +Index: markup.ml-0.7.2/src/html_writer.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/html_writer.ml ++++ markup.ml-0.7.2/src/html_writer.ml +@@ -8,7 +8,7 @@ let _escape_attribute s = + Uutf.String.fold_utf_8 (fun () _ -> function + | `Malformed _ -> () + | `Uchar c -> +- match c with ++ match (Uchar.to_int c) with + | 0x0026 -> Buffer.add_string buffer "&" + | 0x00A0 -> Buffer.add_string buffer " " + | 0x0022 -> Buffer.add_string buffer """ +@@ -21,7 +21,7 @@ let _escape_text s = + Uutf.String.fold_utf_8 (fun () _ -> function + | `Malformed _ -> () + | `Uchar c -> +- match c with ++ match (Uchar.to_int c) with + | 0x0026 -> Buffer.add_string buffer "&" + | 0x00A0 -> Buffer.add_string buffer " " + | 0x003C -> Buffer.add_string buffer "<" +Index: markup.ml-0.7.2/src/input.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/input.ml ++++ markup.ml-0.7.2/src/input.ml +@@ -27,13 +27,13 @@ let preprocess is_valid_char report sour + in + + let rec iterate () = +- next source throw empty (function ++ next source throw empty (fun x -> match Uchar.to_int x with + | 0xFEFF when !first_char -> first_char := false; iterate () + + | 0x0D -> +- next source throw newline (function ++ next source throw newline (fun y -> match Uchar.to_int y with + | 0x0A -> newline () +- | c -> push source c; newline ()) ++ | c -> push source (Uchar.of_int c); newline ()) + + | 0x0A -> newline () + +Index: markup.ml-0.7.2/src/input.mli +=================================================================== +--- markup.ml-0.7.2.orig/src/input.mli ++++ markup.ml-0.7.2/src/input.mli +@@ -4,5 +4,5 @@ + open Common + + val preprocess : +- (int -> bool) -> Error.parse_handler -> int Kstream.t -> ++ (int -> bool) -> Error.parse_handler -> Uchar.t Kstream.t -> + (location * int) Kstream.t * (unit -> location) +Index: markup.ml-0.7.2/src/markup.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/markup.ml ++++ markup.ml-0.7.2/src/markup.ml +@@ -187,7 +187,7 @@ sig + + val decode : + ?report:(location -> Error.t -> unit io) -> t -> +- (char, _) stream -> (int, async) stream ++ (char, _) stream -> (Uchar.t, async) stream + end + + val parse_xml : +Index: markup.ml-0.7.2/src/markup.mli +=================================================================== +--- markup.ml-0.7.2.orig/src/markup.mli ++++ markup.ml-0.7.2/src/markup.mli +@@ -194,7 +194,7 @@ sig + + val decode : + ?report:(location -> Error.t -> unit) -> t -> +- (char, 's) stream -> (int, 's) stream ++ (char, 's) stream -> (Uchar.t, 's) stream + (** Applies a decoder to a byte stream. Illegal input byte sequences result in + calls to the error handler [~report] with error kind [`Decoding_error]. + The illegal bytes are then skipped, and zero or more U+FFFD replacement +@@ -764,7 +764,7 @@ sig + + val decode : + ?report:(location -> Error.t -> unit io) -> Encoding.t -> +- (char, _) stream -> (int, async) stream ++ (char, _) stream -> (Uchar.t, async) stream + end + + (** {2 XML} *) +@@ -838,7 +838,7 @@ val kstream : ('a, _) stream -> 'a Kstre + val of_kstream : 'a Kstream.t -> ('a, _) stream + + val preprocess_input_stream : +- (int, 's) stream -> (location * int, 's) stream * (unit -> location) ++ (Uchar.t, 's) stream -> (location * int, 's) stream * (unit -> location) + + (**/**) + +Index: markup.ml-0.7.2/src/utility.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/utility.ml ++++ markup.ml-0.7.2/src/utility.ml +@@ -346,11 +346,11 @@ let xhtml_entity name = + + match lookup 0 with + | `One c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + Some (Buffer.contents buffer) + | `Two (c, c') -> +- add_utf_8 buffer c; +- add_utf_8 buffer c'; ++ add_utf_8 buffer (Uchar.of_int c); ++ add_utf_8 buffer (Uchar.of_int c'); + Some (Buffer.contents buffer) + + with Exit -> None +Index: markup.ml-0.7.2/src/xml_tokenizer.ml +=================================================================== +--- markup.ml-0.7.2.orig/src/xml_tokenizer.ml ++++ markup.ml-0.7.2/src/xml_tokenizer.ml +@@ -101,7 +101,7 @@ let tokenize report resolve_reference (i + end + + | _, c when filter c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + read () + + | l, c -> +@@ -133,7 +133,7 @@ let tokenize report resolve_reference (i + + | _, c when is_name_start_char c -> + let buffer = Buffer.create 32 in +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + let rec read () = + next input !throw unexpected_eoi begin function + | _, 0x003B -> +@@ -146,7 +146,7 @@ let tokenize report resolve_reference (i + end + + | _, c when is_name_char c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + read () + + | l, c -> +@@ -218,7 +218,7 @@ let tokenize report resolve_reference (i + report_if (not @@ is_name_start_char c) l (fun () -> + `Bad_token (char c, "attribute", "invalid start character")) + !throw (fun () -> +- add_utf_8 name_buffer c; ++ add_utf_8 name_buffer (Uchar.of_int c); + name_state ()) + end + +@@ -235,7 +235,7 @@ let tokenize report resolve_reference (i + report_if (not @@ is_name_start_char c) l (fun () -> + `Bad_token (char c, "attribute", "invalid name character")) + !throw (fun () -> +- add_utf_8 name_buffer c; ++ add_utf_8 name_buffer (Uchar.of_int c); + name_state ()) + end + +@@ -275,14 +275,14 @@ let tokenize report resolve_reference (i + report l + (`Bad_token ("&", "attribute", "replace with '&'")) + !throw (fun () -> +- add_utf_8 value_buffer 0x0026; ++ add_utf_8 value_buffer (Uchar.of_int 0x0026); + state ()) + end + + and handle_lt l state = + report l (`Bad_token ("<", "attribute", "replace with '<'")) !throw + (fun () -> +- add_utf_8 value_buffer 0x003C; ++ add_utf_8 value_buffer (Uchar.of_int 0x003C); + state ()) + + and quoted_value_state quote = +@@ -300,7 +300,7 @@ let tokenize report resolve_reference (i + quoted_value_state quote) + + | _, c -> +- add_utf_8 value_buffer c; ++ add_utf_8 value_buffer (Uchar.of_int c); + quoted_value_state quote + end + +@@ -317,7 +317,7 @@ let tokenize report resolve_reference (i + handle_lt l unquoted_value_state + + | _, c -> +- add_utf_8 value_buffer c; ++ add_utf_8 value_buffer (Uchar.of_int c); + unquoted_value_state () + end + +@@ -372,7 +372,7 @@ let tokenize report resolve_reference (i + report_if (not @@ is_name_start_char c) l (fun () -> + `Bad_token (char c, pi, "invalid start character")) !throw + (fun () -> +- add_utf_8 target_buffer c; ++ add_utf_8 target_buffer (Uchar.of_int c); + target_state ()) + end + +@@ -388,13 +388,13 @@ let tokenize report resolve_reference (i + report_if (not @@ is_name_char c) l (fun () -> + `Bad_token (char c, pi, "invalid name character")) !throw + (fun () -> +- add_utf_8 target_buffer c; ++ add_utf_8 target_buffer (Uchar.of_int c); + target_state ()) + end + + and text_state () = + next' pi finish_pi (fun (_, c) -> +- add_utf_8 text_buffer c; ++ add_utf_8 text_buffer (Uchar.of_int c); + text_state ()) + + and xml_declaration_state () = +@@ -572,7 +572,7 @@ let tokenize report resolve_reference (i + and initial_state () = + next input !throw (fun () -> emit_eoi ()) begin function + | l, (0x005D as c) -> +- add_character l c; ++ add_character l (Uchar.of_int c); + one_bracket_state l + + | l, 0x003C -> +@@ -583,7 +583,7 @@ let tokenize report resolve_reference (i + | None -> + report l (`Bad_token (char c, "text", "replace with '&'")) + !throw (fun () -> +- add_character l c; ++ add_character l (Uchar.of_int c); + initial_state ()) + + | Some s -> +@@ -591,14 +591,14 @@ let tokenize report resolve_reference (i + initial_state ()) + + | l, c -> +- add_character l c; ++ add_character l (Uchar.of_int c); + initial_state () + end + + and one_bracket_state l' = + next_option input !throw begin function + | Some (l, (0x005D as c)) -> +- add_character l c; ++ add_character l (Uchar.of_int c); + two_brackets_state l' l + + | v -> +@@ -611,11 +611,11 @@ let tokenize report resolve_reference (i + | Some (l, (0x003E as c)) -> + report l' (`Bad_token ("]]>", "text", "must end a CDATA section")) + !throw (fun () -> +- add_character l c; ++ add_character l (Uchar.of_int c); + initial_state ()) + + | Some (l, (0x005D as c)) -> +- add_character l c; ++ add_character l (Uchar.of_int c); + two_brackets_state l'' l + + | v -> +@@ -626,7 +626,7 @@ let tokenize report resolve_reference (i + and begin_markup_state l' = + let recover v = + lt_in_text l' (fun () -> +- add_character l' 0x003C; ++ add_character l' (Uchar.of_int 0x003C); + push_option input v; + initial_state ()) + in +@@ -648,7 +648,7 @@ let tokenize report resolve_reference (i + + | _, c when is_name_start_char c -> + let tag_name_buffer = Buffer.create 32 in +- add_utf_8 tag_name_buffer c; ++ add_utf_8 tag_name_buffer (Uchar.of_int c); + start_tag_state l' tag_name_buffer + + | l, c as v -> +@@ -660,7 +660,7 @@ let tokenize report resolve_reference (i + and start_tag_state l' buffer = + let recover v = + lt_in_text l' (fun () -> +- add_character l' 0x003C; ++ add_character l' (Uchar.of_int 0x003C); + add_string l' (Buffer.contents buffer); + push_option input v; + initial_state ()) +@@ -680,7 +680,7 @@ let tokenize report resolve_reference (i + attributes_state l' (Buffer.contents buffer) [] + + | _, c when is_name_char c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + start_tag_state l' buffer + + | l, c as v -> +@@ -731,8 +731,8 @@ let tokenize report resolve_reference (i + and end_tag_state l' = + let recover v = + lt_in_text l' (fun () -> +- add_character l' 0x003C; +- add_character l' 0x002F; ++ add_character l' (Uchar.of_int 0x003C); ++ add_character l' (Uchar.of_int 0x002F); + push_option input v; + initial_state ()) + in +@@ -743,7 +743,7 @@ let tokenize report resolve_reference (i + begin function + | _, c when is_name_start_char c -> + let name_buffer = Buffer.create 32 in +- add_utf_8 name_buffer c; ++ add_utf_8 name_buffer (Uchar.of_int c); + end_tag_name_state l' name_buffer + + | l, c as v -> +@@ -755,8 +755,8 @@ let tokenize report resolve_reference (i + and end_tag_name_state l' buffer = + let recover v = + lt_in_text l' (fun () -> +- add_character l' 0x003C; +- add_character l' 0x002F; ++ add_character l' (Uchar.of_int 0x003C); ++ add_character l' (Uchar.of_int 0x002F); + add_string l' (Buffer.contents buffer); + push_option input v; + initial_state ()) +@@ -773,7 +773,7 @@ let tokenize report resolve_reference (i + end_tag_whitespace_state false l' (Buffer.contents buffer) + + | _, c when is_name_char c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + end_tag_name_state l' buffer + + | l, c as v -> +@@ -821,8 +821,8 @@ let tokenize report resolve_reference (i + + | v -> + bad_comment_start "<!" l' (fun () -> +- add_character l' 0x003C; +- add_character l' 0x0021; ++ add_character l' (Uchar.of_int 0x003C); ++ add_character l' (Uchar.of_int 0x0021); + push_option input v; + initial_state ()) + end +@@ -834,9 +834,9 @@ let tokenize report resolve_reference (i + + | v -> + bad_comment_start "<!-" l' (fun () -> +- add_character l' 0x003C; +- add_character l' 0x0021; +- add_character l' 0x002D; ++ add_character l' (Uchar.of_int 0x003C); ++ add_character l' (Uchar.of_int 0x0021); ++ add_character l' (Uchar.of_int 0x002D); + push_option input v; + initial_state ()) + end +@@ -852,7 +852,7 @@ let tokenize report resolve_reference (i + comment_one_dash_state l' l buffer + + | _, c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -863,8 +863,8 @@ let tokenize report resolve_reference (i + comment_two_dashes_state false l' l'' buffer + + | _, c -> +- add_utf_8 buffer 0x002D; +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int 0x002D); ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer + end + +@@ -883,14 +883,14 @@ let tokenize report resolve_reference (i + + | _, 0x002D -> + recover (fun () -> +- add_utf_8 buffer 0x002D; ++ add_utf_8 buffer (Uchar.of_int 0x002D); + comment_two_dashes_state true l' l'' buffer) + + | _, c -> + recover (fun () -> +- add_utf_8 buffer 0x002D; +- add_utf_8 buffer 0x002D; +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int 0x002D); ++ add_utf_8 buffer (Uchar.of_int 0x002D); ++ add_utf_8 buffer (Uchar.of_int c); + comment_state l' buffer) + end + +@@ -905,9 +905,9 @@ let tokenize report resolve_reference (i + !throw (fun () -> + lt_in_text l' (fun () -> + push_list input cs; +- add_character l' 0x003C; +- add_character l' 0x0021; +- add_character l' 0x005B; ++ add_character l' (Uchar.of_int 0x003C); ++ add_character l' (Uchar.of_int 0x0021); ++ add_character l' (Uchar.of_int 0x005B); + initial_state ())) + end + +@@ -918,7 +918,7 @@ let tokenize report resolve_reference (i + cdata_one_bracket_state l' l + + | l, c -> +- add_character l c; ++ add_character l (Uchar.of_int c); + cdata_state l' + end + +@@ -929,8 +929,8 @@ let tokenize report resolve_reference (i + cdata_two_brackets_state l' l'' l + + | l, c -> +- add_character l'' 0x005D; +- add_character l c; ++ add_character l'' (Uchar.of_int 0x005D); ++ add_character l (Uchar.of_int c); + cdata_state l' + end + +@@ -941,13 +941,13 @@ let tokenize report resolve_reference (i + initial_state () + + | l, 0x005D -> +- add_character l'' 0x005D; ++ add_character l'' (Uchar.of_int 0x005D); + cdata_two_brackets_state l' l''' l + + | l, c -> +- add_character l'' 0x005D; +- add_character l''' 0x005D; +- add_character l c; ++ add_character l'' (Uchar.of_int 0x005D); ++ add_character l''' (Uchar.of_int 0x005D); ++ add_character l (Uchar.of_int c); + cdata_state l' + end + +@@ -963,9 +963,9 @@ let tokenize report resolve_reference (i + !throw (fun () -> + lt_in_text l' (fun () -> + push_list input cs; +- add_character l' 0x003C; +- add_character l' 0x0021; +- add_character l' 0x0044; ++ add_character l' (Uchar.of_int 0x003C); ++ add_character l' (Uchar.of_int 0x0021); ++ add_character l' (Uchar.of_int 0x0044); + initial_state ())) + end + +@@ -980,15 +980,15 @@ let tokenize report resolve_reference (i + emit_doctype l' buffer initial_state + + | _, (0x0022 | 0x0027 as c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_quoted_state (fun () -> doctype_state l' buffer) c l' buffer + + | _, (0x003C as c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_item_state (fun () -> doctype_state l' buffer) l' buffer + + | _, c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_state l' buffer + end + +@@ -996,11 +996,11 @@ let tokenize report resolve_reference (i + next input !throw (fun () -> unterminated_doctype l' buffer) + begin function + | _, c when c = quote -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + state () + + | _, c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_quoted_state state quote l' buffer + end + +@@ -1008,18 +1008,18 @@ let tokenize report resolve_reference (i + next input !throw (fun () -> unterminated_doctype l' buffer) + begin function + | _, (0x0021 as c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_declaration_state state l' buffer + + | l, (0x003F as c) -> +- add_utf_8 buffer c; +- let undo = tap (fun (_, c) -> add_utf_8 buffer c) input in ++ add_utf_8 buffer (Uchar.of_int c); ++ let undo = tap (fun (_, c) -> add_utf_8 buffer (Uchar.of_int c)) input in + parse_declaration_or_processing_instruction l (fun _ -> + undo (); + state ()) + + | _, c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + state () + end + +@@ -1027,16 +1027,16 @@ let tokenize report resolve_reference (i + next input !throw (fun () -> unterminated_doctype l' buffer) + begin function + | _, (0x003E as c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + state () + + | _, (0x0022 | 0x0027 as c) -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_quoted_state + (fun () -> doctype_declaration_state state l' buffer) c l' buffer + + | _, c -> +- add_utf_8 buffer c; ++ add_utf_8 buffer (Uchar.of_int c); + doctype_declaration_state state l' buffer + end + diff --git a/dev-ml/markup/markup-0.7.2.ebuild b/dev-ml/markup/markup-0.7.2-r1.ebuild index 235c575c1fb4..f70ac55cd716 100644 --- a/dev-ml/markup/markup-0.7.2.ebuild +++ b/dev-ml/markup/markup-0.7.2-r1.ebuild @@ -4,21 +4,21 @@ EAPI=5 -inherit findlib +inherit findlib eutils DESCRIPTION="Error-recovering streaming HTML5 and XML parsers" HOMEPAGE="https://github.com/aantron/markup.ml" SRC_URI="https://github.com/aantron/markup.ml/archive/${PV}.tar.gz -> ${P}.tar.gz" LICENSE="BSD" -SLOT="0/${PV}" +SLOT="0/${PV}p1" KEYWORDS="~amd64" IUSE="doc test" DEPEND=" dev-lang/ocaml:=[ocamlopt] dev-ml/lwt:=[ocamlopt] - dev-ml/uutf:=[ocamlopt] + >=dev-ml/uutf-1.0:=[ocamlopt] " RDEPEND="${DEPEND}" DEPEND="${DEPEND} @@ -26,6 +26,11 @@ DEPEND="${DEPEND} dev-ml/ocamlbuild" S="${WORKDIR}/${PN}.ml-${PV}" +src_prepare() { + epatch "${FILESDIR}/uutf.patch" \ + "${FILESDIR}/test.patch" +} + src_compile() { emake use doc && emake docs |