Skip to content

Commit 0ba7b77

Browse files
committed
addressed nojaf feedback
1 parent b35a76c commit 0ba7b77

File tree

1 file changed

+54
-38
lines changed

1 file changed

+54
-38
lines changed

src/FSharp.Formatting.Markdown/HtmlFormatting.fs

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -27,45 +27,61 @@ let internal encodeHighUnicode (text: string) =
2727
if String.IsNullOrEmpty text then
2828
text
2929
else
30-
// Fast path: check if string needs encoding at all
31-
let needsEncoding =
32-
text
33-
|> Seq.exists (fun c ->
34-
let codePoint = int c
35-
Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF))
36-
37-
if not needsEncoding then
38-
text
39-
else
40-
// Tail-recursive function with StringBuilder accumulator
41-
let rec processChars i (sb: System.Text.StringBuilder) =
42-
if i >= text.Length then
43-
sb.ToString()
30+
// Single-pass encoding with lazy StringBuilder allocation
31+
let mutable sb: System.Text.StringBuilder voption = ValueNone
32+
let mutable i = 0
33+
34+
while i < text.Length do
35+
let c = text.[i]
36+
37+
let needsEncoding, codePoint, skipNext =
38+
// Check for surrogate pairs first (emojis and other characters outside BMP)
39+
if
40+
Char.IsHighSurrogate c
41+
&& i + 1 < text.Length
42+
&& Char.IsLowSurrogate text.[i + 1]
43+
then
44+
let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
45+
// Encode all characters outside BMP (>= 0x10000) as they're typically emojis
46+
true, fullCodePoint, true
4447
else
45-
let c = text.[i]
46-
// Check for surrogate pairs first (emojis and other characters outside BMP)
47-
if
48-
Char.IsHighSurrogate c
49-
&& i + 1 < text.Length
50-
&& Char.IsLowSurrogate(text.[i + 1])
51-
then
52-
let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
53-
// Encode all characters outside BMP (>= 0x10000) as they're typically emojis
54-
sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore
55-
processChars (i + 2) sb // Skip both surrogate chars
56-
else
57-
let codePoint = int c
58-
// Encode specific ranges that contain emojis and symbols:
59-
// U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
60-
// U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
61-
if codePoint >= 0x2000 && codePoint <= 0x2BFF then
62-
sb.Append(sprintf "&#%d;" codePoint) |> ignore
63-
else
64-
sb.Append c |> ignore
65-
66-
processChars (i + 1) sb
67-
68-
processChars 0 (System.Text.StringBuilder text.Length)
48+
let codePoint = int c
49+
// Encode specific ranges that contain emojis and symbols:
50+
// U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
51+
// U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
52+
(codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false
53+
54+
if needsEncoding then
55+
// Lazy initialization of StringBuilder only when needed
56+
match sb with
57+
| ValueNone ->
58+
let builder = System.Text.StringBuilder(text.Length + 16)
59+
60+
if i > 0 then
61+
builder.Append(text, 0, i) |> ignore
62+
63+
sb <- ValueSome builder
64+
| ValueSome _ -> ()
65+
66+
// Append HTML entity without using sprintf (avoid allocation)
67+
match sb with
68+
| ValueSome builder ->
69+
builder.Append "&#" |> ignore
70+
builder.Append codePoint |> ignore
71+
builder.Append ';' |> ignore
72+
| ValueNone -> ()
73+
else
74+
// Only append to StringBuilder if it was already initialized
75+
match sb with
76+
| ValueSome builder -> builder.Append c |> ignore
77+
| ValueNone -> ()
78+
79+
i <- i + (if skipNext then 2 else 1)
80+
81+
// Return original string if no encoding was needed
82+
match sb with
83+
| ValueNone -> text
84+
| ValueSome builder -> builder.ToString()
6985

7086
/// Basic escaping as done by Markdown including quotes
7187
let internal htmlEncodeQuotes (code: string) =

0 commit comments

Comments
 (0)