@@ -21,6 +21,52 @@ open MarkdownUtils
2121let internal htmlEncode ( code : string ) =
2222 code.Replace( " &" , " &" ) .Replace( " <" , " <" ) .Replace( " >" , " >" )
2323
24+ /// Encode emojis and problematic Unicode characters as HTML numeric entities
25+ /// Encodes characters in emoji ranges and symbols, but preserves common international text
26+ let internal encodeHighUnicode ( text : string ) =
27+ if String.IsNullOrEmpty text then
28+ text
29+ else
30+ // Fast path: check if string needs encoding at all
31+ let needsEncoding =
32+ text
33+ |> Seq.exists ( fun c ->
34+ let codePoint = int c
35+ Char.IsSurrogate c || ( codePoint >= 0x2000 && codePoint <= 0x2BFF ))
36+
37+ if not needsEncoding then
38+ text
39+ else
40+ // Tail-recursive function with StringBuilder accumulator
41+ let rec processChars i ( sb : System.Text.StringBuilder ) =
42+ if i >= text.Length then
43+ sb.ToString()
44+ else
45+ let c = text.[ i]
46+ // Check for surrogate pairs first (emojis and other characters outside BMP)
47+ if
48+ Char.IsHighSurrogate c
49+ && i + 1 < text.Length
50+ && Char.IsLowSurrogate( text.[ i + 1 ])
51+ then
52+ let fullCodePoint = Char.ConvertToUtf32( c, text.[ i + 1 ])
53+ // Encode all characters outside BMP (>= 0x10000) as they're typically emojis
54+ sb.Append( sprintf " &#%d ;" fullCodePoint) |> ignore
55+ processChars ( i + 2 ) sb // Skip both surrogate chars
56+ else
57+ let codePoint = int c
58+ // Encode specific ranges that contain emojis and symbols:
59+ // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
60+ // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
61+ if codePoint >= 0x2000 && codePoint <= 0x2BFF then
62+ sb.Append( sprintf " &#%d ;" codePoint) |> ignore
63+ else
64+ sb.Append c |> ignore
65+
66+ processChars ( i + 1 ) sb
67+
68+ processChars 0 ( System.Text.StringBuilder text.Length)
69+
2470/// Basic escaping as done by Markdown including quotes
2571let internal htmlEncodeQuotes ( code : string ) =
2672 ( htmlEncode code) .Replace( " \" " , " "" )
@@ -78,7 +124,7 @@ let rec internal formatSpan (ctx: FormattingContext) span =
78124
79125 | AnchorLink( id, _) -> ctx.Writer.Write( " <a name=\" " + htmlEncodeQuotes id + " \" > </a>" )
80126 | EmbedSpans( cmd, _) -> formatSpans ctx ( cmd.Render())
81- | Literal( str, _) -> ctx.Writer.Write( str)
127+ | Literal( str, _) -> ctx.Writer.Write( encodeHighUnicode str)
82128 | HardLineBreak(_) -> ctx.Writer.Write( " <br />" + ctx.Newline)
83129 | IndirectLink( body, _, LookupKey ctx.Links ( link, title), _)
84130 | DirectLink( body, link, title, _) ->
0 commit comments