Skip to content

Commit b35a76c

Browse files
committed
markdown to html improvement
1 parent 226c357 commit b35a76c

File tree

2 files changed

+79
-1
lines changed

2 files changed

+79
-1
lines changed

src/FSharp.Formatting.Markdown/HtmlFormatting.fs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,52 @@ open MarkdownUtils
2121
let internal htmlEncode (code: string) =
2222
code.Replace("&", "&amp;").Replace("<", "&lt;").Replace(">", "&gt;")
2323

24+
/// Encode emojis and problematic Unicode characters as HTML numeric entities
25+
/// Encodes characters in emoji ranges and symbols, but preserves common international text
26+
let internal encodeHighUnicode (text: string) =
27+
if String.IsNullOrEmpty text then
28+
text
29+
else
30+
// Fast path: check if string needs encoding at all
31+
let needsEncoding =
32+
text
33+
|> Seq.exists (fun c ->
34+
let codePoint = int c
35+
Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF))
36+
37+
if not needsEncoding then
38+
text
39+
else
40+
// Tail-recursive function with StringBuilder accumulator
41+
let rec processChars i (sb: System.Text.StringBuilder) =
42+
if i >= text.Length then
43+
sb.ToString()
44+
else
45+
let c = text.[i]
46+
// Check for surrogate pairs first (emojis and other characters outside BMP)
47+
if
48+
Char.IsHighSurrogate c
49+
&& i + 1 < text.Length
50+
&& Char.IsLowSurrogate(text.[i + 1])
51+
then
52+
let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
53+
// Encode all characters outside BMP (>= 0x10000) as they're typically emojis
54+
sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore
55+
processChars (i + 2) sb // Skip both surrogate chars
56+
else
57+
let codePoint = int c
58+
// Encode specific ranges that contain emojis and symbols:
59+
// U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
60+
// U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
61+
if codePoint >= 0x2000 && codePoint <= 0x2BFF then
62+
sb.Append(sprintf "&#%d;" codePoint) |> ignore
63+
else
64+
sb.Append c |> ignore
65+
66+
processChars (i + 1) sb
67+
68+
processChars 0 (System.Text.StringBuilder text.Length)
69+
2470
/// Basic escaping as done by Markdown including quotes
2571
let internal htmlEncodeQuotes (code: string) =
2672
(htmlEncode code).Replace("\"", "&quot;")
@@ -78,7 +124,7 @@ let rec internal formatSpan (ctx: FormattingContext) span =
78124

79125
| AnchorLink(id, _) -> ctx.Writer.Write("<a name=\"" + htmlEncodeQuotes id + "\">&#160;</a>")
80126
| EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render())
81-
| Literal(str, _) -> ctx.Writer.Write(str)
127+
| Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str)
82128
| HardLineBreak(_) -> ctx.Writer.Write("<br />" + ctx.Newline)
83129
| IndirectLink(body, _, LookupKey ctx.Links (link, title), _)
84130
| DirectLink(body, link, title, _) ->

tests/FSharp.Markdown.Tests/Markdown.fs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () =
3030
|> Markdown.ToHtml
3131
|> should contain "<p><code>a &amp;gt; &amp; b</code></p>"
3232

33+
[<Test>]
34+
let ``Emojis are encoded as HTML numeric entities`` () =
35+
let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml
36+
html |> should contain "&#127881;" // 🎉 party popper
37+
html |> should contain "&#128679;" // 🚧 construction
38+
html |> should contain "&#11088;" // ⭐ star
39+
html |> should contain "&#9888;" // ⚠️ warning
40+
html |> should contain "&#9989;" // ✅ check mark
41+
42+
[<Test>]
43+
let ``Regular text without emojis is not modified`` () =
44+
// Fast path optimization: regular text should pass through unchanged
45+
let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml
46+
html |> should contain "пристаням"
47+
html |> should contain "中文"
48+
html |> should not' (contain "&#") // No HTML entities for regular international text
49+
50+
[<Test>]
51+
let ``List without blank line after heading`` () =
52+
// Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382
53+
let markdown =
54+
"""# This is my title
55+
- this list
56+
- should render"""
57+
58+
let html = Markdown.ToHtml markdown
59+
// Check if list is rendered as a separate element, not part of heading
60+
html |> should contain "<h1>This is my title</h1>"
61+
html |> should contain "<ul>"
62+
html |> should contain "<li>this list</li>"
63+
html |> should contain "<li>should render</li>"
64+
3365
[<Test>]
3466
let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () =
3567
let doc = """<a href="mailto:[email protected]">hi</a>""" |> Markdown.Parse

0 commit comments

Comments
 (0)