Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/.gitkeep

This file was deleted.

5 changes: 5 additions & 0 deletions docusaurus.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,11 @@ const config: Config = {
themeConfig: {
// Replace with your project's social card
image: "img/comapeo-social-card.jpg",
// Table of Contents configuration for consistent heading display
tableOfContents: {
minHeadingLevel: 2,
maxHeadingLevel: 3,
},
navbar: {
// title: 'CoMapeo',
logo: {
Expand Down
1 change: 0 additions & 1 deletion i18n/.gitkeep

This file was deleted.

95 changes: 95 additions & 0 deletions scripts/notion-fetch/contentSanitizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,100 @@ describe("contentSanitizer", () => {
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toBe("[tag](#tag)");
});

describe("heading hierarchy fixes", () => {
it("should keep the first H1 and convert subsequent H1s to H2s", () => {
const input = `# First Title
Content here
# Second Title
More content
# Third Title`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toContain("# First Title");
expect(result).toContain("## Second Title");
expect(result).toContain("## Third Title");
expect(result.match(/^# /gm)?.length).toBe(1);
});

it("should remove empty headings", () => {
const input = `# Valid Title
#
## Valid H2
###
Content`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toContain("# Valid Title");
expect(result).toContain("## Valid H2");
expect(result).not.toContain("#\n");
expect(result).not.toContain("### ");
});

it("should preserve H2 and H3 headings unchanged", () => {
const input = `# Title
## Section
### Subsection
#### Deep heading
##### Deeper
###### Deepest`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toBe(input);
});

it("should handle real Notion export pattern", () => {
const input = `# Setting up your phone
### Checklist
# Related Content
### Why is it important
# Troubleshooting`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toContain("# Setting up your phone");
expect(result).toContain("## Related Content");
expect(result).toContain("## Troubleshooting");
expect(result).toContain("### Checklist");
expect(result).toContain("### Why is it important");
});

it("should handle mixed content with headings", () => {
const input = `# Main Title
Some **bold** content here.

## Regular Section
# Another Title (should become H2)
More content with [links](#).

### Subsection
Content here.`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toContain("# Main Title");
expect(result).toContain("## Another Title (should become H2)");
expect(result).toContain("## Regular Section");
expect(result).toContain("### Subsection");
});

it("should handle headings with special characters", () => {
const input = `# Title [H1]
# Another Title: Subtitle
# Title with {brackets}`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toContain("# Title [H1]");
expect(result).toContain("## Another Title: Subtitle");
// Note: brackets get removed by other sanitization rules
expect(result).toMatch(/## Title with.*brackets/);
});

it("should not affect code blocks with # symbols", () => {
const input = `# Title
\`\`\`bash
# This is a comment in code
echo "# Not a heading"
\`\`\`
# Second Title`;
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toContain("# Title");
expect(result).toContain("## Second Title");
expect(result).toContain("# This is a comment in code");
expect(result).toContain('echo "# Not a heading"');
});
});
});
});
85 changes: 74 additions & 11 deletions scripts/notion-fetch/contentSanitizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,63 @@ const isEmojiStyleObject = (snippet: string): boolean =>
const isEmojiImgTag = (snippet: string): boolean =>
snippet.includes('className="emoji"');

/**
* Fixes heading hierarchy issues from Notion exports to ensure proper TOC generation.
* - Keeps only the first H1 (page title)
* - Converts subsequent H1s to H2s
* - Removes empty headings
* @param content - The markdown content string with code blocks already masked
* @param codeBlockPlaceholders - Array of code block placeholders to skip
* @returns Content with fixed heading hierarchy
*/
function fixHeadingHierarchy(
content: string,
codeBlockPlaceholders: string[]
): string {
const lines = content.split("\n");
let firstH1Found = false;

const fixedLines = lines.map((line) => {
// Skip lines that are code block placeholders
if (
codeBlockPlaceholders.some((placeholder) => line.includes(placeholder))
) {
return line;
}

// Match markdown headings: # Heading text
const headingMatch = line.match(/^(#{1,6})\s*(.*)$/);

if (!headingMatch) return line;

const [, hashes, text] = headingMatch;
const level = hashes.length;
const trimmedText = text.trim();

// Remove empty headings (e.g., "# " or "#" with no content)
if (trimmedText === "") {
return "";
}

// Handle H1 headings
if (level === 1) {
if (!firstH1Found) {
// Keep the first H1 as the page title
firstH1Found = true;
return line;
} else {
// Convert subsequent H1s to H2s
return `## ${trimmedText}`;
}
}

// Keep other heading levels unchanged
return line;
});

return fixedLines.join("\n");
}

/**
* Sanitizes markdown content to fix malformed HTML/JSX tags that cause MDX compilation errors
* @param content - The markdown content string
Expand All @@ -19,20 +76,26 @@ const isEmojiImgTag = (snippet: string): boolean =>
export function sanitizeMarkdownContent(content: string): string {
// Fix specific malformed patterns that cause MDX errors

// 0. Remove invalid curly brace expressions while preserving code fences and inline code
// Mask code fences (```...```) and inline code (`...`) to avoid altering them
// 0. Mask code fences (```...```) and inline code (`...`) to avoid altering them
const codeBlocks: string[] = [];
const codeSpans: string[] = [];
const codeBlockPlaceholders: string[] = [];

content = content.replace(/```[\s\S]*?```/g, (m) => {
codeBlocks.push(m);
return `__CODEBLOCK_${codeBlocks.length - 1}__`;
const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`;
codeBlockPlaceholders.push(placeholder);
return placeholder;
});
content = content.replace(/`[^`\n]*`/g, (m) => {
codeSpans.push(m);
return `__CODESPAN_${codeSpans.length - 1}__`;
});

// Aggressively strip all curly-brace expressions by unwrapping to inner text
// 1. Fix heading hierarchy for proper TOC generation (after masking code blocks)
content = fixHeadingHierarchy(content, codeBlockPlaceholders);

// 2. Aggressively strip all curly-brace expressions by unwrapping to inner text
// BUT preserve JSX style objects for emoji images
// Run a few passes to handle simple nesting like {{text}}
for (let i = 0; i < 5 && /\{[^{}]*\}/.test(content); i++) {
Expand All @@ -41,19 +104,19 @@ export function sanitizeMarkdownContent(content: string): string {
);
}

// 1. Fix malformed <link to section.> patterns (the main issue from the error)
// 3. Fix malformed <link to section.> patterns (the main issue from the error)
content = content.replace(
/<link\s+to\s+section\.?>/gi,
"[link to section](#section)"
);

// 2. Fix other malformed <link> tags with invalid attributes (spaces, dots in attr names)
// 4. Fix other malformed <link> tags with invalid attributes (spaces, dots in attr names)
content = content.replace(/<link\s+[^>]*[^\w\s"=-][^>]*>/g, "[link](#)");

// 3. Fix malformed <Link> tags with invalid attributes
// 5. Fix malformed <Link> tags with invalid attributes
content = content.replace(/<Link\s+[^>]*[^\w\s"=-][^>]*>/g, "[Link](#)");

// 4. Fix general malformed tags with dots or spaces in attribute names
// 6. Fix general malformed tags with dots or spaces in attribute names
// This catches patterns like <tag attr.name> or <tag attr value> (without quotes)
// BUT exclude emoji img tags which are valid HTML
content = content.replace(
Expand All @@ -74,7 +137,7 @@ export function sanitizeMarkdownContent(content: string): string {
}
);

// 5. Fix unquoted attribute values in JSX (e.g., <tag attr value> -> <tag attr="value">)
// 7. Fix unquoted attribute values in JSX (e.g., <tag attr value> -> <tag attr="value">)
// BUT exclude emoji img tags which are valid HTML
content = content.replace(
/<([a-zA-Z][a-zA-Z0-9]*)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s+([^>\s"=]+)(\s|>)/g,
Expand All @@ -84,7 +147,7 @@ export function sanitizeMarkdownContent(content: string): string {
: `<${tagName} ${attrName}="${attrValue}"${suffix}`
);

// 6. Final hard cleanup: strip any remaining { ... } to avoid MDX/Acorn errors
// 8. Final hard cleanup: strip any remaining { ... } to avoid MDX/Acorn errors
// BUT preserve JSX style objects for emoji images
// Run a few passes to handle simple nesting like {{text}}.
for (let i = 0; i < 3 && /\{[^{}]*\}/.test(content); i++) {
Expand All @@ -93,7 +156,7 @@ export function sanitizeMarkdownContent(content: string): string {
);
}

// 7. Restore masked code blocks and inline code
// 9. Restore masked code blocks and inline code
content = content.replace(
/__CODEBLOCK_(\d+)__/g,
(_m, i) => codeBlocks[Number(i)]
Expand Down
1 change: 0 additions & 1 deletion static/images/.gitkeep

This file was deleted.

Loading