Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions scripts/notion-fetch/imageReplacer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,63 @@ Some text
"![alt text](https://example.com/image.png)"
);
});

it("should extract hyperlinked images", () => {
const markdown =
"[![alt text](https://example.com/image.png)](https://example.com/link)";
const matches = extractImageMatches(markdown);

expect(matches).toHaveLength(1);
expect(matches[0]).toMatchObject({
alt: "alt text",
url: "https://example.com/image.png",
linkUrl: "https://example.com/link",
idx: 0,
});
expect(matches[0].full).toBe(
"[![alt text](https://example.com/image.png)](https://example.com/link)"
);
});

it("should extract both regular and hyperlinked images", () => {
const markdown = `
![regular](https://example.com/regular.png)
[![hyperlinked](https://example.com/linked.png)](https://example.com)
`;
const matches = extractImageMatches(markdown);

expect(matches).toHaveLength(2);

// Regular image
expect(matches[0].alt).toBe("regular");
expect(matches[0].url).toBe("https://example.com/regular.png");
expect(matches[0].linkUrl).toBeUndefined();

// Hyperlinked image
expect(matches[1].alt).toBe("hyperlinked");
expect(matches[1].url).toBe("https://example.com/linked.png");
expect(matches[1].linkUrl).toBe("https://example.com");
});

it("should handle hyperlinked images with escaped parentheses", () => {
const markdown =
"[![alt](https://example.com/image\\).png)](https://link.com/page\\))";
const matches = extractImageMatches(markdown);

expect(matches).toHaveLength(1);
expect(matches[0].url).toBe("https://example.com/image).png");
expect(matches[0].linkUrl).toBe("https://link.com/page)");
});

it("should handle hyperlinked images with empty alt text", () => {
const markdown =
"[![](https://example.com/image.png)](https://example.com/link)";
const matches = extractImageMatches(markdown);

expect(matches).toHaveLength(1);
expect(matches[0].alt).toBe("");
expect(matches[0].linkUrl).toBe("https://example.com/link");
});
});

describe("processAndReplaceImages", () => {
Expand Down Expand Up @@ -385,5 +442,51 @@ Some text after
expect(result.metrics).toHaveProperty("skippedResize");
expect(result.metrics).toHaveProperty("fullyProcessed");
});

it("should preserve hyperlinks when replacing image URLs", async () => {
const markdown =
"[![alt text](https://example.com/image.png)](https://example.com/link)";
const result = await processAndReplaceImages(markdown, "test-file");

// Should replace the image URL but keep the hyperlink wrapper
expect(result.markdown).toContain("/images/downloaded-image.png");
expect(result.markdown).toContain("https://example.com/link");
expect(result.markdown).toBe(
"[![alt text](/images/downloaded-image.png)](https://example.com/link)"
);
expect(result.stats.successfulImages).toBe(1);
});

it("should handle multiple hyperlinked images", async () => {
const markdown = `
[![img1](https://example.com/1.png)](https://link1.com)
[![img2](https://example.com/2.png)](https://link2.com)
`;
const result = await processAndReplaceImages(markdown, "test-file");

expect(result.stats.successfulImages).toBe(2);
expect(result.markdown).toContain(
"[![img1](/images/downloaded-1.png)](https://link1.com)"
);
expect(result.markdown).toContain(
"[![img2](/images/downloaded-2.png)](https://link2.com)"
);
});

it("should handle mix of regular and hyperlinked images", async () => {
const markdown = `
![regular](https://example.com/regular.png)
[![linked](https://example.com/linked.png)](https://example.com)
`;
const result = await processAndReplaceImages(markdown, "test-file");

expect(result.stats.successfulImages).toBe(2);
expect(result.markdown).toContain(
"![regular](/images/downloaded-regular.png)"
);
expect(result.markdown).toContain(
"[![linked](/images/downloaded-linked.png)](https://example.com)"
);
});
});
});
79 changes: 71 additions & 8 deletions scripts/notion-fetch/imageReplacer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import { ProgressTracker } from "./progressTracker";
* Image match information extracted from markdown
*/
export interface ImageMatch {
/** Full markdown image syntax */
/** Full markdown image syntax (including link wrapper if present) */
full: string;
/** Image URL */
url: string;
Expand All @@ -41,6 +41,8 @@ export interface ImageMatch {
start: number;
/** End position in source markdown */
end: number;
/** Hyperlink URL if image is wrapped in a link */
linkUrl?: string;
}

/**
Expand Down Expand Up @@ -76,22 +78,60 @@ const MAX_CONCURRENT_IMAGES = 5;
/**
* Extracts all image matches from markdown content
*
* Uses an improved regex pattern that:
* - Matches until ')' not preceded by '\'
* - Allows spaces (trimmed)
* - Handles escaped parentheses in URLs
* Handles both regular images and hyperlinked images:
* - Regular: ![alt](url)
* - Hyperlinked: [![alt](img-url)](link-url)
*
* Uses improved regex patterns that:
* - Match until ')' not preceded by '\'
* - Allow spaces (trimmed)
* - Handle escaped parentheses in URLs
*
* @param sourceMarkdown - Source markdown content
* @returns Array of image matches with position information
*/
export function extractImageMatches(sourceMarkdown: string): ImageMatch[] {
// Improved URL pattern: match until a ')' not preceded by '\', allow spaces trimmed
const imgRegex = /!\[([^\]]*)\]\(\s*((?:\\\)|[^)])+?)\s*\)/g;
const imageMatches: ImageMatch[] = [];
let m: RegExpExecArray | null;
let tmpIndex = 0;
let safetyCounter = 0;

// First, extract hyperlinked images: [![alt](img-url)](link-url)
const hyperlinkedImgRegex =
/\[!\[([^\]]*)\]\(\s*((?:\\\)|[^)])+?)\s*\)\]\(\s*((?:\\\)|[^)])+?)\s*\)/g;
let m: RegExpExecArray | null;

while ((m = hyperlinkedImgRegex.exec(sourceMarkdown)) !== null) {
if (++safetyCounter > SAFETY_LIMIT) {
console.warn(
chalk.yellow(
`⚠️ Image match limit (${SAFETY_LIMIT}) reached; skipping remaining.`
)
);
break;
}
const start = m.index;
const full = m[0];
const end = start + full.length;
const rawImgUrl = m[2];
const rawLinkUrl = m[3];
const unescapedImgUrl = rawImgUrl.replace(/\\\)/g, ")");
const unescapedLinkUrl = rawLinkUrl.replace(/\\\)/g, ")");

imageMatches.push({
full,
url: unescapedImgUrl,
alt: m[1],
idx: tmpIndex++,
start,
end,
linkUrl: unescapedLinkUrl,
});
}

// Then, extract regular images: ![alt](url)
// But skip positions already matched by hyperlinked images
const imgRegex = /!\[([^\]]*)\]\(\s*((?:\\\)|[^)])+?)\s*\)/g;

while ((m = imgRegex.exec(sourceMarkdown)) !== null) {
if (++safetyCounter > SAFETY_LIMIT) {
console.warn(
Expand All @@ -101,11 +141,23 @@ export function extractImageMatches(sourceMarkdown: string): ImageMatch[] {
);
break;
}

const start = m.index;
const full = m[0];
const end = start + full.length;

// Skip if this position overlaps with a hyperlinked image
const overlaps = imageMatches.some(
(existing) => start >= existing.start && start < existing.end
);

if (overlaps) {
continue;
}

const rawUrl = m[2];
const unescapedUrl = rawUrl.replace(/\\\)/g, ")");

imageMatches.push({
full,
url: unescapedUrl,
Expand All @@ -116,6 +168,14 @@ export function extractImageMatches(sourceMarkdown: string): ImageMatch[] {
});
}

// Sort by start position to maintain order
imageMatches.sort((a, b) => a.start - b.start);

// Reassign indices after sorting
imageMatches.forEach((match, index) => {
match.idx = index;
});

return imageMatches;
}

Expand Down Expand Up @@ -296,6 +356,9 @@ export async function processAndReplaceImages(

let replacementText: string;
if (processResult.success && processResult.newPath) {
// Replace the image URL with the new local path
// This preserves the hyperlink wrapper if present, as match.full
// contains the complete markdown syntax: [![alt](url)](link) or ![alt](url)
replacementText = match.full.replace(
processResult.imageUrl!,
processResult.newPath
Expand Down
104 changes: 104 additions & 0 deletions scripts/notionClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,110 @@ const paragraphTransformer: BlockToMarkdown = async (block) => {

n2m.setCustomTransformer("paragraph", paragraphTransformer);

/**
* Custom image transformer that preserves hyperlinks from Notion.
* When an image has a hyperlink in Notion, this transformer wraps the
* markdown image syntax with a link: [![alt](img-url)](link-url)
*/
const imageTransformer: BlockToMarkdown = async (block) => {
const imageBlock = block as any;

if (imageBlock?.type !== "image") {
return "";
}

const image = imageBlock.image;
if (!image) {
return "";
}

// Get image URL from external or file
const imageUrl = image.external?.url || image.file?.url || image.url || "";

if (!imageUrl) {
return "";
}

// Check if image has a hyperlink
// WORKAROUND: Since Notion's "Add link" feature doesn't expose links via the API,
// we detect URLs in captions as an alternative approach
let linkUrl = "";
let altText = "";

// Method 1: Check for links in caption rich_text (when URL is formatted as a link)
if (image.caption && Array.isArray(image.caption)) {
for (const captionItem of image.caption) {
// Check if this caption item has a link annotation
if (captionItem.type === "text" && captionItem.text?.link?.url) {
linkUrl = captionItem.text.link.url;
if (!IS_TEST_ENV) {
console.log(chalk.green(`✓ Found link in caption: ${linkUrl}`));
}
// Don't use the linked text as alt text - it's the URL destination
break;
} else if (captionItem.plain_text && !linkUrl) {
// Use non-linked caption text as alt text
altText += captionItem.plain_text || "";
}
}

// Method 2: Check for plain text URLs in caption (fallback)
// This catches cases where users type URLs without Notion converting them
if (!linkUrl) {
const fullCaption = image.caption
.map((item: any) => item.plain_text || "")
.join("");

// Simple URL regex to detect http(s) URLs
const urlMatch = fullCaption.match(/https?:\/\/[^\s]+/);
if (urlMatch) {
linkUrl = urlMatch[0];
if (!IS_TEST_ENV) {
console.log(
chalk.green(`✓ Found plain text URL in caption: ${linkUrl}`)
);
}
// Use the rest of the caption as alt text
altText = fullCaption.replace(linkUrl, "").trim();
} else {
// No URL found, use full caption as alt text
altText = fullCaption;
}
}
}

// Method 3: Check for dedicated link property on the image object (API support if added)
if (!linkUrl && image.link) {
linkUrl = image.link;
if (!IS_TEST_ENV) {
console.log(chalk.green(`✓ Found image link property: ${linkUrl}`));
}
}

// Method 4: Check for link on the block level (API support if added)
if (!linkUrl && imageBlock.link) {
linkUrl = imageBlock.link;
if (!IS_TEST_ENV) {
console.log(chalk.green(`✓ Found block-level link: ${linkUrl}`));
}
}

// Generate markdown
const imageMarkdown = `![${altText}](${imageUrl})`;

// If there's a hyperlink, wrap the image in a link
if (linkUrl) {
if (!IS_TEST_ENV) {
console.log(chalk.green(`✓ Creating hyperlinked image: ${linkUrl}`));
}
return `[${imageMarkdown}](${linkUrl})` as MarkdownBlock;
}

return imageMarkdown as MarkdownBlock;
};

n2m.setCustomTransformer("image", imageTransformer);

export const DATABASE_ID = resolvedDatabaseId;

// For v5 API compatibility - export data source ID
Expand Down