Fix hyperlinked images not rendering in markdown (#98)

luandro · web-flow · commit 7a63eb44388b · 2025-11-27T17:17:45.000-03:00
* fix(notion-fetch): preserve hyperlinks on images from Notion Images that are hyperlinked in Notion were losing their links during the markdown conversion process. This fix adds: 1. Custom image transformer in notionClient.ts that detects hyperlinks on Notion image blocks (from caption rich_text or link property) and wraps images in markdown link syntax: [![alt](img-url)](link-url) 2. Enhanced image extraction regex in imageReplacer.ts to handle both: - Regular images: ![alt](url) - Hyperlinked images: [![alt](img-url)](link-url) 3. Updated image replacement logic to preserve hyperlink wrappers when replacing Notion image URLs with local paths 4. Comprehensive tests for hyperlinked image handling Fixes #96 * fix(notion-fetch): improve hyperlink detection in image captions Based on research into Notion's linking behavior, hyperlinks on images are stored in the caption as link annotations. Updated the image transformer to: 1. Check caption rich_text items for link annotations first 2. Extract the link URL from the first linked caption item 3. Use non-linked caption text as alt text 4. Add fallback checks for image.link and block-level link properties 5. Add debug logging to help diagnose hyperlink detection issues This should correctly detect and preserve image hyperlinks from Notion. Related to #96 * feat(notion-fetch): add workaround for image hyperlinks via captions Since Notion's "Add link" menu option doesn't expose hyperlinks via the API, implement a workaround that detects URLs in image captions: Method 1: Link annotations in caption rich_text - Detects when URLs are formatted as links in captions - Uses the link.url property from the rich_text annotation Method 2: Plain text URL detection (NEW) - Detects URLs typed as plain text in captions - Uses regex to extract http(s) URLs: /https?:\/\/[^\s]+/ - Separates URL from alt text Method 3 & 4: Future-proofing - Checks for image.link and block.link properties - Will work if Notion adds API support in the future Users can now make images clickable by: 1. Typing a URL in the image caption (preferred workaround) 2. Pasting a link in the caption (Notion converts it automatically) Note: The Notion UI "Add link" feature is not supported by the API. Images using that feature will not have clickable links in the output. Related to #96 * refactor(notion-fetch): clean up console logging in image transformer - Wrap console.log in IS_TEST_ENV check for cleaner test output - Use consistent ✓ prefix for success messages - Shorten log message for brevity
diff --git a/scripts/notion-fetch/imageReplacer.test.ts b/scripts/notion-fetch/imageReplacer.test.ts
@@ -159,6 +159,63 @@ Some text
         "![alt text](https://example.com/image.png)"
       );
     });
+
+    it("should extract hyperlinked images", () => {
+      const markdown =
+        "[![alt text](https://example.com/image.png)](https://example.com/link)";
+      const matches = extractImageMatches(markdown);
+
+      expect(matches).toHaveLength(1);
+      expect(matches[0]).toMatchObject({
+        alt: "alt text",
+        url: "https://example.com/image.png",
+        linkUrl: "https://example.com/link",
+        idx: 0,
+      });
+      expect(matches[0].full).toBe(
+        "[![alt text](https://example.com/image.png)](https://example.com/link)"
+      );
+    });
+
+    it("should extract both regular and hyperlinked images", () => {
+      const markdown = `
+![regular](https://example.com/regular.png)
+[![hyperlinked](https://example.com/linked.png)](https://example.com)
+      `;
+      const matches = extractImageMatches(markdown);
+
+      expect(matches).toHaveLength(2);
+
+      // Regular image
+      expect(matches[0].alt).toBe("regular");
+      expect(matches[0].url).toBe("https://example.com/regular.png");
+      expect(matches[0].linkUrl).toBeUndefined();
+
+      // Hyperlinked image
+      expect(matches[1].alt).toBe("hyperlinked");
+      expect(matches[1].url).toBe("https://example.com/linked.png");
+      expect(matches[1].linkUrl).toBe("https://example.com");
+    });
+
+    it("should handle hyperlinked images with escaped parentheses", () => {
+      const markdown =
+        "[![alt](https://example.com/image\\).png)](https://link.com/page\\))";
+      const matches = extractImageMatches(markdown);
+
+      expect(matches).toHaveLength(1);
+      expect(matches[0].url).toBe("https://example.com/image).png");
+      expect(matches[0].linkUrl).toBe("https://link.com/page)");
+    });
+
+    it("should handle hyperlinked images with empty alt text", () => {
+      const markdown =
+        "[![](https://example.com/image.png)](https://example.com/link)";
+      const matches = extractImageMatches(markdown);
+
+      expect(matches).toHaveLength(1);
+      expect(matches[0].alt).toBe("");
+      expect(matches[0].linkUrl).toBe("https://example.com/link");
+    });
   });
 
   describe("processAndReplaceImages", () => {
@@ -385,5 +442,51 @@ Some text after
       expect(result.metrics).toHaveProperty("skippedResize");
       expect(result.metrics).toHaveProperty("fullyProcessed");
     });
+
+    it("should preserve hyperlinks when replacing image URLs", async () => {
+      const markdown =
+        "[![alt text](https://example.com/image.png)](https://example.com/link)";
+      const result = await processAndReplaceImages(markdown, "test-file");
+
+      // Should replace the image URL but keep the hyperlink wrapper
+      expect(result.markdown).toContain("/images/downloaded-image.png");
+      expect(result.markdown).toContain("https://example.com/link");
+      expect(result.markdown).toBe(
+        "[![alt text](/images/downloaded-image.png)](https://example.com/link)"
+      );
+      expect(result.stats.successfulImages).toBe(1);
+    });
+
+    it("should handle multiple hyperlinked images", async () => {
+      const markdown = `
+[![img1](https://example.com/1.png)](https://link1.com)
+[![img2](https://example.com/2.png)](https://link2.com)
+      `;
+      const result = await processAndReplaceImages(markdown, "test-file");
+
+      expect(result.stats.successfulImages).toBe(2);
+      expect(result.markdown).toContain(
+        "[![img1](/images/downloaded-1.png)](https://link1.com)"
+      );
+      expect(result.markdown).toContain(
+        "[![img2](/images/downloaded-2.png)](https://link2.com)"
+      );
+    });
+
+    it("should handle mix of regular and hyperlinked images", async () => {
+      const markdown = `
+![regular](https://example.com/regular.png)
+[![linked](https://example.com/linked.png)](https://example.com)
+      `;
+      const result = await processAndReplaceImages(markdown, "test-file");
+
+      expect(result.stats.successfulImages).toBe(2);
+      expect(result.markdown).toContain(
+        "![regular](/images/downloaded-regular.png)"
+      );
+      expect(result.markdown).toContain(
+        "[![linked](/images/downloaded-linked.png)](https://example.com)"
+      );
+    });
   });
 });
diff --git a/scripts/notion-fetch/imageReplacer.ts b/scripts/notion-fetch/imageReplacer.ts
@@ -29,7 +29,7 @@ import { ProgressTracker } from "./progressTracker";
  * Image match information extracted from markdown
  */
 export interface ImageMatch {
-  /** Full markdown image syntax */
+  /** Full markdown image syntax (including link wrapper if present) */
   full: string;
   /** Image URL */
   url: string;
@@ -41,6 +41,8 @@ export interface ImageMatch {
   start: number;
   /** End position in source markdown */
   end: number;
+  /** Hyperlink URL if image is wrapped in a link */
+  linkUrl?: string;
 }
 
 /**
@@ -76,22 +78,60 @@ const MAX_CONCURRENT_IMAGES = 5;
 /**
  * Extracts all image matches from markdown content
  *
- * Uses an improved regex pattern that:
- * - Matches until ')' not preceded by '\'
- * - Allows spaces (trimmed)
- * - Handles escaped parentheses in URLs
+ * Handles both regular images and hyperlinked images:
+ * - Regular: ![alt](url)
+ * - Hyperlinked: [![alt](img-url)](link-url)
+ *
+ * Uses improved regex patterns that:
+ * - Match until ')' not preceded by '\'
+ * - Allow spaces (trimmed)
+ * - Handle escaped parentheses in URLs
  *
  * @param sourceMarkdown - Source markdown content
  * @returns Array of image matches with position information
  */
 export function extractImageMatches(sourceMarkdown: string): ImageMatch[] {
-  // Improved URL pattern: match until a ')' not preceded by '\', allow spaces trimmed
-  const imgRegex = /!\[([^\]]*)\]\(\s*((?:\\\)|[^)])+?)\s*\)/g;
   const imageMatches: ImageMatch[] = [];
-  let m: RegExpExecArray | null;
   let tmpIndex = 0;
   let safetyCounter = 0;
 
+  // First, extract hyperlinked images: [![alt](img-url)](link-url)
+  const hyperlinkedImgRegex =
+    /\[!\[([^\]]*)\]\(\s*((?:\\\)|[^)])+?)\s*\)\]\(\s*((?:\\\)|[^)])+?)\s*\)/g;
+  let m: RegExpExecArray | null;
+
+  while ((m = hyperlinkedImgRegex.exec(sourceMarkdown)) !== null) {
+    if (++safetyCounter > SAFETY_LIMIT) {
+      console.warn(
+        chalk.yellow(
+          `⚠️  Image match limit (${SAFETY_LIMIT}) reached; skipping remaining.`
+        )
+      );
+      break;
+    }
+    const start = m.index;
+    const full = m[0];
+    const end = start + full.length;
+    const rawImgUrl = m[2];
+    const rawLinkUrl = m[3];
+    const unescapedImgUrl = rawImgUrl.replace(/\\\)/g, ")");
+    const unescapedLinkUrl = rawLinkUrl.replace(/\\\)/g, ")");
+
+    imageMatches.push({
+      full,
+      url: unescapedImgUrl,
+      alt: m[1],
+      idx: tmpIndex++,
+      start,
+      end,
+      linkUrl: unescapedLinkUrl,
+    });
+  }
+
+  // Then, extract regular images: ![alt](url)
+  // But skip positions already matched by hyperlinked images
+  const imgRegex = /!\[([^\]]*)\]\(\s*((?:\\\)|[^)])+?)\s*\)/g;
+
   while ((m = imgRegex.exec(sourceMarkdown)) !== null) {
     if (++safetyCounter > SAFETY_LIMIT) {
       console.warn(
@@ -101,11 +141,23 @@ export function extractImageMatches(sourceMarkdown: string): ImageMatch[] {
       );
       break;
     }
+
     const start = m.index;
     const full = m[0];
     const end = start + full.length;
+
+    // Skip if this position overlaps with a hyperlinked image
+    const overlaps = imageMatches.some(
+      (existing) => start >= existing.start && start < existing.end
+    );
+
+    if (overlaps) {
+      continue;
+    }
+
     const rawUrl = m[2];
     const unescapedUrl = rawUrl.replace(/\\\)/g, ")");
+
     imageMatches.push({
       full,
       url: unescapedUrl,
@@ -116,6 +168,14 @@ export function extractImageMatches(sourceMarkdown: string): ImageMatch[] {
     });
   }
 
+  // Sort by start position to maintain order
+  imageMatches.sort((a, b) => a.start - b.start);
+
+  // Reassign indices after sorting
+  imageMatches.forEach((match, index) => {
+    match.idx = index;
+  });
+
   return imageMatches;
 }
 
@@ -296,6 +356,9 @@ export async function processAndReplaceImages(
 
     let replacementText: string;
     if (processResult.success && processResult.newPath) {
+      // Replace the image URL with the new local path
+      // This preserves the hyperlink wrapper if present, as match.full
+      // contains the complete markdown syntax: [![alt](url)](link) or ![alt](url)
       replacementText = match.full.replace(
         processResult.imageUrl!,
         processResult.newPath
diff --git a/scripts/notionClient.ts b/scripts/notionClient.ts
@@ -330,6 +330,110 @@ const paragraphTransformer: BlockToMarkdown = async (block) => {
 
 n2m.setCustomTransformer("paragraph", paragraphTransformer);
 
+/**
+ * Custom image transformer that preserves hyperlinks from Notion.
+ * When an image has a hyperlink in Notion, this transformer wraps the
+ * markdown image syntax with a link: [![alt](img-url)](link-url)
+ */
+const imageTransformer: BlockToMarkdown = async (block) => {
+  const imageBlock = block as any;
+
+  if (imageBlock?.type !== "image") {
+    return "";
+  }
+
+  const image = imageBlock.image;
+  if (!image) {
+    return "";
+  }
+
+  // Get image URL from external or file
+  const imageUrl = image.external?.url || image.file?.url || image.url || "";
+
+  if (!imageUrl) {
+    return "";
+  }
+
+  // Check if image has a hyperlink
+  // WORKAROUND: Since Notion's "Add link" feature doesn't expose links via the API,
+  // we detect URLs in captions as an alternative approach
+  let linkUrl = "";
+  let altText = "";
+
+  // Method 1: Check for links in caption rich_text (when URL is formatted as a link)
+  if (image.caption && Array.isArray(image.caption)) {
+    for (const captionItem of image.caption) {
+      // Check if this caption item has a link annotation
+      if (captionItem.type === "text" && captionItem.text?.link?.url) {
+        linkUrl = captionItem.text.link.url;
+        if (!IS_TEST_ENV) {
+          console.log(chalk.green(`✓ Found link in caption: ${linkUrl}`));
+        }
+        // Don't use the linked text as alt text - it's the URL destination
+        break;
+      } else if (captionItem.plain_text && !linkUrl) {
+        // Use non-linked caption text as alt text
+        altText += captionItem.plain_text || "";
+      }
+    }
+
+    // Method 2: Check for plain text URLs in caption (fallback)
+    // This catches cases where users type URLs without Notion converting them
+    if (!linkUrl) {
+      const fullCaption = image.caption
+        .map((item: any) => item.plain_text || "")
+        .join("");
+
+      // Simple URL regex to detect http(s) URLs
+      const urlMatch = fullCaption.match(/https?:\/\/[^\s]+/);
+      if (urlMatch) {
+        linkUrl = urlMatch[0];
+        if (!IS_TEST_ENV) {
+          console.log(
+            chalk.green(`✓ Found plain text URL in caption: ${linkUrl}`)
+          );
+        }
+        // Use the rest of the caption as alt text
+        altText = fullCaption.replace(linkUrl, "").trim();
+      } else {
+        // No URL found, use full caption as alt text
+        altText = fullCaption;
+      }
+    }
+  }
+
+  // Method 3: Check for dedicated link property on the image object (API support if added)
+  if (!linkUrl && image.link) {
+    linkUrl = image.link;
+    if (!IS_TEST_ENV) {
+      console.log(chalk.green(`✓ Found image link property: ${linkUrl}`));
+    }
+  }
+
+  // Method 4: Check for link on the block level (API support if added)
+  if (!linkUrl && imageBlock.link) {
+    linkUrl = imageBlock.link;
+    if (!IS_TEST_ENV) {
+      console.log(chalk.green(`✓ Found block-level link: ${linkUrl}`));
+    }
+  }
+
+  // Generate markdown
+  const imageMarkdown = `![${altText}](${imageUrl})`;
+
+  // If there's a hyperlink, wrap the image in a link
+  if (linkUrl) {
+    if (!IS_TEST_ENV) {
+      console.log(chalk.green(`✓ Creating hyperlinked image: ${linkUrl}`));
+    }
+    return `[${imageMarkdown}](${linkUrl})` as MarkdownBlock;
+  }
+
+  return imageMarkdown as MarkdownBlock;
+};
+
+n2m.setCustomTransformer("image", imageTransformer);
+
 export const DATABASE_ID = resolvedDatabaseId;
 
 // For v5 API compatibility - export data source ID