Skip to content

Commit f5dc6b9

Browse files
committed
fix(notion-fetch): add final pass validation for missed S3 URLs
Adds a safety net to catch and fix any AWS S3 URLs that persist in the final markdown (e.g. due to callout processing or regex misses), ensuring all expiring URLs are replaced with local images.
1 parent 0922aca commit f5dc6b9

File tree

4 files changed

+202
-2
lines changed

4 files changed

+202
-2
lines changed

IMAGE_URL_EXPIRATION_SPEC.md

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,48 @@ if (markdownString?.parent) {
351351
**Complexity**: MEDIUM (requires API integration for full refresh)
352352
**Risk**: LOW (detection/logging only)
353353

354-
### Phase 3: Monitoring and Metrics (LOW PRIORITY - OPTIONAL/FUTURE WORK)
354+
### Phase 3: Final Pass Safety Net (HIGH PRIORITY) ⭐
355+
356+
**Goal**: Catch and fix any S3 URLs that remain in the final markdown (e.g., re-introduced by callouts or missed by initial regex)
357+
358+
**Changes**:
359+
360+
1. **Add `validateAndFixRemainingImages` in `imageReplacer.ts`**:
361+
- Scans final markdown for any remaining `amazonaws.com` URLs
362+
- Uses specific regex to target S3 paths
363+
- Re-runs `processAndReplaceImages` if found
364+
- Logs warnings if they persist
365+
366+
2. **Call in `processSinglePage`**:
367+
- Run this check just before writing the file (after all other processing)
368+
369+
**Specific Code Changes**:
370+
371+
```typescript
372+
// In imageReplacer.ts
373+
export async function validateAndFixRemainingImages(markdown, safeFilename) {
374+
const s3Regex = /!\[.*?\]\((https:\/\/prod-files-secure\.s3\.[a-z0-9-]+\.amazonaws\.com\/[^\)]+)\)/;
375+
if (s3Regex.test(markdown)) {
376+
console.warn(`Found S3 URLs in final markdown...`);
377+
return processAndReplaceImages(markdown, safeFilename);
378+
}
379+
return markdown;
380+
}
381+
382+
// In generateBlocks.ts
383+
markdownString.parent = await validateAndFixRemainingImages(
384+
markdownString.parent,
385+
safeFilename
386+
);
387+
```
388+
389+
**Benefits**:
390+
391+
- ✅ Catch-all safety net for edge cases
392+
- ✅ Handles re-introduced URLs from callouts/emojis
393+
- ✅ Provides final guarantee before file write
394+
395+
### Phase 4: Monitoring and Metrics (LOW PRIORITY - OPTIONAL/FUTURE WORK)
355396

356397
**Status**: NOT IMPLEMENTED - Future enhancement
357398

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
3+
import { processAndReplaceImages, validateAndFixRemainingImages } from "../imageReplacer";
4+
import { processImageWithFallbacks } from "../imageProcessing";
5+
6+
// Mock dependencies
7+
vi.mock("../imageProcessing", () => ({
8+
processImageWithFallbacks: vi.fn(),
9+
createProcessingMetrics: vi.fn(() => ({})),
10+
logProcessingMetrics: vi.fn(),
11+
logImageFailure: vi.fn(),
12+
}));
13+
14+
vi.mock("../imageValidation", () => ({
15+
validateAndSanitizeImageUrl: vi.fn((url) => ({
16+
isValid: true,
17+
sanitizedUrl: url,
18+
})),
19+
createFallbackImageMarkdown: vi.fn((full, url) => full), // Fallback keeps original
20+
}));
21+
22+
vi.mock("../markdownTransform", () => ({
23+
sanitizeMarkdownImages: vi.fn((md) => md),
24+
}));
25+
26+
vi.mock("../timeoutUtils", () => ({
27+
processBatch: vi.fn(async (items, processor) => {
28+
// Simple pass-through for testing
29+
const results = [];
30+
for (const item of items) {
31+
results.push({ status: "fulfilled", value: await processor(item) });
32+
}
33+
return results;
34+
}),
35+
}));
36+
37+
describe("Final Pass Image Validation", () => {
38+
beforeEach(() => {
39+
vi.clearAllMocks();
40+
});
41+
42+
it("should detect and fix remaining S3 URLs", async () => {
43+
const s3Url = "https://prod-files-secure.s3.us-west-2.amazonaws.com/test-image.jpg";
44+
const markdown = `Here is an image: ![Alt](${s3Url})`;
45+
const safeFilename = "test-page";
46+
47+
// First pass simulation: mock failure (fallback used)
48+
// This simulates the state BEFORE the final pass
49+
// In the actual code, processAndReplaceImages returns the markdown.
50+
// If it failed, it returned markdown with the original URL.
51+
52+
// Now we test validateAndFixRemainingImages
53+
// We mock processImageWithFallbacks to SUCCEED this time
54+
const { processImageWithFallbacks } = await import("../imageProcessing");
55+
vi.mocked(processImageWithFallbacks).mockResolvedValue({
56+
success: true,
57+
newPath: "/images/fixed.jpg",
58+
savedBytes: 100,
59+
fallbackUsed: false,
60+
});
61+
62+
const result = await validateAndFixRemainingImages(markdown, safeFilename);
63+
64+
// Expect the URL to be replaced
65+
expect(result).toContain("/images/fixed.jpg");
66+
expect(result).not.toContain(s3Url);
67+
expect(processImageWithFallbacks).toHaveBeenCalled();
68+
});
69+
70+
it("should not modify markdown if no S3 URLs are present", async () => {
71+
const markdown = "Here is a local image: ![Alt](/images/local.jpg)";
72+
const safeFilename = "test-page";
73+
74+
const { processImageWithFallbacks } = await import("../imageProcessing");
75+
76+
const result = await validateAndFixRemainingImages(markdown, safeFilename);
77+
78+
expect(result).toBe(markdown);
79+
expect(processImageWithFallbacks).not.toHaveBeenCalled();
80+
});
81+
82+
it("should handle multiple S3 URLs", async () => {
83+
const s3Url1 = "https://prod-files-secure.s3.us-west-2.amazonaws.com/img1.jpg";
84+
const s3Url2 = "https://prod-files-secure.s3.us-west-2.amazonaws.com/img2.jpg";
85+
const markdown = `![1](${s3Url1}) and ![2](${s3Url2})`;
86+
const safeFilename = "test-page";
87+
88+
const { processImageWithFallbacks } = await import("../imageProcessing");
89+
vi.mocked(processImageWithFallbacks).mockImplementation(async (url) => ({
90+
success: true,
91+
newPath: url.includes("img1") ? "/images/1.jpg" : "/images/2.jpg",
92+
savedBytes: 100,
93+
fallbackUsed: false,
94+
}));
95+
96+
const result = await validateAndFixRemainingImages(markdown, safeFilename);
97+
98+
expect(result).toContain("/images/1.jpg");
99+
expect(result).toContain("/images/2.jpg");
100+
expect(result).not.toContain("amazonaws.com");
101+
});
102+
});

scripts/notion-fetch/generateBlocks.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ import { LRUCache, validateCacheSize } from "./cacheStrategies";
2828
import { getImageCache, logImageFailure } from "./imageProcessing";
2929
import { setTranslationString, getI18NPath } from "./translationManager";
3030
import { loadBlocksForPage, loadMarkdownForPage } from "./cacheLoaders";
31-
import { processAndReplaceImages } from "./imageReplacer";
31+
import {
32+
processAndReplaceImages,
33+
validateAndFixRemainingImages,
34+
} from "./imageReplacer";
3235
import {
3336
processToggleSection,
3437
processHeadingSection,
@@ -329,6 +332,13 @@ async function processSinglePage(
329332
console.log(chalk.blue(` ↳ Processed callouts in markdown content`));
330333
}
331334

335+
// ✅ FINAL PASS: Check for any AWS S3 URLs that might have been re-introduced
336+
// (e.g. by callout processing or missed in the first pass) and fix them.
337+
markdownString.parent = await validateAndFixRemainingImages(
338+
markdownString.parent,
339+
safeFilename
340+
);
341+
332342
// Sanitize content to fix malformed HTML/JSX tags
333343
markdownString.parent = sanitizeMarkdownContent(markdownString.parent);
334344

scripts/notion-fetch/imageReplacer.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,3 +361,50 @@ export async function processAndReplaceImages(
361361
metrics,
362362
};
363363
}
364+
365+
/**
366+
* Validates final markdown for remaining S3 URLs and attempts to fix them.
367+
* This acts as a safety net for images missed by the initial pass or re-introduced
368+
* by subsequent processing (e.g. callouts).
369+
*
370+
* @param markdown - The final markdown content to check
371+
* @param safeFilename - Safe filename for logging
372+
* @returns The processed markdown (potentially with fixes applied)
373+
*/
374+
export async function validateAndFixRemainingImages(
375+
markdown: string,
376+
safeFilename: string
377+
): Promise<string> {
378+
// Regex for AWS S3 URLs in markdown image syntax
379+
// Matches: ![alt](https://prod-files-secure.s3...amazonaws.com/...)
380+
const s3Regex =
381+
/!\[.*?\]\((https:\/\/prod-files-secure\.s3\.[a-z0-9-]+\.amazonaws\.com\/[^\)]+)\)/;
382+
383+
if (!s3Regex.test(markdown)) {
384+
return markdown;
385+
}
386+
387+
console.warn(
388+
chalk.yellow(
389+
`⚠️ Found AWS S3 URLs in final markdown for ${safeFilename}. Running final replacement pass...`
390+
)
391+
);
392+
393+
// Re-run processAndReplaceImages
394+
const result = await processAndReplaceImages(markdown, safeFilename);
395+
396+
// Check if any remain (indicating persistent failure)
397+
if (s3Regex.test(result.markdown)) {
398+
console.warn(
399+
chalk.red(
400+
`❌ Failed to replace all S3 URLs in final pass for ${safeFilename}. Some images may expire.`
401+
)
402+
);
403+
} else {
404+
console.info(
405+
chalk.green(`✅ Successfully fixed remaining S3 URLs in ${safeFilename}`)
406+
);
407+
}
408+
409+
return result.markdown;
410+
}

0 commit comments

Comments
 (0)