diff --git a/app/_lib/posthog-server.ts b/app/_lib/posthog-server.ts new file mode 100644 index 000000000..abcfd2af5 --- /dev/null +++ b/app/_lib/posthog-server.ts @@ -0,0 +1,181 @@ +import { PostHog } from "posthog-node"; + +// Server-side PostHog client singleton +let posthogClient: PostHog | null = null; + +export function getPostHogServer(): PostHog { + if (!posthogClient) { + const apiKey = process.env.NEXT_PUBLIC_POSTHOG_KEY; + if (!apiKey) { + throw new Error("NEXT_PUBLIC_POSTHOG_KEY is not set"); + } + posthogClient = new PostHog(apiKey, { + host: process.env.NEXT_PUBLIC_POSTHOG_HOST || "https://us.i.posthog.com", + // Flush events immediately in serverless environments + flushAt: 1, + flushInterval: 0, + }); + } + return posthogClient; +} + +// AI agent detection patterns with classification +const AI_AGENT_CLASSIFIERS: Array<{ + pattern: RegExp; + type: string; + provider: string; +}> = [ + // OpenAI + { pattern: /GPTBot/i, type: "GPTBot", provider: "OpenAI" }, + { pattern: /ChatGPT-User/i, type: "ChatGPT-User", provider: "OpenAI" }, + { pattern: /OAI-SearchBot/i, type: "OAI-SearchBot", provider: "OpenAI" }, + + // Anthropic + { pattern: /ClaudeBot/i, type: "ClaudeBot", provider: "Anthropic" }, + { pattern: /Claude-User/i, type: "Claude-User", provider: "Anthropic" }, + { + pattern: /Claude-SearchBot/i, + type: "Claude-SearchBot", + provider: "Anthropic", + }, + { pattern: /anthropic/i, type: "Anthropic-Agent", provider: "Anthropic" }, + + // Perplexity + { pattern: /PerplexityBot/i, type: "PerplexityBot", provider: "Perplexity" }, + { + pattern: /Perplexity-User/i, + type: "Perplexity-User", + provider: "Perplexity", + }, + + // Google + { pattern: /Google-Extended/i, type: "Google-Extended", provider: "Google" }, + { pattern: /Googlebot/i, type: "Googlebot", provider: "Google" }, + + // Amazon + { pattern: /Amazonbot/i, type: "Amazonbot", provider: "Amazon" }, + { pattern: /amazonq/i, type: "Amazon-Q", provider: "Amazon" }, + { pattern: /amazon-q/i, type: "Amazon-Q", provider: "Amazon" }, + + // Apple + { + pattern: /Applebot-Extended/i, + type: "Applebot-Extended", + provider: "Apple", + }, + + // Meta + { pattern: /meta-externalagent/i, type: "Meta-Agent", provider: "Meta" }, + + // ByteDance + { pattern: /Bytespider/i, type: "Bytespider", provider: "ByteDance" }, + + // Cohere + { pattern: /cohere-ai/i, type: "Cohere-AI", provider: "Cohere" }, + + // Common Crawl + { pattern: /CCBot/i, type: "CCBot", provider: "CommonCrawl" }, + + // Developer tools + { pattern: /cursor/i, type: "Cursor", provider: "Cursor" }, + { pattern: /github.copilot/i, type: "GitHub-Copilot", provider: "GitHub" }, + { pattern: /copilot/i, type: "Copilot", provider: "GitHub" }, + { pattern: /codeium/i, type: "Codeium", provider: "Codeium" }, + { pattern: /tabnine/i, type: "Tabnine", provider: "Tabnine" }, + + // Other AI services + { pattern: /gemini/i, type: "Gemini", provider: "Google" }, + { pattern: /bard/i, type: "Bard", provider: "Google" }, + { pattern: /phind/i, type: "Phind", provider: "Phind" }, + { pattern: /you\.com/i, type: "You.com", provider: "You.com" }, + { pattern: /ai21/i, type: "AI21", provider: "AI21" }, + { pattern: /huggingface/i, type: "HuggingFace", provider: "HuggingFace" }, + + // Agent frameworks + { pattern: /langchain/i, type: "LangChain", provider: "LangChain" }, + { pattern: /llamaindex/i, type: "LlamaIndex", provider: "LlamaIndex" }, + { pattern: /autogpt/i, type: "AutoGPT", provider: "AutoGPT" }, + { pattern: /agentgpt/i, type: "AgentGPT", provider: "AgentGPT" }, + { pattern: /babyagi/i, type: "BabyAGI", provider: "BabyAGI" }, + + // Doc AI tools + { pattern: /kapa\.ai/i, type: "Kapa.ai", provider: "Kapa" }, + { pattern: /mendable/i, type: "Mendable", provider: "Mendable" }, + { pattern: /inkeep/i, type: "Inkeep", provider: "Inkeep" }, + { pattern: /glean/i, type: "Glean", provider: "Glean" }, +]; + +export type AIAgentClassification = { + isAIAgent: boolean; + agentType: string | null; + agentProvider: string | null; +}; + +export function classifyAIAgent(userAgent: string): AIAgentClassification { + for (const classifier of AI_AGENT_CLASSIFIERS) { + if (classifier.pattern.test(userAgent)) { + return { + isAIAgent: true, + agentType: classifier.type, + agentProvider: classifier.provider, + }; + } + } + return { + isAIAgent: false, + agentType: null, + agentProvider: null, + }; +} + +export type ServerPageViewEvent = { + distinctId: string; + pathname: string; + userAgent: string; + referer?: string; + ip?: string; + acceptHeader?: string; + acceptLanguage?: string; +}; + +export async function captureServerPageView(event: ServerPageViewEvent) { + const posthog = getPostHogServer(); + const classification = classifyAIAgent(event.userAgent); + + // Use distinct event name to avoid double-counting with client-side $pageview + // This tracks server-side markdown requests (primarily from AI agents) + posthog.capture({ + distinctId: event.distinctId, + event: "server_markdown_request", + properties: { + $current_url: event.pathname, + $pathname: event.pathname, + $referrer: event.referer, + $useragent: event.userAgent, + + // AI agent classification + is_ai_agent: classification.isAIAgent, + ai_agent_type: classification.agentType, + ai_agent_provider: classification.agentProvider, + + // Request metadata + request_accept_header: event.acceptHeader, + request_accept_language: event.acceptLanguage, + request_source: "server", + + // Mark as server-side capture + $lib: "posthog-node", + }, + }); + + // Flush immediately for serverless + await posthog.flush(); +} + +// Shutdown handler for graceful shutdown +export async function shutdownPostHog() { + if (posthogClient) { + await posthogClient.shutdown(); + posthogClient = null; + } +} diff --git a/app/api/markdown/[[...slug]]/route.ts b/app/api/markdown/[[...slug]]/route.ts index eea63cc2e..a31f7965a 100644 --- a/app/api/markdown/[[...slug]]/route.ts +++ b/app/api/markdown/[[...slug]]/route.ts @@ -1,6 +1,7 @@ import { access, readdir, readFile } from "node:fs/promises"; import { join, normalize, resolve } from "node:path"; import { type NextRequest, NextResponse } from "next/server"; +import { captureServerPageView } from "@/app/_lib/posthog-server"; export const dynamic = "force-dynamic"; @@ -63,11 +64,83 @@ const TOOLKIT_CATEGORIES = [ "social", ]; +// Length of base64 slice for generating distinct IDs +const DISTINCT_ID_LENGTH = 32; + +/** + * Track markdown request for AI agent analytics. + * Non-blocking - errors don't affect response. + */ +async function trackMarkdownRequest(request: NextRequest, pathname: string) { + try { + const userAgent = request.headers.get("user-agent") || ""; + const referer = request.headers.get("referer") || undefined; + const acceptHeader = request.headers.get("accept") || undefined; + const acceptLanguage = request.headers.get("accept-language") || undefined; + + // Use IP or a hash as distinct_id for anonymous tracking + const ip = + request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() || + request.headers.get("x-real-ip") || + "unknown"; + + // Create a semi-stable ID from IP + user agent + const distinctId = `server_${Buffer.from(`${ip}:${userAgent}`).toString("base64").slice(0, DISTINCT_ID_LENGTH)}`; + + await captureServerPageView({ + distinctId, + pathname, + userAgent, + referer, + ip, + acceptHeader, + acceptLanguage, + }); + } catch (error) { + // Log but don't throw - tracking errors should not affect the response + // biome-ignore lint/suspicious/noConsole: intentional error logging for debugging + console.error("[PostHog] Failed to track markdown request:", error); + } +} + type ToolkitMarkdownTarget = { category: string; toolkitId: string; }; +/** + * Try to serve clean pre-generated markdown. + * Returns NextResponse if found, null otherwise. + */ +async function tryServeCleanMarkdown( + request: NextRequest, + sanitizedPath: string +): Promise { + const cleanMarkdownPath = join(CLEAN_MARKDOWN_DIR, `${sanitizedPath}.md`); + + try { + await access(cleanMarkdownPath); + if (!isPathWithinDirectory(cleanMarkdownPath, CLEAN_MARKDOWN_DIR)) { + return null; + } + + const content = await readFile(cleanMarkdownPath, "utf-8"); + await trackMarkdownRequest(request, sanitizedPath); + + return new NextResponse(content, { + status: 200, + headers: { + "Content-Type": "text/markdown; charset=utf-8", + "Content-Disposition": "inline", + "Cache-Control": "public, max-age=3600", + Vary: "Accept, User-Agent", + }, + }); + } catch { + return null; + } +} + /** * Check if a path matches the toolkit documentation pattern. * Handles both actual toolkit IDs and the [toolkitId] dynamic route pattern. @@ -176,26 +249,9 @@ export async function GET( } } else { // Try clean markdown first (preferred) - // e.g., /en/home/quickstart -> public/_markdown/en/home/quickstart.md - const cleanMarkdownPath = join(CLEAN_MARKDOWN_DIR, `${sanitizedPath}.md`); - - try { - await access(cleanMarkdownPath); - if (isPathWithinDirectory(cleanMarkdownPath, CLEAN_MARKDOWN_DIR)) { - const content = await readFile(cleanMarkdownPath, "utf-8"); - - return new NextResponse(content, { - status: 200, - headers: { - "Content-Type": "text/markdown; charset=utf-8", - "Content-Disposition": "inline", - "Cache-Control": "public, max-age=3600", - Vary: "Accept, User-Agent", - }, - }); - } - } catch { - // Clean markdown not found, fall back to raw MDX + const cleanResponse = await tryServeCleanMarkdown(request, sanitizedPath); + if (cleanResponse) { + return cleanResponse; } // Fallback: raw MDX file @@ -220,6 +276,13 @@ export async function GET( const content = await readFile(filePath, "utf-8"); + // Track server-side pageview for AI agent analytics + await trackMarkdownRequest(request, sanitizedPath); + + const contentSource = filePath.includes(TOOLKIT_MARKDOWN_ROOT) + ? "toolkit-markdown" + : "raw-mdx"; + // Return the raw markdown with proper headers return new NextResponse(content, { status: 200, @@ -227,7 +290,7 @@ export async function GET( "Content-Type": "text/markdown; charset=utf-8", "Content-Disposition": "inline", "Cache-Control": "public, max-age=3600, stale-while-revalidate=86400", - "X-Content-Source": "raw-mdx", + "X-Content-Source": contentSource, Vary: "Accept, User-Agent", }, }); diff --git a/instrumentation-client.ts b/instrumentation-client.ts index fb6483223..8d2756f97 100644 --- a/instrumentation-client.ts +++ b/instrumentation-client.ts @@ -4,6 +4,8 @@ posthog.init(process.env.NEXT_PUBLIC_POSTHOG_KEY as string, { api_host: process.env.NEXT_PUBLIC_POSTHOG_HOST || "https://us.i.posthog.com", ui_host: process.env.NEXT_PUBLIC_POSTHOG_UI_HOST || "https://us.posthog.com", defaults: "2025-11-30", + // Allow AI agent traffic to be captured (default filters out bots) + opt_out_useragent_filter: true, session_recording: { maskAllInputs: true, blockClass: "ph-no-capture", diff --git a/package.json b/package.json index 280ae089c..3751f359b 100644 --- a/package.json +++ b/package.json @@ -58,6 +58,7 @@ "nextra": "4.6.0", "nextra-theme-docs": "4.6.0", "posthog-js": "1.321.2", + "posthog-node": "^5.24.15", "react": "19.2.3", "react-dom": "19.2.3", "react-hook-form": "7.65.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3711aa4cf..51d389698 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -59,6 +59,9 @@ importers: posthog-js: specifier: 1.321.2 version: 1.321.2 + posthog-node: + specifier: ^5.24.15 + version: 5.24.15 react: specifier: 19.2.3 version: 19.2.3 @@ -1011,6 +1014,9 @@ packages: cpu: [x64] os: [win32] + '@posthog/core@1.22.0': + resolution: {integrity: sha512-WkmOnq95aAOu6yk6r5LWr5cfXsQdpVbWDCwOxQwxSne8YV6GuZET1ziO5toSQXgrgbdcjrSz2/GopAfiL6iiAA==} + '@posthog/core@1.9.1': resolution: {integrity: sha512-kRb1ch2dhQjsAapZmu6V66551IF2LnCbc1rnrQqnR7ArooVyJN9KOPXre16AJ3ObJz2eTfuP7x25BMyS2Y5Exw==} @@ -3860,6 +3866,10 @@ packages: posthog-js@1.321.2: resolution: {integrity: sha512-h5852d9lYmSNjKWvjDkrmO9/awUU3jayNBEoEBUuMAdfDPc4yYYdxBJeDBxYnCFm6RjCLy4O+vmcwuCRC67EXA==} + posthog-node@5.24.15: + resolution: {integrity: sha512-0QnWVOZAPwEAlp+r3r0jIGfk2IaNYM/2YnEJJhBMJZXs4LpHcTu7mX42l+e95o9xX87YpVuZU0kOkmtQUxgnOA==} + engines: {node: ^20.20.0 || >=22.22.0} + preact@10.28.0: resolution: {integrity: sha512-rytDAoiXr3+t6OIP3WGlDd0ouCUG1iCWzkcY3++Nreuoi17y6T5i/zRhe6uYfoVcxq6YU+sBtJouuRDsq8vvqA==} @@ -5517,6 +5527,10 @@ snapshots: '@pagefind/windows-x64@1.4.0': optional: true + '@posthog/core@1.22.0': + dependencies: + cross-spawn: 7.0.6 + '@posthog/core@1.9.1': dependencies: cross-spawn: 7.0.6 @@ -9094,6 +9108,10 @@ snapshots: query-selector-shadow-dom: 1.0.1 web-vitals: 4.2.4 + posthog-node@5.24.15: + dependencies: + '@posthog/core': 1.22.0 + preact@10.28.0: {} prismjs@1.30.0: {}