Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 181 additions & 0 deletions app/_lib/posthog-server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import { PostHog } from "posthog-node";

// Server-side PostHog client singleton
let posthogClient: PostHog | null = null;

export function getPostHogServer(): PostHog {
if (!posthogClient) {
const apiKey = process.env.NEXT_PUBLIC_POSTHOG_KEY;
if (!apiKey) {
throw new Error("NEXT_PUBLIC_POSTHOG_KEY is not set");
}
posthogClient = new PostHog(apiKey, {
host: process.env.NEXT_PUBLIC_POSTHOG_HOST || "https://us.i.posthog.com",
// Flush events immediately in serverless environments
flushAt: 1,
flushInterval: 0,
});
}
return posthogClient;
}

// AI agent detection patterns with classification
const AI_AGENT_CLASSIFIERS: Array<{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeping this list up-to-date ourselves seems tedious. Is there a pacakge we can install from someone else who is maintaining this list?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... and if not, we should publish and own that package.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't change that much/is relatively stable. I think publishing such a package is a great idea! But perhaps we should wait to do so until this needs updating?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, cool - then move this into its own data file and we can keep it up-to-date more simply

pattern: RegExp;
type: string;
provider: string;
}> = [
// OpenAI
{ pattern: /GPTBot/i, type: "GPTBot", provider: "OpenAI" },
{ pattern: /ChatGPT-User/i, type: "ChatGPT-User", provider: "OpenAI" },
{ pattern: /OAI-SearchBot/i, type: "OAI-SearchBot", provider: "OpenAI" },

// Anthropic
{ pattern: /ClaudeBot/i, type: "ClaudeBot", provider: "Anthropic" },
{ pattern: /Claude-User/i, type: "Claude-User", provider: "Anthropic" },
{
pattern: /Claude-SearchBot/i,
type: "Claude-SearchBot",
provider: "Anthropic",
},
{ pattern: /anthropic/i, type: "Anthropic-Agent", provider: "Anthropic" },

// Perplexity
{ pattern: /PerplexityBot/i, type: "PerplexityBot", provider: "Perplexity" },
{
pattern: /Perplexity-User/i,
type: "Perplexity-User",
provider: "Perplexity",
},

// Google
{ pattern: /Google-Extended/i, type: "Google-Extended", provider: "Google" },
{ pattern: /Googlebot/i, type: "Googlebot", provider: "Google" },

// Amazon
{ pattern: /Amazonbot/i, type: "Amazonbot", provider: "Amazon" },
{ pattern: /amazonq/i, type: "Amazon-Q", provider: "Amazon" },
{ pattern: /amazon-q/i, type: "Amazon-Q", provider: "Amazon" },

// Apple
{
pattern: /Applebot-Extended/i,
type: "Applebot-Extended",
provider: "Apple",
},

// Meta
{ pattern: /meta-externalagent/i, type: "Meta-Agent", provider: "Meta" },

// ByteDance
{ pattern: /Bytespider/i, type: "Bytespider", provider: "ByteDance" },

// Cohere
{ pattern: /cohere-ai/i, type: "Cohere-AI", provider: "Cohere" },

// Common Crawl
{ pattern: /CCBot/i, type: "CCBot", provider: "CommonCrawl" },

// Developer tools
{ pattern: /cursor/i, type: "Cursor", provider: "Cursor" },
{ pattern: /github.copilot/i, type: "GitHub-Copilot", provider: "GitHub" },
{ pattern: /copilot/i, type: "Copilot", provider: "GitHub" },
{ pattern: /codeium/i, type: "Codeium", provider: "Codeium" },
{ pattern: /tabnine/i, type: "Tabnine", provider: "Tabnine" },

// Other AI services
{ pattern: /gemini/i, type: "Gemini", provider: "Google" },
{ pattern: /bard/i, type: "Bard", provider: "Google" },
{ pattern: /phind/i, type: "Phind", provider: "Phind" },
{ pattern: /you\.com/i, type: "You.com", provider: "You.com" },
{ pattern: /ai21/i, type: "AI21", provider: "AI21" },
{ pattern: /huggingface/i, type: "HuggingFace", provider: "HuggingFace" },

// Agent frameworks
{ pattern: /langchain/i, type: "LangChain", provider: "LangChain" },
{ pattern: /llamaindex/i, type: "LlamaIndex", provider: "LlamaIndex" },
{ pattern: /autogpt/i, type: "AutoGPT", provider: "AutoGPT" },
{ pattern: /agentgpt/i, type: "AgentGPT", provider: "AgentGPT" },
{ pattern: /babyagi/i, type: "BabyAGI", provider: "BabyAGI" },

// Doc AI tools
{ pattern: /kapa\.ai/i, type: "Kapa.ai", provider: "Kapa" },
{ pattern: /mendable/i, type: "Mendable", provider: "Mendable" },
{ pattern: /inkeep/i, type: "Inkeep", provider: "Inkeep" },
{ pattern: /glean/i, type: "Glean", provider: "Glean" },
];

export type AIAgentClassification = {
isAIAgent: boolean;
agentType: string | null;
agentProvider: string | null;
};

export function classifyAIAgent(userAgent: string): AIAgentClassification {
for (const classifier of AI_AGENT_CLASSIFIERS) {
if (classifier.pattern.test(userAgent)) {
return {
isAIAgent: true,
agentType: classifier.type,
agentProvider: classifier.provider,
};
}
}
return {
isAIAgent: false,
agentType: null,
agentProvider: null,
};
}

export type ServerPageViewEvent = {
distinctId: string;
pathname: string;
userAgent: string;
referer?: string;
ip?: string;
acceptHeader?: string;
acceptLanguage?: string;
};

export async function captureServerPageView(event: ServerPageViewEvent) {
const posthog = getPostHogServer();
const classification = classifyAIAgent(event.userAgent);

// Use distinct event name to avoid double-counting with client-side $pageview
// This tracks server-side markdown requests (primarily from AI agents)
posthog.capture({
distinctId: event.distinctId,
event: "server_markdown_request",
properties: {
$current_url: event.pathname,
$pathname: event.pathname,
$referrer: event.referer,
$useragent: event.userAgent,

// AI agent classification
is_ai_agent: classification.isAIAgent,
ai_agent_type: classification.agentType,
ai_agent_provider: classification.agentProvider,

// Request metadata
request_accept_header: event.acceptHeader,
request_accept_language: event.acceptLanguage,
request_source: "server",

// Mark as server-side capture
$lib: "posthog-node",
},
});

// Flush immediately for serverless
await posthog.flush();
}

// Shutdown handler for graceful shutdown
export async function shutdownPostHog() {
if (posthogClient) {
await posthogClient.shutdown();
posthogClient = null;
}
}
105 changes: 84 additions & 21 deletions app/api/markdown/[[...slug]]/route.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { access, readdir, readFile } from "node:fs/promises";
import { join, normalize, resolve } from "node:path";
import { type NextRequest, NextResponse } from "next/server";
import { captureServerPageView } from "@/app/_lib/posthog-server";

export const dynamic = "force-dynamic";

Expand Down Expand Up @@ -63,11 +64,83 @@ const TOOLKIT_CATEGORIES = [
"social",
];

// Length of base64 slice for generating distinct IDs
const DISTINCT_ID_LENGTH = 32;

/**
* Track markdown request for AI agent analytics.
* Non-blocking - errors don't affect response.
*/
async function trackMarkdownRequest(request: NextRequest, pathname: string) {
try {
const userAgent = request.headers.get("user-agent") || "";
const referer = request.headers.get("referer") || undefined;
const acceptHeader = request.headers.get("accept") || undefined;
const acceptLanguage = request.headers.get("accept-language") || undefined;

// Use IP or a hash as distinct_id for anonymous tracking
const ip =
request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ||
request.headers.get("x-real-ip") ||
"unknown";

// Create a semi-stable ID from IP + user agent
const distinctId = `server_${Buffer.from(`${ip}:${userAgent}`).toString("base64").slice(0, DISTINCT_ID_LENGTH)}`;

await captureServerPageView({
distinctId,
pathname,
userAgent,
referer,
ip,
acceptHeader,
acceptLanguage,
});
} catch (error) {
// Log but don't throw - tracking errors should not affect the response
// biome-ignore lint/suspicious/noConsole: intentional error logging for debugging
console.error("[PostHog] Failed to track markdown request:", error);
}
}

type ToolkitMarkdownTarget = {
category: string;
toolkitId: string;
};

/**
* Try to serve clean pre-generated markdown.
* Returns NextResponse if found, null otherwise.
*/
async function tryServeCleanMarkdown(
request: NextRequest,
sanitizedPath: string
): Promise<NextResponse | null> {
const cleanMarkdownPath = join(CLEAN_MARKDOWN_DIR, `${sanitizedPath}.md`);

try {
await access(cleanMarkdownPath);
if (!isPathWithinDirectory(cleanMarkdownPath, CLEAN_MARKDOWN_DIR)) {
return null;
}

const content = await readFile(cleanMarkdownPath, "utf-8");
await trackMarkdownRequest(request, sanitizedPath);

return new NextResponse(content, {
status: 200,
headers: {
"Content-Type": "text/markdown; charset=utf-8",
"Content-Disposition": "inline",
"Cache-Control": "public, max-age=3600",
Vary: "Accept, User-Agent",
},
});
} catch {
return null;
}
}

/**
* Check if a path matches the toolkit documentation pattern.
* Handles both actual toolkit IDs and the [toolkitId] dynamic route pattern.
Expand Down Expand Up @@ -176,26 +249,9 @@ export async function GET(
}
} else {
// Try clean markdown first (preferred)
// e.g., /en/home/quickstart -> public/_markdown/en/home/quickstart.md
const cleanMarkdownPath = join(CLEAN_MARKDOWN_DIR, `${sanitizedPath}.md`);

try {
await access(cleanMarkdownPath);
if (isPathWithinDirectory(cleanMarkdownPath, CLEAN_MARKDOWN_DIR)) {
const content = await readFile(cleanMarkdownPath, "utf-8");

return new NextResponse(content, {
status: 200,
headers: {
"Content-Type": "text/markdown; charset=utf-8",
"Content-Disposition": "inline",
"Cache-Control": "public, max-age=3600",
Vary: "Accept, User-Agent",
},
});
}
} catch {
// Clean markdown not found, fall back to raw MDX
const cleanResponse = await tryServeCleanMarkdown(request, sanitizedPath);
if (cleanResponse) {
return cleanResponse;
}

// Fallback: raw MDX file
Expand All @@ -220,14 +276,21 @@ export async function GET(

const content = await readFile(filePath, "utf-8");

// Track server-side pageview for AI agent analytics
await trackMarkdownRequest(request, sanitizedPath);

const contentSource = filePath.includes(TOOLKIT_MARKDOWN_ROOT)
? "toolkit-markdown"
: "raw-mdx";

// Return the raw markdown with proper headers
return new NextResponse(content, {
status: 200,
headers: {
"Content-Type": "text/markdown; charset=utf-8",
"Content-Disposition": "inline",
"Cache-Control": "public, max-age=3600, stale-while-revalidate=86400",
"X-Content-Source": "raw-mdx",
"X-Content-Source": contentSource,
Vary: "Accept, User-Agent",
},
});
Expand Down
2 changes: 2 additions & 0 deletions instrumentation-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ posthog.init(process.env.NEXT_PUBLIC_POSTHOG_KEY as string, {
api_host: process.env.NEXT_PUBLIC_POSTHOG_HOST || "https://us.i.posthog.com",
ui_host: process.env.NEXT_PUBLIC_POSTHOG_UI_HOST || "https://us.posthog.com",
defaults: "2025-11-30",
// Allow AI agent traffic to be captured (default filters out bots)
opt_out_useragent_filter: true,
session_recording: {
maskAllInputs: true,
blockClass: "ph-no-capture",
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"nextra": "4.6.0",
"nextra-theme-docs": "4.6.0",
"posthog-js": "1.321.2",
"posthog-node": "^5.24.15",
"react": "19.2.3",
"react-dom": "19.2.3",
"react-hook-form": "7.65.0",
Expand Down
18 changes: 18 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.