Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,75 @@ class NoteContentFulltextExp extends Expression {
return; // Content too large or invalid
}
content = processedContent;

// Check note size and determine search strategy
const contentSize = content.length;
const isExtremeNote = contentSize > FUZZY_SEARCH_CONFIG.EXTREME_NOTE_SIZE_THRESHOLD;
const isLargeNote = contentSize > FUZZY_SEARCH_CONFIG.LARGE_NOTE_SIZE_THRESHOLD;
const isFuzzyOperator = this.operator === "~=" || this.operator === "~*";

// For extremely large notes (>5MB), only search title regardless of operator
if (isExtremeNote) {
const note = becca.notes[noteId];
const title = note.title || "";

log.info(`Note ${noteId} is ${(contentSize / (1024 * 1024)).toFixed(1)}MB - searching title only due to extreme size`);

// For fuzzy operators, use fuzzy matching on title
// For other operators, use exact/wildcard matching on title
const normalizedTitle = normalizeSearchText(title);
let titleMatches = false;

if (isFuzzyOperator) {
titleMatches = this.tokens.some(token =>
this.fuzzyMatchToken(normalizeSearchText(token), normalizedTitle)
);
} else {
// Apply the operator to title matching
titleMatches = this.tokens.every(token => {
const normalizedToken = normalizeSearchText(token);
if (this.operator === "*=*") return normalizedTitle.includes(normalizedToken);
if (this.operator === "=") return normalizedTitle === normalizedToken;
if (this.operator === "!=") return normalizedTitle !== normalizedToken;
if (this.operator === "*=") return normalizedTitle.endsWith(normalizedToken);
if (this.operator === "=*") return normalizedTitle.startsWith(normalizedToken);
return false;
});
}

if (titleMatches) {
resultNoteSet.add(becca.notes[noteId]);
}

return content;
}

// For large notes (250KB-5MB) with fuzzy operators, use optimized strategy
if (isLargeNote && isFuzzyOperator) {
const note = becca.notes[noteId];
const title = note.title || "";

log.info(`Note ${noteId} is ${(contentSize / 1024).toFixed(1)}KB - using optimized search (fuzzy on title, exact on content)`);

// Perform fuzzy search on title
const titleMatches = this.fuzzyMatchToken(normalizeSearchText(this.tokens[0]), normalizeSearchText(title));

// Perform exact match on content for all tokens
const contentMatches = this.tokens.every(token => {
const normalizedToken = normalizeSearchText(token);
const normalizedContent = normalizeSearchText(content);
return normalizedContent.includes(normalizedToken);
});

// Add to results if either title matches with fuzzy or content matches exactly
if (titleMatches || contentMatches) {
resultNoteSet.add(becca.notes[noteId]);
}

return content;
}

// Standard search logic for non-large notes or non-fuzzy operators
if (this.tokens.length === 1) {
const [token] = this.tokens;

Expand Down Expand Up @@ -250,11 +318,6 @@ class NoteContentFulltextExp extends Expression {
return false;
}

// Warn about large word counts but still attempt matching
if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
}

// Find positions of each token
const tokenPositions: number[][] = this.tokens.map(token => {
const normalizedToken = normalizeSearchText(token);
Expand Down
31 changes: 9 additions & 22 deletions apps/server/src/services/search/utils/text_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@ export const FUZZY_SEARCH_CONFIG = {
MAX_EDIT_DISTANCE: 2,
// Maximum proximity distance for phrase matching (in words)
MAX_PHRASE_PROXIMITY: 10,
// Large note threshold - above this, use optimized search strategy (fuzzy on title only)
LARGE_NOTE_SIZE_THRESHOLD: 250000, // 250KB - switch to title-only fuzzy for performance
// Extreme note threshold - above this, skip content search entirely
EXTREME_NOTE_SIZE_THRESHOLD: 5 * 1024 * 1024, // 5MB - title search only
// Absolute hard limits for extreme cases - only to prevent system crashes
ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
// Performance warning thresholds - inform user but still attempt search
PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
// Progressive processing thresholds for very large content
PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
// Performance thresholds
EARLY_TERMINATION_THRESHOLD: 3,
} as const;
Expand Down Expand Up @@ -204,7 +202,8 @@ export function validateFuzzySearchTokens(tokens: string[], operator: string): {

/**
* Validates and preprocesses content for search operations.
* Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
* Only blocks truly extreme cases that could crash the system.
* Large notes (>50K words) are handled with optimized search strategy instead.
*
* @param content The content to validate and preprocess
* @param noteId The note ID (for logging purposes)
Expand All @@ -222,28 +221,16 @@ export function validateAndPreprocessContent(content: string, noteId?: string):
return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
}

// Warn about very large content but still process it
if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
}

// For word count, be even more permissive - only block truly extreme cases
// For word count, only block truly extreme cases
const wordCount = content.split(/\s+/).length;
if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
// Only in truly extreme cases, truncate to prevent system crash
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
}

// Warn about high word counts but still process them
if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
}

// Progressive processing warning for very large content
if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
}
// Notes above LARGE_NOTE_SIZE_THRESHOLD (250KB) will use optimized search strategy
// (handled in note_content_fulltext.ts)

return content;
}
Expand Down