Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions app/lib/persistence/document-store/text-chunker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Text chunking utilities
export interface ChunkOptions {
chunkSize?: number;
chunkOverlap?: number;
separators?: string[];
}

export class TextChunker {
private _defaultOptions: Required<ChunkOptions> = {
chunkSize: 500,
chunkOverlap: 50,
separators: ['\n\n', '\n', '. ', ' '],
};

constructor(private _options: ChunkOptions = {}) {
this._options = { ...this._defaultOptions, ..._options };
}

chunk(text: string): string[] {
const { chunkSize, chunkOverlap, separators } = { ...this._defaultOptions, ...this._options };
const chunks: string[] = [];

// If text is shorter than chunk size, return it as a single chunk
if (text.length <= chunkSize) {
return [text];
}

let startIndex = 0;

while (startIndex < text.length) {
const endIndex = Math.min(startIndex + chunkSize, text.length);
let chunkEnd = endIndex;

// Try to find a natural breaking point
if (endIndex < text.length) {
let foundSeparator = false;

for (const separator of separators) {
const lastSeparatorIndex = text.lastIndexOf(separator, endIndex);

if (lastSeparatorIndex > startIndex) {
chunkEnd = lastSeparatorIndex + separator.length;
foundSeparator = true;
break;
}
}

// If no separator found, break at word boundary
if (!foundSeparator) {
const lastSpaceIndex = text.lastIndexOf(' ', endIndex);

if (lastSpaceIndex > startIndex) {
chunkEnd = lastSpaceIndex + 1;
}
}
}

// Add the chunk
chunks.push(text.slice(startIndex, chunkEnd).trim());

// Move start index considering overlap
startIndex = chunkEnd - chunkOverlap;
}

// Filter out empty chunks and those that are only whitespace
return chunks.filter((chunk) => chunk.trim().length > 0);
}
}
186 changes: 186 additions & 0 deletions app/lib/persistence/document-store/vector-store.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import { FeatureExtractionPipeline, pipeline } from '@xenova/transformers';
import { createRxDatabase, type RxDatabase, type RxCollection, type RxDocument } from 'rxdb';
import { getRxStorageDexie } from 'rxdb/plugins/storage-dexie';
import { TextChunker, type ChunkOptions } from './text-chunker';

// Define the document schema type
interface VectorDocument {
id: string;
content: string;
embedding: number[];
metadata?: Record<string, any>;
}

// Define the collection schema
const vectorStoreSchema = {
version: 0,
primaryKey: 'id',
type: 'object',
properties: {
id: {
type: 'string',
maxLength: 100,
},
content: {
type: 'string',
},
embedding: {
type: 'array',
items: {
type: 'number',
},
},
metadata: {
type: 'object',
},
},
required: ['id', 'content', 'embedding'],
};

export class VectorStore {
private _db: RxDatabase | undefined;
private _collection: RxCollection | undefined;
private _embedder: Promise<FeatureExtractionPipeline>;
private _chunker: TextChunker;
private _initialized: boolean = false;

get isInitialized(): boolean {
return this._initialized;
}

constructor(chunkOptions: ChunkOptions = {}) {
this._embedder = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
this._chunker = new TextChunker(chunkOptions);
}

async initialize(dbName: string = 'doc-store'): Promise<void> {
this._db = await createRxDatabase({
name: dbName,
storage: getRxStorageDexie(),
});

await this._db.addCollections({
documents: {
schema: vectorStoreSchema,
},
});

this._collection = this._db.documents;
this._initialized = true;
}

private async _getEmbedding(text: string): Promise<number[]> {
const pipe = await this._embedder;
const output = await pipe(text, { pooling: 'mean', normalize: true });

return Array.from(output.data);
}

async addDocument(content: string, metadata?: Record<string, any>, id?: string): Promise<RxDocument<VectorDocument>> {
if (!this._initialized) {
throw new Error('Vector store not initialized');
}

if (!this._collection) {
throw new Error('Collection not initialized');
}

const embedding = await this._getEmbedding(content);
const docId = id || crypto.randomUUID();

const document: VectorDocument = {
id: docId,
content,
embedding,
metadata,
};

return await this._collection.insert(document);
}

async addDocuments(
documents: Array<{ content: string; metadata?: Record<string, any>; id?: string }>,
): Promise<RxDocument<VectorDocument>[]> {
const promises = documents.map((doc) => this.addDocument(doc.content, doc.metadata, doc.id));
return Promise.all(promises);
}

async addDocumentWithChunking(
content: string,
metadata?: Record<string, any>,
baseId?: string,
): Promise<RxDocument<VectorDocument>[]> {
const chunks = this._chunker.chunk(content);

const documents = chunks.map((chunk, index) => ({
content: chunk,
metadata: {
...metadata,
chunkIndex: index,
totalChunks: chunks.length,
originalText: content.slice(0, 100) + '...', // Store the beginning of original text
},
id: baseId ? `${baseId}-chunk-${index}` : undefined,
}));

return this.addDocuments(documents);
}

private _cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new Error('Vectors must have the same length');
}

let dotProduct = 0;
let normA = 0;
let normB = 0;

for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}

return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}

async similaritySearch(
query: string,
k: number = 5,
): Promise<Array<{ content: string; similarity: number; metadata?: Record<string, any> }>> {
if (!this._initialized) {
throw new Error('Vector store not initialized');
}

if (!this._collection) {
throw new Error('Collection not initialized');
}

const queryEmbedding = await this._getEmbedding(query);
const documents = await this._collection?.find().exec();

const results = documents.map((doc) => ({
content: doc.content,
metadata: doc.metadata,
similarity: this._cosineSimilarity(queryEmbedding, doc.embedding),
}));

return results.sort((a, b) => b.similarity - a.similarity).slice(0, k);
}

async deleteDocument(id: string): Promise<void> {
if (!this._initialized) {
throw new Error('Vector store not initialized');
}

if (!this._collection) {
throw new Error('Collection not initialized');
}

await this._collection.findOne(id).remove();
}

async close(): Promise<void> {
await this._db?.destroy();
}
}
2 changes: 1 addition & 1 deletion app/routes/api.check-env-key.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { LoaderFunction } from '@remix-run/node';
import type { LoaderFunction } from '@remix-run/cloudflare';
import { providerBaseUrlEnvKeys } from '~/utils/constants';

export const loader: LoaderFunction = async ({ context, request }) => {
Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
"@uiw/codemirror-theme-vscode": "^4.23.6",
"@unocss/reset": "^0.61.9",
"@webcontainer/api": "1.3.0-internal.10",
"@xenova/transformers": "^2.17.2",
"@xterm/addon-fit": "^0.10.0",
"@xterm/addon-web-links": "^0.11.0",
"@xterm/xterm": "^5.5.0",
Expand Down Expand Up @@ -101,6 +102,7 @@
"remark-gfm": "^4.0.0",
"remix-island": "^0.2.0",
"remix-utils": "^7.7.0",
"rxdb": "^15.39.0",
"shiki": "^1.24.0",
"unist-util-visit": "^5.0.0"
},
Expand Down
Loading
Loading