"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.XenovaEmbeddingService = void 0; const transformers_1 = require("@xenova/transformers"); const path_1 = __importDefault(require("path")); const fs_1 = __importDefault(require("fs")); class XenovaEmbeddingService { extractor = null; modelId = 'Xenova/all-MiniLM-L6-v2'; initPromise = null; /** * Lazy load the model (initialize on first call) */ async ensureInitialized() { if (this.extractor) return; if (!this.initPromise) { this.initPromise = this.initialize(); } await this.initPromise; } async initialize() { console.log(`[EmbeddingService] Loading model: ${this.modelId}...`); const start = Date.now(); // Configure local path // Try to find 'models' directory relative to this file // src/core/embedding-service.ts -> ../../models // dist/core/embedding-service.js -> ../../models const projectRoot = path_1.default.resolve(__dirname, '../../'); const localModelDir = path_1.default.join(projectRoot, 'models'); const modelPath = path_1.default.join(localModelDir, this.modelId); // Check if local model exists if (fs_1.default.existsSync(modelPath)) { console.log(`[EmbeddingService] Found local model at: ${modelPath}`); transformers_1.env.localModelPath = localModelDir; transformers_1.env.allowRemoteModels = false; // Force local usage } else { console.log(`[EmbeddingService] Local model not found, attempting remote download...`); // Use default behavior (remote download to cache) } this.extractor = await (0, transformers_1.pipeline)('feature-extraction', this.modelId, { quantized: true, // @ts-ignore progress_callback: (progress) => { if (progress.status === 'downloading') { console.log(`[EmbeddingService] Downloading: ${progress.file} (${progress.progress}%)`); } } }); console.log(`[EmbeddingService] Model loaded in ${Date.now() - start}ms`); } async embed(text) { await this.ensureInitialized(); if (!this.extractor) throw new Error('Model failed to initialize'); // Preprocess text const cleaned = this.cleanText(text); // Generate embedding const output = await this.extractor(cleaned, { pooling: 'mean', // Mean pooling normalize: true // L2 normalization }); // Convert to regular array return Array.from(output.data); } async embedBatch(texts) { await this.ensureInitialized(); if (!this.extractor) throw new Error('Model failed to initialize'); const cleaned = texts.map(t => this.cleanText(t)); // Batch inference const output = await this.extractor(cleaned, { pooling: 'mean', normalize: true }); // Split results by batch const dim = this.getDimensions(); const results = []; // output.data is a flattened Float32Array of all embeddings for (let i = 0; i < texts.length; i++) { const start = i * dim; const end = start + dim; // @ts-ignore results.push(Array.from(output.data.slice(start, end))); } return results; } getDimensions() { return 384; // all-MiniLM-L6-v2 fixed dimensions } getModelInfo() { return { name: 'all-MiniLM-L6-v2', version: 'ONNX', dimensions: 384, maxTokens: 256 }; } /** * Text cleaning (remove noise, improve quality) */ cleanText(text) { return text .trim() .replace(/\s+/g, ' ') // Merge excess whitespace .substring(0, 500); // Truncate overly long text (model specific) } } exports.XenovaEmbeddingService = XenovaEmbeddingService;