Select Git revision
Embeddings.js
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
Embeddings.js 13.66 KiB
import { Ollama } from 'ollama';
import { aiFilterModelsByName } from "../controllers/AI.js";
import { ChromaClient } from "chromadb";
// embeddings
import { Chroma } from "@langchain/community/vectorstores/chroma";
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
// loaders - https://js.langchain.com/v0.1/docs/modules/data_connection/document_loaders/
import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
import {
JSONLoader,
JSONLinesLoader,
} from "langchain/document_loaders/fs/json";
import { TextLoader } from "langchain/document_loaders/fs/text";
import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
import fs from 'fs';
import path from 'path';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file";
import { ScoreThresholdRetriever } from 'langchain/retrievers/score_threshold';
// PROVIDE OLLAMA CONNECTION
export const ollama = new Ollama({ host: process.env.AI_API_URL });
// PROVIDE CHROMA-DB CONNECTION
const chroma = new ChromaClient({ path: process.env.VECTOR_API_URL });
// PROVIDE OLLAMA EMBEDDER
export const embeddings = new OllamaEmbeddings({
baseUrl: process.env['AI_API_URL'],
model: process.env['RAG_MODEL_NAME'],
maxConcurrency: 5,
});
// PROVIDE UNIFIED CHROMA VECTORSTORE SETTINGS
export const chromaVSsettings = {
collectionName: process.env['VECTOR_COLLECTION_NAME'],
url: process.env['VECTOR_API_URL'],
collectionMetadata: {
"hnsw:space": "cosine"
}
};
// PROVIDE VECTOR STORE CONNECTION
// predefine vectorStoreConnection in global scope
let vectorStoreConnection;
try {
// check if cectorDB is reachable
await chroma.heartbeat();
// create connection
vectorStoreConnection = await Chroma.fromExistingCollection(embeddings, chromaVSsettings);
} catch (error) {
// throw error if connection can't be established
throw new Error(`Error creating VectorDB connection on ${process.env['VECTOR_API_URL']}`);
}
// export vectorStoreConnection
export default vectorStoreConnection;
// PROVIDE RETRIEVER
export const retriever = vectorStoreConnection.asRetriever();
// export const retriever = vectorStoreConnection.asRetriever(1);
// export const retriever = ScoreThresholdRetriever.fromVectorStore(vectorStoreConnection, {
// minSimilarityScore: 0.1, // Finds results with at least this similarity score
// maxK: 100, // The maximum K value to use. Use it based to your chunk size to make sure you don't run out of tokens
// kIncrement: 2, // How much to increase K by each time. It'll fetch N results, then N + kIncrement, then N + kIncrement * 2, etc.
// });
/** *******************************************************
* REMOVE VECTOR DB
*/
export const removeVectorDb = async (req, res, next) => {
// check if vDB is running
const vectorDBrunning = await isVectorDbAvailable();
// exit if not running
if (!vectorDBrunning) {
return res.status(404).json({ vectorDBrunning });
}
// exit if collection don't exist
if (! await isCollectionAvailable()) {
return res.status(404).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
}
// delete collection
await chroma.deleteCollection(
{ name: process.env['VECTOR_COLLECTION_NAME'] }
);
return res.json({ 'message': 'VectorDB removed.' });
};
/** *******************************************************
* CHECK STATUS OF VECTOR DB
*/
export const getStatus = async (req, res, next) => {
// check if vDB is running
const vectorDBrunning = await isVectorDbAvailable();
// exit if not running
if (!vectorDBrunning) {
return res.status(404).json({ vectorDBrunning });
}
// check if collection is available
let collection = await isCollectionAvailable();
if (!collection) {
// create collection
// console.log('Creating vector collection...');
collection = await createCollection();
}
// get collection count
const itemCount = await collection.count();
// const items = await collection.get();
// return status
return res.json({ vectorDBrunning, collection, itemCount });
};
/** *******************************************************
* CREATE EMBEDDINGS
*/
export const createEmbeddings = async (req, res) => {
// check if collection is available
const collection = await isCollectionAvailable();
if (!collection) {
return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
}
// test if model is available
const models = await aiFilterModelsByName(process.env['RAG_MODEL_NAME']);
// install model if missing
if (!models.length) {
console.info('Embedding Model not found. Installing ...');
await ollama.pull({ model: process.env['RAG_MODEL_NAME'] });
}
// console.log('collection count BEFORE', await collection.count());
// load RAG files
const docs = await directoryLoader();
// embed
const loadedDocs = await embedder(docs);
// console.log('collection count AFTER', await collection.count());
return res.json({ 'message': 'Embeddings created.' });
};
/** *******************************************************
* UPDATE EMBEDDINGS
*/
export const updateEmbeddings = async (req, res, next) => {
// check if collection is available
const collection = await isCollectionAvailable();
if (!collection) {
return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
}
// #################
// GET CURRENT STATE
// #################
// save local files incl mtime in object
const currentRAGFiles = await getCurrentRAGFiles();
// get all current embeddings in object, equally structured to currentRAGFiles
const embeddingsBefore = await collection.count();
const allCurrentEmbeddings = await collection.get();
const currentEmbeddings = allCurrentEmbeddings.metadatas.reduce((acc, curr) => {
acc[curr.source] = curr.timestamp;
return acc;
}, {});
// #################
// DECIDE WHAT TO DO
// #################
let files2embed = [];
let outdatedEmbeddings = [];
// loop through currentRAGFiles
Object.keys(currentRAGFiles).forEach(async (key) => {
// RAGFile not in Embeddings => insert
if (!currentEmbeddings[key]) {
files2embed.push(key);
// delete currentEmbeddings[key];
return;
}
// RAGFile in Embeddings but outdated => update
if (currentEmbeddings[key] && currentRAGFiles[key] > currentEmbeddings[key]) {
// console.log("🚀 ~ EMBEDDING IS OUTDATED", key);
files2embed.push(key);
outdatedEmbeddings.push(key);
delete currentEmbeddings[key];
return;
}
// RAGFile in Embeddings and up2date => do nothing
if (currentEmbeddings[key] && currentRAGFiles[key] <= currentEmbeddings[key]) {
// console.log("🚀 ~ EMBEDDING IS UP TO DATE", key);
delete currentEmbeddings[key];
return;
}
});
// #################
// PROCESS DECISIONS
// #################
// delete outdated embeddings
if (outdatedEmbeddings.length > 0) {
const deletedOutdatedEmbeddings = await deleteEmbeddingsByFileNames(outdatedEmbeddings);
}
// embed missing files
if (files2embed.length > 0) {
const docs = await fileLoader(files2embed);
// embed
const loadedDocs = await embedder(docs);
}
// Embedding not in RAGFiles => delete from Embeddings
if (process.env.RAG_DELETE_EMBEDDINGS === 'true' && Object.keys(currentEmbeddings).length > 0) {
const deletedEmbeddings = await deleteEmbeddingsByFileNames(Object.keys(currentEmbeddings));
}
// #################
// FINALIZE
// #################
// make some counts
const newlyEmbedded = files2embed.length - outdatedEmbeddings.length;
const embeddingsAfter = await collection.count();
// return result
return res.json({
message: 'Embeddings updated.',
updated: outdatedEmbeddings.length,
added: newlyEmbedded,
deleted: Object.keys(currentEmbeddings).length,
embeddingsBefore,
embeddingsAfter
});
};
/** *******************************************************
####################### FUNCTIONS #######################
******************************************************* */
/** *******************************************************
* READ DIRECTORY RECURSIVELY AND RETURN ABSOLUTE PATH OF FILES
* written by copilot
*/
function readDirectoryRecursive(directoryPath) {
// get all entries in directory
const entries = fs.readdirSync(directoryPath, { withFileTypes: true });
let allEntries = [];
// loop throug entries
entries.forEach(entry => {
// get full path
const fullPath = path.join(directoryPath, entry.name);
// if entry is directory
if (entry.isDirectory()) {
// fetch all entries recursively by looping in the very same function
allEntries = allEntries.concat(readDirectoryRecursive(fullPath));
} else {
// skip if entry is not a file
if (!entry.isFile()) return;
// add full path to allEntries array
allEntries.push(fullPath);
}
});
return allEntries;
}
/** *******************************************************
* GET CURRENT RAG FILES
*/
export const getCurrentRAGFiles = async () => {
// set directory path
const directoryPath = path.join(process.cwd(), process.env.RAG_FOLDER);
// filter unwanted entries and return absolute path of the wanted
const files = readDirectoryRecursive(directoryPath).filter((file) => {
if (file.endsWith('.gitkeep')) return false;
return true;
});
// loop through files
const currentRAGFiles = {};
files.forEach((file) => {
// skip .gitkeep
if (file === '.gitkeep') return;
// get file stats
const fileStats = fs.statSync(file);
// turn mtime into unix timestamp
let unixtime = Math.floor(new Date(fileStats.mtime).getTime() / 1000);
// store in object
currentRAGFiles[file] = unixtime;
});
return currentRAGFiles;
};
/** *******************************************************
* CHECK IF VECTOR DB IS AVAILABLE
*/
export const isVectorDbAvailable = async () => {
let heartbeat;
// console.log('Checking VectorDB availability...');
try {
performance.mark('isVectorDbAvailable:start');
heartbeat = await chroma.heartbeat();
performance.mark('isVectorDbAvailable:end');
return true;
} catch (error) {
return false;
}
};
/** *******************************************************
* CHECK IF VECTOR DB COLLECTION IS AVAILABLE
*/
export const isCollectionAvailable = async () => {
performance.mark('isCollectionAvailable:start');
// return false if vector db is not available
if (!await isVectorDbAvailable()) {
return false;
}
// get active collections
const collections = await chroma.listCollections();
// check if required collection exists
if (collections.some(collection => collection.name === process.env['VECTOR_COLLECTION_NAME'])) {
// return collection
performance.mark('isCollectionAvailable:end');
return await chroma.getCollection({ name: process.env['VECTOR_COLLECTION_NAME'] });
};
// return false if collection not found
performance.mark('isCollectionAvailable:end');
return false;
};
/** *******************************************************
* CREATE VECTOR DB COLLECTION
*/
export const createCollection = async () => {
// create collection
try {
return await chroma.createCollection({
name: process.env['VECTOR_COLLECTION_NAME']
});
} catch (error) {
console.error('Error creating VectorDB collection:', error);
next(error);
}
};
/** *******************************************************
* LOAD WHOLE FOLDER RECURSIVELY
*/
export const directoryLoader = async () => {
const directoryPath = path.join(process.cwd(), process.env.RAG_FOLDER);
// TODO simply skip not mentioned file types
const loader = new DirectoryLoader(
directoryPath,
{
".json": (path) => new JSONLoader(path, "/texts"),
".jsonl": (path) => new JSONLinesLoader(path, "/html"),
".txt": (path) => new TextLoader(path),
".csv": (path) => new CSVLoader(path, "text"),
".pdf": (path) => new PDFLoader(path, "text"),
}
);
return await loader.load();
};
/** *******************************************************
* LOAD SPECIFIC DOCUMENT
*/
export const fileLoader = async (docs = []) => {
const loader = new MultiFileLoader(
docs,
{
".json": (path) => new JSONLoader(path, "/texts"),
".jsonl": (path) => new JSONLinesLoader(path, "/html"),
".txt": (path) => new TextLoader(path),
".csv": (path) => new CSVLoader(path, "text"),
".pdf": (path) => new PDFLoader(path, "text"),
}
);
return await loader.load();
};
/** *******************************************************
* EMBED GIVEN DOCS
*/
export const embedder = async (docs) => {
// chunk docs
const splitter = new RecursiveCharacterTextSplitter();
const chunks = await splitter.splitDocuments(docs);
// add unix timestamp to metadata of all chunks
chunks.forEach(chunk => {
chunk.metadata.timestamp = Math.floor(new Date().getTime() / 1000);
});
// store into vector db
return await Chroma.fromDocuments(
chunks,
embeddings,
chromaVSsettings
);
};
/** *******************************************************
* DELETE EMBEDDINGS BY FILE NAME
*/
export const deleteEmbeddingsByFileNames = async (fileNames = []) => {
// check if collection is available
const collection = await isCollectionAvailable();
if (!collection) {
return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
}
// delete from embeddings
for (const fileName of fileNames) {
await collection.delete({ where: { source: fileName } });
};
return true;
};