Embeddings.js


import { Ollama } from 'ollama';
import { aiFilterModelsByName } from "../controllers/AI.js";
import { ChromaClient } from "chromadb";
// embeddings
import { Chroma } from "@langchain/community/vectorstores/chroma";
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
// loaders - https://js.langchain.com/v0.1/docs/modules/data_connection/document_loaders/
import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
import {
  JSONLoader,
  JSONLinesLoader,
} from "langchain/document_loaders/fs/json";
import { TextLoader } from "langchain/document_loaders/fs/text";
import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
import fs from 'fs';
import path from 'path';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file";
import { ScoreThresholdRetriever } from 'langchain/retrievers/score_threshold';


// PROVIDE OLLAMA CONNECTION
export const ollama = new Ollama({ host: process.env.AI_API_URL });
// PROVIDE CHROMA-DB CONNECTION
const chroma = new ChromaClient({ path: process.env.VECTOR_API_URL });
// PROVIDE OLLAMA EMBEDDER
export const embeddings = new OllamaEmbeddings({
  baseUrl: process.env['AI_API_URL'],
  model: process.env['RAG_MODEL_NAME'],
  maxConcurrency: 5,
});
// PROVIDE UNIFIED  CHROMA VECTORSTORE SETTINGS
export const chromaVSsettings = {
  collectionName: process.env['VECTOR_COLLECTION_NAME'],
  url: process.env['VECTOR_API_URL'],
  collectionMetadata: {
    "hnsw:space": "cosine"
  }
};
// PROVIDE VECTOR STORE CONNECTION
// predefine vectorStoreConnection in global scope
let vectorStoreConnection;
try {
  // check if cectorDB is reachable
  await chroma.heartbeat();
  // create connection
  vectorStoreConnection = await Chroma.fromExistingCollection(embeddings, chromaVSsettings);
} catch (error) {
  // throw error if connection can't be established
  throw new Error(`Error creating VectorDB connection on ${process.env['VECTOR_API_URL']}`);
}
// export vectorStoreConnection
export default vectorStoreConnection;
// PROVIDE RETRIEVER
export const retriever = vectorStoreConnection.asRetriever();
// export const retriever = vectorStoreConnection.asRetriever(1);
// export const retriever = ScoreThresholdRetriever.fromVectorStore(vectorStoreConnection, {
//   minSimilarityScore: 0.1, // Finds results with at least this similarity score
//   maxK: 100, // The maximum K value to use. Use it based to your chunk size to make sure you don't run out of tokens
//   kIncrement: 2, // How much to increase K by each time. It'll fetch N results, then N + kIncrement, then N + kIncrement * 2, etc.
// });


/** *******************************************************
 * REMOVE VECTOR DB
 */
export const removeVectorDb = async (req, res, next) => {
  // check if vDB is running
  const vectorDBrunning = await isVectorDbAvailable();
  // exit if not running
  if (!vectorDBrunning) {
    return res.status(404).json({ vectorDBrunning });
  }
  // exit if collection don't exist
  if (! await isCollectionAvailable()) {
    return res.status(404).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
  }

  // delete collection
  await chroma.deleteCollection(
    { name: process.env['VECTOR_COLLECTION_NAME'] }
  );
  return res.json({ 'message': 'VectorDB removed.' });
};


/** *******************************************************
 * CHECK STATUS OF VECTOR DB
 */
export const getStatus = async (req, res, next) => {
  // check if vDB is running
  const vectorDBrunning = await isVectorDbAvailable();
  // exit if not running
  if (!vectorDBrunning) {
    return res.status(404).json({ vectorDBrunning });
  }
  // check if collection is available
  let collection = await isCollectionAvailable();
  if (!collection) {
    // create collection
    // console.log('Creating vector collection...');
    collection = await createCollection();
  }
  // get collection count
  const itemCount = await collection.count();
  // const items = await collection.get();
  // return status
  return res.json({ vectorDBrunning, collection, itemCount });
};

/** *******************************************************
 * CREATE EMBEDDINGS
 */
export const createEmbeddings = async (req, res) => {
  // check if collection is available
  const collection = await isCollectionAvailable();
  if (!collection) {
    return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
  }
  // test if model is available
  const models = await aiFilterModelsByName(process.env['RAG_MODEL_NAME']);
  // install model if missing
  if (!models.length) {
    console.info('Embedding Model not found. Installing ...');
    await ollama.pull({ model: process.env['RAG_MODEL_NAME'] });
  }
  // console.log('collection count BEFORE', await collection.count());
  // load RAG files
  const docs = await directoryLoader();
  // embed
  const loadedDocs = await embedder(docs);

  // console.log('collection count AFTER', await collection.count());
  return res.json({ 'message': 'Embeddings created.' });
};

/** *******************************************************
 * UPDATE EMBEDDINGS
 */
export const updateEmbeddings = async (req, res, next) => {
  // check if collection is available
  const collection = await isCollectionAvailable();
  if (!collection) {
    return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
  }

  // #################
  // GET CURRENT STATE
  // #################  
  // save local files incl mtime in object
  const currentRAGFiles = await getCurrentRAGFiles();

  // get all current embeddings in object, equally structured to currentRAGFiles
  const embeddingsBefore = await collection.count();
  const allCurrentEmbeddings = await collection.get();
  const currentEmbeddings = allCurrentEmbeddings.metadatas.reduce((acc, curr) => {
    acc[curr.source] = curr.timestamp;
    return acc;
  }, {});


  // #################
  // DECIDE WHAT TO DO
  // #################
  let files2embed = [];
  let outdatedEmbeddings = [];
  // loop through currentRAGFiles
  Object.keys(currentRAGFiles).forEach(async (key) => {
    // RAGFile not in Embeddings => insert
    if (!currentEmbeddings[key]) {
      files2embed.push(key);
      // delete currentEmbeddings[key];
      return;
    }
    // RAGFile in Embeddings but outdated => update
    if (currentEmbeddings[key] && currentRAGFiles[key] > currentEmbeddings[key]) {
      // console.log("🚀 ~ EMBEDDING IS OUTDATED", key);
      files2embed.push(key);
      outdatedEmbeddings.push(key);
      delete currentEmbeddings[key];
      return;
    }
    // RAGFile in Embeddings and up2date => do nothing
    if (currentEmbeddings[key] && currentRAGFiles[key] <= currentEmbeddings[key]) {
      // console.log("🚀 ~ EMBEDDING IS UP TO DATE", key);
      delete currentEmbeddings[key];
      return;
    }
  });

  // #################
  // PROCESS DECISIONS
  // #################
  // delete outdated embeddings
  if (outdatedEmbeddings.length > 0) {
    const deletedOutdatedEmbeddings = await deleteEmbeddingsByFileNames(outdatedEmbeddings);
  }

  // embed missing files
  if (files2embed.length > 0) {
    const docs = await fileLoader(files2embed);
    // embed
    const loadedDocs = await embedder(docs);
  }

  // Embedding not in RAGFiles => delete from Embeddings
  if (process.env.RAG_DELETE_EMBEDDINGS === 'true' && Object.keys(currentEmbeddings).length > 0) {
    const deletedEmbeddings = await deleteEmbeddingsByFileNames(Object.keys(currentEmbeddings));
  }

  // #################
  // FINALIZE
  // #################
  // make some counts
  const newlyEmbedded = files2embed.length - outdatedEmbeddings.length;
  const embeddingsAfter = await collection.count();

  // return result
  return res.json({
    message: 'Embeddings updated.',
    updated: outdatedEmbeddings.length,
    added: newlyEmbedded,
    deleted: Object.keys(currentEmbeddings).length,
    embeddingsBefore,
    embeddingsAfter
  });
};


/** *******************************************************
 ####################### FUNCTIONS #######################
 ******************************************************* */


/** *******************************************************
* READ DIRECTORY RECURSIVELY AND RETURN ABSOLUTE PATH OF FILES
* written by copilot
*/
function readDirectoryRecursive(directoryPath) {
  // get all entries in directory
  const entries = fs.readdirSync(directoryPath, { withFileTypes: true });
  let allEntries = [];
  // loop throug entries
  entries.forEach(entry => {
    // get full path
    const fullPath = path.join(directoryPath, entry.name);
    // if entry is directory
    if (entry.isDirectory()) {
      // fetch all entries recursively by looping in the very same function
      allEntries = allEntries.concat(readDirectoryRecursive(fullPath));
    } else {
      // skip if entry is not a file
      if (!entry.isFile()) return;
      // add full path to allEntries array
      allEntries.push(fullPath);
    }
  });

  return allEntries;
}

/** *******************************************************
* GET CURRENT RAG FILES
*/
export const getCurrentRAGFiles = async () => {
  // set directory path
  const directoryPath = path.join(process.cwd(), process.env.RAG_FOLDER);
  // filter unwanted entries and return absolute path of the wanted
  const files = readDirectoryRecursive(directoryPath).filter((file) => {
    if (file.endsWith('.gitkeep')) return false;
    return true;
  });

  // loop through files
  const currentRAGFiles = {};
  files.forEach((file) => {
    // skip .gitkeep
    if (file === '.gitkeep') return;
    // get file stats
    const fileStats = fs.statSync(file);
    // turn mtime into unix timestamp
    let unixtime = Math.floor(new Date(fileStats.mtime).getTime() / 1000);
    // store in object
    currentRAGFiles[file] = unixtime;
  });
  return currentRAGFiles;
};


/** *******************************************************
* CHECK IF VECTOR DB IS AVAILABLE
*/
export const isVectorDbAvailable = async () => {
  let heartbeat;
  // console.log('Checking VectorDB availability...');
  try {
    performance.mark('isVectorDbAvailable:start');
    heartbeat = await chroma.heartbeat();
    performance.mark('isVectorDbAvailable:end');
    return true;
  } catch (error) {
    return false;
  }
};

/** *******************************************************
* CHECK IF VECTOR DB COLLECTION IS AVAILABLE
*/
export const isCollectionAvailable = async () => {
  performance.mark('isCollectionAvailable:start');
  // return false if vector db is not available
  if (!await isVectorDbAvailable()) {
    return false;
  }
  // get active collections
  const collections = await chroma.listCollections();
  // check if required collection exists
  if (collections.some(collection => collection.name === process.env['VECTOR_COLLECTION_NAME'])) {
    // return collection
    performance.mark('isCollectionAvailable:end');
    return await chroma.getCollection({ name: process.env['VECTOR_COLLECTION_NAME'] });
  };
  // return false if collection not found
  performance.mark('isCollectionAvailable:end');
  return false;
};


/** *******************************************************
* CREATE VECTOR DB COLLECTION
*/
export const createCollection = async () => {
  // create collection
  try {
    return await chroma.createCollection({
      name: process.env['VECTOR_COLLECTION_NAME']
    });
  } catch (error) {
    console.error('Error creating VectorDB collection:', error);
    next(error);
  }
};


/** *******************************************************
* LOAD WHOLE FOLDER RECURSIVELY
*/
export const directoryLoader = async () => {

  const directoryPath = path.join(process.cwd(), process.env.RAG_FOLDER);
  // TODO simply skip not mentioned file types
  const loader = new DirectoryLoader(
    directoryPath,
    {
      ".json": (path) => new JSONLoader(path, "/texts"),
      ".jsonl": (path) => new JSONLinesLoader(path, "/html"),
      ".txt": (path) => new TextLoader(path),
      ".csv": (path) => new CSVLoader(path, "text"),
      ".pdf": (path) => new PDFLoader(path, "text"),
    }
  );
  return await loader.load();
};

/** *******************************************************
* LOAD SPECIFIC DOCUMENT
*/
export const fileLoader = async (docs = []) => {
  const loader = new MultiFileLoader(
    docs,
    {
      ".json": (path) => new JSONLoader(path, "/texts"),
      ".jsonl": (path) => new JSONLinesLoader(path, "/html"),
      ".txt": (path) => new TextLoader(path),
      ".csv": (path) => new CSVLoader(path, "text"),
      ".pdf": (path) => new PDFLoader(path, "text"),
    }
  );
  return await loader.load();
};


/** *******************************************************
* EMBED GIVEN DOCS
*/
export const embedder = async (docs) => {
  // chunk docs 
  const splitter = new RecursiveCharacterTextSplitter();
  const chunks = await splitter.splitDocuments(docs);

  // add unix timestamp to metadata of all chunks
  chunks.forEach(chunk => {
    chunk.metadata.timestamp = Math.floor(new Date().getTime() / 1000);
  });

  // store into vector db
  return await Chroma.fromDocuments(
    chunks,
    embeddings,
    chromaVSsettings
  );
};


/** *******************************************************
 * DELETE EMBEDDINGS BY FILE NAME
 */
export const deleteEmbeddingsByFileNames = async (fileNames = []) => {

  // check if collection is available
  const collection = await isCollectionAvailable();
  if (!collection) {
    return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
  }

  // delete from embeddings
  for (const fileName of fileNames) {
    await collection.delete({ where: { source: fileName } });
  };
  return true;
};