Skip to content
Snippets Groups Projects
Select Git revision
  • 02f5be355a453122c193d01398f959d7429b9e84
  • master default
  • v0.1.11.1
  • v0.1.11
  • v0.1.10.1
  • v0.1.10
  • v0.1.9.8
  • v0.1.9.7
  • v0.1.9.6
  • v0.1.2
  • v0.1.1
  • v0.1
12 results

test_to_lin.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    Embeddings.js 13.66 KiB
    
    import { Ollama } from 'ollama';
    import { aiFilterModelsByName } from "../controllers/AI.js";
    import { ChromaClient } from "chromadb";
    // embeddings
    import { Chroma } from "@langchain/community/vectorstores/chroma";
    import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama";
    import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
    // loaders - https://js.langchain.com/v0.1/docs/modules/data_connection/document_loaders/
    import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
    import {
      JSONLoader,
      JSONLinesLoader,
    } from "langchain/document_loaders/fs/json";
    import { TextLoader } from "langchain/document_loaders/fs/text";
    import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
    import fs from 'fs';
    import path from 'path';
    import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
    import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file";
    import { ScoreThresholdRetriever } from 'langchain/retrievers/score_threshold';
    
    
    // PROVIDE OLLAMA CONNECTION
    export const ollama = new Ollama({ host: process.env.AI_API_URL });
    // PROVIDE CHROMA-DB CONNECTION
    const chroma = new ChromaClient({ path: process.env.VECTOR_API_URL });
    // PROVIDE OLLAMA EMBEDDER
    export const embeddings = new OllamaEmbeddings({
      baseUrl: process.env['AI_API_URL'],
      model: process.env['RAG_MODEL_NAME'],
      maxConcurrency: 5,
    });
    // PROVIDE UNIFIED  CHROMA VECTORSTORE SETTINGS
    export const chromaVSsettings = {
      collectionName: process.env['VECTOR_COLLECTION_NAME'],
      url: process.env['VECTOR_API_URL'],
      collectionMetadata: {
        "hnsw:space": "cosine"
      }
    };
    // PROVIDE VECTOR STORE CONNECTION
    // predefine vectorStoreConnection in global scope
    let vectorStoreConnection;
    try {
      // check if cectorDB is reachable
      await chroma.heartbeat();
      // create connection
      vectorStoreConnection = await Chroma.fromExistingCollection(embeddings, chromaVSsettings);
    } catch (error) {
      // throw error if connection can't be established
      throw new Error(`Error creating VectorDB connection on ${process.env['VECTOR_API_URL']}`);
    }
    // export vectorStoreConnection
    export default vectorStoreConnection;
    // PROVIDE RETRIEVER
    export const retriever = vectorStoreConnection.asRetriever();
    // export const retriever = vectorStoreConnection.asRetriever(1);
    // export const retriever = ScoreThresholdRetriever.fromVectorStore(vectorStoreConnection, {
    //   minSimilarityScore: 0.1, // Finds results with at least this similarity score
    //   maxK: 100, // The maximum K value to use. Use it based to your chunk size to make sure you don't run out of tokens
    //   kIncrement: 2, // How much to increase K by each time. It'll fetch N results, then N + kIncrement, then N + kIncrement * 2, etc.
    // });
    
    
    
    /** *******************************************************
     * REMOVE VECTOR DB
     */
    export const removeVectorDb = async (req, res, next) => {
      // check if vDB is running
      const vectorDBrunning = await isVectorDbAvailable();
      // exit if not running
      if (!vectorDBrunning) {
        return res.status(404).json({ vectorDBrunning });
      }
      // exit if collection don't exist
      if (! await isCollectionAvailable()) {
        return res.status(404).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
      }
    
      // delete collection
      await chroma.deleteCollection(
        { name: process.env['VECTOR_COLLECTION_NAME'] }
      );
      return res.json({ 'message': 'VectorDB removed.' });
    };
    
    
    /** *******************************************************
     * CHECK STATUS OF VECTOR DB
     */
    export const getStatus = async (req, res, next) => {
      // check if vDB is running
      const vectorDBrunning = await isVectorDbAvailable();
      // exit if not running
      if (!vectorDBrunning) {
        return res.status(404).json({ vectorDBrunning });
      }
      // check if collection is available
      let collection = await isCollectionAvailable();
      if (!collection) {
        // create collection
        // console.log('Creating vector collection...');
        collection = await createCollection();
      }
      // get collection count
      const itemCount = await collection.count();
      // const items = await collection.get();
      // return status
      return res.json({ vectorDBrunning, collection, itemCount });
    };
    
    /** *******************************************************
     * CREATE EMBEDDINGS
     */
    export const createEmbeddings = async (req, res) => {
      // check if collection is available
      const collection = await isCollectionAvailable();
      if (!collection) {
        return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
      }
      // test if model is available
      const models = await aiFilterModelsByName(process.env['RAG_MODEL_NAME']);
      // install model if missing
      if (!models.length) {
        console.info('Embedding Model not found. Installing ...');
        await ollama.pull({ model: process.env['RAG_MODEL_NAME'] });
      }
      // console.log('collection count BEFORE', await collection.count());
      // load RAG files
      const docs = await directoryLoader();
      // embed
      const loadedDocs = await embedder(docs);
    
      // console.log('collection count AFTER', await collection.count());
      return res.json({ 'message': 'Embeddings created.' });
    };
    
    /** *******************************************************
     * UPDATE EMBEDDINGS
     */
    export const updateEmbeddings = async (req, res, next) => {
      // check if collection is available
      const collection = await isCollectionAvailable();
      if (!collection) {
        return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
      }
    
      // #################
      // GET CURRENT STATE
      // #################  
      // save local files incl mtime in object
      const currentRAGFiles = await getCurrentRAGFiles();
    
      // get all current embeddings in object, equally structured to currentRAGFiles
      const embeddingsBefore = await collection.count();
      const allCurrentEmbeddings = await collection.get();
      const currentEmbeddings = allCurrentEmbeddings.metadatas.reduce((acc, curr) => {
        acc[curr.source] = curr.timestamp;
        return acc;
      }, {});
    
    
      // #################
      // DECIDE WHAT TO DO
      // #################
      let files2embed = [];
      let outdatedEmbeddings = [];
      // loop through currentRAGFiles
      Object.keys(currentRAGFiles).forEach(async (key) => {
        // RAGFile not in Embeddings => insert
        if (!currentEmbeddings[key]) {
          files2embed.push(key);
          // delete currentEmbeddings[key];
          return;
        }
        // RAGFile in Embeddings but outdated => update
        if (currentEmbeddings[key] && currentRAGFiles[key] > currentEmbeddings[key]) {
          // console.log("🚀 ~ EMBEDDING IS OUTDATED", key);
          files2embed.push(key);
          outdatedEmbeddings.push(key);
          delete currentEmbeddings[key];
          return;
        }
        // RAGFile in Embeddings and up2date => do nothing
        if (currentEmbeddings[key] && currentRAGFiles[key] <= currentEmbeddings[key]) {
          // console.log("🚀 ~ EMBEDDING IS UP TO DATE", key);
          delete currentEmbeddings[key];
          return;
        }
      });
    
      // #################
      // PROCESS DECISIONS
      // #################
      // delete outdated embeddings
      if (outdatedEmbeddings.length > 0) {
        const deletedOutdatedEmbeddings = await deleteEmbeddingsByFileNames(outdatedEmbeddings);
      }
    
      // embed missing files
      if (files2embed.length > 0) {
        const docs = await fileLoader(files2embed);
        // embed
        const loadedDocs = await embedder(docs);
      }
    
      // Embedding not in RAGFiles => delete from Embeddings
      if (process.env.RAG_DELETE_EMBEDDINGS === 'true' && Object.keys(currentEmbeddings).length > 0) {
        const deletedEmbeddings = await deleteEmbeddingsByFileNames(Object.keys(currentEmbeddings));
      }
    
      // #################
      // FINALIZE
      // #################
      // make some counts
      const newlyEmbedded = files2embed.length - outdatedEmbeddings.length;
      const embeddingsAfter = await collection.count();
    
      // return result
      return res.json({
        message: 'Embeddings updated.',
        updated: outdatedEmbeddings.length,
        added: newlyEmbedded,
        deleted: Object.keys(currentEmbeddings).length,
        embeddingsBefore,
        embeddingsAfter
      });
    };
    
    
    /** *******************************************************
     ####################### FUNCTIONS #######################
     ******************************************************* */
    
    
    /** *******************************************************
    * READ DIRECTORY RECURSIVELY AND RETURN ABSOLUTE PATH OF FILES
    * written by copilot
    */
    function readDirectoryRecursive(directoryPath) {
      // get all entries in directory
      const entries = fs.readdirSync(directoryPath, { withFileTypes: true });
      let allEntries = [];
      // loop throug entries
      entries.forEach(entry => {
        // get full path
        const fullPath = path.join(directoryPath, entry.name);
        // if entry is directory
        if (entry.isDirectory()) {
          // fetch all entries recursively by looping in the very same function
          allEntries = allEntries.concat(readDirectoryRecursive(fullPath));
        } else {
          // skip if entry is not a file
          if (!entry.isFile()) return;
          // add full path to allEntries array
          allEntries.push(fullPath);
        }
      });
    
      return allEntries;
    }
    
    /** *******************************************************
    * GET CURRENT RAG FILES
    */
    export const getCurrentRAGFiles = async () => {
      // set directory path
      const directoryPath = path.join(process.cwd(), process.env.RAG_FOLDER);
      // filter unwanted entries and return absolute path of the wanted
      const files = readDirectoryRecursive(directoryPath).filter((file) => {
        if (file.endsWith('.gitkeep')) return false;
        return true;
      });
    
      // loop through files
      const currentRAGFiles = {};
      files.forEach((file) => {
        // skip .gitkeep
        if (file === '.gitkeep') return;
        // get file stats
        const fileStats = fs.statSync(file);
        // turn mtime into unix timestamp
        let unixtime = Math.floor(new Date(fileStats.mtime).getTime() / 1000);
        // store in object
        currentRAGFiles[file] = unixtime;
      });
      return currentRAGFiles;
    };
    
    
    /** *******************************************************
    * CHECK IF VECTOR DB IS AVAILABLE
    */
    export const isVectorDbAvailable = async () => {
      let heartbeat;
      // console.log('Checking VectorDB availability...');
      try {
        performance.mark('isVectorDbAvailable:start');
        heartbeat = await chroma.heartbeat();
        performance.mark('isVectorDbAvailable:end');
        return true;
      } catch (error) {
        return false;
      }
    };
    
    /** *******************************************************
    * CHECK IF VECTOR DB COLLECTION IS AVAILABLE
    */
    export const isCollectionAvailable = async () => {
      performance.mark('isCollectionAvailable:start');
      // return false if vector db is not available
      if (!await isVectorDbAvailable()) {
        return false;
      }
      // get active collections
      const collections = await chroma.listCollections();
      // check if required collection exists
      if (collections.some(collection => collection.name === process.env['VECTOR_COLLECTION_NAME'])) {
        // return collection
        performance.mark('isCollectionAvailable:end');
        return await chroma.getCollection({ name: process.env['VECTOR_COLLECTION_NAME'] });
      };
      // return false if collection not found
      performance.mark('isCollectionAvailable:end');
      return false;
    };
    
    
    /** *******************************************************
    * CREATE VECTOR DB COLLECTION
    */
    export const createCollection = async () => {
      // create collection
      try {
        return await chroma.createCollection({
          name: process.env['VECTOR_COLLECTION_NAME']
        });
      } catch (error) {
        console.error('Error creating VectorDB collection:', error);
        next(error);
      }
    };
    
    
    /** *******************************************************
    * LOAD WHOLE FOLDER RECURSIVELY
    */
    export const directoryLoader = async () => {
    
      const directoryPath = path.join(process.cwd(), process.env.RAG_FOLDER);
      // TODO simply skip not mentioned file types
      const loader = new DirectoryLoader(
        directoryPath,
        {
          ".json": (path) => new JSONLoader(path, "/texts"),
          ".jsonl": (path) => new JSONLinesLoader(path, "/html"),
          ".txt": (path) => new TextLoader(path),
          ".csv": (path) => new CSVLoader(path, "text"),
          ".pdf": (path) => new PDFLoader(path, "text"),
        }
      );
      return await loader.load();
    };
    
    /** *******************************************************
    * LOAD SPECIFIC DOCUMENT
    */
    export const fileLoader = async (docs = []) => {
      const loader = new MultiFileLoader(
        docs,
        {
          ".json": (path) => new JSONLoader(path, "/texts"),
          ".jsonl": (path) => new JSONLinesLoader(path, "/html"),
          ".txt": (path) => new TextLoader(path),
          ".csv": (path) => new CSVLoader(path, "text"),
          ".pdf": (path) => new PDFLoader(path, "text"),
        }
      );
      return await loader.load();
    };
    
    
    /** *******************************************************
    * EMBED GIVEN DOCS
    */
    export const embedder = async (docs) => {
      // chunk docs 
      const splitter = new RecursiveCharacterTextSplitter();
      const chunks = await splitter.splitDocuments(docs);
    
      // add unix timestamp to metadata of all chunks
      chunks.forEach(chunk => {
        chunk.metadata.timestamp = Math.floor(new Date().getTime() / 1000);
      });
    
      // store into vector db
      return await Chroma.fromDocuments(
        chunks,
        embeddings,
        chromaVSsettings
      );
    };
    
    
    /** *******************************************************
     * DELETE EMBEDDINGS BY FILE NAME
     */
    export const deleteEmbeddingsByFileNames = async (fileNames = []) => {
    
      // check if collection is available
      const collection = await isCollectionAvailable();
      if (!collection) {
        return res.status(500).json({ error: `VectorDB collection ${process.env['VECTOR_COLLECTION_NAME']} not found.` });
      }
    
      // delete from embeddings
      for (const fileName of fileNames) {
        await collection.delete({ where: { source: fileName } });
      };
      return true;
    };