From 252735d14f361de9981989dffa6769f4f682002e Mon Sep 17 00:00:00 2001
From: Timm Lehmberg <timm.lehmberg@awhamburg.de>
Date: Wed, 25 Sep 2024 00:30:50 +0200
Subject: [PATCH] sycning
---
GraphRAG/fragate/settings.yaml | 155 +++++++++++++++++++++++++++++++++
1 file changed, 155 insertions(+)
create mode 100644 GraphRAG/fragate/settings.yaml
diff --git a/GraphRAG/fragate/settings.yaml b/GraphRAG/fragate/settings.yaml
new file mode 100644
index 0000000..e2ecf4a
--- /dev/null
+++ b/GraphRAG/fragate/settings.yaml
@@ -0,0 +1,155 @@
+
+encoding_model: cl100k_base
+skip_workflows: []
+llm:
+ api_key: ${GRAPHRAG_API_KEY}
+ type: openai_chat # or azure_openai_chat
+ model: gpt-3.5-turbo
+ model_supports_json: true # recommended if this is available for your model.
+ # max_tokens: 4000
+ # request_timeout: 180.0
+ # api_base: https://<instance>.openai.azure.com
+ # api_version: 2024-02-15-preview
+ # organization: <organization_id>
+ # deployment_name: <azure_model_deployment_name>
+ # tokens_per_minute: 150_000 # set a leaky bucket throttle
+ # requests_per_minute: 10_000 # set a leaky bucket throttle
+ # max_retries: 10
+ # max_retry_wait: 10.0
+ # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
+ # concurrent_requests: 25 # the number of parallel inflight requests that may be made
+ # temperature: 0 # temperature for sampling
+ # top_p: 1 # top-p sampling
+ # n: 1 # Number of completions to generate
+
+parallelization:
+ stagger: 0.3
+ # num_threads: 50 # the number of threads to use for parallel processing
+
+async_mode: threaded # or asyncio
+
+embeddings:
+ ## parallelization: override the global parallelization settings for embeddings
+ async_mode: threaded # or asyncio
+ # target: required # or all
+ llm:
+ api_key: ${GRAPHRAG_API_KEY}
+ type: openai_embedding # or azure_openai_embedding
+ model: text-embedding-3-small
+ # api_base: https://<instance>.openai.azure.com
+ # api_version: 2024-02-15-preview
+ # organization: <organization_id>
+ # deployment_name: <azure_model_deployment_name>
+ # tokens_per_minute: 150_000 # set a leaky bucket throttle
+ # requests_per_minute: 10_000 # set a leaky bucket throttle
+ # max_retries: 10
+ # max_retry_wait: 10.0
+ # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
+ # concurrent_requests: 25 # the number of parallel inflight requests that may be made
+ # batch_size: 16 # the number of documents to send in a single request
+ # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request
+
+
+
+
+chunks:
+ size: 2048
+ overlap: 512
+ group_by_columns: [id] # by default, we don't allow chunks to cross documents
+
+input:
+ type: file # or blob
+ file_type: text # or csv
+ base_dir: "data"
+ file_encoding: utf-8
+ file_pattern: ".*\\.txt$"
+
+cache:
+ type: file # or blob
+ base_dir: "cache"
+ # connection_string: <azure_blob_storage_connection_string>
+ # container_name: <azure_blob_storage_container_name>
+
+storage:
+ type: file # or blob
+ base_dir: "output/${timestamp}/artifacts"
+ # connection_string: <azure_blob_storage_connection_string>
+ # container_name: <azure_blob_storage_container_name>
+
+reporting:
+ type: file # or console, blob
+ base_dir: "output/${timestamp}/reports"
+ # connection_string: <azure_blob_storage_connection_string>
+ # container_name: <azure_blob_storage_container_name>
+
+entity_extraction:
+ ## llm: override the global llm settings for this task
+ ## parallelization: override the global parallelization settings for this task
+ ## async_mode: override the global async_mode settings for this task
+ prompt: "prompts/entity_extraction.txt"
+ entity_types: [organization,person,geo,event]
+ max_gleanings: 1
+
+summarize_descriptions:
+ ## llm: override the global llm settings for this task
+ ## parallelization: override the global parallelization settings for this task
+ ## async_mode: override the global async_mode settings for this task
+ prompt: "prompts/summarize_descriptions.txt"
+ max_length: 500
+
+claim_extraction:
+ ## llm: override the global llm settings for this task
+ ## parallelization: override the global parallelization settings for this task
+ ## async_mode: override the global async_mode settings for this task
+ # enabled: true
+ prompt: "prompts/claim_extraction.txt"
+ description: "Any claims or facts that could be relevant to information discovery."
+ max_gleanings: 1
+
+community_reports:
+ ## llm: override the global llm settings for this task
+ ## parallelization: override the global parallelization settings for this task
+ ## async_mode: override the global async_mode settings for this task
+ prompt: "prompts/community_report.txt"
+ max_length: 2000
+ max_input_length: 8000
+
+cluster_graph:
+ max_cluster_size: 10
+
+embed_graph:
+ enabled: false # if true, will generate node2vec embeddings for nodes
+ # num_walks: 10
+ # walk_length: 40
+ # window_size: 2
+ # iterations: 3
+ # random_seed: 597832
+
+umap:
+ enabled: false # if true, will generate UMAP embeddings for nodes
+
+snapshots:
+ graphml: false
+ raw_entities: false
+ top_level_nodes: false
+
+local_search:
+ # text_unit_prop: 0.5
+ # community_prop: 0.1
+ # conversation_history_max_turns: 5
+ # top_k_mapped_entities: 10
+ # top_k_relationships: 10
+ # llm_temperature: 0 # temperature for sampling
+ # llm_top_p: 1 # top-p sampling
+ # llm_n: 1 # Number of completions to generate
+ # max_tokens: 12000
+
+global_search:
+ # llm_temperature: 0 # temperature for sampling
+ # llm_top_p: 1 # top-p sampling
+ # llm_n: 1 # Number of completions to generate
+ # max_tokens: 12000
+ # data_max_tokens: 12000
+ # map_max_tokens: 1000
+ # reduce_max_tokens: 2000
+ # concurrency: 32
--
GitLab