diff --git a/GraphRAG/fragate/settings.yaml b/GraphRAG/fragate/settings.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2ecf4a299e36e3885c33e21a125f4d568a7ad85 --- /dev/null +++ b/GraphRAG/fragate/settings.yaml @@ -0,0 +1,155 @@ + +encoding_model: cl100k_base +skip_workflows: [] +llm: + api_key: ${GRAPHRAG_API_KEY} + type: openai_chat # or azure_openai_chat + model: gpt-3.5-turbo + model_supports_json: true # recommended if this is available for your model. + # max_tokens: 4000 + # request_timeout: 180.0 + # api_base: https://<instance>.openai.azure.com + # api_version: 2024-02-15-preview + # organization: <organization_id> + # deployment_name: <azure_model_deployment_name> + # tokens_per_minute: 150_000 # set a leaky bucket throttle + # requests_per_minute: 10_000 # set a leaky bucket throttle + # max_retries: 10 + # max_retry_wait: 10.0 + # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times + # concurrent_requests: 25 # the number of parallel inflight requests that may be made + # temperature: 0 # temperature for sampling + # top_p: 1 # top-p sampling + # n: 1 # Number of completions to generate + +parallelization: + stagger: 0.3 + # num_threads: 50 # the number of threads to use for parallel processing + +async_mode: threaded # or asyncio + +embeddings: + ## parallelization: override the global parallelization settings for embeddings + async_mode: threaded # or asyncio + # target: required # or all + llm: + api_key: ${GRAPHRAG_API_KEY} + type: openai_embedding # or azure_openai_embedding + model: text-embedding-3-small + # api_base: https://<instance>.openai.azure.com + # api_version: 2024-02-15-preview + # organization: <organization_id> + # deployment_name: <azure_model_deployment_name> + # tokens_per_minute: 150_000 # set a leaky bucket throttle + # requests_per_minute: 10_000 # set a leaky bucket throttle + # max_retries: 10 + # max_retry_wait: 10.0 + # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times + # concurrent_requests: 25 # the number of parallel inflight requests that may be made + # batch_size: 16 # the number of documents to send in a single request + # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request + + + + +chunks: + size: 2048 + overlap: 512 + group_by_columns: [id] # by default, we don't allow chunks to cross documents + +input: + type: file # or blob + file_type: text # or csv + base_dir: "data" + file_encoding: utf-8 + file_pattern: ".*\\.txt$" + +cache: + type: file # or blob + base_dir: "cache" + # connection_string: <azure_blob_storage_connection_string> + # container_name: <azure_blob_storage_container_name> + +storage: + type: file # or blob + base_dir: "output/${timestamp}/artifacts" + # connection_string: <azure_blob_storage_connection_string> + # container_name: <azure_blob_storage_container_name> + +reporting: + type: file # or console, blob + base_dir: "output/${timestamp}/reports" + # connection_string: <azure_blob_storage_connection_string> + # container_name: <azure_blob_storage_container_name> + +entity_extraction: + ## llm: override the global llm settings for this task + ## parallelization: override the global parallelization settings for this task + ## async_mode: override the global async_mode settings for this task + prompt: "prompts/entity_extraction.txt" + entity_types: [organization,person,geo,event] + max_gleanings: 1 + +summarize_descriptions: + ## llm: override the global llm settings for this task + ## parallelization: override the global parallelization settings for this task + ## async_mode: override the global async_mode settings for this task + prompt: "prompts/summarize_descriptions.txt" + max_length: 500 + +claim_extraction: + ## llm: override the global llm settings for this task + ## parallelization: override the global parallelization settings for this task + ## async_mode: override the global async_mode settings for this task + # enabled: true + prompt: "prompts/claim_extraction.txt" + description: "Any claims or facts that could be relevant to information discovery." + max_gleanings: 1 + +community_reports: + ## llm: override the global llm settings for this task + ## parallelization: override the global parallelization settings for this task + ## async_mode: override the global async_mode settings for this task + prompt: "prompts/community_report.txt" + max_length: 2000 + max_input_length: 8000 + +cluster_graph: + max_cluster_size: 10 + +embed_graph: + enabled: false # if true, will generate node2vec embeddings for nodes + # num_walks: 10 + # walk_length: 40 + # window_size: 2 + # iterations: 3 + # random_seed: 597832 + +umap: + enabled: false # if true, will generate UMAP embeddings for nodes + +snapshots: + graphml: false + raw_entities: false + top_level_nodes: false + +local_search: + # text_unit_prop: 0.5 + # community_prop: 0.1 + # conversation_history_max_turns: 5 + # top_k_mapped_entities: 10 + # top_k_relationships: 10 + # llm_temperature: 0 # temperature for sampling + # llm_top_p: 1 # top-p sampling + # llm_n: 1 # Number of completions to generate + # max_tokens: 12000 + +global_search: + # llm_temperature: 0 # temperature for sampling + # llm_top_p: 1 # top-p sampling + # llm_n: 1 # Number of completions to generate + # max_tokens: 12000 + # data_max_tokens: 12000 + # map_max_tokens: 1000 + # reduce_max_tokens: 2000 + # concurrency: 32