From fcbbe8c759e9b92a81f2b146e04af750af67b4e8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 22 Feb 2023 14:47:22 -0600 Subject: [PATCH] Read content plugin configs from Khoj config YAML Configure external text content plugins via the Khoj YAML Reuse existing TextContentConfig definition for external text content plugins --- src/khoj/utils/rawconfig.py | 3 ++- tests/data/config.yml | 11 +++++++++++ tests/test_cli.py | 15 ++++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 4fbe6543..389e80f6 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -1,7 +1,7 @@ # System Packages import json from pathlib import Path -from typing import List, Optional +from typing import List, Dict, Optional # External Packages from pydantic import BaseModel, validator @@ -56,6 +56,7 @@ class ContentConfig(ConfigBase): image: Optional[ImageContentConfig] music: Optional[TextContentConfig] markdown: Optional[TextContentConfig] + plugins: Optional[Dict[str, TextContentConfig]] class TextSearchConfig(ConfigBase): diff --git a/tests/data/config.yml b/tests/data/config.yml index 41603972..6d3aa35b 100644 --- a/tests/data/config.yml +++ b/tests/data/config.yml @@ -6,6 +6,17 @@ content-type: embeddings-file: ".note_embeddings.pt" index-header-entries: true + plugins: + content_plugin_1: + input-files: [ "content_plugin_1_new.jsonl.gz" ] + compressed-jsonl: "content_plugin_1.jsonl.gz" + embeddings-file: "content_plugin_1_embeddings.pt" + + content_plugin_2: + input-filter: [ "*2_new.jsonl.gz" ] + compressed-jsonl: "content_plugin_2.jsonl.gz" + embeddings-file: "content_plugin_2_embeddings.pt" + search-type: asymmetric: encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3" diff --git a/tests/test_cli.py b/tests/test_cli.py index b7e18460..98b2353c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -43,8 +43,21 @@ def test_cli_config_from_file(): assert actual_args.no_gui == True assert actual_args.regenerate == True assert actual_args.config is not None + assert actual_args.verbose == 3 + + # Ensure content config is loaded from file assert actual_args.config.content_type.org.input_files == [ Path("~/first_from_config.org"), Path("~/second_from_config.org"), ] - assert actual_args.verbose == 3 + assert len(actual_args.config.content_type.plugins.keys()) == 2 + assert actual_args.config.content_type.plugins["content_plugin_1"].input_files == [ + Path("content_plugin_1_new.jsonl.gz") + ] + assert actual_args.config.content_type.plugins["content_plugin_2"].input_filter == ["*2_new.jsonl.gz"] + assert actual_args.config.content_type.plugins["content_plugin_1"].compressed_jsonl == Path( + "content_plugin_1.jsonl.gz" + ) + assert actual_args.config.content_type.plugins["content_plugin_2"].embeddings_file == Path( + "content_plugin_2_embeddings.pt" + )