From e3cd8b415061c5167861c7ca8435b4eb521a712a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 17 Oct 2023 22:59:10 -0700 Subject: [PATCH] Only index files returned by input-filter globs in fs_syncer Ignore .org, .pdf etc. suffixed directories under `input-filter' from being evaluated as files. Explicitly filter results by input-filter globs to only index files, not directory for each text type Add test to prevent regression Closes #448 --- src/khoj/utils/fs_syncer.py | 5 +++++ tests/test_text_search.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index 1745b760..12c4e5dc 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -1,5 +1,6 @@ import logging import glob +import os from typing import Optional from bs4 import BeautifulSoup @@ -53,6 +54,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: filtered_file for jsonl_file_filter in input_filter for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files) @@ -102,6 +104,7 @@ def get_org_files(config: TextContentConfig): filtered_file for org_file_filter in org_file_filter for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_org_files = sorted(absolute_org_files | filtered_org_files) @@ -146,6 +149,7 @@ def get_markdown_files(config: TextContentConfig): filtered_file for markdown_file_filter in markdown_file_filter for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) @@ -194,6 +198,7 @@ def get_pdf_files(config: TextContentConfig): filtered_file for pdf_file_filter in pdf_file_filter for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 60246a61..179718fa 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -31,6 +31,22 @@ def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_n get_org_files(org_config_with_only_new_file) +# ---------------------------------------------------------------------------------------------------- +def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path): + # Arrange + orgfile = tmp_path / "directory.org" / "file.org" + orgfile.parent.mkdir() + with open(orgfile, "w") as f: + f.write("* Heading\n- List item\n") + org_content_config = TextContentConfig( + input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt" + ) + + # Act + # should not raise IsADirectoryError and return orgfile + assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"} + + # ---------------------------------------------------------------------------------------------------- def test_text_search_setup_with_empty_file_raises_error( org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig