mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
Only index files returned by input-filter globs in fs_syncer
Ignore .org, .pdf etc. suffixed directories under `input-filter' from being evaluated as files. Explicitly filter results by input-filter globs to only index files, not directory for each text type Add test to prevent regression Closes #448
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import glob
|
import glob
|
||||||
|
import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
@@ -53,6 +54,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||||||
filtered_file
|
filtered_file
|
||||||
for jsonl_file_filter in input_filter
|
for jsonl_file_filter in input_filter
|
||||||
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
|
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
|
||||||
@@ -102,6 +104,7 @@ def get_org_files(config: TextContentConfig):
|
|||||||
filtered_file
|
filtered_file
|
||||||
for org_file_filter in org_file_filter
|
for org_file_filter in org_file_filter
|
||||||
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
||||||
@@ -146,6 +149,7 @@ def get_markdown_files(config: TextContentConfig):
|
|||||||
filtered_file
|
filtered_file
|
||||||
for markdown_file_filter in markdown_file_filter
|
for markdown_file_filter in markdown_file_filter
|
||||||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
||||||
@@ -194,6 +198,7 @@ def get_pdf_files(config: TextContentConfig):
|
|||||||
filtered_file
|
filtered_file
|
||||||
for pdf_file_filter in pdf_file_filter
|
for pdf_file_filter in pdf_file_filter
|
||||||
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
||||||
|
|||||||
@@ -31,6 +31,22 @@ def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_n
|
|||||||
get_org_files(org_config_with_only_new_file)
|
get_org_files(org_config_with_only_new_file)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
|
||||||
|
# Arrange
|
||||||
|
orgfile = tmp_path / "directory.org" / "file.org"
|
||||||
|
orgfile.parent.mkdir()
|
||||||
|
with open(orgfile, "w") as f:
|
||||||
|
f.write("* Heading\n- List item\n")
|
||||||
|
org_content_config = TextContentConfig(
|
||||||
|
input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# should not raise IsADirectoryError and return orgfile
|
||||||
|
assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_text_search_setup_with_empty_file_raises_error(
|
def test_text_search_setup_with_empty_file_raises_error(
|
||||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
||||||
|
|||||||
Reference in New Issue
Block a user