mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 21:29:11 +00:00
Support multiple input-filters to configure content to index via khoj.yml
- Update existings code, tests to process input-filters as list instead of str - Test `text_to_jsonl' get files methods to work with combination of `input-files' and `input-filters' Resolves #84
This commit is contained in:
@@ -63,15 +63,19 @@ def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
def get_beancount_files(beancount_files=None, beancount_file_filter=None):
|
||||
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
|
||||
"Get Beancount files to process"
|
||||
absolute_beancount_files, filtered_beancount_files = set(), set()
|
||||
if beancount_files:
|
||||
absolute_beancount_files = {get_absolute_path(beancount_file)
|
||||
for beancount_file
|
||||
in beancount_files}
|
||||
if beancount_file_filter:
|
||||
filtered_beancount_files = set(glob.glob(get_absolute_path(beancount_file_filter)))
|
||||
if beancount_file_filters:
|
||||
filtered_beancount_files = {
|
||||
filtered_file
|
||||
for beancount_file_filter in beancount_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
|
||||
}
|
||||
|
||||
all_beancount_files = absolute_beancount_files | filtered_beancount_files
|
||||
|
||||
|
||||
@@ -63,13 +63,17 @@ def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
def get_markdown_files(markdown_files=None, markdown_file_filter=None):
|
||||
def get_markdown_files(markdown_files=None, markdown_file_filters=None):
|
||||
"Get Markdown files to process"
|
||||
absolute_markdown_files, filtered_markdown_files = set(), set()
|
||||
if markdown_files:
|
||||
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
||||
if markdown_file_filter:
|
||||
filtered_markdown_files = set(glob.glob(get_absolute_path(markdown_file_filter)))
|
||||
if markdown_file_filters:
|
||||
filtered_markdown_files = {
|
||||
filtered_file
|
||||
for markdown_file_filter in markdown_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
|
||||
}
|
||||
|
||||
all_markdown_files = absolute_markdown_files | filtered_markdown_files
|
||||
|
||||
|
||||
@@ -68,15 +68,19 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
def get_org_files(org_files=None, org_file_filter=None):
|
||||
def get_org_files(org_files=None, org_file_filters=None):
|
||||
"Get Org files to process"
|
||||
absolute_org_files, filtered_org_files = set(), set()
|
||||
if org_files:
|
||||
absolute_org_files = {get_absolute_path(org_file)
|
||||
for org_file
|
||||
in org_files}
|
||||
if org_file_filter:
|
||||
filtered_org_files = set(glob.glob(get_absolute_path(org_file_filter)))
|
||||
if org_file_filters:
|
||||
filtered_org_files = {
|
||||
filtered_file
|
||||
for org_file_filter in org_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
|
||||
}
|
||||
|
||||
all_org_files = absolute_org_files | filtered_org_files
|
||||
|
||||
|
||||
@@ -241,7 +241,11 @@ def setup(config: ImageContentConfig, search_config: ImageSearchConfig, regenera
|
||||
image_directories = [resolve_absolute_path(directory, strict=True) for directory in config.input_directories]
|
||||
absolute_image_files = set(extract_entries(image_directories))
|
||||
if config.input_filter:
|
||||
filtered_image_files = set(glob.glob(get_absolute_path(config.input_filter)))
|
||||
filtered_image_files = {
|
||||
filtered_file
|
||||
for input_filter in config.input_filter
|
||||
for filtered_file in glob.glob(get_absolute_path(input_filter))
|
||||
}
|
||||
|
||||
all_image_files = sorted(list(absolute_image_files | filtered_image_files))
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import List, Optional
|
||||
from pydantic import BaseModel, validator
|
||||
|
||||
# Internal Packages
|
||||
from src.utils.helpers import to_snake_case_from_dash
|
||||
from src.utils.helpers import to_snake_case_from_dash, is_none_or_empty
|
||||
|
||||
class ConfigBase(BaseModel):
|
||||
class Config:
|
||||
@@ -15,27 +15,27 @@ class ConfigBase(BaseModel):
|
||||
|
||||
class TextContentConfig(ConfigBase):
|
||||
input_files: Optional[List[Path]]
|
||||
input_filter: Optional[str]
|
||||
input_filter: Optional[List[str]]
|
||||
compressed_jsonl: Path
|
||||
embeddings_file: Path
|
||||
index_heading_entries: Optional[bool] = False
|
||||
|
||||
@validator('input_filter')
|
||||
def input_filter_or_files_required(cls, input_filter, values, **kwargs):
|
||||
if input_filter is None and ('input_files' not in values or values["input_files"] is None):
|
||||
if is_none_or_empty(input_filter) and ('input_files' not in values or values["input_files"] is None):
|
||||
raise ValueError("Either input_filter or input_files required in all content-type.<text_search> section of Khoj config file")
|
||||
return input_filter
|
||||
|
||||
class ImageContentConfig(ConfigBase):
|
||||
input_directories: Optional[List[Path]]
|
||||
input_filter: Optional[str]
|
||||
input_filter: Optional[List[str]]
|
||||
embeddings_file: Path
|
||||
use_xmp_metadata: bool
|
||||
batch_size: int
|
||||
|
||||
@validator('input_filter')
|
||||
def input_filter_or_directories_required(cls, input_filter, values, **kwargs):
|
||||
if input_filter is None and ('input_directories' not in values or values["input_directories"] is None):
|
||||
if is_none_or_empty(input_filter) and ('input_directories' not in values or values["input_directories"] is None):
|
||||
raise ValueError("Either input_filter or input_directories required in all content-type.image section of Khoj config file")
|
||||
return input_filter
|
||||
|
||||
|
||||
Reference in New Issue
Block a user