Support multiple input-filters to configure content to index via khoj.yml

- Update existings code, tests to process input-filters as list
  instead of str
- Test `text_to_jsonl' get files methods to work with combination of
  `input-files' and `input-filters'

Resolves #84
This commit is contained in:
Debanjum Singh Solanky
2022-09-12 10:39:39 +03:00
parent 940c8fac8c
commit a701ad08b9
11 changed files with 138 additions and 37 deletions

View File

@@ -63,15 +63,19 @@ def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
return entries_with_ids
def get_beancount_files(beancount_files=None, beancount_file_filter=None):
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
"Get Beancount files to process"
absolute_beancount_files, filtered_beancount_files = set(), set()
if beancount_files:
absolute_beancount_files = {get_absolute_path(beancount_file)
for beancount_file
in beancount_files}
if beancount_file_filter:
filtered_beancount_files = set(glob.glob(get_absolute_path(beancount_file_filter)))
if beancount_file_filters:
filtered_beancount_files = {
filtered_file
for beancount_file_filter in beancount_file_filters
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
}
all_beancount_files = absolute_beancount_files | filtered_beancount_files

View File

@@ -63,13 +63,17 @@ def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
return entries_with_ids
def get_markdown_files(markdown_files=None, markdown_file_filter=None):
def get_markdown_files(markdown_files=None, markdown_file_filters=None):
"Get Markdown files to process"
absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files:
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
if markdown_file_filter:
filtered_markdown_files = set(glob.glob(get_absolute_path(markdown_file_filter)))
if markdown_file_filters:
filtered_markdown_files = {
filtered_file
for markdown_file_filter in markdown_file_filters
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
}
all_markdown_files = absolute_markdown_files | filtered_markdown_files

View File

@@ -68,15 +68,19 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
return entries_with_ids
def get_org_files(org_files=None, org_file_filter=None):
def get_org_files(org_files=None, org_file_filters=None):
"Get Org files to process"
absolute_org_files, filtered_org_files = set(), set()
if org_files:
absolute_org_files = {get_absolute_path(org_file)
for org_file
in org_files}
if org_file_filter:
filtered_org_files = set(glob.glob(get_absolute_path(org_file_filter)))
if org_file_filters:
filtered_org_files = {
filtered_file
for org_file_filter in org_file_filters
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
}
all_org_files = absolute_org_files | filtered_org_files