Only index files returned by input-filter globs in fs_syncer

Ignore .org, .pdf etc. suffixed directories under `input-filter' from
being evaluated as files.

Explicitly filter results by input-filter globs to only index files,
not directory for each text type

Add test to prevent regression

Closes #448
This commit is contained in:
Debanjum Singh Solanky
2023-10-17 22:59:10 -07:00
parent 51363d280d
commit e3cd8b4150
2 changed files with 21 additions and 0 deletions

View File

@@ -1,5 +1,6 @@
import logging
import glob
import os
from typing import Optional
from bs4 import BeautifulSoup
@@ -53,6 +54,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
filtered_file
for jsonl_file_filter in input_filter
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
@@ -102,6 +104,7 @@ def get_org_files(config: TextContentConfig):
filtered_file
for org_file_filter in org_file_filter
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_org_files = sorted(absolute_org_files | filtered_org_files)
@@ -146,6 +149,7 @@ def get_markdown_files(config: TextContentConfig):
filtered_file
for markdown_file_filter in markdown_file_filter
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
@@ -194,6 +198,7 @@ def get_pdf_files(config: TextContentConfig):
filtered_file
for pdf_file_filter in pdf_file_filter
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)