mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Read text files as utf-8, instead of default os locale
On Windows, the default locale isn't utf8. Khoj had regressed to reading files in OS specified locale encoding, e.g cp1252, cp949 etc. It now explicitly uses utf8 encoding to read text files for indexing Resolves #495, resolves #472
This commit is contained in:
@@ -72,7 +72,7 @@ async def update(
|
|||||||
raise HTTPException(status_code=401, detail="Invalid API Key")
|
raise HTTPException(status_code=401, detail="Invalid API Key")
|
||||||
state.config_lock.acquire()
|
state.config_lock.acquire()
|
||||||
try:
|
try:
|
||||||
logger.info(f"📬 Updating content index via API call by {client}")
|
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||||
org_files: Dict[str, str] = {}
|
org_files: Dict[str, str] = {}
|
||||||
markdown_files: Dict[str, str] = {}
|
markdown_files: Dict[str, str] = {}
|
||||||
pdf_files: Dict[str, str] = {}
|
pdf_files: Dict[str, str] = {}
|
||||||
@@ -139,7 +139,8 @@ async def update(
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"🚨 Failed to {force} update {t} content index triggered via API call by {client}: {e}", exc_info=True
|
f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
|
||||||
|
exc_info=True,
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
state.config_lock.release()
|
state.config_lock.release()
|
||||||
@@ -154,7 +155,7 @@ async def update(
|
|||||||
host=host,
|
host=host,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"📪 Content index updated via API call by {client}")
|
logger.info(f"📪 Content index updated via API call by {client} client")
|
||||||
return Response(content="OK", status_code=200)
|
return Response(content="OK", status_code=200)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
import glob
|
import glob
|
||||||
import base64
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
@@ -69,7 +68,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||||||
|
|
||||||
filename_to_content_map = {}
|
filename_to_content_map = {}
|
||||||
for file in all_target_files:
|
for file in all_target_files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r", encoding="utf8") as f:
|
||||||
try:
|
try:
|
||||||
plaintext_content = f.read()
|
plaintext_content = f.read()
|
||||||
if file.endswith(("html", "htm", "xml")):
|
if file.endswith(("html", "htm", "xml")):
|
||||||
@@ -115,7 +114,7 @@ def get_org_files(config: TextContentConfig):
|
|||||||
|
|
||||||
filename_to_content_map = {}
|
filename_to_content_map = {}
|
||||||
for file in all_org_files:
|
for file in all_org_files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r", encoding="utf8") as f:
|
||||||
try:
|
try:
|
||||||
filename_to_content_map[file] = f.read()
|
filename_to_content_map[file] = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -137,7 +136,7 @@ def get_markdown_files(config: TextContentConfig):
|
|||||||
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
|
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
"Get Markdown files to process"
|
# Get markdown files to process
|
||||||
absolute_markdown_files, filtered_markdown_files = set(), set()
|
absolute_markdown_files, filtered_markdown_files = set(), set()
|
||||||
if markdown_files:
|
if markdown_files:
|
||||||
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
||||||
@@ -164,7 +163,7 @@ def get_markdown_files(config: TextContentConfig):
|
|||||||
|
|
||||||
filename_to_content_map = {}
|
filename_to_content_map = {}
|
||||||
for file in all_markdown_files:
|
for file in all_markdown_files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r", encoding="utf8") as f:
|
||||||
try:
|
try:
|
||||||
filename_to_content_map[file] = f.read()
|
filename_to_content_map[file] = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -1,26 +1,25 @@
|
|||||||
# System Packages
|
# System Packages
|
||||||
import logging
|
import logging
|
||||||
|
import locale
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import pytest
|
import pytest
|
||||||
from khoj.utils.config import SearchModels
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.state import content_index, search_models
|
from khoj.utils.state import content_index, search_models
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||||
|
from khoj.utils.config import SearchModels
|
||||||
from khoj.utils.fs_syncer import get_org_files
|
from khoj.utils.fs_syncer import get_org_files
|
||||||
|
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_text_search_setup_with_missing_file_raises_error(
|
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
|
||||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
# Ensure file mentioned in org.input-files is missing
|
# Ensure file mentioned in org.input-files is missing
|
||||||
single_new_file = Path(org_config_with_only_new_file.input_files[0])
|
single_new_file = Path(org_config_with_only_new_file.input_files[0])
|
||||||
@@ -29,7 +28,7 @@ def test_text_search_setup_with_missing_file_raises_error(
|
|||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
with pytest.raises(FileNotFoundError):
|
with pytest.raises(FileNotFoundError):
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
get_org_files(org_config_with_only_new_file)
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@@ -48,6 +47,7 @@ def test_text_search_setup_with_empty_file_raises_error(
|
|||||||
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||||
# Arrange
|
# Arrange
|
||||||
data = get_org_files(content_config.org)
|
data = get_org_files(content_config.org)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Regenerate notes embeddings during asymmetric setup
|
# Regenerate notes embeddings during asymmetric setup
|
||||||
notes_model = text_search.setup(
|
notes_model = text_search.setup(
|
||||||
|
|||||||
Reference in New Issue
Block a user