Read text files as utf-8, instead of default os locale

On Windows, the default locale isn't utf8. Khoj had regressed to
reading files in OS specified locale encoding, e.g cp1252, cp949 etc.

It now explicitly uses utf8 encoding to read text files for indexing

Resolves #495, resolves #472
This commit is contained in:
Debanjum Singh Solanky
2023-10-17 21:31:15 -07:00
parent 3d4576ae38
commit d9d133dfb9
3 changed files with 14 additions and 14 deletions

View File

@@ -72,7 +72,7 @@ async def update(
raise HTTPException(status_code=401, detail="Invalid API Key")
state.config_lock.acquire()
try:
logger.info(f"📬 Updating content index via API call by {client}")
logger.info(f"📬 Updating content index via API call by {client} client")
org_files: Dict[str, str] = {}
markdown_files: Dict[str, str] = {}
pdf_files: Dict[str, str] = {}
@@ -139,7 +139,8 @@ async def update(
except Exception as e:
logger.error(
f"🚨 Failed to {force} update {t} content index triggered via API call by {client}: {e}", exc_info=True
f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
exc_info=True,
)
finally:
state.config_lock.release()
@@ -154,7 +155,7 @@ async def update(
host=host,
)
logger.info(f"📪 Content index updated via API call by {client}")
logger.info(f"📪 Content index updated via API call by {client} client")
return Response(content="OK", status_code=200)

View File

@@ -1,6 +1,5 @@
import logging
import glob
import base64
from typing import Optional
from bs4 import BeautifulSoup
@@ -69,7 +68,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
filename_to_content_map = {}
for file in all_target_files:
with open(file, "r") as f:
with open(file, "r", encoding="utf8") as f:
try:
plaintext_content = f.read()
if file.endswith(("html", "htm", "xml")):
@@ -115,7 +114,7 @@ def get_org_files(config: TextContentConfig):
filename_to_content_map = {}
for file in all_org_files:
with open(file, "r") as f:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
@@ -137,7 +136,7 @@ def get_markdown_files(config: TextContentConfig):
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
return {}
"Get Markdown files to process"
# Get markdown files to process
absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files:
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
@@ -164,7 +163,7 @@ def get_markdown_files(config: TextContentConfig):
filename_to_content_map = {}
for file in all_markdown_files:
with open(file, "r") as f:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:

View File

@@ -1,26 +1,25 @@
# System Packages
import logging
import locale
from pathlib import Path
import os
# External Packages
import pytest
from khoj.utils.config import SearchModels
# Internal Packages
from khoj.utils.state import content_index, search_models
from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.utils.config import SearchModels
from khoj.utils.fs_syncer import get_org_files
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
# Test
# ----------------------------------------------------------------------------------------------------
def test_text_search_setup_with_missing_file_raises_error(
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
# Arrange
# Ensure file mentioned in org.input-files is missing
single_new_file = Path(org_config_with_only_new_file.input_files[0])
@@ -29,7 +28,7 @@ def test_text_search_setup_with_missing_file_raises_error(
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(FileNotFoundError):
data = get_org_files(org_config_with_only_new_file)
get_org_files(org_config_with_only_new_file)
# ----------------------------------------------------------------------------------------------------
@@ -48,6 +47,7 @@ def test_text_search_setup_with_empty_file_raises_error(
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
# Arrange
data = get_org_files(content_config.org)
# Act
# Regenerate notes embeddings during asymmetric setup
notes_model = text_search.setup(