From fbb7747dcc2d03a3b137a7d882412281409cee6c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 6 Feb 2023 18:45:57 -0300 Subject: [PATCH 1/2] Read Markdown file as utf8 instead of the default encoding used by OS - Background 1. Obsidian stores markdown notes as utf8[1] 2. By default, the python `open' command uses the OS locale encoding[2] This was causing the `UnicodeDecodeError: codec can't decode byte' error - Fix - Read markdown files as utf8 The Obsidian plugin is the main use-case for markdown files in khoj currently and that stores md files as utf8. Do not assume utf8 for other content types like org-mode, beancount for now. - Fail if error in reading file as utf8, instead of ignoring errors. Would rather have user realize that their files are not going to get indexed correctly. [1]: https://forum.obsidian.md/t/better-handle-md-files-not-stored-in-utf8-format/13524/3 [2]: https://docs.python.org/3/library/functions.html#open --- src/processor/markdown/markdown_to_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 9b326884..822cad0c 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -97,7 +97,7 @@ class MarkdownToJsonl(TextToJsonl): entries = [] entry_to_file_map = [] for markdown_file in markdown_files: - with open(markdown_file) as f: + with open(markdown_file, 'r', encoding='utf8') as f: markdown_content = f.read() markdown_entries_per_file = [] for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): From c11f7b47e4fc4c1a213a3ca2415fc8ae0158e429 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 6 Feb 2023 21:05:34 -0300 Subject: [PATCH 2/2] Update workflow to run backend tests for all supported python versions --- .github/workflows/test.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2902a56f..6ac0f873 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,13 +24,20 @@ jobs: test: name: Run Tests runs-on: ubuntu-latest - steps: + strategy: + fail-fast: false + matrix: + python_version: + - 3.8 + - 3.9 + - 3.10 + steps: - uses: actions/checkout@v3 - - name: Set up Python 3.10 + - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: ${{ matrix.python_version }} - name: Install Dependencies run: |