From 6e70b914c2e8792dc7c8fb44d555b4864dcc691c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 20:42:26 -0700 Subject: [PATCH] Remove unused dump_jsonl method The entries index is stored ingzipped jsonl files for each content type --- src/khoj/processor/github/github_to_jsonl.py | 7 ++----- src/khoj/processor/jsonl/jsonl_to_jsonl.py | 7 ++----- src/khoj/processor/markdown/markdown_to_jsonl.py | 7 ++----- src/khoj/processor/notion/notion_to_jsonl.py | 7 ++----- src/khoj/processor/org_mode/org_to_jsonl.py | 7 ++----- src/khoj/processor/pdf/pdf_to_jsonl.py | 7 ++----- src/khoj/utils/jsonl.py | 13 +------------ tests/conftest.py | 6 +++--- 8 files changed, 16 insertions(+), 45 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 9dbdc093..91dbd6da 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -13,7 +13,7 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry @@ -97,10 +97,7 @@ class GithubToJsonl(TextToJsonl): jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) # Compress JSONL formatted Data - if self.config.compressed_jsonl.suffix == ".gz": - compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) - elif self.config.compressed_jsonl.suffix == ".jsonl": - dump_jsonl(jsonl_data, self.config.compressed_jsonl) + compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) return entries_with_ids diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py index c033f522..3c824545 100644 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py @@ -7,7 +7,7 @@ from typing import List # Internal Packages from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, timer -from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import load_jsonl, compress_jsonl_data from khoj.utils.rawconfig import Entry @@ -48,10 +48,7 @@ class JsonlToJsonl(TextToJsonl): jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 2da5bd4c..b6acbfbb 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -10,7 +10,7 @@ from typing import List from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer from khoj.utils.constants import empty_escape_sequences -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry, TextContentConfig @@ -61,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl): jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py index d4cd78f3..489f0341 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_jsonl.py @@ -8,7 +8,7 @@ import requests from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, NotionContentConfig from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry from enum import Enum @@ -250,9 +250,6 @@ class NotionToJsonl(TextToJsonl): jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries) # Compress JSONL formatted Data - if self.config.compressed_jsonl.suffix == ".gz": - compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) - elif self.config.compressed_jsonl.suffix == ".jsonl": - dump_jsonl(jsonl_data, self.config.compressed_jsonl) + compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) return entries_with_ids diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index b00a6c50..b3bc06fd 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -8,7 +8,7 @@ from typing import Iterable, List from khoj.processor.org_mode import orgnode from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry, TextContentConfig from khoj.utils import state @@ -62,10 +62,7 @@ class OrgToJsonl(TextToJsonl): jsonl_data = self.convert_org_entries_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index e41fd976..f8a20692 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader # Internal Packages from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry @@ -55,10 +55,7 @@ class PdfToJsonl(TextToJsonl): jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/utils/jsonl.py b/src/khoj/utils/jsonl.py index c9576810..ed779e79 100644 --- a/src/khoj/utils/jsonl.py +++ b/src/khoj/utils/jsonl.py @@ -20,7 +20,7 @@ def load_jsonl(input_path): # Open JSONL file if input_path.suffix == ".gz": jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8") - elif input_path.suffix == ".jsonl": + else: jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8") # Read JSONL file @@ -36,17 +36,6 @@ def load_jsonl(input_path): return data -def dump_jsonl(jsonl_data, output_path): - "Write List of JSON objects to JSON line file" - # Create output directory, if it doesn't exist - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: - f.write(jsonl_data) - - logger.debug(f"Wrote jsonl data to {output_path}") - - def compress_jsonl_data(jsonl_data, output_path): # Create output directory, if it doesn't exist output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/conftest.py b/tests/conftest.py index a92d33ca..07c5156f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -90,7 +90,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config: content_config.org = TextContentConfig( input_files=None, input_filter=["tests/data/org/*.org"], - compressed_jsonl=content_dir.joinpath("notes.jsonl"), + compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"), embeddings_file=content_dir.joinpath("note_embeddings.pt"), ) @@ -101,7 +101,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config: content_config.plugins = { "plugin1": TextContentConfig( - input_files=[content_dir.joinpath("notes.jsonl")], + input_files=[content_dir.joinpath("notes.jsonl.gz")], input_filter=None, compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"), embeddings_file=content_dir.joinpath("plugin_embeddings.pt"), @@ -142,7 +142,7 @@ def md_content_config(tmp_path_factory): content_config.markdown = TextContentConfig( input_files=None, input_filter=["tests/data/markdown/*.markdown"], - compressed_jsonl=content_dir.joinpath("markdown.jsonl"), + compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"), embeddings_file=content_dir.joinpath("markdown_embeddings.pt"), )