mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 05:39:12 +00:00
Remove unused dump_jsonl method
The entries index is stored ingzipped jsonl files for each content type
This commit is contained in:
@@ -13,7 +13,7 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
|||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
@@ -97,10 +97,7 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if self.config.compressed_jsonl.suffix == ".gz":
|
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
|
||||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
|
||||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from typing import List
|
|||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.helpers import get_absolute_path, timer
|
from khoj.utils.helpers import get_absolute_path, timer
|
||||||
from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import load_jsonl, compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
@@ -48,10 +48,7 @@ class JsonlToJsonl(TextToJsonl):
|
|||||||
jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
|
||||||
elif output_file.suffix == ".jsonl":
|
|
||||||
dump_jsonl(jsonl_data, output_file)
|
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from typing import List
|
|||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||||
from khoj.utils.constants import empty_escape_sequences
|
from khoj.utils.constants import empty_escape_sequences
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
from khoj.utils.rawconfig import Entry, TextContentConfig
|
||||||
|
|
||||||
|
|
||||||
@@ -61,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl):
|
|||||||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
|
||||||
elif output_file.suffix == ".jsonl":
|
|
||||||
dump_jsonl(jsonl_data, output_file)
|
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import requests
|
|||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@@ -250,9 +250,6 @@ class NotionToJsonl(TextToJsonl):
|
|||||||
jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
|
jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if self.config.compressed_jsonl.suffix == ".gz":
|
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
|
||||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
|
||||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from typing import Iterable, List
|
|||||||
from khoj.processor.org_mode import orgnode
|
from khoj.processor.org_mode import orgnode
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
from khoj.utils.rawconfig import Entry, TextContentConfig
|
||||||
from khoj.utils import state
|
from khoj.utils import state
|
||||||
|
|
||||||
@@ -62,10 +62,7 @@ class OrgToJsonl(TextToJsonl):
|
|||||||
jsonl_data = self.convert_org_entries_to_jsonl(entries)
|
jsonl_data = self.convert_org_entries_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
|
||||||
elif output_file.suffix == ".jsonl":
|
|
||||||
dump_jsonl(jsonl_data, output_file)
|
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader
|
|||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
@@ -55,10 +55,7 @@ class PdfToJsonl(TextToJsonl):
|
|||||||
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
|
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
|
||||||
elif output_file.suffix == ".jsonl":
|
|
||||||
dump_jsonl(jsonl_data, output_file)
|
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def load_jsonl(input_path):
|
|||||||
# Open JSONL file
|
# Open JSONL file
|
||||||
if input_path.suffix == ".gz":
|
if input_path.suffix == ".gz":
|
||||||
jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8")
|
jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8")
|
||||||
elif input_path.suffix == ".jsonl":
|
else:
|
||||||
jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8")
|
jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8")
|
||||||
|
|
||||||
# Read JSONL file
|
# Read JSONL file
|
||||||
@@ -36,17 +36,6 @@ def load_jsonl(input_path):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def dump_jsonl(jsonl_data, output_path):
|
|
||||||
"Write List of JSON objects to JSON line file"
|
|
||||||
# Create output directory, if it doesn't exist
|
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
with open(output_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(jsonl_data)
|
|
||||||
|
|
||||||
logger.debug(f"Wrote jsonl data to {output_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def compress_jsonl_data(jsonl_data, output_path):
|
def compress_jsonl_data(jsonl_data, output_path):
|
||||||
# Create output directory, if it doesn't exist
|
# Create output directory, if it doesn't exist
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
|||||||
content_config.org = TextContentConfig(
|
content_config.org = TextContentConfig(
|
||||||
input_files=None,
|
input_files=None,
|
||||||
input_filter=["tests/data/org/*.org"],
|
input_filter=["tests/data/org/*.org"],
|
||||||
compressed_jsonl=content_dir.joinpath("notes.jsonl"),
|
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
|
||||||
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
|
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -101,7 +101,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
|||||||
|
|
||||||
content_config.plugins = {
|
content_config.plugins = {
|
||||||
"plugin1": TextContentConfig(
|
"plugin1": TextContentConfig(
|
||||||
input_files=[content_dir.joinpath("notes.jsonl")],
|
input_files=[content_dir.joinpath("notes.jsonl.gz")],
|
||||||
input_filter=None,
|
input_filter=None,
|
||||||
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
|
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
|
||||||
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
|
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
|
||||||
@@ -142,7 +142,7 @@ def md_content_config(tmp_path_factory):
|
|||||||
content_config.markdown = TextContentConfig(
|
content_config.markdown = TextContentConfig(
|
||||||
input_files=None,
|
input_files=None,
|
||||||
input_filter=["tests/data/markdown/*.markdown"],
|
input_filter=["tests/data/markdown/*.markdown"],
|
||||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
|
compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
|
||||||
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user