mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Remove unused Entry to Jsonl converter from text to entry class, tests
This was earlier used when the index was plaintext jsonl file. Now that documents are indexed in a DB this func is not required. Simplify org,md,pdf,plaintext to entries tests by removing the entry to jsonl conversion step
This commit is contained in:
@@ -123,8 +123,3 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_markdown_maps_to_jsonl(entries: List[Entry]):
|
|
||||||
"Convert each Markdown entry to JSON and collate as JSONL"
|
|
||||||
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
|
||||||
|
|||||||
@@ -146,8 +146,3 @@ class OrgToEntries(TextToEntries):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_org_entries_to_jsonl(entries: Iterable[Entry]) -> str:
|
|
||||||
"Convert each Org-Mode entry to JSON and collate as JSONL"
|
|
||||||
return "".join([f"{entry_dict.to_json()}\n" for entry_dict in entries])
|
|
||||||
|
|||||||
@@ -106,8 +106,3 @@ class PdfToEntries(TextToEntries):
|
|||||||
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_pdf_maps_to_jsonl(entries: List[Entry]):
|
|
||||||
"Convert each PDF entry to JSON and collate as JSONL"
|
|
||||||
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
|
||||||
|
|||||||
@@ -87,8 +87,3 @@ class PlaintextToEntries(TextToEntries):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_entries_to_jsonl(entries: List[Entry]):
|
|
||||||
"Convert each entry to JSON and collate as JSONL"
|
|
||||||
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
|
||||||
|
|||||||
@@ -244,11 +244,6 @@ class TextToEntries(ABC):
|
|||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_text_maps_to_jsonl(entries: List[Entry]) -> str:
|
|
||||||
# Convert each entry to JSON and write to JSONL file
|
|
||||||
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_field(field: str) -> str:
|
def clean_field(field: str) -> str:
|
||||||
return field.replace("\0", "") if not is_none_or_empty(field) else ""
|
return field.replace("\0", "") if not is_none_or_empty(field) else ""
|
||||||
|
|||||||
@@ -23,18 +23,14 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
|
||||||
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
# Ensure raw entry with no headings do not get heading prefix prepended
|
# Ensure raw entry with no headings do not get heading prefix prepended
|
||||||
assert not jsonl_data[0]["raw"].startswith("#")
|
assert not entries[0].raw.startswith("#")
|
||||||
# Ensure compiled entry has filename prepended as top level heading
|
# Ensure compiled entry has filename prepended as top level heading
|
||||||
assert expected_heading in jsonl_data[0]["compiled"]
|
assert entries[0].compiled.startswith(expected_heading)
|
||||||
# Ensure compiled entry also includes the file name
|
# Ensure compiled entry also includes the file name
|
||||||
assert str(tmp_path) in jsonl_data[0]["compiled"]
|
assert str(tmp_path) in entries[0].compiled
|
||||||
|
|
||||||
|
|
||||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
@@ -52,12 +48,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
|||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
|
||||||
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||||
@@ -79,12 +71,8 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
|||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
|
||||||
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 2
|
assert len(entries) == 2
|
||||||
# Ensure entry compiled strings include the markdown files they originate from
|
# Ensure entry compiled strings include the markdown files they originate from
|
||||||
assert all([tmp_path.stem in entry.compiled for entry in entries])
|
assert all([tmp_path.stem in entry.compiled for entry in entries])
|
||||||
|
|
||||||
|
|||||||
@@ -26,18 +26,15 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
|||||||
for index_heading_entries in [True, False]:
|
for index_heading_entries in [True, False]:
|
||||||
# Act
|
# Act
|
||||||
# Extract entries into jsonl from specified Org files
|
# Extract entries into jsonl from specified Org files
|
||||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries)
|
||||||
OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries)
|
|
||||||
)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
if index_heading_entries:
|
if index_heading_entries:
|
||||||
# Entry with empty body indexed when index_heading_entries set to True
|
# Entry with empty body indexed when index_heading_entries set to True
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
else:
|
else:
|
||||||
# Entry with empty body ignored when index_heading_entries set to False
|
# Entry with empty body ignored when index_heading_entries set to False
|
||||||
assert is_none_or_empty(jsonl_data)
|
assert is_none_or_empty(entries)
|
||||||
|
|
||||||
|
|
||||||
def test_entry_split_when_exceeds_max_words():
|
def test_entry_split_when_exceeds_max_words():
|
||||||
@@ -58,15 +55,12 @@ def test_entry_split_when_exceeds_max_words():
|
|||||||
entries = OrgToEntries.extract_org_entries(org_files=data)
|
entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Split each entry from specified Org files by max words
|
# Split each entry from specified Org files by max words
|
||||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4)
|
||||||
TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4)
|
|
||||||
)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 2
|
assert len(entries) == 2
|
||||||
# Ensure compiled entries split by max_words start with entry heading (for search context)
|
# Ensure compiled entries split by max_words start with entry heading (for search context)
|
||||||
assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])
|
assert all([entry.compiled.startswith(expected_heading) for entry in entries])
|
||||||
|
|
||||||
|
|
||||||
def test_entry_split_drops_large_words():
|
def test_entry_split_drops_large_words():
|
||||||
@@ -103,11 +97,10 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(OrgToEntries.extract_org_entries(org_files=data))
|
entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_file_with_entry_after_intro_text_to_jsonl(tmp_path):
|
def test_file_with_entry_after_intro_text_to_jsonl(tmp_path):
|
||||||
@@ -127,12 +120,8 @@ Intro text
|
|||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries = OrgToEntries.extract_org_entries(org_files=data)
|
entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
|
||||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 2
|
assert len(entries) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_file_with_no_headings_to_jsonl(tmp_path):
|
def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
@@ -150,12 +139,8 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries = OrgToEntries.extract_org_entries(org_files=data)
|
entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
|
||||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_get_org_files(tmp_path):
|
def test_get_org_files(tmp_path):
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
||||||
@@ -17,12 +16,8 @@ def test_single_page_pdf_to_jsonl():
|
|||||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||||
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Pdf Files
|
|
||||||
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_multi_page_pdf_to_jsonl():
|
def test_multi_page_pdf_to_jsonl():
|
||||||
@@ -35,12 +30,8 @@ def test_multi_page_pdf_to_jsonl():
|
|||||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||||
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Pdf Files
|
|
||||||
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 6
|
assert len(entries) == 6
|
||||||
|
|
||||||
|
|
||||||
def test_ocr_page_pdf_to_jsonl():
|
def test_ocr_page_pdf_to_jsonl():
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -31,16 +30,12 @@ def test_plaintext_file(tmp_path):
|
|||||||
for entry in entries:
|
for entry in entries:
|
||||||
entry.file = str(Path(entry.file).absolute())
|
entry.file = str(Path(entry.file).absolute())
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
|
||||||
jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(entries)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(entries) == 1
|
||||||
# Ensure raw entry with no headings do not get heading prefix prepended
|
# Ensure raw entry with no headings do not get heading prefix prepended
|
||||||
assert not jsonl_data[0]["raw"].startswith("#")
|
assert not entries[0].raw.startswith("#")
|
||||||
# Ensure compiled entry has filename prepended as top level heading
|
# Ensure compiled entry has filename prepended as top level heading
|
||||||
assert jsonl_data[0]["compiled"] == f"{filename}\n{raw_entry}"
|
assert entries[0].compiled == f"{filename}\n{raw_entry}"
|
||||||
|
|
||||||
|
|
||||||
def test_get_plaintext_files(tmp_path):
|
def test_get_plaintext_files(tmp_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user