mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 05:29:12 +00:00
Use new Text Entry class to track text entries in Intermediate Format
- Context
- The app maintains all text content in a standard, intermediate format
- The intermediate format was loaded, passed around as a dictionary
for easier, faster updates to the intermediate format schema initially
- The intermediate format is reasonably stable now, given it's usage
by all 3 text content types currently implemented
- Changes
- Concretize text entries into `Entries' class instead of using dictionaries
- Code is updated to load, pass around entries as `Entries' objects
instead of as dictionaries
- `text_search' and `text_to_jsonl' methods are annotated with
type hints for the new `Entries' type
- Code and Tests referencing entries are updated to use class style
access patterns instead of the previous dictionary access patterns
- Move `mark_entries_for_update' method into `TextToJsonl' base class
- This is a more natural location for the method as it is only
(to be) used by `text_to_jsonl' classes
- Avoid circular reference issues on importing `Entries' class
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import glob
|
||||
import re
|
||||
import logging
|
||||
@@ -7,9 +6,10 @@ import time
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.text_to_jsonl import TextToJsonl
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import Entry
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -40,7 +40,7 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||
entries_with_ids = self.mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||
end = time.time()
|
||||
logger.debug(f"Identify new or updated entries: {end - start} seconds")
|
||||
|
||||
@@ -110,17 +110,17 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
||||
def convert_markdown_entries_to_maps(parsed_entries: list[str], entry_to_file_map) -> list[Entry]:
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'})
|
||||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f'{entry_to_file_map[parsed_entry]}'))
|
||||
|
||||
logger.info(f"Converted {len(entries)} markdown entries to dictionaries")
|
||||
logger.info(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||
|
||||
return entry_maps
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def convert_markdown_maps_to_jsonl(entries):
|
||||
"Convert each Markdown entries to JSON and collate as JSONL"
|
||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||
def convert_markdown_maps_to_jsonl(entries: list[Entry]):
|
||||
"Convert each Markdown entry to JSON and collate as JSONL"
|
||||
return ''.join([f'{entry.to_json()}\n' for entry in entries])
|
||||
|
||||
Reference in New Issue
Block a user