mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 13:22:12 +00:00
Use single func to handle indexing from scratch and incrementally
Previous regenerate mechanism did not deduplicate entries with same key So entries looked different between regenerate and update Having single func, mark_entries_for_update, to handle both scenarios will avoid this divergence Update all text_to_jsonl methods to use the above method for generating index from scratch
This commit is contained in:
@@ -15,7 +15,6 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
|||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from khoj.utils import state
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -38,7 +37,7 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
def process(self, previous_entries=None):
|
def process(self, previous_entries=[]):
|
||||||
current_entries = []
|
current_entries = []
|
||||||
for repo in self.config.repos:
|
for repo in self.config.repos:
|
||||||
current_entries += self.process_repo(repo)
|
current_entries += self.process_repo(repo)
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class JsonlToJsonl(TextToJsonl):
|
class JsonlToJsonl(TextToJsonl):
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def process(self, previous_entries=None):
|
def process(self, previous_entries=[]):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
input_jsonl_files, input_jsonl_filter, output_file = (
|
input_jsonl_files, input_jsonl_filter, output_file = (
|
||||||
self.config.input_files,
|
self.config.input_files,
|
||||||
@@ -38,15 +38,9 @@ class JsonlToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
if not previous_entries:
|
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
else:
|
)
|
||||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
|
||||||
current_entries,
|
|
||||||
previous_entries,
|
|
||||||
key="compiled",
|
|
||||||
logger=logger,
|
|
||||||
)
|
|
||||||
|
|
||||||
with timer("Write entries to JSONL file", logger):
|
with timer("Write entries to JSONL file", logger):
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ class MarkdownToJsonl(TextToJsonl):
|
|||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def process(self, previous_entries=None):
|
def process(self, previous_entries=[]):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
markdown_files, markdown_file_filter, output_file = (
|
markdown_files, markdown_file_filter, output_file = (
|
||||||
self.config.input_files,
|
self.config.input_files,
|
||||||
@@ -51,12 +51,9 @@ class MarkdownToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
if not previous_entries:
|
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
else:
|
)
|
||||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
|
||||||
current_entries, previous_entries, key="compiled", logger=logger
|
|
||||||
)
|
|
||||||
|
|
||||||
with timer("Write markdown entries to JSONL file", logger):
|
with timer("Write markdown entries to JSONL file", logger):
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class NotionToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
self.body_params = {"page_size": 100}
|
self.body_params = {"page_size": 100}
|
||||||
|
|
||||||
def process(self, previous_entries=None):
|
def process(self, previous_entries=[]):
|
||||||
current_entries = []
|
current_entries = []
|
||||||
|
|
||||||
# Get all pages
|
# Get all pages
|
||||||
@@ -240,12 +240,9 @@ class NotionToJsonl(TextToJsonl):
|
|||||||
def update_entries_with_ids(self, current_entries, previous_entries):
|
def update_entries_with_ids(self, current_entries, previous_entries):
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
if not previous_entries:
|
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
else:
|
)
|
||||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
|
||||||
current_entries, previous_entries, key="compiled", logger=logger
|
|
||||||
)
|
|
||||||
|
|
||||||
with timer("Write Notion entries to JSONL file", logger):
|
with timer("Write Notion entries to JSONL file", logger):
|
||||||
# Process Each Entry from all Notion entries
|
# Process Each Entry from all Notion entries
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class OrgToJsonl(TextToJsonl):
|
|||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def process(self, previous_entries: List[Entry] = None):
|
def process(self, previous_entries: List[Entry] = []):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
org_files, org_file_filter, output_file = (
|
org_files, org_file_filter, output_file = (
|
||||||
self.config.input_files,
|
self.config.input_files,
|
||||||
@@ -51,9 +51,7 @@ class OrgToJsonl(TextToJsonl):
|
|||||||
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
if not previous_entries:
|
with timer("Identify new or updated entries", logger):
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
|
||||||
else:
|
|
||||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||||
current_entries, previous_entries, key="compiled", logger=logger
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class PdfToJsonl(TextToJsonl):
|
class PdfToJsonl(TextToJsonl):
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def process(self, previous_entries=None):
|
def process(self, previous_entries=[]):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
pdf_files, pdf_file_filter, output_file = (
|
pdf_files, pdf_file_filter, output_file = (
|
||||||
self.config.input_files,
|
self.config.input_files,
|
||||||
@@ -45,12 +45,9 @@ class PdfToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
if not previous_entries:
|
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
else:
|
)
|
||||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
|
||||||
current_entries, previous_entries, key="compiled", logger=logger
|
|
||||||
)
|
|
||||||
|
|
||||||
with timer("Write PDF entries to JSONL file", logger):
|
with timer("Write PDF entries to JSONL file", logger):
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ class TextToJsonl(ABC):
|
|||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def process(self, previous_entries: List[Entry] = None) -> List[Tuple[int, Entry]]:
|
def process(self, previous_entries: List[Entry] = []) -> List[Tuple[int, Entry]]:
|
||||||
...
|
...
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -176,10 +176,10 @@ def setup(
|
|||||||
) -> TextContent:
|
) -> TextContent:
|
||||||
# Map notes in text files to (compressed) JSONL formatted file
|
# Map notes in text files to (compressed) JSONL formatted file
|
||||||
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
||||||
previous_entries = (
|
previous_entries = []
|
||||||
extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
|
if config.compressed_jsonl.exists() and not regenerate:
|
||||||
)
|
previous_entries = extract_entries(config.compressed_jsonl)
|
||||||
entries_with_indices = text_to_jsonl(config).process(previous_entries or [])
|
entries_with_indices = text_to_jsonl(config).process(previous_entries)
|
||||||
|
|
||||||
# Extract Updated Entries
|
# Extract Updated Entries
|
||||||
entries = extract_entries(config.compressed_jsonl)
|
entries = extract_entries(config.compressed_jsonl)
|
||||||
|
|||||||
Reference in New Issue
Block a user