mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 05:29:12 +00:00
Create and use a context manager to time code
Use the timer context manager in all places where code was being timed - Benefits - Deduplicate timing code scattered across codebase. - Provides single place to manage perf timing code - Use consistent timing log patterns
This commit is contained in:
@@ -4,6 +4,7 @@ import hashlib
|
||||
import time
|
||||
import logging
|
||||
from typing import Callable
|
||||
from src.utils.helpers import timer
|
||||
|
||||
# Internal Packages
|
||||
from src.utils.rawconfig import Entry, TextContentConfig
|
||||
@@ -40,35 +41,31 @@ class TextToJsonl(ABC):
|
||||
|
||||
def mark_entries_for_update(self, current_entries: list[Entry], previous_entries: list[Entry], key='compiled', logger=None) -> list[tuple[int, Entry]]:
|
||||
# Hash all current and previous entries to identify new entries
|
||||
start = time.time()
|
||||
current_entry_hashes = list(map(TextToJsonl.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextToJsonl.hash_func(key), previous_entries))
|
||||
end = time.time()
|
||||
logger.debug(f"Hash previous, current entries: {end - start} seconds")
|
||||
with timer("Hash previous, current entries", logger):
|
||||
current_entry_hashes = list(map(TextToJsonl.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextToJsonl.hash_func(key), previous_entries))
|
||||
|
||||
start = time.time()
|
||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||
hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
|
||||
with timer("Identify, Mark, Combine new, existing entries", logger):
|
||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||
hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
|
||||
|
||||
# All entries that did not exist in the previous set are to be added
|
||||
new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
|
||||
# All entries that exist in both current and previous sets are kept
|
||||
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
|
||||
# All entries that did not exist in the previous set are to be added
|
||||
new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
|
||||
# All entries that exist in both current and previous sets are kept
|
||||
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
|
||||
|
||||
# Mark new entries with -1 id to flag for later embeddings generation
|
||||
new_entries = [
|
||||
(-1, hash_to_current_entries[entry_hash])
|
||||
for entry_hash in new_entry_hashes
|
||||
]
|
||||
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
|
||||
existing_entries = [
|
||||
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
|
||||
for entry_hash in existing_entry_hashes
|
||||
]
|
||||
# Mark new entries with -1 id to flag for later embeddings generation
|
||||
new_entries = [
|
||||
(-1, hash_to_current_entries[entry_hash])
|
||||
for entry_hash in new_entry_hashes
|
||||
]
|
||||
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
|
||||
existing_entries = [
|
||||
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
|
||||
for entry_hash in existing_entry_hashes
|
||||
]
|
||||
|
||||
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
|
||||
entries_with_ids = existing_entries_sorted + new_entries
|
||||
end = time.time()
|
||||
logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds")
|
||||
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
|
||||
entries_with_ids = existing_entries_sorted + new_entries
|
||||
|
||||
return entries_with_ids
|
||||
Reference in New Issue
Block a user