From e90ab5341ac5c9a7e9a559b5a57478a24ec6a7b3 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 23 Jun 2025 19:04:06 -0700 Subject: [PATCH] Add context uri field to deeplink line number in original doc --- src/khoj/database/models/__init__.py | 1 + src/khoj/processor/content/text_to_entries.py | 3 +++ src/khoj/processor/conversation/utils.py | 2 +- src/khoj/routers/helpers.py | 17 ++++++++++------- src/khoj/search_type/text_search.py | 2 ++ src/khoj/utils/rawconfig.py | 11 +++++++++++ 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 4a952de4..1a10f962 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -23,6 +23,7 @@ logger = logging.getLogger(__name__) class Context(PydanticBaseModel): compiled: str file: str + uri: str query: Optional[str] = None diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 8e0b3322..bac55aa4 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -99,6 +99,7 @@ class TextToEntries(ABC): entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw) entry.heading = TextToEntries.clean_field(entry.heading) entry.file = TextToEntries.clean_field(entry.file) + entry_uri = TextToEntries.clean_field(entry_uri) chunked_entries.append( Entry( @@ -107,6 +108,7 @@ class TextToEntries(ABC): heading=entry.heading, file=entry.file, corpus_id=corpus_id, + uri=entry_uri, ) ) @@ -192,6 +194,7 @@ class TextToEntries(ABC): file_type=file_type, hashed_value=entry_hash, corpus_id=entry.corpus_id, + url=entry.uri, search_model=model, file_object=file_object, ) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index f4deb8d9..58b36baa 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -646,7 +646,7 @@ def generate_chatml_messages_with_context( if not is_none_or_empty(chat.context): references = "\n\n".join( - {f"# File: {item.file}\n## {item.compiled}\n" for item in chat.context or [] if isinstance(item, dict)} + {f"# URI: {item.uri}\n## {item.compiled}\n" for item in chat.context or [] if isinstance(item, dict)} ) message_context += [{"type": "text", "text": f"{prompts.notes_conversation.format(references=references)}"}] diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 9877233e..0643d134 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -1265,6 +1265,7 @@ async def search_documents( "query": item.additional["query"], "compiled": item["entry"], "file": item.additional["file"], + "uri": item.additional["uri"], } for item in search_results ] @@ -2867,6 +2868,7 @@ async def view_file_content( { "query": query, "file": path, + "uri": path, "compiled": filtered_text, } ] @@ -2878,7 +2880,7 @@ async def view_file_content( logger.error(error_msg, exc_info=True) # Return an error result in the expected format - yield [{"query": query, "file": path, "compiled": error_msg}] + yield [{"query": query, "file": path, "uri": path, "compiled": error_msg}] async def grep_files( @@ -2982,7 +2984,7 @@ async def grep_files( max_results, ) if not line_matches: - yield {"query": query, "file": path_prefix, "compiled": "No matches found."} + yield {"query": query, "file": path_prefix, "uri": path_prefix, "compiled": "No matches found."} return # Truncate matched lines list if too long @@ -2991,7 +2993,7 @@ async def grep_files( f"... {len(line_matches) - max_results} more results found. Use stricter regex or path to narrow down results." ] - yield {"query": query, "file": path_prefix or "", "compiled": "\n".join(line_matches)} + yield {"query": query, "file": path_prefix, "uri": path_prefix, "compiled": "\n".join(line_matches)} except Exception as e: error_msg = f"Error using grep files tool: {str(e)}" @@ -3000,6 +3002,7 @@ async def grep_files( { "query": _generate_query(0, 0, path_prefix or "", regex_pattern, lines_before, lines_after), "file": path_prefix, + "uri": path_prefix, "compiled": error_msg, } ] @@ -3032,7 +3035,7 @@ async def list_files( file_objects = await FileObjectAdapters.aget_file_objects_by_path_prefix(user, path) if not file_objects: - yield {"query": _generate_query(0, path, pattern), "file": path, "compiled": "No files found."} + yield {"query": _generate_query(0, path, pattern), "file": path, "uri": path, "compiled": "No files found."} return # Extract file names from file objects @@ -3047,7 +3050,7 @@ async def list_files( query = _generate_query(len(files), path, pattern) if not files: - yield {"query": query, "file": path, "compiled": "No files found."} + yield {"query": query, "file": path, "uri": path, "compiled": "No files found."} return # Truncate the list if it's too long @@ -3057,9 +3060,9 @@ async def list_files( f"... {len(files) - max_files} more files found. Use glob pattern to narrow down results." ] - yield {"query": query, "file": path, "compiled": "\n- ".join(files)} + yield {"query": query, "file": path, "uri": path, "compiled": "\n- ".join(files)} except Exception as e: error_msg = f"Error listing files in {path}: {str(e)}" logger.error(error_msg, exc_info=True) - yield {"query": query, "file": path, "compiled": error_msg} + yield {"query": query, "file": path, "uri": path, "compiled": error_msg} diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 3fafa44b..b2b3453b 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -157,6 +157,7 @@ def collate_results(hits, dedupe=True): "additional": { "source": hit.file_source, "file": hit.file_path, + "uri": hit.url, "compiled": hit.compiled, "heading": hit.heading, }, @@ -180,6 +181,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]): "additional": { "source": hit.additional["source"], "file": hit.additional["file"], + "uri": hit.additional["uri"], "query": hit.additional["query"], "compiled": hit.additional["compiled"], "heading": hit.additional["heading"], diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index e0248a66..e3662db5 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -176,6 +176,7 @@ class Entry: compiled: str heading: Optional[str] file: Optional[str] + uri: Optional[str] = None corpus_id: str def __init__( @@ -184,6 +185,7 @@ class Entry: compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None, + uri: Optional[str] = None, corpus_id: uuid.UUID = None, ): self.raw = raw @@ -191,6 +193,14 @@ class Entry: self.heading = heading self.file = file self.corpus_id = str(corpus_id) + if uri: + self.uri = uri + elif file and (file.startswith("http") or file.startswith("file://")): + self.uri = file + elif file: + self.uri = f"file://{file}" + else: + self.uri = None def to_json(self) -> str: return json.dumps(self.__dict__, ensure_ascii=False) @@ -206,4 +216,5 @@ class Entry: file=dictionary.get("file", None), heading=dictionary.get("heading", None), corpus_id=dictionary.get("corpus_id", None), + uri=dictionary.get("uri", None), )