Add context uri field to deeplink line number in original doc

2026-03-02 21:19:12 +00:00 · 2025-06-23 19:04:06 -07:00
parent 820b4523fd
commit e90ab5341a
6 changed files with 28 additions and 8 deletions
--- a/src/khoj/database/models/init.py
+++ b/src/khoj/database/models/init.py
@@ -23,6 +23,7 @@ logger = logging.getLogger(__name__)
 class Context(PydanticBaseModel):
    compiled: str
    file: str
+    uri: str
    query: Optional[str] = None


--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@@ -99,6 +99,7 @@ class TextToEntries(ABC):
                entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw)
                entry.heading = TextToEntries.clean_field(entry.heading)
                entry.file = TextToEntries.clean_field(entry.file)
+                entry_uri = TextToEntries.clean_field(entry_uri)

                chunked_entries.append(
                    Entry(
@@ -107,6 +108,7 @@ class TextToEntries(ABC):
                        heading=entry.heading,
                        file=entry.file,
                        corpus_id=corpus_id,
+                        uri=entry_uri,
                    )
                )

@@ -192,6 +194,7 @@ class TextToEntries(ABC):
                            file_type=file_type,
                            hashed_value=entry_hash,
                            corpus_id=entry.corpus_id,
+                            url=entry.uri,
                            search_model=model,
                            file_object=file_object,
                        )
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -646,7 +646,7 @@ def generate_chatml_messages_with_context(

        if not is_none_or_empty(chat.context):
            references = "\n\n".join(
-                {f"# File: {item.file}\n## {item.compiled}\n" for item in chat.context or [] if isinstance(item, dict)}
+                {f"# URI: {item.uri}\n## {item.compiled}\n" for item in chat.context or [] if isinstance(item, dict)}
            )
            message_context += [{"type": "text", "text": f"{prompts.notes_conversation.format(references=references)}"}]

--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1265,6 +1265,7 @@ async def search_documents(
                "query": item.additional["query"],
                "compiled": item["entry"],
                "file": item.additional["file"],
+                "uri": item.additional["uri"],
            }
            for item in search_results
        ]
@@ -2867,6 +2868,7 @@ async def view_file_content(
            {
                "query": query,
                "file": path,
+                "uri": path,
                "compiled": filtered_text,
            }
        ]
@@ -2878,7 +2880,7 @@ async def view_file_content(
        logger.error(error_msg, exc_info=True)

        # Return an error result in the expected format
-        yield [{"query": query, "file": path, "compiled": error_msg}]
+        yield [{"query": query, "file": path, "uri": path, "compiled": error_msg}]


 async def grep_files(
@@ -2982,7 +2984,7 @@ async def grep_files(
            max_results,
        )
        if not line_matches:
-            yield {"query": query, "file": path_prefix, "compiled": "No matches found."}
+            yield {"query": query, "file": path_prefix, "uri": path_prefix, "compiled": "No matches found."}
            return

        # Truncate matched lines list if too long
@@ -2991,7 +2993,7 @@ async def grep_files(
                f"... {len(line_matches) - max_results} more results found. Use stricter regex or path to narrow down results."
            ]

-        yield {"query": query, "file": path_prefix or "", "compiled": "\n".join(line_matches)}
+        yield {"query": query, "file": path_prefix, "uri": path_prefix, "compiled": "\n".join(line_matches)}

    except Exception as e:
        error_msg = f"Error using grep files tool: {str(e)}"
@@ -3000,6 +3002,7 @@ async def grep_files(
            {
                "query": _generate_query(0, 0, path_prefix or "", regex_pattern, lines_before, lines_after),
                "file": path_prefix,
+                "uri": path_prefix,
                "compiled": error_msg,
            }
        ]
@@ -3032,7 +3035,7 @@ async def list_files(
            file_objects = await FileObjectAdapters.aget_file_objects_by_path_prefix(user, path)

        if not file_objects:
-            yield {"query": _generate_query(0, path, pattern), "file": path, "compiled": "No files found."}
+            yield {"query": _generate_query(0, path, pattern), "file": path, "uri": path, "compiled": "No files found."}
            return

        # Extract file names from file objects
@@ -3047,7 +3050,7 @@ async def list_files(

        query = _generate_query(len(files), path, pattern)
        if not files:
-            yield {"query": query, "file": path, "compiled": "No files found."}
+            yield {"query": query, "file": path, "uri": path, "compiled": "No files found."}
            return

        # Truncate the list if it's too long
@@ -3057,9 +3060,9 @@ async def list_files(
                f"... {len(files) - max_files} more files found. Use glob pattern to narrow down results."
            ]

-        yield {"query": query, "file": path, "compiled": "\n- ".join(files)}
+        yield {"query": query, "file": path, "uri": path, "compiled": "\n- ".join(files)}

    except Exception as e:
        error_msg = f"Error listing files in {path}: {str(e)}"
        logger.error(error_msg, exc_info=True)
-        yield {"query": query, "file": path, "compiled": error_msg}
+        yield {"query": query, "file": path, "uri": path, "compiled": error_msg}
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -157,6 +157,7 @@ def collate_results(hits, dedupe=True):
                    "additional": {
                        "source": hit.file_source,
                        "file": hit.file_path,
+                        "uri": hit.url,
                        "compiled": hit.compiled,
                        "heading": hit.heading,
                    },
@@ -180,6 +181,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]):
                    "additional": {
                        "source": hit.additional["source"],
                        "file": hit.additional["file"],
+                        "uri": hit.additional["uri"],
                        "query": hit.additional["query"],
                        "compiled": hit.additional["compiled"],
                        "heading": hit.additional["heading"],
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -176,6 +176,7 @@ class Entry:
    compiled: str
    heading: Optional[str]
    file: Optional[str]
+    uri: Optional[str] = None
    corpus_id: str

    def __init__(
@@ -184,6 +185,7 @@ class Entry:
        compiled: str = None,
        heading: Optional[str] = None,
        file: Optional[str] = None,
+        uri: Optional[str] = None,
        corpus_id: uuid.UUID = None,
    ):
        self.raw = raw
@@ -191,6 +193,14 @@ class Entry:
        self.heading = heading
        self.file = file
        self.corpus_id = str(corpus_id)
+        if uri:
+            self.uri = uri
+        elif file and (file.startswith("http") or file.startswith("file://")):
+            self.uri = file
+        elif file:
+            self.uri = f"file://{file}"
+        else:
+            self.uri = None

    def to_json(self) -> str:
        return json.dumps(self.__dict__, ensure_ascii=False)
@@ -206,4 +216,5 @@ class Entry:
            file=dictionary.get("file", None),
            heading=dictionary.get("heading", None),
            corpus_id=dictionary.get("corpus_id", None),
+            uri=dictionary.get("uri", None),
        )