Merge branch 'master' into support-incremental-updates-of-embeddings

2026-03-04 21:29:12 +00:00 · 2022-09-10 22:11:43 +03:00
parent 030fab9bb2 ed8d432fdd
commit ebd5039bd1
20 changed files with 225 additions and 70 deletions
--- a/src/interface/desktop/main_window.py
+++ b/src/interface/desktop/main_window.py
@@ -92,7 +92,7 @@ class MainWindow(QtWidgets.QMainWindow):
        search_type_layout = QtWidgets.QVBoxLayout(search_type_settings)
        enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type)
        # Add file browser to set input files for given search type
-        input_files = FileBrowser(file_input_text, search_type, current_content_files)
+        input_files = FileBrowser(file_input_text, search_type, current_content_files or [])

        # Set enabled/disabled based on checkbox state
        enable_search_type.setChecked(current_content_files is not None and len(current_content_files) > 0)
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -5,7 +5,7 @@
 ;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
 ;; Description: Natural, Incremental Search for your Second Brain
 ;; Keywords: search, org-mode, outlines, markdown, beancount, ledger, image
-;; Version: 0.1.6
+;; Version: 0.1.9
 ;; Package-Requires: ((emacs "27.1"))
 ;; URL: http://github.com/debanjum/khoj/interface/emacs

--- a/src/main.py
+++ b/src/main.py
@@ -3,8 +3,12 @@ import os
 import signal
 import sys
 import logging
+import warnings
 from platform import system

+# Ignore non-actionable warnings
+warnings.filterwarnings("ignore", message=r'snapshot_download.py has been made private', category=FutureWarning)
+
 # External Packages
 import uvicorn
 from fastapi import FastAPI
@@ -63,6 +67,9 @@ def run():
    args = cli(state.cli_args)
    set_state(args)

+    # Create app directory, if it doesn't exist
+    state.config_file.parent.mkdir(parents=True, exist_ok=True)
+
    # Setup Logger
    if args.verbose == 0:
        logger.setLevel(logging.WARN)
--- a/src/processor/org_mode/orgnode.py
+++ b/src/processor/org_mode/orgnode.py
@@ -41,8 +41,13 @@ from os.path import relpath
 indent_regex = re.compile(r'^\s*')

 def normalize_filename(filename):
-   file_relative_to_home = f'~/{relpath(filename, start=Path.home())}'
-   escaped_filename = f'{file_relative_to_home}'.replace("[","\[").replace("]","\]")
+   "Normalize and escape filename for rendering"
+   if not Path(filename).is_absolute():
+      # Normalize relative filename to be relative to current directory
+      normalized_filename = f'~/{relpath(filename, start=Path.home())}'
+   else:
+      normalized_filename = filename
+   escaped_filename = f'{normalized_filename}'.replace("[","\[").replace("]","\]")
   return escaped_filename

 def makelist(filename):
@@ -61,7 +66,7 @@ def makelist(filename):

   todos         = { "TODO": "", "WAITING": "", "ACTIVE": "",
                     "DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
-   level         = 0
+   level         = ""
   heading       = ""
   bodytext      = ""
   tags          = list()      # set of all tags in headline
@@ -73,6 +78,7 @@ def makelist(filename):
   propdict      = dict()
   in_properties_drawer = False
   in_logbook_drawer = False
+   file_title = f'{filename}'

   for line in f:
       ctr += 1
@@ -111,6 +117,16 @@ def makelist(filename):
              kwlist = re.findall(r'([A-Z]+)\(', line)
              for kw in kwlist: todos[kw] = ""

+           # Set file title to TITLE property, if it exists
+           title_search = re.search(r'^#\+TITLE:\s*(.*)$', line)
+           if title_search and title_search.group(1).strip() != '':
+               title_text = title_search.group(1).strip()
+               if file_title == f'{filename}':
+                  file_title = title_text
+               else:
+                  file_title += f' {title_text}'
+               continue
+
           # Ignore Properties Drawers Completely
           if re.search(':PROPERTIES:', line):
              in_properties_drawer=True
@@ -167,7 +183,7 @@ def makelist(filename):
               bodytext = bodytext + line

   # write out last node
-   thisNode = Orgnode(level, heading, bodytext, tags)
+   thisNode = Orgnode(level, heading or file_title, bodytext, tags)
   thisNode.setProperties(propdict)
   if sched_date:
      thisNode.setScheduled(sched_date)
@@ -196,8 +212,12 @@ def makelist(filename):
          n.setHeading(prtysrch.group(2))

       # Set SOURCE property to a file+heading based org-mode link to the entry
-       escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]")
-       n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]'
+       if n.Level() == 0:
+         n.properties['LINE'] = f'file:{normalize_filename(filename)}::0'
+         n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}]]'
+       else:
+         escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]")
+         n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]'

   return nodelist

--- a/src/search_filter/word_filter.py
+++ b/src/search_filter/word_filter.py
@@ -27,7 +27,7 @@ class WordFilter(BaseFilter):

    def load(self, entries, regenerate=False):
        start = time.time()
-        self.cache = {}  # Clear cache on reload of filter
+        self.cache = {}  # Clear cache on filter (re-)load
        entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
        # Create map of words to entries they exist in
        for entry_index, entry in enumerate(entries):
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -11,7 +11,7 @@ from src.search_filter.base_filter import BaseFilter

 # Internal Packages
 from src.utils import state
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
+from src.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import TextSearchConfig, TextContentConfig
 from src.utils.jsonl import load_jsonl
@@ -187,6 +187,8 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon

    # Extract Updated Entries
    entries = extract_entries(config.compressed_jsonl)
+    if is_none_or_empty(entries):
+        raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
    top_k = min(len(entries), top_k)  # top_k hits can't be more than the total entries in corpus

    # Compute or Load Embeddings
--- a/src/utils/cli.py
+++ b/src/utils/cli.py
@@ -1,6 +1,7 @@
 # Standard Packages
 import argparse
 import pathlib
+from importlib.metadata import version

 # Internal Packages
 from src.utils.helpers import resolve_absolute_path
@@ -17,9 +18,15 @@ def cli(args=None):
    parser.add_argument('--host', type=str, default='127.0.0.1', help="Host address of the server. Default: 127.0.0.1")
    parser.add_argument('--port', '-p', type=int, default=8000, help="Port of the server. Default: 8000")
    parser.add_argument('--socket', type=pathlib.Path, help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock")
+    parser.add_argument('--version', '-V', action='store_true', help="Print the installed Khoj version and exit")

    args = parser.parse_args(args)

+    if args.version:
+        # Show version of khoj installed and exit
+        print(version('khoj-assistant'))
+        exit(0)
+
    # Normalize config_file path to absolute path
    args.config_file = resolve_absolute_path(args.config_file)

--- a/src/utils/jsonl.py
+++ b/src/utils/jsonl.py
@@ -44,7 +44,7 @@ def dump_jsonl(jsonl_data, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(jsonl_data)

-    logger.info(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
+    logger.info(f'Wrote jsonl data to {output_path}')


 def compress_jsonl_data(jsonl_data, output_path):