From d4d7dbaca60d10c9e8dcde0c290d83320f6cf7b7 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 21 Jul 2022 20:22:24 +0400 Subject: [PATCH] Support Natural Search on Markdown Files - Reason: Allow natural search on markdown based notes, documentation, websites etc - Details: - Create markdown processor to extract Markdown entries (identified by Heading) into standard jsonl format required by text_search - Update API, Configs to support interfacing with new markdown type - Update Emacs, Web clients to support interfacing with new markdown type via API - Update Readme to mentiond markdown is also supported Closes #35 --- README.org | 2 +- src/interface/emacs/khoj.el | 29 ++++-- src/interface/web/index.html | 1 + src/main.py | 15 ++- src/processor/markdown/__init__.py | 0 src/processor/markdown/markdown_to_jsonl.py | 108 ++++++++++++++++++++ src/utils/config.py | 2 + src/utils/rawconfig.py | 1 + 8 files changed, 149 insertions(+), 9 deletions(-) create mode 100644 src/processor/markdown/__init__.py create mode 100644 src/processor/markdown/markdown_to_jsonl.py diff --git a/README.org b/README.org index 428adfb0..1e60b17b 100644 --- a/README.org +++ b/README.org @@ -16,7 +16,7 @@ #+end_src *** 2. Configure - - [Required] Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, org-mode notes and beancount directories + - [Required] Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, (org-mode or markdown) notes and beancount directories - [Optional] Edit application configuration in [[./config/sample_config.yml][sample_config.yml]] *** 3. Run diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 82359649..6ca9ee1d 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -3,8 +3,8 @@ ;; Copyright (C) 2021-2022 Debanjum Singh Solanky ;; Author: Debanjum Singh Solanky -;; Version: 0.1 -;; Keywords: search, org-mode, outlines +;; Version: 1.0 +;; Keywords: search, org-mode, outlines, markdown, image ;; URL: http://github.com/debanjum/khoj/interface/emacs ;; This file is NOT part of GNU Emacs. @@ -27,7 +27,7 @@ ;;; Commentary: ;; This package provides natural language search on org-mode notes, -;; beancount transactions and images. +;; markdown files, beancount transactions and images. ;; It is a wrapper that interfaces with transformer based ML models. ;; The models search capabilities are exposed via the Khoj HTTP API @@ -46,6 +46,18 @@ :group 'khoj :type 'integer) +(defun khoj--extract-entries-as-markdown (json-response query) + "Convert json response from API to markdown entries" + ;; remove leading (, ) or SPC from extracted entries string + (replace-regexp-in-string + "^[\(\) ]" "" + ;; extract entries from response as single string and convert to entries + (format "# %s\n%s" + query + (mapcar + (lambda (args) (format "%s" (cdr (assoc 'entry args)))) + json-response)))) + (defun khoj--extract-entries-as-org (json-response query) "Convert json response from API to org-mode entries" ;; remove leading (, ) or SPC from extracted entries string @@ -59,7 +71,7 @@ json-response)))) (defun khoj--extract-entries-as-images (json-response query) - "Convert json response from API to org-mode entries with images" + "Convert json response from API to html with images" ;; remove leading (, ) or SPC from extracted entries string (replace-regexp-in-string "[\(\) ]$" "" @@ -103,6 +115,7 @@ ((equal buffer-name "Music.org") "music") ((equal file-extension "bean") "ledger") ((equal file-extension "org") "notes") + ((or (equal file-extension "markdown") (equal file-extension "md")) "markdown") (t "notes")))) (defun khoj--construct-api-query (query search-type) @@ -111,10 +124,10 @@ ;;;###autoload (defun khoj (query) - "Khoj on org-mode content via khoj API" + "Search your content naturally using the Khoj API" (interactive "sQuery: ") (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) - (search-type (completing-read "Type: " '("notes" "ledger" "music" "image") nil t default-type)) + (search-type (completing-read "Type: " '("notes" "markdown" "ledger" "music" "image") nil t default-type)) (url (khoj--construct-api-query query search-type)) (buff (get-buffer-create (format "*Khoj (q:%s t:%s)*" query search-type)))) ;; get json response from api @@ -122,17 +135,19 @@ (let ((inhibit-read-only t)) (erase-buffer) (url-insert-file-contents url))) - ;; convert json response to org-mode entries + ;; render json response into formatted entries (with-current-buffer buff (let ((inhibit-read-only t) (json-response (json-parse-buffer :object-type 'alist))) (erase-buffer) (insert (cond ((or (equal search-type "notes") (equal search-type "music")) (khoj--extract-entries-as-org json-response query)) + ((equal search-type "markdown") (khoj--extract-entries-as-markdown json-response query)) ((equal search-type "ledger") (khoj--extract-entries-as-ledger json-response query)) ((equal search-type "image") (khoj--extract-entries-as-images json-response query)) (t (format "%s" json-response)))) (cond ((equal search-type "notes") (org-mode)) + ((equal search-type "markdown") (markdown-mode)) ((equal search-type "ledger") (beancount-mode)) ((equal search-type "music") (progn (org-mode) (org-music-mode))) diff --git a/src/interface/web/index.html b/src/interface/web/index.html index c096ce4c..7e84af22 100644 --- a/src/interface/web/index.html +++ b/src/interface/web/index.html @@ -63,6 +63,7 @@ diff --git a/src/main.py b/src/main.py index 515c64a8..e847f8e1 100644 --- a/src/main.py +++ b/src/main.py @@ -14,6 +14,7 @@ from fastapi.templating import Jinja2Templates from src.search_type import image_search, text_search from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl +from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl from src.utils.helpers import get_absolute_path, get_from_dict from src.utils.cli import cli from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel @@ -80,6 +81,13 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): # collate and return results return text_search.collate_results(hits, entries, results_count) + if (t == SearchType.Markdown or t == None) and model.notes_search: + # query markdown files + hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[explicit_filter, date_filter]) + + # collate and return results + return text_search.collate_results(hits, entries, results_count) + if (t == SearchType.Ledger or t == None) and model.ledger_search: # query transactions hits, entries = text_search.query(user_query, model.ledger_search, filters=[explicit_filter, date_filter]) @@ -88,7 +96,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): return text_search.collate_results(hits, entries, results_count) if (t == SearchType.Image or t == None) and model.image_search: - # query transactions + # query images hits = image_search.query(user_query, results_count, model.image_search) output_directory = f'{os.getcwd()}/{web_directory}' @@ -172,6 +180,11 @@ def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None # Extract Entries, Generate Music Embeddings model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose) + # Initialize Markdown Search + if (t == SearchType.Markdown or t == None) and config.content_type.markdown: + # Extract Entries, Generate Markdown Embeddings + model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose) + # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: # Extract Entries, Generate Ledger Embeddings diff --git a/src/processor/markdown/__init__.py b/src/processor/markdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py new file mode 100644 index 00000000..c2133ede --- /dev/null +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# Standard Packages +import json +import argparse +import pathlib +import glob +import re + +# Internal Packages +from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.constants import empty_escape_sequences +from src.utils.jsonl import dump_jsonl, compress_jsonl_data + + +# Define Functions +def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, verbose=0): + # Input Validation + if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): + print("At least one of markdown-files or markdown-file-filter is required to be specified") + exit(1) + + # Get Markdown Files to Process + markdown_files = get_markdown_files(markdown_files, markdown_file_filter, verbose) + + # Extract Entries from specified Markdown files + entries = extract_markdown_entries(markdown_files) + + # Process Each Entry from All Notes Files + jsonl_data = convert_markdown_entries_to_jsonl(entries, verbose=verbose) + + # Compress JSONL formatted Data + if output_file.suffix == ".gz": + compress_jsonl_data(jsonl_data, output_file, verbose=verbose) + elif output_file.suffix == ".jsonl": + dump_jsonl(jsonl_data, output_file, verbose=verbose) + + return entries + + +def get_markdown_files(markdown_files=None, markdown_file_filter=None, verbose=0): + "Get Markdown files to process" + absolute_markdown_files, filtered_markdown_files = set(), set() + if markdown_files: + absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} + if markdown_file_filter: + filtered_markdown_files = set(glob.glob(get_absolute_path(markdown_file_filter))) + + all_markdown_files = absolute_markdown_files | filtered_markdown_files + + files_with_non_markdown_extensions = { + md_file + for md_file + in all_markdown_files + if not md_file.endswith(".md") and not md_file.endswith('.markdown') + } + + if any(files_with_non_markdown_extensions): + print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}") + + if verbose > 0: + print(f'Processing files: {all_markdown_files}') + + return all_markdown_files + + +def extract_markdown_entries(markdown_files): + "Extract entries by heading from specified Markdown files" + + # Regex to extract Markdown Entries by Heading + markdown_heading_regex = r'^#' + + entries = [] + for markdown_file in markdown_files: + with open(markdown_file) as f: + markdown_content = f.read() + entries.extend([f'#{entry.strip(empty_escape_sequences)}' + for entry + in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)]) + + return entries + + +def convert_markdown_entries_to_jsonl(entries, verbose=0): + "Convert each Markdown entries to JSON and collate as JSONL" + jsonl = '' + for entry in entries: + entry_dict = {'compiled': entry, 'raw': entry} + # Convert Dictionary to JSON and Append to JSONL string + jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' + + if verbose > 0: + print(f"Converted {len(entries)} to jsonl format") + + return jsonl + + +if __name__ == '__main__': + # Setup Argument Parser + parser = argparse.ArgumentParser(description="Map Markdown entries into (compressed) JSONL format") + parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz") + parser.add_argument('--input-files', '-i', nargs='*', help="List of markdown files to process") + parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for markdown files to process") + parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0") + args = parser.parse_args() + + # Map notes in Markdown files to (compressed) JSONL formatted file + markdown_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose) diff --git a/src/utils/config.py b/src/utils/config.py index c9e41b68..3836801d 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -11,6 +11,7 @@ class SearchType(str, Enum): Notes = "notes" Ledger = "ledger" Music = "music" + Markdown = "markdown" Image = "image" @@ -39,6 +40,7 @@ class SearchModels(): notes_search: TextSearchModel = None ledger_search: TextSearchModel = None music_search: TextSearchModel = None + markdown_search: TextSearchModel = None image_search: ImageSearchModel = None diff --git a/src/utils/rawconfig.py b/src/utils/rawconfig.py index a355d6a4..1a514226 100644 --- a/src/utils/rawconfig.py +++ b/src/utils/rawconfig.py @@ -31,6 +31,7 @@ class ContentConfig(ConfigBase): ledger: Optional[TextContentConfig] image: Optional[ImageContentConfig] music: Optional[TextContentConfig] + markdown: Optional[TextContentConfig] class TextSearchConfig(ConfigBase): encoder: Optional[str]