mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Support Natural Search on Markdown Files
- Reason:
Allow natural search on markdown based notes, documentation,
websites etc
- Details:
- Create markdown processor to extract Markdown entries (identified by
Heading) into standard jsonl format required by text_search
- Update API, Configs to support interfacing with new markdown type
- Update Emacs, Web clients to support interfacing with new markdown
type via API
- Update Readme to mentiond markdown is also supported
Closes #35
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
#+end_src
|
||||
|
||||
*** 2. Configure
|
||||
- [Required] Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, org-mode notes and beancount directories
|
||||
- [Required] Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, (org-mode or markdown) notes and beancount directories
|
||||
- [Optional] Edit application configuration in [[./config/sample_config.yml][sample_config.yml]]
|
||||
|
||||
*** 3. Run
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
|
||||
|
||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||
;; Version: 0.1
|
||||
;; Keywords: search, org-mode, outlines
|
||||
;; Version: 1.0
|
||||
;; Keywords: search, org-mode, outlines, markdown, image
|
||||
;; URL: http://github.com/debanjum/khoj/interface/emacs
|
||||
|
||||
;; This file is NOT part of GNU Emacs.
|
||||
@@ -27,7 +27,7 @@
|
||||
;;; Commentary:
|
||||
|
||||
;; This package provides natural language search on org-mode notes,
|
||||
;; beancount transactions and images.
|
||||
;; markdown files, beancount transactions and images.
|
||||
;; It is a wrapper that interfaces with transformer based ML models.
|
||||
;; The models search capabilities are exposed via the Khoj HTTP API
|
||||
|
||||
@@ -46,6 +46,18 @@
|
||||
:group 'khoj
|
||||
:type 'integer)
|
||||
|
||||
(defun khoj--extract-entries-as-markdown (json-response query)
|
||||
"Convert json response from API to markdown entries"
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string
|
||||
"^[\(\) ]" ""
|
||||
;; extract entries from response as single string and convert to entries
|
||||
(format "# %s\n%s"
|
||||
query
|
||||
(mapcar
|
||||
(lambda (args) (format "%s" (cdr (assoc 'entry args))))
|
||||
json-response))))
|
||||
|
||||
(defun khoj--extract-entries-as-org (json-response query)
|
||||
"Convert json response from API to org-mode entries"
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
@@ -59,7 +71,7 @@
|
||||
json-response))))
|
||||
|
||||
(defun khoj--extract-entries-as-images (json-response query)
|
||||
"Convert json response from API to org-mode entries with images"
|
||||
"Convert json response from API to html with images"
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string
|
||||
"[\(\) ]$" ""
|
||||
@@ -103,6 +115,7 @@
|
||||
((equal buffer-name "Music.org") "music")
|
||||
((equal file-extension "bean") "ledger")
|
||||
((equal file-extension "org") "notes")
|
||||
((or (equal file-extension "markdown") (equal file-extension "md")) "markdown")
|
||||
(t "notes"))))
|
||||
|
||||
(defun khoj--construct-api-query (query search-type)
|
||||
@@ -111,10 +124,10 @@
|
||||
|
||||
;;;###autoload
|
||||
(defun khoj (query)
|
||||
"Khoj on org-mode content via khoj API"
|
||||
"Search your content naturally using the Khoj API"
|
||||
(interactive "sQuery: ")
|
||||
(let* ((default-type (khoj--buffer-name-to-search-type (buffer-name)))
|
||||
(search-type (completing-read "Type: " '("notes" "ledger" "music" "image") nil t default-type))
|
||||
(search-type (completing-read "Type: " '("notes" "markdown" "ledger" "music" "image") nil t default-type))
|
||||
(url (khoj--construct-api-query query search-type))
|
||||
(buff (get-buffer-create (format "*Khoj (q:%s t:%s)*" query search-type))))
|
||||
;; get json response from api
|
||||
@@ -122,17 +135,19 @@
|
||||
(let ((inhibit-read-only t))
|
||||
(erase-buffer)
|
||||
(url-insert-file-contents url)))
|
||||
;; convert json response to org-mode entries
|
||||
;; render json response into formatted entries
|
||||
(with-current-buffer buff
|
||||
(let ((inhibit-read-only t)
|
||||
(json-response (json-parse-buffer :object-type 'alist)))
|
||||
(erase-buffer)
|
||||
(insert
|
||||
(cond ((or (equal search-type "notes") (equal search-type "music")) (khoj--extract-entries-as-org json-response query))
|
||||
((equal search-type "markdown") (khoj--extract-entries-as-markdown json-response query))
|
||||
((equal search-type "ledger") (khoj--extract-entries-as-ledger json-response query))
|
||||
((equal search-type "image") (khoj--extract-entries-as-images json-response query))
|
||||
(t (format "%s" json-response))))
|
||||
(cond ((equal search-type "notes") (org-mode))
|
||||
((equal search-type "markdown") (markdown-mode))
|
||||
((equal search-type "ledger") (beancount-mode))
|
||||
((equal search-type "music") (progn (org-mode)
|
||||
(org-music-mode)))
|
||||
|
||||
@@ -63,6 +63,7 @@
|
||||
<select id="type">
|
||||
<option value="image">Image</option>
|
||||
<option value="notes">Notes</option>
|
||||
<option value="markdown">Markdown</option>
|
||||
<option value="ledger">Ledger</option>
|
||||
<option value="music">Music</option>
|
||||
</select>
|
||||
|
||||
15
src/main.py
15
src/main.py
@@ -14,6 +14,7 @@ from fastapi.templating import Jinja2Templates
|
||||
from src.search_type import image_search, text_search
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
||||
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
|
||||
from src.utils.helpers import get_absolute_path, get_from_dict
|
||||
from src.utils.cli import cli
|
||||
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
@@ -80,6 +81,13 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
|
||||
# collate and return results
|
||||
return text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
if (t == SearchType.Markdown or t == None) and model.notes_search:
|
||||
# query markdown files
|
||||
hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[explicit_filter, date_filter])
|
||||
|
||||
# collate and return results
|
||||
return text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
if (t == SearchType.Ledger or t == None) and model.ledger_search:
|
||||
# query transactions
|
||||
hits, entries = text_search.query(user_query, model.ledger_search, filters=[explicit_filter, date_filter])
|
||||
@@ -88,7 +96,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
|
||||
return text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
if (t == SearchType.Image or t == None) and model.image_search:
|
||||
# query transactions
|
||||
# query images
|
||||
hits = image_search.query(user_query, results_count, model.image_search)
|
||||
output_directory = f'{os.getcwd()}/{web_directory}'
|
||||
|
||||
@@ -172,6 +180,11 @@ def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None
|
||||
# Extract Entries, Generate Music Embeddings
|
||||
model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
|
||||
|
||||
# Initialize Markdown Search
|
||||
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
|
||||
# Extract Entries, Generate Markdown Embeddings
|
||||
model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
|
||||
|
||||
# Initialize Ledger Search
|
||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||
# Extract Entries, Generate Ledger Embeddings
|
||||
|
||||
0
src/processor/markdown/__init__.py
Normal file
0
src/processor/markdown/__init__.py
Normal file
108
src/processor/markdown/markdown_to_jsonl.py
Normal file
108
src/processor/markdown/markdown_to_jsonl.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Standard Packages
|
||||
import json
|
||||
import argparse
|
||||
import pathlib
|
||||
import glob
|
||||
import re
|
||||
|
||||
# Internal Packages
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
|
||||
|
||||
# Define Functions
|
||||
def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, verbose=0):
|
||||
# Input Validation
|
||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||
exit(1)
|
||||
|
||||
# Get Markdown Files to Process
|
||||
markdown_files = get_markdown_files(markdown_files, markdown_file_filter, verbose)
|
||||
|
||||
# Extract Entries from specified Markdown files
|
||||
entries = extract_markdown_entries(markdown_files)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_data = convert_markdown_entries_to_jsonl(entries, verbose=verbose)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file, verbose=verbose)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file, verbose=verbose)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def get_markdown_files(markdown_files=None, markdown_file_filter=None, verbose=0):
|
||||
"Get Markdown files to process"
|
||||
absolute_markdown_files, filtered_markdown_files = set(), set()
|
||||
if markdown_files:
|
||||
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
||||
if markdown_file_filter:
|
||||
filtered_markdown_files = set(glob.glob(get_absolute_path(markdown_file_filter)))
|
||||
|
||||
all_markdown_files = absolute_markdown_files | filtered_markdown_files
|
||||
|
||||
files_with_non_markdown_extensions = {
|
||||
md_file
|
||||
for md_file
|
||||
in all_markdown_files
|
||||
if not md_file.endswith(".md") and not md_file.endswith('.markdown')
|
||||
}
|
||||
|
||||
if any(files_with_non_markdown_extensions):
|
||||
print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}")
|
||||
|
||||
if verbose > 0:
|
||||
print(f'Processing files: {all_markdown_files}')
|
||||
|
||||
return all_markdown_files
|
||||
|
||||
|
||||
def extract_markdown_entries(markdown_files):
|
||||
"Extract entries by heading from specified Markdown files"
|
||||
|
||||
# Regex to extract Markdown Entries by Heading
|
||||
markdown_heading_regex = r'^#'
|
||||
|
||||
entries = []
|
||||
for markdown_file in markdown_files:
|
||||
with open(markdown_file) as f:
|
||||
markdown_content = f.read()
|
||||
entries.extend([f'#{entry.strip(empty_escape_sequences)}'
|
||||
for entry
|
||||
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)])
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def convert_markdown_entries_to_jsonl(entries, verbose=0):
|
||||
"Convert each Markdown entries to JSON and collate as JSONL"
|
||||
jsonl = ''
|
||||
for entry in entries:
|
||||
entry_dict = {'compiled': entry, 'raw': entry}
|
||||
# Convert Dictionary to JSON and Append to JSONL string
|
||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||
|
||||
if verbose > 0:
|
||||
print(f"Converted {len(entries)} to jsonl format")
|
||||
|
||||
return jsonl
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Setup Argument Parser
|
||||
parser = argparse.ArgumentParser(description="Map Markdown entries into (compressed) JSONL format")
|
||||
parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz")
|
||||
parser.add_argument('--input-files', '-i', nargs='*', help="List of markdown files to process")
|
||||
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for markdown files to process")
|
||||
parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Map notes in Markdown files to (compressed) JSONL formatted file
|
||||
markdown_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)
|
||||
@@ -11,6 +11,7 @@ class SearchType(str, Enum):
|
||||
Notes = "notes"
|
||||
Ledger = "ledger"
|
||||
Music = "music"
|
||||
Markdown = "markdown"
|
||||
Image = "image"
|
||||
|
||||
|
||||
@@ -39,6 +40,7 @@ class SearchModels():
|
||||
notes_search: TextSearchModel = None
|
||||
ledger_search: TextSearchModel = None
|
||||
music_search: TextSearchModel = None
|
||||
markdown_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ class ContentConfig(ConfigBase):
|
||||
ledger: Optional[TextContentConfig]
|
||||
image: Optional[ImageContentConfig]
|
||||
music: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
|
||||
class TextSearchConfig(ConfigBase):
|
||||
encoder: Optional[str]
|
||||
|
||||
Reference in New Issue
Block a user