Support Natural Search on Markdown Files

- Reason: Allow natural search on markdown based notes, documentation, websites etc - Details: - Create markdown processor to extract Markdown entries (identified by Heading) into standard jsonl format required by text_search - Update API, Configs to support interfacing with new markdown type - Update Emacs, Web clients to support interfacing with new markdown type via API - Update Readme to mentiond markdown is also supported Closes #35
2026-03-04 21:29:12 +00:00 · 2022-07-21 20:22:24 +04:00
parent 0602d018c0
commit d4d7dbaca6
8 changed files with 149 additions and 9 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -14,6 +14,7 @@ from fastapi.templating import Jinja2Templates
 from src.search_type import image_search, text_search
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
+from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
 from src.utils.helpers import get_absolute_path, get_from_dict
 from src.utils.cli import cli
 from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@@ -80,6 +81,13 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
        # collate and return results
        return text_search.collate_results(hits, entries, results_count)

+    if (t == SearchType.Markdown or t == None) and model.notes_search:
+        # query markdown files
+        hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[explicit_filter, date_filter])
+
+        # collate and return results
+        return text_search.collate_results(hits, entries, results_count)
+
    if (t == SearchType.Ledger or t == None) and model.ledger_search:
        # query transactions
        hits, entries = text_search.query(user_query, model.ledger_search, filters=[explicit_filter, date_filter])
@@ -88,7 +96,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
        return text_search.collate_results(hits, entries, results_count)

    if (t == SearchType.Image or t == None) and model.image_search:
-        # query transactions
+        # query images
        hits = image_search.query(user_query, results_count, model.image_search)
        output_directory = f'{os.getcwd()}/{web_directory}'

@@ -172,6 +180,11 @@ def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None
        # Extract Entries, Generate Music Embeddings
        model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)

+    # Initialize Markdown Search
+    if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
+        # Extract Entries, Generate Markdown Embeddings
+        model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
+
    # Initialize Ledger Search
    if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
        # Extract Entries, Generate Ledger Embeddings