Drop Support for Org Music, Ledger Content Types

Removing unused content types will reduce khoj code to manage

- 0f993b3 Drop support for Ledger as a separate content type
   Khoj will soon get a generic text indexing content type in Index plain text files #237.
   This along with a file filter should suffice for searching through Ledger transactions

- c9db532 Remove unused org-music as an indexable content type from Khoj
   Org-music was just a custom content type that worked with org-music.
   It was mostly only useful for me.
This commit is contained in:
Debanjum
2023-07-02 17:48:29 -07:00
committed by GitHub
19 changed files with 22 additions and 697 deletions

View File

@@ -4,7 +4,7 @@
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
;; Description: An AI personal assistant for your digital brain
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, beancount, image
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
;; Version: 0.7.0
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
@@ -29,8 +29,7 @@
;;; Commentary:
;; Create an AI personal assistant for your `org-mode', `markdown' notes,
;; `beancount' transactions, PDFs and images. This package exposes
;; two assistance modes, search and chat:
;; PDFs and images. The assistant exposes 2 modes, search and chat:
;;
;; Chat provides faster answers, iterative discovery and assisted
;; creativity. It requires your OpenAI API key to access GPT models
@@ -93,10 +92,8 @@
:group 'khoj
:type '(choice (const "org")
(const "markdown")
(const "ledger")
(const "image")
(const "pdf")
(const "music")))
(const "pdf")))
;; --------------------------
@@ -120,9 +117,7 @@
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
(declare-function org-element-type "org-mode" (ELEMENT))
(declare-function beancount-mode "beancount" ())
(declare-function markdown-mode "markdown-mode" ())
(declare-function org-music-mode "org-music" ())
(declare-function which-key--show-keymap "which-key" (KEYMAP-NAME KEYMAP &optional PRIOR-ARGS ALL
NO-PAGING FILTER))
@@ -137,22 +132,16 @@ NO-PAGING FILTER))
"C-x m | markdown\n")
(when (member 'org enabled-content-types)
"C-x o | org-mode\n")
(when (member 'ledger enabled-content-types)
"C-x l | ledger\n")
(when (member 'image enabled-content-types)
"C-x i | image\n")
(when (member 'pdf enabled-content-types)
"C-x p | pdf\n")
(when (member 'music enabled-content-types)
"C-x M | music\n"))))
"C-x p | pdf\n"))))
(defvar khoj--rerank nil "Track when re-rank of results triggered.")
(defvar khoj--reference-count 0 "Track number of references currently in chat bufffer.")
(defun khoj--search-markdown () "Set content-type to `markdown'." (interactive) (setq khoj--content-type "markdown"))
(defun khoj--search-org () "Set content-type to `org-mode'." (interactive) (setq khoj--content-type "org"))
(defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
(defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image"))
(defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music"))
(defun khoj--search-pdf () "Set content-type to pdf." (interactive) (setq khoj--content-type "pdf"))
(defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
(defun khoj--make-search-keymap (&optional existing-keymap)
@@ -164,14 +153,10 @@ NO-PAGING FILTER))
(define-key kmap (kbd "C-x m") #'khoj--search-markdown))
(when (member 'org enabled-content-types)
(define-key kmap (kbd "C-x o") #'khoj--search-org))
(when (member 'ledger enabled-content-types)
(define-key kmap (kbd "C-x l") #'khoj--search-ledger))
(when (member 'image enabled-content-types)
(define-key kmap (kbd "C-x i") #'khoj--search-images))
(when (member 'pdf enabled-content-types)
(define-key kmap (kbd "C-x p") #'khoj--search-pdf))
(when (member 'music enabled-content-types)
(define-key kmap (kbd "C-x M") #'khoj--search-music))
kmap))
(defvar khoj--keymap nil "Track Khoj keymap in this variable.")
@@ -538,18 +523,6 @@ CONFIG is json obtained from Khoj config API."
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string "^[\(\) ]" "")))
(defun khoj--extract-entries-as-ledger (json-response query)
"Convert JSON-RESPONSE, QUERY from API to ledger entries."
(thread-last json-response
;; extract and render entries from API response
(mapcar (lambda (args) (format "%s\n\n" (cdr (assoc 'entry args)))))
;; Set query as heading in rendered results buffer
(format ";; %s\n\n%s\n" query)
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string "^[\(\) ]" "")
;; remove trailing (, ) or SPC from extracted entries string
(replace-regexp-in-string "[\(\) ]$" "")))
(defun khoj--extract-entries-as-pdf (json-response query)
"Convert QUERY, JSON-RESPONSE from API with PDF results to `org-mode' entries."
(thread-last
@@ -621,8 +594,6 @@ CONFIG is json obtained from Khoj config API."
(let ((enabled-content-types (khoj--get-enabled-content-types))
(file-extension (file-name-extension buffer-name)))
(cond
((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music")
((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger")
((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf")
((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
@@ -678,10 +649,9 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
(json-response (json-parse-buffer :object-type 'alist)))
(erase-buffer)
(insert
(cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query))
(cond ((equal content-type "org") (khoj--extract-entries-as-org json-response query))
((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
((equal content-type "pdf") (khoj--extract-entries-as-pdf json-response query))
((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
((equal content-type "image") (khoj--extract-entries-as-images json-response query))
(t (khoj--extract-entries json-response query))))
(cond ((or (equal content-type "all")
@@ -696,9 +666,6 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
(org-set-startup-visibility)))
((equal content-type "markdown") (progn (markdown-mode)
(visual-line-mode)))
((equal content-type "ledger") (beancount-mode))
((equal content-type "music") (progn (org-mode)
(org-music-mode)))
((equal content-type "image") (progn (shr-render-region (point-min) (point-max))
(goto-char (point-min))))
(t (fundamental-mode))))
@@ -920,7 +887,7 @@ RECEIVE-DATE is the message receive date."
(remove-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
(defun khoj-incremental ()
"Natural, Incremental Search for your personal notes, transactions and music."
"Natural, Incremental Search for your personal notes and documents."
(interactive)
(let* ((khoj-buffer-name (get-buffer-create khoj--search-buffer-name)))
;; switch to khoj results buffer
@@ -1014,7 +981,7 @@ Paragraph only starts at first text after blank line."
;; set content type to: last used > based on current buffer > default type
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
;; dynamically set choices to content types enabled on khoj backend
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "ledger" "music" "image")))
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
(transient-define-suffix khoj--search-command (&optional args)
(interactive (list (transient-args transient-current-command)))
@@ -1074,7 +1041,7 @@ Paragraph only starts at first text after blank line."
;;;###autoload
(defun khoj ()
"Provide natural, search assistance for your notes, transactions and images."
"Provide natural, search assistance for your notes, documents and images."
(interactive)
(when khoj-auto-setup
(khoj-setup t))

View File

@@ -112,46 +112,6 @@ Rule everything\n\
\n"))))
(ert-deftest khoj-tests--extract-entries-as-ledger ()
"Test `json-response', `query' from API formatted as beancount ledger."
(let ((user-query "Become God")
(json-response-from-khoj-backend
(json-read-from-string
"[\
{\
\"entry\": \"4242-04-01 * \\\"Penance Center\\\" \\\"Book Stay for 10,000 Years\\\"\\n Expenses:Health:Mental 15 GOLD\\n Assets:Commodities:Gold\",\
\"score\": \"0.42\",\
\"additional\": {\
\"file\": \"/home/ravan/ledger.beancount\",\
\"compiled\": \"4242-04-01 * \\\"Penance Center\\\" \\\"Book Stay for 10,000 Years\\\" Expenses:Health:Mental 15 GOLD Assets:Commodities:Gold\"\
}\
},\
{\
\"entry\": \"14242-04-01 * \\\"Brahma\\\" \\\"Boon for Invincibility from Higher Beings\\\"\\n Income:Health -1,00,00,000 LIFE\\n Assets:Commodities:Life\",\
\"score\": \"0.42\",\
\"additional\": {\
\"file\": \"/home/ravan/ledger.beancount\",\
\"compiled\": \"4242-04-01 * \\\"Brahma\\\" \\\"Boon for Invincibility from Higher Beings\\\" Income:Health -1,00,00,000 LIFE Assets:Commodities:Life\"\
}\
}]\
")))
(should
(equal
(khoj--extract-entries-as-ledger json-response-from-khoj-backend user-query)
";; Become God\n\
\n\
4242-04-01 * \"Penance Center\" \"Book Stay for 10,000 Years\"\n\
Expenses:Health:Mental 15 GOLD\n\
Assets:Commodities:Gold\n\
\n\
14242-04-01 * \"Brahma\" \"Boon for Invincibility from Higher Beings\"\n\
Income:Health -1,00,00,000 LIFE\n\
Assets:Commodities:Life\n\
\n\
\n\
"))))
;; -------------------------------------
;; Test Helpers for Find Similar Feature

View File

@@ -12,7 +12,6 @@ from fastapi.staticfiles import StaticFiles
# Internal Packages
from khoj.processor.conversation.gpt import summarize
from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
@@ -106,18 +105,6 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
filters=[DateFilter(), WordFilter(), FileFilter()],
)
# Initialize Org Music Search
if (t == state.SearchType.Music or t == None) and config.content_type.music and config.search_type.asymmetric:
logger.info("🎺 Setting up search for org-music")
# Extract Entries, Generate Music Embeddings
model.music_search = text_search.setup(
OrgToJsonl,
config.content_type.music,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
filters=[DateFilter(), WordFilter()],
)
# Initialize Markdown Search
if (
(t == state.SearchType.Markdown or t == None)
@@ -134,18 +121,6 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
filters=[DateFilter(), WordFilter(), FileFilter()],
)
# Initialize Ledger Search
if (t == state.SearchType.Ledger or t == None) and config.content_type.ledger and config.search_type.symmetric:
logger.info("💸 Setting up search for ledger")
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = text_search.setup(
BeancountToJsonl,
config.content_type.ledger,
search_config=config.search_type.symmetric,
regenerate=regenerate,
filters=[DateFilter(), WordFilter(), FileFilter()],
)
# Initialize PDF Search
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf and config.search_type.asymmetric:
logger.info("🖨️ Setting up search for pdf")

View File

@@ -47,12 +47,6 @@
}).join("\n");
}
function render_ledger(query, data) {
return data.map(function (item) {
return `<div class="results-ledger">` + `<p>${item.entry}</p>` + `</div>`;
}).join("\n");
}
function render_pdf(query, data) {
return data.map(function (item) {
let compiled_lines = item.additional.compiled.split("\n");
@@ -88,12 +82,8 @@
results = render_markdown(query, data);
} else if (type === "org") {
results = render_org(query, data, "org-");
} else if (type === "music") {
results = render_org(query, data, "music-");
} else if (type === "image") {
results = data.map(render_image).join('');
} else if (type === "ledger") {
results = render_ledger(query, data);
} else if (type === "pdf") {
results = render_pdf(query, data);
} else if (type === "github" || type === "all") {
@@ -362,8 +352,7 @@
white-space: pre-wrap;
}
.results-pdf,
.results-plugin,
.results-ledger {
.results-plugin {
text-align: left;
white-space: pre-line;
}
@@ -371,17 +360,14 @@
.results-github {
text-align: left;
}
.results-music,
.results-org {
text-align: left;
white-space: pre-line;
}
.results-music h3,
.results-org h3 {
margin: 20px 0 0 0;
font-size: larger;
}
span.music-task-status,
span.org-task-status {
color: white;
padding: 3.5px 3.5px 0;
@@ -390,15 +376,12 @@
background-color: #eab308;
font-size: medium;
}
span.music-task-status.todo,
span.org-task-status.todo {
background-color: #3b82f6
}
span.music-task-status.done,
span.org-task-status.done {
background-color: #22c55e;
}
span.music-task-tag,
span.org-task-tag {
color: white;
padding: 3.5px 3.5px 0;

View File

@@ -143,23 +143,15 @@ search_type = """
Objective: Extract search type from user query and return information as JSON
Allowed search types are listed below:
- search-type=["notes","ledger","image","music", "pdf"]
- search-type=["notes", "image", "pdf"]
Some examples are given below for reference:
Q:What fiction book was I reading last week about AI starship?
A:{ "search-type": "notes" }
Q: What did the lease say about early termination
A: { "search-type": "pdf" }
Q:Play some calm classical music?
A:{ "search-type": "music" }
Q:How much did I spend at Subway for dinner last time?
A:{ "search-type": "ledger" }
Q:What was that popular Sri lankan song that Alex had mentioned?
A:{ "search-type": "music" }
Q:Can you recommend a movie to watch from my notes?
A:{ "search-type": "notes" }
Q:When did I buy Groceries last?
A:{ "search-type": "ledger" }
Q:When did I go surfing last?
A:{ "search-type": "notes" }
Q:"""

View File

@@ -1,133 +0,0 @@
# Standard Packages
import glob
import re
import logging
from typing import List
# Internal Packages
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
from khoj.utils.constants import empty_escape_sequences
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.rawconfig import Entry
logger = logging.getLogger(__name__)
class BeancountToJsonl(TextToJsonl):
# Define Functions
def process(self, previous_entries=None):
# Extract required fields from config
beancount_files, beancount_file_filter, output_file = (
self.config.input_files,
self.config.input_filter,
self.config.compressed_jsonl,
)
# Input Validation
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
print("At least one of beancount-files or beancount-file-filter is required to be specified")
exit(1)
# Get Beancount Files to Process
beancount_files = BeancountToJsonl.get_beancount_files(beancount_files, beancount_file_filter)
# Extract Entries from specified Beancount files
with timer("Parse transactions from Beancount files into dictionaries", logger):
current_entries = BeancountToJsonl.convert_transactions_to_maps(
*BeancountToJsonl.extract_beancount_transactions(beancount_files)
)
# Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger):
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
# Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated transaction", logger):
if not previous_entries:
entries_with_ids = list(enumerate(current_entries))
else:
entries_with_ids = TextToJsonl.mark_entries_for_update(
current_entries, previous_entries, key="compiled", logger=logger
)
with timer("Write transactions to JSONL file", logger):
# Process Each Entry from All Notes Files
entries = list(map(lambda entry: entry[1], entries_with_ids))
jsonl_data = BeancountToJsonl.convert_transaction_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
return entries_with_ids
@staticmethod
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
"Get Beancount files to process"
absolute_beancount_files, filtered_beancount_files = set(), set()
if beancount_files:
absolute_beancount_files = {get_absolute_path(beancount_file) for beancount_file in beancount_files}
if beancount_file_filters:
filtered_beancount_files = {
filtered_file
for beancount_file_filter in beancount_file_filters
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter), recursive=True)
}
all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)
files_with_non_beancount_extensions = {
beancount_file
for beancount_file in all_beancount_files
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
}
if any(files_with_non_beancount_extensions):
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
logger.debug(f"Processing files: {all_beancount_files}")
return all_beancount_files
@staticmethod
def extract_beancount_transactions(beancount_files):
"Extract entries from specified Beancount files"
# Initialize Regex for extracting Beancount Entries
transaction_regex = r"^\n?\d{4}-\d{2}-\d{2} [\*|\!] "
empty_newline = f"^[\n\r\t\ ]*$"
entries = []
transaction_to_file_map = []
for beancount_file in beancount_files:
with open(beancount_file) as f:
ledger_content = f.read()
transactions_per_file = [
entry.strip(empty_escape_sequences)
for entry in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
if re.match(transaction_regex, entry)
]
transaction_to_file_map += zip(transactions_per_file, [beancount_file] * len(transactions_per_file))
entries.extend(transactions_per_file)
return entries, dict(transaction_to_file_map)
@staticmethod
def convert_transactions_to_maps(parsed_entries: List[str], transaction_to_file_map) -> List[Entry]:
"Convert each parsed Beancount transaction into a Entry"
entries = []
for parsed_entry in parsed_entries:
entries.append(
Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{transaction_to_file_map[parsed_entry]}")
)
logger.debug(f"Converted {len(parsed_entries)} transactions to dictionaries")
return entries
@staticmethod
def convert_transaction_maps_to_jsonl(entries: List[Entry]) -> str:
"Convert each Beancount transaction entry to JSON and collate as JSONL"
return "".join([f"{entry.to_json()}\n" for entry in entries])

View File

@@ -171,11 +171,9 @@ async def search(
defiltered_query = filter.defilter(user_query)
encoded_asymmetric_query = None
if t == SearchType.All or (t != SearchType.Ledger and t != SearchType.Image):
if t == SearchType.All or t != SearchType.Image:
text_search_models: List[TextSearchModel] = [
model
for model_name, model in state.model.__dict__.items()
if isinstance(model, TextSearchModel) and model_name != "ledger_search"
model for model in state.model.__dict__.values() if isinstance(model, TextSearchModel)
]
if text_search_models:
with timer("Encoding query took", logger=logger):
@@ -244,33 +242,6 @@ async def search(
)
]
if (t == SearchType.Ledger) and state.model.ledger_search:
# query transactions
search_futures += [
executor.submit(
text_search.query,
user_query,
state.model.ledger_search,
rank_results=r or False,
score_threshold=score_threshold,
dedupe=dedupe or True,
)
]
if (t == SearchType.Music or t == SearchType.All) and state.model.music_search:
# query music library
search_futures += [
executor.submit(
text_search.query,
user_query,
state.model.music_search,
question_embedding=encoded_asymmetric_query,
rank_results=r or False,
score_threshold=score_threshold,
dedupe=dedupe or True,
)
]
if (t == SearchType.Image) and state.model.image_search:
# query images
search_futures += [

View File

@@ -16,7 +16,7 @@ import json
web_client = APIRouter()
templates = Jinja2Templates(directory=constants.web_directory)
VALID_CONTENT_TYPES = ["org", "ledger", "markdown", "pdf"]
VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf"]
# Create Routes
@@ -60,7 +60,7 @@ if not state.demo:
@web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
def content_config_page(request: Request, content_type: str):
if content_type not in VALID_CONTENT_TYPES:
if content_type not in VALID_TEXT_CONTENT_TYPES:
return templates.TemplateResponse("config.html", context={"request": request})
default_copy = constants.default_config.copy()

View File

@@ -19,8 +19,6 @@ if TYPE_CHECKING:
class SearchType(str, Enum):
All = "all"
Org = "org"
Ledger = "ledger"
Music = "music"
Markdown = "markdown"
Image = "image"
Pdf = "pdf"
@@ -61,8 +59,6 @@ class ImageSearchModel:
@dataclass
class SearchModels:
org_search: TextSearchModel = None
ledger_search: TextSearchModel = None
music_search: TextSearchModel = None
markdown_search: TextSearchModel = None
pdf_search: TextSearchModel = None
image_search: ImageSearchModel = None

View File

@@ -22,12 +22,6 @@ default_config = {
"compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz",
"embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt",
},
"ledger": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
},
"pdf": {
"input-files": None,
"input-filter": None,
@@ -41,12 +35,6 @@ default_config = {
"batch-size": 50,
"use-xmp-metadata": False,
},
"music": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/music/music.jsonl.gz",
"embeddings-file": "~/.khoj/content/music/music_embeddings.pt",
},
"github": {
"pat-token": None,
"repos": [],

View File

@@ -72,9 +72,7 @@ class ImageContentConfig(ConfigBase):
class ContentConfig(ConfigBase):
org: Optional[TextContentConfig]
ledger: Optional[TextContentConfig]
image: Optional[ImageContentConfig]
music: Optional[TextContentConfig]
markdown: Optional[TextContentConfig]
pdf: Optional[TextContentConfig]
github: Optional[GithubContentConfig]