mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 21:29:08 +00:00
Wire up PDF to jsonl processor to Khoj server layer (API, config)
- Specify PDF content to index via khoj.yml - Index PDF content on app start, reconfigure - Expose PDF as a search type via API
This commit is contained in:
@@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
@@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize PDF Search
|
||||
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
|
||||
logger.info("💸 Setting up search for pdf")
|
||||
# Extract Entries, Generate PDF Embeddings
|
||||
model.pdf_search = text_search.setup(
|
||||
PdfToJsonl,
|
||||
config.content_type.pdf,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize Image Search
|
||||
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
||||
logger.info("🌄 Setting up search for images")
|
||||
|
||||
@@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
|
||||
return "Beancount Files (*.bean *.beancount)"
|
||||
elif search_type == SearchType.Markdown:
|
||||
return "Markdown Files (*.md *.markdown)"
|
||||
elif search_type == SearchType.Pdf:
|
||||
return "Pdf Files (*.pdf)"
|
||||
elif search_type == SearchType.Music:
|
||||
return "Org-Music Files (*.org)"
|
||||
elif search_type == SearchType.Image:
|
||||
|
||||
@@ -109,6 +109,17 @@ def search(
|
||||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
|
||||
# query pdf files
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||
# query transactions
|
||||
with timer("Query took", logger):
|
||||
|
||||
@@ -22,6 +22,7 @@ class SearchType(str, Enum):
|
||||
Music = "music"
|
||||
Markdown = "markdown"
|
||||
Image = "image"
|
||||
Pdf = "pdf"
|
||||
|
||||
|
||||
class ProcessorType(str, Enum):
|
||||
@@ -61,6 +62,7 @@ class SearchModels:
|
||||
ledger_search: TextSearchModel = None
|
||||
music_search: TextSearchModel = None
|
||||
markdown_search: TextSearchModel = None
|
||||
pdf_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
plugin_search: Dict[str, TextSearchModel] = None
|
||||
|
||||
|
||||
@@ -28,6 +28,12 @@ default_config = {
|
||||
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
||||
},
|
||||
"pdf": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||
},
|
||||
"image": {
|
||||
"input-directories": None,
|
||||
"input-filter": None,
|
||||
|
||||
@@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
|
||||
image: Optional[ImageContentConfig]
|
||||
music: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
pdf: Optional[TextContentConfig]
|
||||
plugins: Optional[Dict[str, TextContentConfig]]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user