mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-04 21:29:12 +00:00
Accept file deletion requests by clients during sync
- Remove unused full_corpus boolean. The full_corpus=False code path wasn't being used (accept for in a test) - The full_corpus=True code path used was ignoring file deletion requests sent by clients during sync. Unclear why this was done - Added unit test to prevent regression and show file deletion by clients during sync not ignored now
This commit is contained in:
@@ -19,16 +19,11 @@ class DocxToEntries(TextToEntries):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
# Extract required fields from config
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
|
||||
# Extract Entries from specified Docx files
|
||||
with timer("Extract entries from specified DOCX files", logger):
|
||||
|
||||
@@ -48,9 +48,7 @@ class GithubToEntries(TextToEntries):
|
||||
else:
|
||||
return
|
||||
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
if self.config.pat_token is None or self.config.pat_token == "":
|
||||
logger.error(f"Github PAT token is not set. Skipping github content")
|
||||
raise ValueError("Github PAT token is not set. Skipping github content")
|
||||
|
||||
@@ -20,16 +20,11 @@ class ImageToEntries(TextToEntries):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
# Extract required fields from config
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
|
||||
# Extract Entries from specified image files
|
||||
with timer("Extract entries from specified Image files", logger):
|
||||
|
||||
@@ -19,16 +19,11 @@ class MarkdownToEntries(TextToEntries):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
# Extract required fields from config
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == ""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
deletion_file_names = set([file for file in files if files[file] == ""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
|
||||
max_tokens = 256
|
||||
# Extract Entries from specified Markdown files
|
||||
|
||||
@@ -78,9 +78,7 @@ class NotionToEntries(TextToEntries):
|
||||
|
||||
self.body_params = {"page_size": 100}
|
||||
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
current_entries = []
|
||||
|
||||
# Get all pages
|
||||
|
||||
@@ -20,15 +20,10 @@ class OrgToEntries(TextToEntries):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == ""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
deletion_file_names = set([file for file in files if files[file] == ""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
|
||||
# Extract Entries from specified Org files
|
||||
max_tokens = 256
|
||||
|
||||
@@ -22,16 +22,11 @@ class PdfToEntries(TextToEntries):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
# Extract required fields from config
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
|
||||
# Extract Entries from specified Pdf files
|
||||
with timer("Extract entries from specified PDF files", logger):
|
||||
|
||||
@@ -20,15 +20,10 @@ class PlaintextToEntries(TextToEntries):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == ""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
deletion_file_names = set([file for file in files if files[file] == ""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
|
||||
# Extract Entries from specified plaintext files
|
||||
with timer("Extract entries from specified Plaintext files", logger):
|
||||
|
||||
@@ -31,9 +31,7 @@ class TextToEntries(ABC):
|
||||
self.date_filter = DateFilter()
|
||||
|
||||
@abstractmethod
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -1313,7 +1313,6 @@ def configure_content(
|
||||
files: Optional[dict[str, dict[str, str]]],
|
||||
regenerate: bool = False,
|
||||
t: Optional[state.SearchType] = state.SearchType.All,
|
||||
full_corpus: bool = True,
|
||||
user: KhojUser = None,
|
||||
) -> bool:
|
||||
success = True
|
||||
@@ -1344,7 +1343,6 @@ def configure_content(
|
||||
OrgToEntries,
|
||||
files.get("org"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -1362,7 +1360,6 @@ def configure_content(
|
||||
MarkdownToEntries,
|
||||
files.get("markdown"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
|
||||
@@ -1379,7 +1376,6 @@ def configure_content(
|
||||
PdfToEntries,
|
||||
files.get("pdf"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
|
||||
@@ -1398,7 +1394,6 @@ def configure_content(
|
||||
PlaintextToEntries,
|
||||
files.get("plaintext"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
|
||||
@@ -1418,7 +1413,6 @@ def configure_content(
|
||||
GithubToEntries,
|
||||
None,
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
config=github_config,
|
||||
)
|
||||
@@ -1439,7 +1433,6 @@ def configure_content(
|
||||
NotionToEntries,
|
||||
None,
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
config=notion_config,
|
||||
)
|
||||
@@ -1459,7 +1452,6 @@ def configure_content(
|
||||
ImageToEntries,
|
||||
files.get("image"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -1472,7 +1464,6 @@ def configure_content(
|
||||
DocxToEntries,
|
||||
files.get("docx"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
except Exception as e:
|
||||
|
||||
@@ -80,6 +80,6 @@ async def notion_auth_callback(request: Request, background_tasks: BackgroundTas
|
||||
notion_redirect = str(request.app.url_path_for("notion_config_page"))
|
||||
|
||||
# Trigger an async job to configure_content. Let it run without blocking the response.
|
||||
background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, True, user)
|
||||
background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, user)
|
||||
|
||||
return RedirectResponse(notion_redirect)
|
||||
|
||||
@@ -199,17 +199,16 @@ def setup(
|
||||
text_to_entries: Type[TextToEntries],
|
||||
files: dict[str, str],
|
||||
regenerate: bool,
|
||||
full_corpus: bool = True,
|
||||
user: KhojUser = None,
|
||||
config=None,
|
||||
) -> None:
|
||||
) -> Tuple[int, int]:
|
||||
if config:
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
|
||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||
files=files, user=user, regenerate=regenerate
|
||||
)
|
||||
else:
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
|
||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||
files=files, user=user, regenerate=regenerate
|
||||
)
|
||||
|
||||
if files:
|
||||
@@ -219,6 +218,8 @@ def setup(
|
||||
f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names[:10]} ..."
|
||||
)
|
||||
|
||||
return num_new_embeddings, num_deleted_embeddings
|
||||
|
||||
|
||||
def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]:
|
||||
"""Score all retrieved entries using the cross-encoder"""
|
||||
|
||||
@@ -122,7 +122,7 @@ def get_org_files(config: TextContentConfig):
|
||||
logger.debug("At least one of org-files or org-file-filter is required to be specified")
|
||||
return {}
|
||||
|
||||
"Get Org files to process"
|
||||
# Get Org files to process
|
||||
absolute_org_files, filtered_org_files = set(), set()
|
||||
if org_files:
|
||||
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
|
||||
|
||||
Reference in New Issue
Block a user