Send more text file types from Desktop app and improve indexing them

- Allow syncing more file types from desktop app to index on server
  - Use `file-type' package to identify valid text file types on Desktop app

- Split plaintext entries into smaller logical units than a whole file
  Since the text splitting upgrades in #645, compiled chunks have more
  logical splits like paragraph, sentence.
  Show those (potentially) smaller snippets to the user as references

- Tangential Fix:
  Initialize unbound currentTime variable for error log timestamp
This commit is contained in:
Debanjum Singh Solanky
2024-04-03 01:49:15 +05:30
parent 89915dcb4c
commit 7ff1bd9f8b
5 changed files with 101 additions and 16 deletions

View File

@@ -1,4 +1,5 @@
const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron'); const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron');
const FileType = require('file-type');
const todesktop = require("@todesktop/runtime"); const todesktop = require("@todesktop/runtime");
const khojPackage = require('./package.json'); const khojPackage = require('./package.json');
@@ -111,22 +112,31 @@ function filenameToMimeType (filename) {
} }
} }
function processDirectory(filesToPush, folder) { async function isPlainTextFile(filePath) {
const fileType = await FileType.fromFile(filePath);
if (!fileType) {
return false;
}
return fileType.mime.startsWith('text/');
}
async function processDirectory(filesToPush, folder) {
const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true }); const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true });
for (const file of files) { for (const file of files) {
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) { const filePath = path.join(folder.path, file.name);
if (file.isFile() && await isPlainTextFile(filePath)) {
console.log(`Add ${file.name} in ${folder.path} for indexing`); console.log(`Add ${file.name} in ${folder.path} for indexing`);
filesToPush.push(path.join(folder.path, file.name)); filesToPush.push(filePath);
} }
if (file.isDirectory()) { if (file.isDirectory()) {
processDirectory(filesToPush, {'path': path.join(folder.path, file.name)}); await processDirectory(filesToPush, {'path': path.join(folder.path, file.name)});
} }
} }
} }
function pushDataToKhoj (regenerate = false) { async function pushDataToKhoj (regenerate = false) {
// Don't sync if token or hostURL is not set or if already syncing // Don't sync if token or hostURL is not set or if already syncing
if (store.get('khojToken') === '' || store.get('hostURL') === '' || syncing === true) { if (store.get('khojToken') === '' || store.get('hostURL') === '' || syncing === true) {
const win = BrowserWindow.getAllWindows()[0]; const win = BrowserWindow.getAllWindows()[0];
@@ -148,7 +158,7 @@ function pushDataToKhoj (regenerate = false) {
// Collect paths of all indexable files in configured folders // Collect paths of all indexable files in configured folders
for (const folder of folders) { for (const folder of folders) {
processDirectory(filesToPush, folder); await processDirectory(filesToPush, folder);
} }
const lastSync = store.get('lastSync') || []; const lastSync = store.get('lastSync') || [];
@@ -222,6 +232,7 @@ function pushDataToKhoj (regenerate = false) {
} else if (error?.code === 'ECONNREFUSED') { } else if (error?.code === 'ECONNREFUSED') {
state["error"] = `Could not connect to Khoj server. Ensure you can connect to it at ${error.address}:${error.port}.`; state["error"] = `Could not connect to Khoj server. Ensure you can connect to it at ${error.address}:${error.port}.`;
} else { } else {
currentTime = new Date();
state["error"] = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`; state["error"] = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`;
} }
}) })
@@ -240,7 +251,7 @@ pushDataToKhoj();
async function handleFileOpen (type) { async function handleFileOpen (type) {
let { canceled, filePaths } = {canceled: true, filePaths: []}; let { canceled, filePaths } = {canceled: true, filePaths: []};
if (type === 'file') { if (type === 'file') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files", extensions: validFileTypes}] })); ({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files" }] }));
} else if (type === 'folder') { } else if (type === 'folder') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]})); ({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]}));
} }
@@ -331,7 +342,7 @@ async function removeFolder (event, folderPath) {
async function syncData (regenerate = false) { async function syncData (regenerate = false) {
try { try {
pushDataToKhoj(regenerate); await pushDataToKhoj(regenerate);
const date = new Date(); const date = new Date();
console.log('Pushing data to Khoj at: ', date); console.log('Pushing data to Khoj at: ', date);
} catch (err) { } catch (err) {
@@ -343,7 +354,7 @@ async function deleteAllFiles () {
try { try {
store.set('files', []); store.set('files', []);
store.set('folders', []); store.set('folders', []);
pushDataToKhoj(true); await pushDataToKhoj(true);
const date = new Date(); const date = new Date();
console.log('Pushing data to Khoj at: ', date); console.log('Pushing data to Khoj at: ', date);
} catch (err) { } catch (err) {
@@ -366,9 +377,9 @@ const createWindow = (tab = 'chat.html') => {
} }
}) })
const job = new cron('0 */10 * * * *', function() { const job = new cron('0 */10 * * * *', async function() {
try { try {
pushDataToKhoj(); await pushDataToKhoj();
const date = new Date(); const date = new Date();
console.log('Pushing data to Khoj at: ', date); console.log('Pushing data to Khoj at: ', date);
win.webContents.send('update-state', state); win.webContents.send('update-state', state);

View File

@@ -20,6 +20,7 @@
"axios": "^1.6.4", "axios": "^1.6.4",
"cron": "^2.4.3", "cron": "^2.4.3",
"electron-store": "^8.1.0", "electron-store": "^8.1.0",
"fs": "^0.0.1-security" "fs": "^0.0.1-security",
"file-type": "^16.2.0"
} }
} }

View File

@@ -62,6 +62,11 @@
lodash.once "^4.1.1" lodash.once "^4.1.1"
semver "^7.3.2" semver "^7.3.2"
"@tokenizer/token@^0.3.0":
version "0.3.0"
resolved "https://registry.yarnpkg.com/@tokenizer/token/-/token-0.3.0.tgz#fe98a93fe789247e998c75e74e9c7c63217aa276"
integrity sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==
"@types/cacheable-request@^6.0.1": "@types/cacheable-request@^6.0.1":
version "6.0.3" version "6.0.3"
resolved "https://registry.yarnpkg.com/@types/cacheable-request/-/cacheable-request-6.0.3.tgz#a430b3260466ca7b5ca5bfd735693b36e7a9d183" resolved "https://registry.yarnpkg.com/@types/cacheable-request/-/cacheable-request-6.0.3.tgz#a430b3260466ca7b5ca5bfd735693b36e7a9d183"
@@ -471,6 +476,15 @@ fd-slicer@~1.1.0:
dependencies: dependencies:
pend "~1.2.0" pend "~1.2.0"
file-type@^16.2.0:
version "16.5.4"
resolved "https://registry.yarnpkg.com/file-type/-/file-type-16.5.4.tgz#474fb4f704bee427681f98dd390058a172a6c2fd"
integrity sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw==
dependencies:
readable-web-to-node-stream "^3.0.0"
strtok3 "^6.2.4"
token-types "^4.1.1"
fill-range@^7.0.1: fill-range@^7.0.1:
version "7.0.1" version "7.0.1"
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40" resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40"
@@ -668,6 +682,11 @@ human-signals@^2.1.0:
resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
ieee754@^1.2.1:
version "1.2.1"
resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.2.1.tgz#8eb7a10a63fff25d15a57b001586d177d1b0d352"
integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==
ignore@^5.2.0: ignore@^5.2.0:
version "5.2.4" version "5.2.4"
resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.2.4.tgz#a291c0c6178ff1b960befe47fcdec301674a6324" resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.2.4.tgz#a291c0c6178ff1b960befe47fcdec301674a6324"
@@ -686,7 +705,7 @@ inflight@^1.0.4:
once "^1.3.0" once "^1.3.0"
wrappy "1" wrappy "1"
inherits@2: inherits@2, inherits@^2.0.3:
version "2.0.4" version "2.0.4"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
@@ -979,6 +998,11 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==
peek-readable@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/peek-readable/-/peek-readable-4.1.0.tgz#4ece1111bf5c2ad8867c314c81356847e8a62e72"
integrity sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg==
pend@~1.2.0: pend@~1.2.0:
version "1.2.0" version "1.2.0"
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50" resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
@@ -1029,6 +1053,22 @@ quick-lru@^5.1.1:
resolved "https://registry.yarnpkg.com/quick-lru/-/quick-lru-5.1.1.tgz#366493e6b3e42a3a6885e2e99d18f80fb7a8c932" resolved "https://registry.yarnpkg.com/quick-lru/-/quick-lru-5.1.1.tgz#366493e6b3e42a3a6885e2e99d18f80fb7a8c932"
integrity sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA== integrity sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==
readable-stream@^3.6.0:
version "3.6.2"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.2.tgz#56a9b36ea965c00c5a93ef31eb111a0f11056967"
integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==
dependencies:
inherits "^2.0.3"
string_decoder "^1.1.1"
util-deprecate "^1.0.1"
readable-web-to-node-stream@^3.0.0:
version "3.0.2"
resolved "https://registry.yarnpkg.com/readable-web-to-node-stream/-/readable-web-to-node-stream-3.0.2.tgz#5d52bb5df7b54861fd48d015e93a2cb87b3ee0bb"
integrity sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw==
dependencies:
readable-stream "^3.6.0"
require-from-string@^2.0.2: require-from-string@^2.0.2:
version "2.0.2" version "2.0.2"
resolved "https://registry.yarnpkg.com/require-from-string/-/require-from-string-2.0.2.tgz#89a7fdd938261267318eafe14f9c32e598c36909" resolved "https://registry.yarnpkg.com/require-from-string/-/require-from-string-2.0.2.tgz#89a7fdd938261267318eafe14f9c32e598c36909"
@@ -1077,6 +1117,11 @@ run-parallel@^1.1.9:
dependencies: dependencies:
queue-microtask "^1.2.2" queue-microtask "^1.2.2"
safe-buffer@~5.2.0:
version "5.2.1"
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6"
integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
sax@^1.2.4: sax@^1.2.4:
version "1.2.4" version "1.2.4"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
@@ -1133,11 +1178,26 @@ sprintf-js@^1.1.2:
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.2.tgz#da1765262bf8c0f571749f2ad6c26300207ae673" resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.2.tgz#da1765262bf8c0f571749f2ad6c26300207ae673"
integrity sha512-VE0SOVEHCk7Qc8ulkWw3ntAzXuqf7S2lvwQaDLRnUeIEaKNQJzV6BwmLKhOqT61aGhfUMrXeaBk+oDGCzvhcug== integrity sha512-VE0SOVEHCk7Qc8ulkWw3ntAzXuqf7S2lvwQaDLRnUeIEaKNQJzV6BwmLKhOqT61aGhfUMrXeaBk+oDGCzvhcug==
string_decoder@^1.1.1:
version "1.3.0"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
dependencies:
safe-buffer "~5.2.0"
strip-final-newline@^2.0.0: strip-final-newline@^2.0.0:
version "2.0.0" version "2.0.0"
resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad" resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad"
integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA== integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==
strtok3@^6.2.4:
version "6.3.0"
resolved "https://registry.yarnpkg.com/strtok3/-/strtok3-6.3.0.tgz#358b80ffe6d5d5620e19a073aa78ce947a90f9a0"
integrity sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==
dependencies:
"@tokenizer/token" "^0.3.0"
peek-readable "^4.1.0"
sumchecker@^3.0.1: sumchecker@^3.0.1:
version "3.0.1" version "3.0.1"
resolved "https://registry.yarnpkg.com/sumchecker/-/sumchecker-3.0.1.tgz#6377e996795abb0b6d348e9b3e1dfb24345a8e42" resolved "https://registry.yarnpkg.com/sumchecker/-/sumchecker-3.0.1.tgz#6377e996795abb0b6d348e9b3e1dfb24345a8e42"
@@ -1152,6 +1212,14 @@ to-regex-range@^5.0.1:
dependencies: dependencies:
is-number "^7.0.0" is-number "^7.0.0"
token-types@^4.1.1:
version "4.2.1"
resolved "https://registry.yarnpkg.com/token-types/-/token-types-4.2.1.tgz#0f897f03665846982806e138977dbe72d44df753"
integrity sha512-6udB24Q737UD/SDsKAHI9FCRP7Bqc9D/MQUV02ORQg5iskjtLJlZJNdN4kKtcdtwCeWIwIHDGaUsTsCCAa8sFQ==
dependencies:
"@tokenizer/token" "^0.3.0"
ieee754 "^1.2.1"
type-fest@^0.13.1: type-fest@^0.13.1:
version "0.13.1" version "0.13.1"
resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-0.13.1.tgz#0172cb5bce80b0bd542ea348db50c7e21834d934" resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-0.13.1.tgz#0172cb5bce80b0bd542ea348db50c7e21834d934"
@@ -1179,6 +1247,11 @@ uri-js@^4.2.2:
dependencies: dependencies:
punycode "^2.1.0" punycode "^2.1.0"
util-deprecate@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==
which@^2.0.1: which@^2.0.1:
version "2.0.2" version "2.0.2"
resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1" resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1"

View File

@@ -47,7 +47,7 @@ class PlaintextToEntries(TextToEntries):
# Split entries by max tokens supported by model # Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger): with timer("Split entries by max token size supported by model", logger):
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256, raw_is_compiled=True)
# Identify, mark and merge any new entries with previous entries # Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger): with timer("Identify new or updated entries", logger):

View File

@@ -59,7 +59,7 @@ class TextToEntries(ABC):
@staticmethod @staticmethod
def split_entries_by_max_tokens( def split_entries_by_max_tokens(
entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500 entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500, raw_is_compiled: bool = False
) -> List[Entry]: ) -> List[Entry]:
"Split entries if compiled entry length exceeds the max tokens supported by the ML model." "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
chunked_entries: List[Entry] = [] chunked_entries: List[Entry] = []
@@ -94,7 +94,7 @@ class TextToEntries(ABC):
# Clean entry of unwanted characters like \0 character # Clean entry of unwanted characters like \0 character
compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk) compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk)
entry.raw = TextToEntries.clean_field(entry.raw) entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw)
entry.heading = TextToEntries.clean_field(entry.heading) entry.heading = TextToEntries.clean_field(entry.heading)
entry.file = TextToEntries.clean_field(entry.file) entry.file = TextToEntries.clean_field(entry.file)