mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-04 21:29:12 +00:00
Use Magika to more robustly identify text files to send for indexing
- `file-type' doesn't handle mis-labelled files or files without extensions well - Only show supported file types in file selector dialog on Desktop app Use Magika to get list of text file extensions. Combine with other supported extensions to get complete list of supported file extensions. Use it to limit selectable files in the File Open dialog. Note: Folder selector will index text files with no extensions as well
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron');
|
||||
const FileType = require('file-type');
|
||||
const Magika = require('magika').MagikaNode;
|
||||
const todesktop = require("@todesktop/runtime");
|
||||
const khojPackage = require('./package.json');
|
||||
|
||||
@@ -15,8 +15,8 @@ const KHOJ_URL = 'https://app.khoj.dev';
|
||||
|
||||
const Store = require('electron-store');
|
||||
|
||||
const validFileTypes = ['org', 'md', 'markdown', 'txt', 'html', 'xml', 'pdf']
|
||||
|
||||
const magika = new Magika();
|
||||
let validFileTypes;
|
||||
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
|
||||
|
||||
const schema = {
|
||||
@@ -68,6 +68,7 @@ const schema = {
|
||||
}
|
||||
};
|
||||
|
||||
let isMagikaLoaded = false;
|
||||
let syncing = false;
|
||||
let state = {}
|
||||
const store = new Store({ schema });
|
||||
@@ -113,11 +114,19 @@ function filenameToMimeType (filename) {
|
||||
}
|
||||
|
||||
async function isPlainTextFile(filePath) {
|
||||
const fileType = await FileType.fromFile(filePath);
|
||||
if (!fileType) {
|
||||
if (!isMagikaLoaded) {
|
||||
await magika.load();
|
||||
isMagikaLoaded = true;
|
||||
}
|
||||
try {
|
||||
const fileContent = fs.readFileSync(filePath);
|
||||
const fileType = await magika.identifyBytes(fileContent);
|
||||
const fileLabel = magika.config.labels.filter(l => l.name == fileType.label)?.[0]
|
||||
return fileLabel?.is_text
|
||||
} catch (err) {
|
||||
console.error("Failed to identify file type: ", err);
|
||||
return false;
|
||||
}
|
||||
return fileType.mime.startsWith('text/');
|
||||
}
|
||||
|
||||
async function processDirectory(filesToPush, folder) {
|
||||
@@ -249,9 +258,18 @@ async function pushDataToKhoj (regenerate = false) {
|
||||
pushDataToKhoj();
|
||||
|
||||
async function handleFileOpen (type) {
|
||||
if (!isMagikaLoaded) {
|
||||
await magika.load();
|
||||
isMagikaLoaded = true;
|
||||
validFileTypes = [
|
||||
"org", "md", "pdf",
|
||||
// all text file extensions known to Magika
|
||||
...magika.config.labels.filter(l => l.is_text == true).map(l => l.name)];
|
||||
}
|
||||
|
||||
let { canceled, filePaths } = {canceled: true, filePaths: []};
|
||||
if (type === 'file') {
|
||||
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files" }] }));
|
||||
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files", extensions: validFileTypes }] }));
|
||||
} else if (type === 'folder') {
|
||||
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]}));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user