Use Magika to more robustly identify text files to send for indexing

- `file-type' doesn't handle mis-labelled files or files without
   extensions well

- Only show supported file types in file selector dialog on Desktop app
  Use Magika to get list of text file extensions. Combine with other
  supported extensions to get complete list of supported file extensions.
  Use it to limit selectable files in the File Open dialog.

  Note: Folder selector will index text files with no extensions as well
This commit is contained in:
Debanjum Singh Solanky
2024-04-09 16:07:18 +05:30
parent f040418cf1
commit f2dc9709b7
3 changed files with 646 additions and 59 deletions

View File

@@ -1,5 +1,5 @@
const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron');
const FileType = require('file-type');
const Magika = require('magika').MagikaNode;
const todesktop = require("@todesktop/runtime");
const khojPackage = require('./package.json');
@@ -15,8 +15,8 @@ const KHOJ_URL = 'https://app.khoj.dev';
const Store = require('electron-store');
const validFileTypes = ['org', 'md', 'markdown', 'txt', 'html', 'xml', 'pdf']
const magika = new Magika();
let validFileTypes;
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
const schema = {
@@ -68,6 +68,7 @@ const schema = {
}
};
let isMagikaLoaded = false;
let syncing = false;
let state = {}
const store = new Store({ schema });
@@ -113,11 +114,19 @@ function filenameToMimeType (filename) {
}
async function isPlainTextFile(filePath) {
const fileType = await FileType.fromFile(filePath);
if (!fileType) {
if (!isMagikaLoaded) {
await magika.load();
isMagikaLoaded = true;
}
try {
const fileContent = fs.readFileSync(filePath);
const fileType = await magika.identifyBytes(fileContent);
const fileLabel = magika.config.labels.filter(l => l.name == fileType.label)?.[0]
return fileLabel?.is_text
} catch (err) {
console.error("Failed to identify file type: ", err);
return false;
}
return fileType.mime.startsWith('text/');
}
async function processDirectory(filesToPush, folder) {
@@ -249,9 +258,18 @@ async function pushDataToKhoj (regenerate = false) {
pushDataToKhoj();
async function handleFileOpen (type) {
if (!isMagikaLoaded) {
await magika.load();
isMagikaLoaded = true;
validFileTypes = [
"org", "md", "pdf",
// all text file extensions known to Magika
...magika.config.labels.filter(l => l.is_text == true).map(l => l.name)];
}
let { canceled, filePaths } = {canceled: true, filePaths: []};
if (type === 'file') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files" }] }));
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files", extensions: validFileTypes }] }));
} else if (type === 'folder') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]}));
}