diff --git a/pyproject.toml b/pyproject.toml index f02b5559..59adf952 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,7 @@ dependencies = [ "anthropic == 0.26.1", "docx2txt == 0.8", "google-generativeai == 0.8.3", + "pyjson5 == 1.6.7", ] dynamic = ["version"] diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 21a95a29..079f3fea 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -5,6 +5,7 @@ import math import mimetypes import os import queue +import re import uuid from dataclasses import dataclass from datetime import datetime @@ -14,6 +15,7 @@ from time import perf_counter from typing import Any, Callable, Dict, List, Optional import PIL.Image +import pyjson5 import requests import tiktoken import yaml @@ -538,6 +540,47 @@ def clean_code_python(code: str): return code.strip().removeprefix("```python").removesuffix("```") +def load_complex_json(json_str): + """ + Preprocess a raw JSON string to escape unescaped double quotes within value strings, + while preserving the JSON structure and already escaped quotes. + """ + + def replace_unescaped_quotes(match): + # Get the content between colons and commas/end braces + content = match.group(1) + # Replace unescaped double, single quotes that aren't already escaped + # Uses negative lookbehind to avoid replacing already escaped quotes + # Replace " with \" + processed_dq = re.sub(r'(?