From 70b7e7c73afd514f51271df3add368d170ca316f Mon Sep 17 00:00:00 2001 From: Debanjum Date: Tue, 26 Nov 2024 15:35:23 -0800 Subject: [PATCH] Improve load of complex json objects. Use it to pick tool, run code Gemini doesn't work well when trying to output json objects. Using it to output raw json strings with complex, multi-line structures requires more intense clean-up of raw json string for parsing --- src/khoj/processor/conversation/utils.py | 41 ++++++++++++++++++++++++ src/khoj/processor/tools/run_code.py | 6 ++-- src/khoj/routers/research.py | 6 ++-- tests/test_conversation_utils.py | 12 +++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 21a95a29..efd3c51d 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -5,6 +5,7 @@ import math import mimetypes import os import queue +import re import uuid from dataclasses import dataclass from datetime import datetime @@ -538,6 +539,46 @@ def clean_code_python(code: str): return code.strip().removeprefix("```python").removesuffix("```") +def load_complex_json(json_str): + """ + Preprocess a raw JSON string to escape unescaped double quotes within value strings, + while preserving the JSON structure and already escaped quotes. + """ + + def replace_unescaped_quotes(match): + # Get the content between colons and commas/end braces + content = match.group(1) + # Replace unescaped double, single quotes that aren't already escaped + # Uses negative lookbehind to avoid replacing already escaped quotes + # Replace " with \" + processed_dq = re.sub(r'(?