diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 0c3064b9..6e5d2fb6 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -697,8 +697,9 @@ def clean_code_python(code: str): def load_complex_json(json_str): """ - Preprocess a raw JSON string to escape unescaped double quotes within value strings, - while preserving the JSON structure and already escaped quotes. + Preprocess a raw JSON string to + - escape unescaped double quotes within value strings while preserving the JSON structure and already escaped quotes. + - remove suffix after the first valid JSON object, """ def replace_unescaped_quotes(match): @@ -726,9 +727,20 @@ def load_complex_json(json_str): for loads in json_loaders_to_try: try: return loads(processed) - except (json.JSONDecodeError, pyjson5.Json5Exception) as e: - errors.append(f"{type(e).__name__}: {str(e)}") + except (json.JSONDecodeError, pyjson5.Json5Exception) as e_load: + loader_name = loads.__name__ + errors.append(f"{loader_name} (initial parse): {type(e_load).__name__}: {str(e_load)}") + # Handle plain text suffixes by slicing at error position + if hasattr(e_load, "pos") and 0 < e_load.pos < len(processed): + try: + sliced = processed[: e_load.pos].strip() + if sliced: + return loads(sliced) + except Exception as e_slice: + errors.append( + f"{loader_name} after slice at {e_load.pos}: {type(e_slice).__name__}: {str(e_slice)}" + ) # If all loaders fail, raise the aggregated error raise ValueError( f"Failed to load JSON with errors: {'; '.join(errors)}\n\n" diff --git a/tests/test_conversation_utils.py b/tests/test_conversation_utils.py index b1fdad30..eb613e46 100644 --- a/tests/test_conversation_utils.py +++ b/tests/test_conversation_utils.py @@ -175,16 +175,46 @@ class TestTruncateMessage: assert truncated_chat_history[0] != copy_big_chat_message, "Original message should be modified" -def test_load_complex_raw_json_string(): - # Arrange - raw_json = r"""{"key": "value with unescaped " and unescaped \' and escaped \" and escaped \\'"}""" - expeced_json = {"key": "value with unescaped \" and unescaped \\' and escaped \" and escaped \\'"} +class TestLoadComplexJson: + def test_load_complex_raw_json_string(self): + # Arrange + raw_json = r"""{"key": "value with unescaped " and unescaped \' and escaped \" and escaped \\'"}""" + expected_json = {"key": "value with unescaped \" and unescaped \\' and escaped \" and escaped \\'"} - # Act - parsed_json = utils.load_complex_json(raw_json) + # Act + parsed_json = utils.load_complex_json(raw_json) - # Assert - assert parsed_json == expeced_json + # Assert + assert parsed_json == expected_json + + def test_load_complex_json_with_python_code(self): + # Arrange + raw_json = r"""{"python": "import os\nvalue = \"\"\"\nfirst line of "text"\nsecond line of 'text'\n\"\"\"\nprint(value)"}""" + expected_json = { + "python": 'import os\nvalue = """\nfirst line of "text"\nsecond line of \'text\'\n"""\nprint(value)' + } + + # Act + parsed_json = utils.load_complex_json(raw_json) + + # Assert + assert parsed_json == expected_json + + def test_load_complex_json_inline(self): + # Arrange + raw_json = """ + {"key1": "value1", "key2": "value2"}plain text suffix + """ + expected_json = { + "key1": "value1", + "key2": "value2", + } + + # Act + parsed_json = utils.load_complex_json(raw_json) + + # Assert + assert parsed_json == expected_json def generate_content(count, suffix=""):