From 452c794e93eafc6abf898cf17cf8accd19360456 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 18 Aug 2025 16:52:59 -0700 Subject: [PATCH] Make regex search tool results look more like grep results --- src/khoj/routers/helpers.py | 8 +++++--- src/khoj/utils/helpers.py | 5 ++++- tests/test_grep_files.py | 18 +++++++++--------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index ffd384b3..a905aa02 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -3023,6 +3023,7 @@ async def grep_files( file_matches = await FileObjectAdapters.aget_file_objects_by_regex(user, db_pattern, path_prefix) line_matches = [] + line_matches_count = 0 for file_object in file_matches: lines = file_object.raw_text.split("\n") matched_line_numbers = [] @@ -3031,6 +3032,7 @@ async def grep_files( for i, line in enumerate(lines, 1): if regex.search(line): matched_line_numbers.append(i) + line_matches_count += len(matched_line_numbers) # Build context for each match for line_num in matched_line_numbers: @@ -3047,10 +3049,10 @@ async def grep_files( if current_line_num == line_num: # This is the matching line, mark it - context_lines.append(f"{file_object.file_name}:{current_line_num}:> {line_content}") + context_lines.append(f"{file_object.file_name}:{current_line_num}: {line_content}") else: # This is a context line - context_lines.append(f"{file_object.file_name}:{current_line_num}: {line_content}") + context_lines.append(f"{file_object.file_name}-{current_line_num}- {line_content}") # Add separator between matches if showing context if lines_before > 0 or lines_after > 0: @@ -3065,7 +3067,7 @@ async def grep_files( # Check if no results found max_results = 1000 query = _generate_query( - len([m for m in line_matches if ":>" in m]), + line_matches_count, len(file_matches), path_prefix, regex_pattern, diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index f8e5a07c..a200e2f4 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -613,9 +613,12 @@ tools_for_research_llm = { Helpful to answer questions for which all relevant notes or documents are needed to complete the search. Example: "Notes that mention Tom". You need to know all the correct keywords or regex patterns for this tool to be useful. - REMEMBER: + IMPORTANT: - The regex pattern will ONLY match content on a single line. Multi-line matches are NOT supported (even if you use \\n). + TIPS: + - The output follows a grep-like format. Matches are prefixed with the file path and line number. Useful to combine with viewing file around specific line numbers. + An optional path prefix can restrict search to specific files/directories. Use lines_before, lines_after to show context around matches. """ diff --git a/tests/test_grep_files.py b/tests/test_grep_files.py index 22828e19..96d9b3e7 100644 --- a/tests/test_grep_files.py +++ b/tests/test_grep_files.py @@ -46,8 +46,8 @@ async def test_grep_files_simple_match(default_user: KhojUser): assert len(results) == 1 result = results[0] assert "Found 2 matches for 'hello' in 1 documents" in result["query"] - assert "test.txt:1:> hello world" in result["compiled"] - assert "test.txt:3:> hello again" in result["compiled"] + assert "test.txt:1: hello world" in result["compiled"] + assert "test.txt:3: hello again" in result["compiled"] @pytest.mark.django_db @@ -110,7 +110,7 @@ async def test_grep_files_with_path_prefix(default_user: KhojUser): result = results[0] assert "Found 1 matches for 'hello' in 1 documents" in result["query"] assert "in dir1/" in result["query"] - assert "dir1/test1.txt:1:> hello from dir1" in result["compiled"] + assert "dir1/test1.txt:1: hello from dir1" in result["compiled"] assert "dir2/test2.txt" not in result["compiled"] @@ -142,9 +142,9 @@ async def test_grep_files_with_context(default_user: KhojUser): result = results[0] assert "Found 1 matches for 'match' in 1 documents" in result["query"] assert "Showing 1 lines before and 1 lines after" in result["query"] - assert "test.txt:2: line 2" in result["compiled"] - assert "test.txt:3:> line 3 (match)" in result["compiled"] - assert "test.txt:4: line 4" in result["compiled"] + assert "test.txt-2- line 2" in result["compiled"] + assert "test.txt:3: line 3 (match)" in result["compiled"] + assert "test.txt-4- line 4" in result["compiled"] assert "line 1" not in result["compiled"] assert "line 5" not in result["compiled"] @@ -199,8 +199,8 @@ async def test_grep_files_multiple_files(default_user: KhojUser): assert len(results) == 1 result = results[0] assert "Found 2 matches for 'hello' in 2 documents" in result["query"] - assert "file1.txt:1:> hello from file1" in result["compiled"] - assert "file2.txt:1:> hello from file2" in result["compiled"] + assert "file1.txt:1: hello from file1" in result["compiled"] + assert "file2.txt:1: hello from file2" in result["compiled"] @pytest.mark.parametrize( @@ -272,4 +272,4 @@ async def test_grep_files_financial_entries_regex_patterns( # All patterns should find the sailing entry assert f"Found {expected_matches} matches" in result["query"] - assert 'ledger.txt:8:> 1984-06-24 * "Center for Boats" "Sailing" #bob' in result["compiled"] + assert 'ledger.txt:8: 1984-06-24 * "Center for Boats" "Sailing" #bob' in result["compiled"]