diff --git a/src/khoj/processor/operator/grounding_agent.py b/src/khoj/processor/operator/grounding_agent.py index 65cb5c14..d6126cbe 100644 --- a/src/khoj/processor/operator/grounding_agent.py +++ b/src/khoj/processor/operator/grounding_agent.py @@ -9,7 +9,7 @@ from khoj.processor.conversation.utils import construct_structured_message from khoj.processor.operator.operator_actions import * from khoj.processor.operator.operator_agent_base import AgentActResult from khoj.processor.operator.operator_environment_base import EnvState -from khoj.utils.helpers import convert_image_to_png, get_chat_usage_metrics +from khoj.utils.helpers import get_chat_usage_metrics logger = logging.getLogger(__name__) @@ -256,7 +256,7 @@ back() # Use this to go back to the previous page. # Construct grounding LLM input (using only the latest user prompt + image) # We don't pass the full history here, as grounding depends on the *current* state + NL action - screenshots = [f"data:image/png;base64,{convert_image_to_png(current_state.screenshot)}"] + screenshots = [f"data:image/webp;base64,{current_state.screenshot}"] grounding_messages_content = construct_structured_message( grounding_user_prompt, screenshots, self.model.name, vision_enabled=True ) diff --git a/src/khoj/processor/operator/grounding_agent_uitars.py b/src/khoj/processor/operator/grounding_agent_uitars.py index 21a3109d..daba4629 100644 --- a/src/khoj/processor/operator/grounding_agent_uitars.py +++ b/src/khoj/processor/operator/grounding_agent_uitars.py @@ -19,7 +19,7 @@ from PIL import Image from khoj.processor.operator.operator_actions import * from khoj.processor.operator.operator_environment_base import EnvState -from khoj.utils.helpers import convert_image_to_png, get_chat_usage_metrics +from khoj.utils.helpers import get_chat_usage_metrics logger = logging.getLogger(__name__) @@ -257,11 +257,8 @@ class GroundingAgentUitars: self.thoughts ), "The number of observations and actions should be the same." - screenshot = convert_image_to_png(env_state.screenshot) - self.history_images.append(base64.b64decode(screenshot)) - - base64_image = screenshot - self.observations.append({"screenshot": base64_image, "accessibility_tree": None}) + self.history_images.append(base64.b64decode(env_state.screenshot)) + self.observations.append({"screenshot": env_state.screenshot, "accessibility_tree": None}) user_prompt = self.prompt_template.format( instruction=instruction, action_space=self.prompt_action_space, language=self.language diff --git a/src/khoj/processor/operator/operator_agent_binary.py b/src/khoj/processor/operator/operator_agent_binary.py index 4010b84e..0ec69bc2 100644 --- a/src/khoj/processor/operator/operator_agent_binary.py +++ b/src/khoj/processor/operator/operator_agent_binary.py @@ -17,11 +17,7 @@ from khoj.processor.operator.operator_agent_base import ( ) from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult from khoj.routers.helpers import send_message_to_model_wrapper -from khoj.utils.helpers import ( - convert_image_to_png, - get_openai_async_client, - is_none_or_empty, -) +from khoj.utils.helpers import get_openai_async_client, is_none_or_empty logger = logging.getLogger(__name__) @@ -132,7 +128,7 @@ Focus on the visual action and provide all necessary context. if is_none_or_empty(self.messages): query_text = f"**Main Objective**: {self.query}" - query_screenshot = [f"data:image/png;base64,{convert_image_to_png(current_state.screenshot)}"] + query_screenshot = [f"data:image/webp;base64,{current_state.screenshot}"] first_message_content = construct_structured_message( message=query_text, images=query_screenshot, diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 0a902629..047c59b8 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -598,20 +598,6 @@ def convert_image_data_uri(image_data_uri: str, target_format: str = "png") -> s return output_data_uri -def convert_image_to_png(image_base64: str) -> str: - """Convert base64 image to png format for wider support""" - image_bytes = base64.b64decode(image_base64) - image_io = io.BytesIO(image_bytes) - with Image.open(image_io) as original_image: - output_image_io = io.BytesIO() - original_image.save(output_image_io, "PNG") - - # Encode the WebP image back to base64 - output_image_bytes = output_image_io.getvalue() - output_image_io.close() - return base64.b64encode(output_image_bytes).decode("utf-8") - - def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]: """ Truncate large output files and drop image file data from code results.