mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 13:25:11 +00:00
Remove unnecessary images conversion to png in binary operator agent.
It's handled by the ai model interaction handlers in khoj server core.
This commit is contained in:
@@ -9,7 +9,7 @@ from khoj.processor.conversation.utils import construct_structured_message
|
|||||||
from khoj.processor.operator.operator_actions import *
|
from khoj.processor.operator.operator_actions import *
|
||||||
from khoj.processor.operator.operator_agent_base import AgentActResult
|
from khoj.processor.operator.operator_agent_base import AgentActResult
|
||||||
from khoj.processor.operator.operator_environment_base import EnvState
|
from khoj.processor.operator.operator_environment_base import EnvState
|
||||||
from khoj.utils.helpers import convert_image_to_png, get_chat_usage_metrics
|
from khoj.utils.helpers import get_chat_usage_metrics
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -256,7 +256,7 @@ back() # Use this to go back to the previous page.
|
|||||||
|
|
||||||
# Construct grounding LLM input (using only the latest user prompt + image)
|
# Construct grounding LLM input (using only the latest user prompt + image)
|
||||||
# We don't pass the full history here, as grounding depends on the *current* state + NL action
|
# We don't pass the full history here, as grounding depends on the *current* state + NL action
|
||||||
screenshots = [f"data:image/png;base64,{convert_image_to_png(current_state.screenshot)}"]
|
screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
|
||||||
grounding_messages_content = construct_structured_message(
|
grounding_messages_content = construct_structured_message(
|
||||||
grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
|
grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from PIL import Image
|
|||||||
|
|
||||||
from khoj.processor.operator.operator_actions import *
|
from khoj.processor.operator.operator_actions import *
|
||||||
from khoj.processor.operator.operator_environment_base import EnvState
|
from khoj.processor.operator.operator_environment_base import EnvState
|
||||||
from khoj.utils.helpers import convert_image_to_png, get_chat_usage_metrics
|
from khoj.utils.helpers import get_chat_usage_metrics
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -257,11 +257,8 @@ class GroundingAgentUitars:
|
|||||||
self.thoughts
|
self.thoughts
|
||||||
), "The number of observations and actions should be the same."
|
), "The number of observations and actions should be the same."
|
||||||
|
|
||||||
screenshot = convert_image_to_png(env_state.screenshot)
|
self.history_images.append(base64.b64decode(env_state.screenshot))
|
||||||
self.history_images.append(base64.b64decode(screenshot))
|
self.observations.append({"screenshot": env_state.screenshot, "accessibility_tree": None})
|
||||||
|
|
||||||
base64_image = screenshot
|
|
||||||
self.observations.append({"screenshot": base64_image, "accessibility_tree": None})
|
|
||||||
|
|
||||||
user_prompt = self.prompt_template.format(
|
user_prompt = self.prompt_template.format(
|
||||||
instruction=instruction, action_space=self.prompt_action_space, language=self.language
|
instruction=instruction, action_space=self.prompt_action_space, language=self.language
|
||||||
|
|||||||
@@ -17,11 +17,7 @@ from khoj.processor.operator.operator_agent_base import (
|
|||||||
)
|
)
|
||||||
from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
|
from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
|
||||||
from khoj.routers.helpers import send_message_to_model_wrapper
|
from khoj.routers.helpers import send_message_to_model_wrapper
|
||||||
from khoj.utils.helpers import (
|
from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
|
||||||
convert_image_to_png,
|
|
||||||
get_openai_async_client,
|
|
||||||
is_none_or_empty,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -132,7 +128,7 @@ Focus on the visual action and provide all necessary context.
|
|||||||
|
|
||||||
if is_none_or_empty(self.messages):
|
if is_none_or_empty(self.messages):
|
||||||
query_text = f"**Main Objective**: {self.query}"
|
query_text = f"**Main Objective**: {self.query}"
|
||||||
query_screenshot = [f"data:image/png;base64,{convert_image_to_png(current_state.screenshot)}"]
|
query_screenshot = [f"data:image/webp;base64,{current_state.screenshot}"]
|
||||||
first_message_content = construct_structured_message(
|
first_message_content = construct_structured_message(
|
||||||
message=query_text,
|
message=query_text,
|
||||||
images=query_screenshot,
|
images=query_screenshot,
|
||||||
|
|||||||
@@ -598,20 +598,6 @@ def convert_image_data_uri(image_data_uri: str, target_format: str = "png") -> s
|
|||||||
return output_data_uri
|
return output_data_uri
|
||||||
|
|
||||||
|
|
||||||
def convert_image_to_png(image_base64: str) -> str:
|
|
||||||
"""Convert base64 image to png format for wider support"""
|
|
||||||
image_bytes = base64.b64decode(image_base64)
|
|
||||||
image_io = io.BytesIO(image_bytes)
|
|
||||||
with Image.open(image_io) as original_image:
|
|
||||||
output_image_io = io.BytesIO()
|
|
||||||
original_image.save(output_image_io, "PNG")
|
|
||||||
|
|
||||||
# Encode the WebP image back to base64
|
|
||||||
output_image_bytes = output_image_io.getvalue()
|
|
||||||
output_image_io.close()
|
|
||||||
return base64.b64encode(output_image_bytes).decode("utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]:
|
def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Truncate large output files and drop image file data from code results.
|
Truncate large output files and drop image file data from code results.
|
||||||
|
|||||||
Reference in New Issue
Block a user