fix: initialize mcp tools in session w simpler code

ducphamle2 · ducphamle2 · commit 82e250078e7c · 2025-04-01T22:47:53.000-07:00
chore: revert original prompts codeact

chore: re-add system prompt

chore: remove playwright mcp custom

chore: remove unused
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
@@ -31,11 +31,6 @@ const messageActions = {
       store.dispatch(addAssistantMessage(message.message));
     }
   },
-  [ActionType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT]: (message: ActionMessage) => {
-    if (!message.args.thought && message.message) {
-      store.dispatch(addAssistantMessage(message.message));
-    }
-  },
   [ActionType.WRITE]: (message: ActionMessage) => {
     const { path, content } = message.args;
     store.dispatch(setActiveFilepath(path));
diff --git a/frontend/src/services/observations.ts b/frontend/src/services/observations.ts
@@ -31,7 +31,6 @@ export function handleObservationMessage(message: ObservationMessage) {
       break;
     case ObservationType.BROWSE:
     case ObservationType.BROWSE_INTERACTIVE:
-    case ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT:
       if (message.extras?.screenshot) {
         store.dispatch(setScreenshotSrc(message.extras?.screenshot));
       }
@@ -220,19 +219,6 @@ export function handleObservationMessage(message: ObservationMessage) {
           }),
         );
         break;
-      case ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT:
-        store.dispatch(
-          addAssistantObservation({
-            ...baseObservation,
-            observation: ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT,
-            extras: {
-              url: String(message.extras.url || ""),
-              screenshot: String(message.extras.screenshot || ""),
-              trigger_by_action: String(message.extras.trigger_by_action || ""),
-            },
-          }),
-        );
-        break;
       case "error":
         store.dispatch(
           addAssistantObservation({
diff --git a/frontend/src/types/action-type.tsx b/frontend/src/types/action-type.tsx
@@ -38,9 +38,6 @@ enum ActionType {
 
   // Changes the state of the agent, e.g. to paused or running
   CHANGE_AGENT_STATE = "change_agent_state",
-
-  // Take a screenshot of the browser
-  PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = "playwright_mcp_browser_screenshot",
 }
 
 export default ActionType;
diff --git a/frontend/src/types/core/base.ts b/frontend/src/types/core/base.ts
@@ -12,8 +12,7 @@ export type OpenHandsEventType =
   | "reject"
   | "think"
   | "finish"
-  | "error"
-  | "playwright_mcp_browser_screenshot";
+  | "error";
 
 interface OpenHandsBaseEvent {
   id: number;
diff --git a/frontend/src/types/core/observations.ts b/frontend/src/types/core/observations.ts
@@ -1,5 +1,4 @@
 import { AgentState } from "../agent-state";
-import ObservationType from "../observation-type";
 import { OpenHandsObservationEvent } from "./base";
 
 export interface AgentStateChangeObservation
@@ -110,15 +109,6 @@ export interface AgentThinkObservation
   };
 }
 
-export interface PlaywrightMcpBrowserScreenshotObservation
-  extends OpenHandsObservationEvent<ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT> {
-  source: "agent";
-  extras: {
-    url: string;
-    screenshot: string;
-    trigger_by_action: string;
-  };
-}
 export type OpenHandsObservation =
   | AgentStateChangeObservation
   | AgentThinkObservation
@@ -130,5 +120,4 @@ export type OpenHandsObservation =
   | WriteObservation
   | ReadObservation
   | EditObservation
-  | ErrorObservation
-  | PlaywrightMcpBrowserScreenshotObservation;
+  | ErrorObservation;
diff --git a/frontend/src/types/observation-type.tsx b/frontend/src/types/observation-type.tsx
@@ -11,8 +11,6 @@ enum ObservationType {
   // Interactive browsing
   BROWSE_INTERACTIVE = "browse_interactive",
 
-  PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot',
-
   // The output of a command
   RUN = "run",
 
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
@@ -205,8 +205,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                 action = McpAction(
                     name=tool_call.function.name, arguments=tool_call.function.arguments
                 )
-                action.set_hard_timeout(120)
-                logger.warning(f'MCP action in function_calling.py: {action}')
+                # action.set_hard_timeout(120)
+                logger.debug(f'MCP action in function_calling.py: {action}')
 
             # We only add thought to the first action
             if i == 0:
diff --git a/openhands/agenthub/codeact_agent/prompts/system_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/system_prompt.j2
@@ -1,4 +1,4 @@
-You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.
+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
 
 <ROLE>
 Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
diff --git a/openhands/agenthub/codeact_agent/prompts/user_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/user_prompt.j2
@@ -1,5 +0,0 @@
-You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.
-
-<PROBLEM_SOLVING_WORKFLOW>
-Take screenshot of important actions you take related to web-browsing.
-</PROBLEM_SOLVING_WORKFLOW>
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
@@ -52,7 +52,3 @@ class ObservationType(str, Enum):
 
     MCP = 'mcp'
     """Result of a MCP Server operation"""
-
-    PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot'
-    """Result of a Playwright MCP Browser Screenshot operation. The response is a base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching
-    browsergym's screenshot format."""
diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py
@@ -45,5 +45,4 @@
     'RecallObservation',
     'RecallType',
     'MCPObservation',
-    'PlaywrightMcpBrowserScreenshotObservation',
 ]
diff --git a/openhands/events/observation/playwright_mcp.py b/openhands/events/observation/playwright_mcp.py
diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py
@@ -27,9 +27,6 @@
 )
 from openhands.events.observation.mcp import MCPObservation
 from openhands.events.observation.observation import Observation
-from openhands.events.observation.playwright_mcp import (
-    PlaywrightMcpBrowserScreenshotObservation,
-)
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
 
@@ -50,7 +47,6 @@
     AgentThinkObservation,
     RecallObservation,
     MCPObservation,
-    PlaywrightMcpBrowserScreenshotObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
@@ -1,5 +1,4 @@
 from typing import Generator
-import json
 
 from litellm import ModelResponse
 
@@ -40,9 +39,6 @@
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.mcp import MCPObservation
 from openhands.events.observation.observation import Observation
-from openhands.events.observation.playwright_mcp import (
-    PlaywrightMcpBrowserScreenshotObservation,
-)
 from openhands.events.serialization.event import truncate_content
 from openhands.utils.prompt import PromptManager, RepositoryInfo, RuntimeInfo
 
@@ -337,22 +333,6 @@ def _process_observation(
         elif isinstance(obs, MCPObservation):
             # logger.warning(f'MCPObservation: {obs}')
             message = Message(role='assistant', content=[TextContent(text=obs.content)])
-        elif isinstance(obs, PlaywrightMcpBrowserScreenshotObservation):
-            text = 'Image: Current webpage screenshot\n'
-            screenshot_content = json.loads(obs.content)
-            logger.debug(
-                f'screenshot_content in conversation_memory: {screenshot_content}'
-            )
-            if 'url' in screenshot_content:
-                text += f'URL: {screenshot_content["url"]}\n'
-
-            # We don't actually need to screenshot fed into the LLM. We can use snapshots. Meanwhile, the screenshot will be streamed to the user.
-            message = Message(
-                role='assistant',
-                content=[
-                    TextContent(text=text),
-                ],
-            )
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
@@ -24,7 +24,6 @@
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
 from fastapi.security import APIKeyHeader
-from mcp.types import ImageContent
 from openhands_aci.editor.editor import OHEditor
 from openhands_aci.editor.exceptions import ToolError
 from openhands_aci.editor.results import ToolResult
@@ -59,11 +58,7 @@
     Observation,
 )
 from openhands.events.observation.mcp import MCPObservation
-from openhands.events.observation.playwright_mcp import (
-    PlaywrightMcpBrowserScreenshotObservation,
-)
 from openhands.events.serialization import event_from_dict, event_to_dict
-from openhands.mcp.mcp_base import ToolResult as MCPToolResult
 from openhands.runtime.browser import browse
 from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
@@ -559,32 +554,8 @@ async def call_tool_mcp(self, action: McpAction) -> Observation:
         for agent in mcp_agents:
             await agent.cleanup()
 
-        # special case for browser screenshot of playwright_mcp
-        if action.name == 'browser_screenshot':
-            return self.playwright_mcp_browser_screenshot(action, response)
-
         return MCPObservation(content=f'MCP result:{response}')
 
-    def playwright_mcp_browser_screenshot(
-        self, action: McpAction, response: MCPToolResult
-    ) -> Observation:
-        # example response:
-        """
-        {
-            "type": "image",
-            "data": "image/jpeg;base64,/9j/4AA...",
-            "mimeType": "image/jpeg",
-            "url": "https://www.google.com"
-        }
-        """
-        screenshot_content: ImageContent = response.output
-        return PlaywrightMcpBrowserScreenshotObservation(
-            content=f'{response}',
-            url=screenshot_content.url if screenshot_content.url is not None else '',
-            trigger_by_action=action.name,
-            screenshot=f'data:image/png;base64,{screenshot_content.data}',
-        )
-
     def close(self):
         self.memory_monitor.stop_monitoring()
         if self.bash_session is not None:
@@ -609,9 +580,6 @@ def close(self):
         help='BrowserGym environment used for browser evaluation',
         default=None,
     )
-    parser.add_argument(
-        '--runtime-mode', type=str, help='docker | others', default='others'
-    )
 
     # example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement
     args = parser.parse_args()
diff --git a/openhands/runtime/impl/docker/docker_runtime.py b/openhands/runtime/impl/docker/docker_runtime.py
@@ -1,9 +1,8 @@
+import os
 from functools import lru_cache
 from typing import Callable
 from uuid import UUID
 
-import os 
-
 import docker
 import httpx
 import tenacity
@@ -89,9 +88,13 @@ def __init__(
         self._vscode_port = -1
         self._app_ports: list[int] = []
 
-        if os.environ.get("DOCKER_HOST_ADDR"):
-            logger.info(f'Using DOCKER_HOST_IP: {os.environ["DOCKER_HOST_ADDR"]} for local_runtime_url')
-            self.config.sandbox.local_runtime_url = f'http://{os.environ["DOCKER_HOST_ADDR"]}'
+        if os.environ.get('DOCKER_HOST_ADDR'):
+            logger.info(
+                f'Using DOCKER_HOST_IP: {os.environ["DOCKER_HOST_ADDR"]} for local_runtime_url'
+            )
+            self.config.sandbox.local_runtime_url = (
+                f'http://{os.environ["DOCKER_HOST_ADDR"]}'
+            )
 
         self.docker_client: docker.DockerClient = self._init_docker_client()
         self.api_url = f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
@@ -284,7 +287,6 @@ def _init_container(self):
             server_port=self._container_port,
             plugins=self.plugins,
             app_config=self.config,
-            runtime_mode='docker',
         )
 
         try:
diff --git a/openhands/runtime/utils/command.py b/openhands/runtime/utils/command.py
@@ -18,7 +18,6 @@ def get_action_execution_server_startup_command(
     python_prefix: list[str] = DEFAULT_PYTHON_PREFIX,
     override_user_id: int | None = None,
     override_username: str | None = None,
-    runtime_mode: str = 'others',
 ) -> list[str]:
     sandbox_config = app_config.sandbox
 
@@ -56,8 +55,6 @@ def get_action_execution_server_startup_command(
         '--user-id',
         str(user_id),
         *browsergym_args,
-        '--runtime-mode',
-        runtime_mode,
     ]
 
     return base_cmd
diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py
diff --git a/openhands/server/session/session.py b/openhands/server/session/session.py

Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,6 @@ enum ActionType {`
`38`	`38`
`39`	`39`	`// Changes the state of the agent, e.g. to paused or running`
`40`	`40`	`CHANGE_AGENT_STATE = "change_agent_state",`
`41`		`-`
`42`		`- // Take a screenshot of the browser`
`43`		`- PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = "playwright_mcp_browser_screenshot",`
`44`	`41`	`}`
`45`	`42`
`46`	`43`	`export default ActionType;`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.`
	`1`	`+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.`
`2`	`2`
`3`	`3`	`<ROLE>`
`4`	`4`	`Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.`
Original file line number	Diff line number	Diff line change
`@@ -45,5 +45,4 @@`
`45`	`45`	`'RecallObservation',`
`46`	`46`	`'RecallType',`
`47`	`47`	`'MCPObservation',`
`48`		`- 'PlaywrightMcpBrowserScreenshotObservation',`
`49`	`48`	`]`
-Original file line number
+Diff line change
 from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import OpenHandsLoggerAdapter
 from openhands.core.schema.agent import AgentState
 -from openhands.core.setup import create_mcp_agents
 from openhands.events.action import ChangeAgentStateAction, MessageAction
 from openhands.events.event import Event, EventSource
 from openhands.events.stream import EventStream
 from openhands.integrations.provider import PROVIDER_TOKEN_TYPE, ProviderHandler
 -from openhands.mcp.mcp_agent import convert_mcp_agents_to_tools
 from openhands.memory.memory import Memory
 from openhands.microagent.microagent import BaseMicroAgent
 from openhands.runtime import get_runtime_cls
         finished = False  # For monitoring
         runtime_connected = False
         try:
 -            # Initialize MCP agents first before creating runtime and controller
 -            try:
 -                # Log MCP configuration to help with debugging
 -                self.logger.info(f'MCP SSE servers: {config.mcp.sse.mcp_servers}')
 -                self.logger.info(f'MCP stdio commands: {config.mcp.stdio.commands}')
 -                self.logger.info(f'MCP stdio args: {config.mcp.stdio.args}')
+-
 -                # Check if MCP servers are available
 -                if not config.mcp.sse.mcp_servers and not config.mcp.stdio.commands:
 -                    self.logger.warning(
 -                        'No MCP servers or commands configured. MCP integration will not work.'
 -                    )
 -                else:
 -                    self.logger.info('Initializing MCP agents for server mode...')
 -                    mcp_agents = await create_mcp_agents(
 -                        config.mcp.sse.mcp_servers,
 -                        config.mcp.stdio.commands,
 -                        config.mcp.stdio.args,
 -                    )
+-
 -                    # Give some time for MCP connections to stabilize
 -                    await asyncio.sleep(1)
+-
 -                    # For CodeActAgent and similar agents that use the tools attribute
 -                    if hasattr(agent, 'tools'):
 -                        try:
 -                            # Convert MCP agents to tools format for CodeActAgent
 -                            mcp_tools = convert_mcp_agents_to_tools(mcp_agents)
 -                            self.logger.info(
 -                                f"MCP tools created: {[tool.get('function', {}).get('name', '<unnamed>') for tool in mcp_tools]}"
 -                            )
+-
 -                            # If agent already has tools, extend them; otherwise create a new list
 -                            if isinstance(agent.tools, list):
 -                                agent.tools.extend(mcp_tools)
 -                            else:
 -                                agent.tools = mcp_tools
+-
 -                            self.logger.info(
 -                                f'Agent now has {len(agent.tools)} tools including MCP tools'
 -                            )
 -                        except Exception as e:
 -                            self.logger.error(
 -                                f'Error converting MCP agents to tools: {str(e)}',
 -                                exc_info=True,
 -                            )
+-
 -                    # Log MCP agents status
 -                    for idx, mcp_agent in enumerate(mcp_agents):
 -                        self.logger.info(
 -                            f'MCP Agent {idx} connection type: {mcp_agent.connection_type}'
 -                        )
 -                        self.logger.info(
 -                            f"MCP Agent {idx} available tools: {list(mcp_agent.mcp_clients.tool_map.keys()) if hasattr(mcp_agent, 'mcp_clients') and hasattr(mcp_agent.mcp_clients, 'tool_map') else 'No tools available'}"
 -                        )
 -                        await mcp_agent.cleanup()
+-
 -                    self.logger.info(
 -                        f'Successfully initialized {len(mcp_agents)} MCP agents'
 -                    )
 -            except Exception as e:
 -                self.logger.error(f'Error initializing MCP agents: {e}', exc_info=True)
+-
             self._create_security_analyzer(config.security.security_analyzer)
             runtime_connected = await self._create_runtime(
                 runtime_name=runtime_name,
             return False
         if selected_repository and git_provider_tokens:
 -            await self.runtime.clone_repo(git_provider_tokens,
 -                                          selected_repository,
 -                                          selected_branch)
 +            await self.runtime.clone_repo(
 +                git_provider_tokens, selected_repository, selected_branch
 +            )
             await call_sync_from_async(self.runtime.maybe_run_setup_script)
         self.logger.debug(
-Original file line number
+Diff line change
+)
 from openhands.core.logger import OpenHandsLoggerAdapter
 from openhands.core.schema import AgentState
 +from openhands.core.setup import create_mcp_agents
 from openhands.events.action import MessageAction, NullAction
 from openhands.events.event import Event, EventSource
 from openhands.events.observation import (
 from openhands.events.serialization import event_from_dict, event_to_dict
 from openhands.events.stream import EventStreamSubscriber
 from openhands.llm.llm import LLM
 +from openhands.mcp.mcp_agent import convert_mcp_agents_to_tools
 from openhands.server.session.agent_session import AgentSession
 from openhands.server.session.conversation_init_data import ConversationInitData
 from openhands.server.settings import Settings
             self.logger.info(f'Enabling default condenser: {default_condenser_config}')
             agent_config.condenser = default_condenser_config
 -        agent = Agent.get_cls(agent_cls)(llm, agent_config)
 +        mcp_agents = await create_mcp_agents(
 +            self.config.mcp.sse.mcp_servers,
 +            self.config.mcp.stdio.commands,
 +            self.config.mcp.stdio.args,
 +        )
 +        mcp_tools = convert_mcp_agents_to_tools(mcp_agents)
 +        agent = Agent.get_cls(agent_cls)(llm, agent_config, mcp_tools)
 +        # close all mcp agents after extracting tools
 +        for mcp_agent in mcp_agents:
 +            await mcp_agent.cleanup()
         git_provider_tokens = None
         selected_repository = None