fix: initialize mcp tools in session w simpler code

ducphamle2 · ducphamle2 · commit 93ea7a258a7d · 2025-04-01T22:43:48.000-07:00
chore: revert original prompts codeact

chore: re-add system prompt

chore: remove playwright mcp custom
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
@@ -31,11 +31,6 @@ const messageActions = {
       store.dispatch(addAssistantMessage(message.message));
     }
   },
-  [ActionType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT]: (message: ActionMessage) => {
-    if (!message.args.thought && message.message) {
-      store.dispatch(addAssistantMessage(message.message));
-    }
-  },
   [ActionType.WRITE]: (message: ActionMessage) => {
     const { path, content } = message.args;
     store.dispatch(setActiveFilepath(path));
diff --git a/frontend/src/services/observations.ts b/frontend/src/services/observations.ts
@@ -31,7 +31,6 @@ export function handleObservationMessage(message: ObservationMessage) {
       break;
     case ObservationType.BROWSE:
     case ObservationType.BROWSE_INTERACTIVE:
-    case ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT:
       if (message.extras?.screenshot) {
         store.dispatch(setScreenshotSrc(message.extras?.screenshot));
       }
@@ -220,19 +219,6 @@ export function handleObservationMessage(message: ObservationMessage) {
           }),
         );
         break;
-      case ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT:
-        store.dispatch(
-          addAssistantObservation({
-            ...baseObservation,
-            observation: ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT,
-            extras: {
-              url: String(message.extras.url || ""),
-              screenshot: String(message.extras.screenshot || ""),
-              trigger_by_action: String(message.extras.trigger_by_action || ""),
-            },
-          }),
-        );
-        break;
       case "error":
         store.dispatch(
           addAssistantObservation({
diff --git a/frontend/src/types/action-type.tsx b/frontend/src/types/action-type.tsx
@@ -38,9 +38,6 @@ enum ActionType {
 
   // Changes the state of the agent, e.g. to paused or running
   CHANGE_AGENT_STATE = "change_agent_state",
-
-  // Take a screenshot of the browser
-  PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = "playwright_mcp_browser_screenshot",
 }
 
 export default ActionType;
diff --git a/frontend/src/types/core/base.ts b/frontend/src/types/core/base.ts
@@ -12,8 +12,7 @@ export type OpenHandsEventType =
   | "reject"
   | "think"
   | "finish"
-  | "error"
-  | "playwright_mcp_browser_screenshot";
+  | "error";
 
 interface OpenHandsBaseEvent {
   id: number;
diff --git a/frontend/src/types/core/observations.ts b/frontend/src/types/core/observations.ts
@@ -110,15 +110,7 @@ export interface AgentThinkObservation
   };
 }
 
-export interface PlaywrightMcpBrowserScreenshotObservation
-  extends OpenHandsObservationEvent<ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT> {
-  source: "agent";
-  extras: {
-    url: string;
-    screenshot: string;
-    trigger_by_action: string;
-  };
-}
+
 export type OpenHandsObservation =
   | AgentStateChangeObservation
   | AgentThinkObservation
@@ -130,5 +122,4 @@ export type OpenHandsObservation =
   | WriteObservation
   | ReadObservation
   | EditObservation
-  | ErrorObservation
-  | PlaywrightMcpBrowserScreenshotObservation;
+  | ErrorObservation;
diff --git a/frontend/src/types/observation-type.tsx b/frontend/src/types/observation-type.tsx
@@ -11,8 +11,6 @@ enum ObservationType {
   // Interactive browsing
   BROWSE_INTERACTIVE = "browse_interactive",
 
-  PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot',
-
   // The output of a command
   RUN = "run",
 
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
@@ -205,7 +205,7 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                 action = McpAction(
                     name=tool_call.function.name, arguments=tool_call.function.arguments
                 )
-                action.set_hard_timeout(120)
+                # action.set_hard_timeout(120)
                 logger.warning(f'MCP action in function_calling.py: {action}')
 
             # We only add thought to the first action
diff --git a/openhands/agenthub/codeact_agent/prompts/system_prompt copy.j2 b/openhands/agenthub/codeact_agent/prompts/system_prompt copy.j2
@@ -0,0 +1,64 @@
+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
+
+<ROLE>
+Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
+* If the user asks a question, like "why is X happening", don't try to fix the problem. Just give an answer to the question.
+</ROLE>
+
+<EFFICIENCY>
+* Each action you take is somewhat expensive. Wherever possible, combine multiple actions into a single action, e.g. combine multiple bash commands into one, using sed and grep to edit/view multiple files at once.
+* When exploring the codebase, use efficient tools like find, grep, and git commands with appropriate filters to minimize unnecessary operations.
+</EFFICIENCY>
+
+<FILE_SYSTEM_GUIDELINES>
+* When a user provides a file path, do NOT assume it's relative to the current working directory. First explore the file system to locate the file before working on it.
+* If asked to edit a file, edit the file directly, rather than creating a new file with a different filename.
+* For global search-and-replace operations, consider using `sed` instead of opening file editors multiple times.
+</FILE_SYSTEM_GUIDELINES>
+
+<CODE_QUALITY>
+* Write clean, efficient code with minimal comments. Avoid redundancy in comments: Do not repeat information that can be easily inferred from the code itself.
+* When implementing solutions, focus on making the minimal changes needed to solve the problem.
+* Before implementing any changes, first thoroughly understand the codebase through exploration.
+* If you are adding a lot of code to a function or file, consider splitting the function or file into smaller pieces when appropriate.
+</CODE_QUALITY>
+
+<VERSION_CONTROL>
+* When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.
+* Exercise caution with git operations. Do NOT make potentially dangerous changes (e.g., pushing to main, deleting repositories) unless explicitly asked to do so.
+* When committing changes, use `git status` to see all modified files, and stage all files necessary for the commit. Use `git commit -a` whenever possible.
+</VERSION_CONTROL>
+
+<PROBLEM_SOLVING_WORKFLOW>
+1. EXPLORATION: Thoroughly explore relevant files and understand the context before proposing solutions
+2. ANALYSIS: Consider multiple approaches and select the most promising one
+3. TESTING:
+   * For bug fixes: Create tests to verify issues before implementing fixes
+   * For new features: Consider test-driven development when appropriate
+   * If the repository lacks testing infrastructure and implementing tests would require extensive setup, consult with the user before investing time in building testing infrastructure
+4. IMPLEMENTATION: Make focused, minimal changes to address the problem
+5. VERIFICATION: Test your implementation thoroughly, including edge cases
+</PROBLEM_SOLVING_WORKFLOW>
+
+<SECURITY>
+* Only use GITHUB_TOKEN and other credentials in ways the user has explicitly requested and would expect.
+* Use APIs to work with GitHub or other platforms, unless the user asks otherwise or your task requires browsing.
+</SECURITY>
+
+<ENVIRONMENT_SETUP>
+* When user asks you to run an application, don't stop if the application is not installed. Instead, please install the application and run the command again.
+* If you encounter missing dependencies:
+  1. First, look around in the repository for existing dependency files (requirements.txt, pyproject.toml, package.json, Gemfile, etc.)
+  2. If dependency files exist, use them to install all dependencies at once (e.g., `pip install -r requirements.txt`, `npm install`, etc.)
+  3. Only install individual packages directly if no dependency files are found or if only specific packages are needed
+* Similarly, if you encounter missing dependencies for essential tools requested by the user, install them when possible.
+</ENVIRONMENT_SETUP>
+
+<TROUBLESHOOTING>
+* If you've made repeated attempts to solve a problem but tests still fail or the user reports it's still broken:
+  1. Step back and reflect on 5-7 different possible sources of the problem
+  2. Assess the likelihood of each possible cause
+  3. Methodically address the most likely causes, starting with the highest probability
+  4. Document your reasoning process
+* When you run into any major issue while executing a plan from the user, please don't try to directly work around it. Instead, propose a new plan and confirm with the user before proceeding.
+</TROUBLESHOOTING>
diff --git a/openhands/agenthub/codeact_agent/prompts/system_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/system_prompt.j2
@@ -1,4 +1,4 @@
-You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.
+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
 
 <ROLE>
 Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
diff --git a/openhands/agenthub/codeact_agent/prompts/user_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/user_prompt.j2
@@ -1,5 +0,0 @@
-You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.
-
-<PROBLEM_SOLVING_WORKFLOW>
-Take screenshot of important actions you take related to web-browsing.
-</PROBLEM_SOLVING_WORKFLOW>
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
@@ -52,7 +52,3 @@ class ObservationType(str, Enum):
 
     MCP = 'mcp'
     """Result of a MCP Server operation"""
-
-    PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot'
-    """Result of a Playwright MCP Browser Screenshot operation. The response is a base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching
-    browsergym's screenshot format."""
diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py
@@ -45,5 +45,4 @@
     'RecallObservation',
     'RecallType',
     'MCPObservation',
-    'PlaywrightMcpBrowserScreenshotObservation',
 ]
diff --git a/openhands/events/observation/playwright_mcp.py b/openhands/events/observation/playwright_mcp.py
diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py
@@ -27,9 +27,6 @@
 )
 from openhands.events.observation.mcp import MCPObservation
 from openhands.events.observation.observation import Observation
-from openhands.events.observation.playwright_mcp import (
-    PlaywrightMcpBrowserScreenshotObservation,
-)
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
 
@@ -50,7 +47,6 @@
     AgentThinkObservation,
     RecallObservation,
     MCPObservation,
-    PlaywrightMcpBrowserScreenshotObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
@@ -1,5 +1,4 @@
 from typing import Generator
-import json
 
 from litellm import ModelResponse
 
@@ -40,9 +39,6 @@
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.mcp import MCPObservation
 from openhands.events.observation.observation import Observation
-from openhands.events.observation.playwright_mcp import (
-    PlaywrightMcpBrowserScreenshotObservation,
-)
 from openhands.events.serialization.event import truncate_content
 from openhands.utils.prompt import PromptManager, RepositoryInfo, RuntimeInfo
 
@@ -337,22 +333,6 @@ def _process_observation(
         elif isinstance(obs, MCPObservation):
             # logger.warning(f'MCPObservation: {obs}')
             message = Message(role='assistant', content=[TextContent(text=obs.content)])
-        elif isinstance(obs, PlaywrightMcpBrowserScreenshotObservation):
-            text = 'Image: Current webpage screenshot\n'
-            screenshot_content = json.loads(obs.content)
-            logger.debug(
-                f'screenshot_content in conversation_memory: {screenshot_content}'
-            )
-            if 'url' in screenshot_content:
-                text += f'URL: {screenshot_content["url"]}\n'
-
-            # We don't actually need to screenshot fed into the LLM. We can use snapshots. Meanwhile, the screenshot will be streamed to the user.
-            message = Message(
-                role='assistant',
-                content=[
-                    TextContent(text=text),
-                ],
-            )
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
@@ -24,7 +24,6 @@
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
 from fastapi.security import APIKeyHeader
-from mcp.types import ImageContent
 from openhands_aci.editor.editor import OHEditor
 from openhands_aci.editor.exceptions import ToolError
 from openhands_aci.editor.results import ToolResult
@@ -59,11 +58,7 @@
     Observation,
 )
 from openhands.events.observation.mcp import MCPObservation
-from openhands.events.observation.playwright_mcp import (
-    PlaywrightMcpBrowserScreenshotObservation,
-)
 from openhands.events.serialization import event_from_dict, event_to_dict
-from openhands.mcp.mcp_base import ToolResult as MCPToolResult
 from openhands.runtime.browser import browse
 from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
@@ -559,32 +554,8 @@ async def call_tool_mcp(self, action: McpAction) -> Observation:
         for agent in mcp_agents:
             await agent.cleanup()
 
-        # special case for browser screenshot of playwright_mcp
-        if action.name == 'browser_screenshot':
-            return self.playwright_mcp_browser_screenshot(action, response)
-
         return MCPObservation(content=f'MCP result:{response}')
 
-    def playwright_mcp_browser_screenshot(
-        self, action: McpAction, response: MCPToolResult
-    ) -> Observation:
-        # example response:
-        """
-        {
-            "type": "image",
-            "data": "image/jpeg;base64,/9j/4AA...",
-            "mimeType": "image/jpeg",
-            "url": "https://www.google.com"
-        }
-        """
-        screenshot_content: ImageContent = response.output
-        return PlaywrightMcpBrowserScreenshotObservation(
-            content=f'{response}',
-            url=screenshot_content.url if screenshot_content.url is not None else '',
-            trigger_by_action=action.name,
-            screenshot=f'data:image/png;base64,{screenshot_content.data}',
-        )
-
     def close(self):
         self.memory_monitor.stop_monitoring()
         if self.bash_session is not None:
diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py
diff --git a/openhands/server/session/session.py b/openhands/server/session/session.py

Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,6 @@ enum ActionType {`
`38`	`38`
`39`	`39`	`// Changes the state of the agent, e.g. to paused or running`
`40`	`40`	`CHANGE_AGENT_STATE = "change_agent_state",`
`41`		`-`
`42`		`- // Take a screenshot of the browser`
`43`		`- PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = "playwright_mcp_browser_screenshot",`
`44`	`41`	`}`
`45`	`42`
`46`	`43`	`export default ActionType;`
Original file line number	Diff line number	Diff line change
`@@ -205,7 +205,7 @@ def response_to_actions(response: ModelResponse) -> list[Action]:`
`205`	`205`	`action = McpAction(`
`206`	`206`	`name=tool_call.function.name, arguments=tool_call.function.arguments`
`207`	`207`	`)`
`208`		`- action.set_hard_timeout(120)`
	`208`	`+ # action.set_hard_timeout(120)`
`209`	`209`	`logger.warning(f'MCP action in function_calling.py: {action}')`
`210`	`210`
`211`	`211`	`# We only add thought to the first action`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.`
	`1`	`+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.`
`2`	`2`
`3`	`3`	`<ROLE>`
`4`	`4`	`Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.`
Original file line number	Diff line number	Diff line change
`@@ -45,5 +45,4 @@`
`45`	`45`	`'RecallObservation',`
`46`	`46`	`'RecallType',`
`47`	`47`	`'MCPObservation',`
`48`		`- 'PlaywrightMcpBrowserScreenshotObservation',`
`49`	`48`	`]`