Skip to content

Commit 93ea7a2

Browse files
committed
fix: initialize mcp tools in session w simpler code
chore: revert original prompts codeact chore: re-add system prompt chore: remove playwright mcp custom
1 parent a1de26e commit 93ea7a2

File tree

18 files changed

+84
-193
lines changed

18 files changed

+84
-193
lines changed

frontend/src/services/actions.ts

-5
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,6 @@ const messageActions = {
3131
store.dispatch(addAssistantMessage(message.message));
3232
}
3333
},
34-
[ActionType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT]: (message: ActionMessage) => {
35-
if (!message.args.thought && message.message) {
36-
store.dispatch(addAssistantMessage(message.message));
37-
}
38-
},
3934
[ActionType.WRITE]: (message: ActionMessage) => {
4035
const { path, content } = message.args;
4136
store.dispatch(setActiveFilepath(path));

frontend/src/services/observations.ts

-14
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ export function handleObservationMessage(message: ObservationMessage) {
3131
break;
3232
case ObservationType.BROWSE:
3333
case ObservationType.BROWSE_INTERACTIVE:
34-
case ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT:
3534
if (message.extras?.screenshot) {
3635
store.dispatch(setScreenshotSrc(message.extras?.screenshot));
3736
}
@@ -220,19 +219,6 @@ export function handleObservationMessage(message: ObservationMessage) {
220219
}),
221220
);
222221
break;
223-
case ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT:
224-
store.dispatch(
225-
addAssistantObservation({
226-
...baseObservation,
227-
observation: ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT,
228-
extras: {
229-
url: String(message.extras.url || ""),
230-
screenshot: String(message.extras.screenshot || ""),
231-
trigger_by_action: String(message.extras.trigger_by_action || ""),
232-
},
233-
}),
234-
);
235-
break;
236222
case "error":
237223
store.dispatch(
238224
addAssistantObservation({

frontend/src/types/action-type.tsx

-3
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,6 @@ enum ActionType {
3838

3939
// Changes the state of the agent, e.g. to paused or running
4040
CHANGE_AGENT_STATE = "change_agent_state",
41-
42-
// Take a screenshot of the browser
43-
PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = "playwright_mcp_browser_screenshot",
4441
}
4542

4643
export default ActionType;

frontend/src/types/core/base.ts

+1-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ export type OpenHandsEventType =
1212
| "reject"
1313
| "think"
1414
| "finish"
15-
| "error"
16-
| "playwright_mcp_browser_screenshot";
15+
| "error";
1716

1817
interface OpenHandsBaseEvent {
1918
id: number;

frontend/src/types/core/observations.ts

+2-11
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,7 @@ export interface AgentThinkObservation
110110
};
111111
}
112112

113-
export interface PlaywrightMcpBrowserScreenshotObservation
114-
extends OpenHandsObservationEvent<ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT> {
115-
source: "agent";
116-
extras: {
117-
url: string;
118-
screenshot: string;
119-
trigger_by_action: string;
120-
};
121-
}
113+
122114
export type OpenHandsObservation =
123115
| AgentStateChangeObservation
124116
| AgentThinkObservation
@@ -130,5 +122,4 @@ export type OpenHandsObservation =
130122
| WriteObservation
131123
| ReadObservation
132124
| EditObservation
133-
| ErrorObservation
134-
| PlaywrightMcpBrowserScreenshotObservation;
125+
| ErrorObservation;

frontend/src/types/observation-type.tsx

-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ enum ObservationType {
1111
// Interactive browsing
1212
BROWSE_INTERACTIVE = "browse_interactive",
1313

14-
PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot',
15-
1614
// The output of a command
1715
RUN = "run",
1816

openhands/agenthub/codeact_agent/function_calling.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
205205
action = McpAction(
206206
name=tool_call.function.name, arguments=tool_call.function.arguments
207207
)
208-
action.set_hard_timeout(120)
208+
# action.set_hard_timeout(120)
209209
logger.warning(f'MCP action in function_calling.py: {action}')
210210

211211
# We only add thought to the first action
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
2+
3+
<ROLE>
4+
Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
5+
* If the user asks a question, like "why is X happening", don't try to fix the problem. Just give an answer to the question.
6+
</ROLE>
7+
8+
<EFFICIENCY>
9+
* Each action you take is somewhat expensive. Wherever possible, combine multiple actions into a single action, e.g. combine multiple bash commands into one, using sed and grep to edit/view multiple files at once.
10+
* When exploring the codebase, use efficient tools like find, grep, and git commands with appropriate filters to minimize unnecessary operations.
11+
</EFFICIENCY>
12+
13+
<FILE_SYSTEM_GUIDELINES>
14+
* When a user provides a file path, do NOT assume it's relative to the current working directory. First explore the file system to locate the file before working on it.
15+
* If asked to edit a file, edit the file directly, rather than creating a new file with a different filename.
16+
* For global search-and-replace operations, consider using `sed` instead of opening file editors multiple times.
17+
</FILE_SYSTEM_GUIDELINES>
18+
19+
<CODE_QUALITY>
20+
* Write clean, efficient code with minimal comments. Avoid redundancy in comments: Do not repeat information that can be easily inferred from the code itself.
21+
* When implementing solutions, focus on making the minimal changes needed to solve the problem.
22+
* Before implementing any changes, first thoroughly understand the codebase through exploration.
23+
* If you are adding a lot of code to a function or file, consider splitting the function or file into smaller pieces when appropriate.
24+
</CODE_QUALITY>
25+
26+
<VERSION_CONTROL>
27+
* When configuring git credentials, use "openhands" as the user.name and "[email protected]" as the user.email by default, unless explicitly instructed otherwise.
28+
* Exercise caution with git operations. Do NOT make potentially dangerous changes (e.g., pushing to main, deleting repositories) unless explicitly asked to do so.
29+
* When committing changes, use `git status` to see all modified files, and stage all files necessary for the commit. Use `git commit -a` whenever possible.
30+
</VERSION_CONTROL>
31+
32+
<PROBLEM_SOLVING_WORKFLOW>
33+
1. EXPLORATION: Thoroughly explore relevant files and understand the context before proposing solutions
34+
2. ANALYSIS: Consider multiple approaches and select the most promising one
35+
3. TESTING:
36+
* For bug fixes: Create tests to verify issues before implementing fixes
37+
* For new features: Consider test-driven development when appropriate
38+
* If the repository lacks testing infrastructure and implementing tests would require extensive setup, consult with the user before investing time in building testing infrastructure
39+
4. IMPLEMENTATION: Make focused, minimal changes to address the problem
40+
5. VERIFICATION: Test your implementation thoroughly, including edge cases
41+
</PROBLEM_SOLVING_WORKFLOW>
42+
43+
<SECURITY>
44+
* Only use GITHUB_TOKEN and other credentials in ways the user has explicitly requested and would expect.
45+
* Use APIs to work with GitHub or other platforms, unless the user asks otherwise or your task requires browsing.
46+
</SECURITY>
47+
48+
<ENVIRONMENT_SETUP>
49+
* When user asks you to run an application, don't stop if the application is not installed. Instead, please install the application and run the command again.
50+
* If you encounter missing dependencies:
51+
1. First, look around in the repository for existing dependency files (requirements.txt, pyproject.toml, package.json, Gemfile, etc.)
52+
2. If dependency files exist, use them to install all dependencies at once (e.g., `pip install -r requirements.txt`, `npm install`, etc.)
53+
3. Only install individual packages directly if no dependency files are found or if only specific packages are needed
54+
* Similarly, if you encounter missing dependencies for essential tools requested by the user, install them when possible.
55+
</ENVIRONMENT_SETUP>
56+
57+
<TROUBLESHOOTING>
58+
* If you've made repeated attempts to solve a problem but tests still fail or the user reports it's still broken:
59+
1. Step back and reflect on 5-7 different possible sources of the problem
60+
2. Assess the likelihood of each possible cause
61+
3. Methodically address the most likely causes, starting with the highest probability
62+
4. Document your reasoning process
63+
* When you run into any major issue while executing a plan from the user, please don't try to directly work around it. Instead, propose a new plan and confirm with the user before proceeding.
64+
</TROUBLESHOOTING>

openhands/agenthub/codeact_agent/prompts/system_prompt.j2

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.
1+
You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
22

33
<ROLE>
44
Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
Original file line numberDiff line numberDiff line change
@@ -1,5 +0,0 @@
1-
You are Thesis Capsule agent, a helpful AI assistant that can interact with a computer to solve tasks.
2-
3-
<PROBLEM_SOLVING_WORKFLOW>
4-
Take screenshot of important actions you take related to web-browsing.
5-
</PROBLEM_SOLVING_WORKFLOW>

openhands/core/schema/observation.py

-4
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,3 @@ class ObservationType(str, Enum):
5252

5353
MCP = 'mcp'
5454
"""Result of a MCP Server operation"""
55-
56-
PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot'
57-
"""Result of a Playwright MCP Browser Screenshot operation. The response is a base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching
58-
browsergym's screenshot format."""

openhands/events/observation/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,4 @@
4545
'RecallObservation',
4646
'RecallType',
4747
'MCPObservation',
48-
'PlaywrightMcpBrowserScreenshotObservation',
4948
]

openhands/events/observation/playwright_mcp.py

-22
This file was deleted.

openhands/events/serialization/observation.py

-4
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@
2727
)
2828
from openhands.events.observation.mcp import MCPObservation
2929
from openhands.events.observation.observation import Observation
30-
from openhands.events.observation.playwright_mcp import (
31-
PlaywrightMcpBrowserScreenshotObservation,
32-
)
3330
from openhands.events.observation.reject import UserRejectObservation
3431
from openhands.events.observation.success import SuccessObservation
3532

@@ -50,7 +47,6 @@
5047
AgentThinkObservation,
5148
RecallObservation,
5249
MCPObservation,
53-
PlaywrightMcpBrowserScreenshotObservation,
5450
)
5551

5652
OBSERVATION_TYPE_TO_CLASS = {

openhands/memory/conversation_memory.py

-20
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from typing import Generator
2-
import json
32

43
from litellm import ModelResponse
54

@@ -40,9 +39,6 @@
4039
from openhands.events.observation.error import ErrorObservation
4140
from openhands.events.observation.mcp import MCPObservation
4241
from openhands.events.observation.observation import Observation
43-
from openhands.events.observation.playwright_mcp import (
44-
PlaywrightMcpBrowserScreenshotObservation,
45-
)
4642
from openhands.events.serialization.event import truncate_content
4743
from openhands.utils.prompt import PromptManager, RepositoryInfo, RuntimeInfo
4844

@@ -337,22 +333,6 @@ def _process_observation(
337333
elif isinstance(obs, MCPObservation):
338334
# logger.warning(f'MCPObservation: {obs}')
339335
message = Message(role='assistant', content=[TextContent(text=obs.content)])
340-
elif isinstance(obs, PlaywrightMcpBrowserScreenshotObservation):
341-
text = 'Image: Current webpage screenshot\n'
342-
screenshot_content = json.loads(obs.content)
343-
logger.debug(
344-
f'screenshot_content in conversation_memory: {screenshot_content}'
345-
)
346-
if 'url' in screenshot_content:
347-
text += f'URL: {screenshot_content["url"]}\n'
348-
349-
# We don't actually need to screenshot fed into the LLM. We can use snapshots. Meanwhile, the screenshot will be streamed to the user.
350-
message = Message(
351-
role='assistant',
352-
content=[
353-
TextContent(text=text),
354-
],
355-
)
356336
elif isinstance(obs, IPythonRunCellObservation):
357337
text = obs.content
358338
# replace base64 images with a placeholder

openhands/runtime/action_execution_server.py

-29
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from fastapi.exceptions import RequestValidationError
2525
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
2626
from fastapi.security import APIKeyHeader
27-
from mcp.types import ImageContent
2827
from openhands_aci.editor.editor import OHEditor
2928
from openhands_aci.editor.exceptions import ToolError
3029
from openhands_aci.editor.results import ToolResult
@@ -59,11 +58,7 @@
5958
Observation,
6059
)
6160
from openhands.events.observation.mcp import MCPObservation
62-
from openhands.events.observation.playwright_mcp import (
63-
PlaywrightMcpBrowserScreenshotObservation,
64-
)
6561
from openhands.events.serialization import event_from_dict, event_to_dict
66-
from openhands.mcp.mcp_base import ToolResult as MCPToolResult
6762
from openhands.runtime.browser import browse
6863
from openhands.runtime.browser.browser_env import BrowserEnv
6964
from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
@@ -559,32 +554,8 @@ async def call_tool_mcp(self, action: McpAction) -> Observation:
559554
for agent in mcp_agents:
560555
await agent.cleanup()
561556

562-
# special case for browser screenshot of playwright_mcp
563-
if action.name == 'browser_screenshot':
564-
return self.playwright_mcp_browser_screenshot(action, response)
565-
566557
return MCPObservation(content=f'MCP result:{response}')
567558

568-
def playwright_mcp_browser_screenshot(
569-
self, action: McpAction, response: MCPToolResult
570-
) -> Observation:
571-
# example response:
572-
"""
573-
{
574-
"type": "image",
575-
"data": "image/jpeg;base64,/9j/4AA...",
576-
"mimeType": "image/jpeg",
577-
"url": "https://www.google.com"
578-
}
579-
"""
580-
screenshot_content: ImageContent = response.output
581-
return PlaywrightMcpBrowserScreenshotObservation(
582-
content=f'{response}',
583-
url=screenshot_content.url if screenshot_content.url is not None else '',
584-
trigger_by_action=action.name,
585-
screenshot=f'data:image/png;base64,{screenshot_content.data}',
586-
)
587-
588559
def close(self):
589560
self.memory_monitor.stop_monitoring()
590561
if self.bash_session is not None:

0 commit comments

Comments
 (0)