add claude support

roywei · roywei · commit 1fc11185a7cb · 2024-03-17T16:10:01.000-07:00
diff --git a/operate/config.py b/operate/config.py
@@ -2,6 +2,7 @@
 import sys
 from dotenv import load_dotenv
 from openai import OpenAI
+import anthropic
 from prompt_toolkit.shortcuts import input_dialog
 import google.generativeai as genai
 
@@ -33,6 +34,10 @@ def __init__(self):
         self.google_api_key = (
             None  # instance variables are backups in case saving to a `.env` fails
         )
+        self.anthropic_api_key = (
+            None  # instance variables are backups in case saving to a `.env` fails
+        )
+
 
     def initialize_openai(self):
         if self.verbose:
@@ -71,6 +76,14 @@ def initialize_google(self):
         model = genai.GenerativeModel("gemini-pro-vision")
 
         return model
+    
+    def initialize_anthropic(self):
+        if self.anthropic_api_key:
+            api_key = self.anthropic_api_key
+        else:
+            api_key = os.getenv("ANTHROPIC_API_KEY")
+        return anthropic.Anthropic(api_key=api_key)
+
 
     def validation(self, model, voice_mode):
         """
@@ -87,6 +100,9 @@ def validation(self, model, voice_mode):
         self.require_api_key(
             "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
         )
+        self.require_api_key(
+            "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3-with-ocr"
+        )
 
     def require_api_key(self, key_name, key_description, is_required):
         key_exists = bool(os.environ.get(key_name))
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -48,14 +48,16 @@ async def get_next_action(model, messages, objective, session_id):
     if model == "gpt-4-with-ocr":
         operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
         return operation, None
-    elif model == "agent-1":
+    if model == "agent-1":
         return "coming soon"
-    elif model == "gemini-pro-vision":
+    if model == "gemini-pro-vision":
         return call_gemini_pro_vision(messages, objective), None
-    elif model == "llava":
-        operation = call_ollama_llava(messages), None
-        return operation
-
+    if model == "llava":
+        operation = call_ollama_llava(messages)
+        return operation, None
+    if model == "claude-3-with-ocr":
+        operation = await call_claude_3_with_ocr(messages, objective, model)
+        return operation, None
     raise ModelNotRecognizedException(model)
 
 
@@ -261,7 +263,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
                 result = reader.readtext(screenshot_filename)
 
                 text_element_index = get_text_element(
-                    result, text_to_click, screenshot_filename
+                    result, text_to_click[:3], screenshot_filename
                 )
                 coordinates = get_text_coordinates(
                     result, text_element_index, screenshot_filename
@@ -528,6 +530,159 @@ def call_ollama_llava(messages):
         return call_ollama_llava(messages)
 
 
+async def call_claude_3_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_claude_3_with_ocr]")
+
+    try:
+        time.sleep(1)
+        client = config.initialize_anthropic()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img = Image.open(img_file)
+            
+            # Calculate the new dimensions while maintaining the aspect ratio
+            original_width, original_height = img.size
+            aspect_ratio = original_width / original_height
+            new_width = 2560  # Adjust this value to achieve the desired file size
+            new_height = int(new_width / aspect_ratio)
+            
+            # Resize the image
+            img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            
+            # Save the resized image to a BytesIO object
+            img_buffer = io.BytesIO()
+            img_resized.save(img_buffer, format='PNG')
+            img_buffer.seek(0)
+            
+            # Encode the resized image as base64
+            img_data = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": img_data,
+                    },
+                },
+                {"type": "text", "text": user_prompt + "**REMEMBER** Only output json format, do not append any other text."},
+            ],
+        }
+        messages.append(vision_message)
+
+        # anthropic api expect system prompt as an separate argument
+        response = client.messages.create(
+            model="claude-3-opus-20240229",
+            max_tokens=3000,
+            system=messages[0]["content"],
+            messages=messages[1:],
+        )
+
+        content = response.content[0].text
+        content = clean_json(content)
+        content_str = content
+        try:
+            content = json.loads(content)
+        except json.JSONDecodeError as e:
+            if config.verbose:
+                print(
+                    f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] JSONDecodeError: {e} {ANSI_RESET}"
+                )
+            response = client.messages.create(
+                model="claude-3-opus-20240229",
+                max_tokens=3000,
+                system=f"This json string is not valid, when using with json.loads(content) \
+                it throws the following error: {e}, return correct json string. **REMEMBER** Only output json format, do not append any other text.",
+                messages=[{"role": "user", "content": content}],
+            )
+            content = response.content[0].text
+            content = clean_json(content)
+            content_str = content
+            content = json.loads(content)
+
+        if config.verbose:
+            print(
+                f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] content: {content} {ANSI_RESET}"
+            )
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_claude_3_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click[:3], screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_claude_3_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_claude_3_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_claude_3_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        raise Exception(e)
+        #return gpt_4_fallback(messages, objective, model)
+    
+
 def get_last_assistant_message(messages):
     """
     Retrieve the last message from the assistant in the messages array.
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
@@ -72,6 +72,7 @@
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
 You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+**REMEMBER** Only output json format, do not append any other text.
 
 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
 ```
@@ -238,6 +239,13 @@ def get_system_prompt(model, objective):
             os_search_str=os_search_str,
             operating_system=operating_system,
         )
+    elif model == "claude-3-with-ocr":
+        prompt = SYSTEM_PROMPT_OCR.format(
+            objective=objective,
+            cmd_string=cmd_string,
+            os_search_str=os_search_str,
+            operating_system=operating_system,
+        )
     else:
         prompt = SYSTEM_PROMPT_STANDARD.format(
             objective=objective,
diff --git a/requirements.txt b/requirements.txt
@@ -51,4 +51,5 @@ google-generativeai==0.3.0
 aiohttp==3.9.1
 ultralytics==8.0.227
 easyocr==1.7.1
-ollama==0.1.6
+ollama==0.1.6
+anthropic