From 0a85a1912b7cbd3d79bc4318013cf7651150e14a Mon Sep 17 00:00:00 2001 From: Hadrien Date: Sun, 9 Feb 2025 11:53:29 +0100 Subject: [PATCH 1/5] add other ollama visual llms tosquash to squash w/ first --- operate/models/apis.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index d0ccb0c4..273b6046 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -50,8 +50,8 @@ async def get_next_action(model, messages, objective, session_id): return "coming soon" if model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None - if model == "llava": - operation = call_ollama_llava(messages) + if model == "llava" or model == "llava:13b" or "bakllava" or "llava-llama3": + operation = call_ollama_llava(messages, model) return operation, None if model == "claude-3": operation = await call_claude_3_with_ocr(messages, objective, model) @@ -558,9 +558,11 @@ async def call_gpt_4o_labeled(messages, objective, model): return call_gpt_4o(messages) -def call_ollama_llava(messages): +def call_ollama_llava(messages, model): + if model == "": + model = "llava" if config.verbose: - print("[call_ollama_llava]") + print(f"[call_ollama_llava] model {model}") time.sleep(1) try: model = config.initialize_ollama() @@ -590,8 +592,8 @@ def call_ollama_llava(messages): } messages.append(vision_message) - response = model.chat( - model="llava", + response = ollama.chat( + model=model, messages=messages, ) @@ -633,7 +635,7 @@ def call_ollama_llava(messages): ) if config.verbose: traceback.print_exc() - return call_ollama_llava(messages) + return call_ollama_llava(messages, model) async def call_claude_3_with_ocr(messages, objective, model): From 3393989caa9bfad2e14f0ea7cbf063e130587f14 Mon Sep 17 00:00:00 2001 From: Hadrien Date: Sun, 9 Feb 2025 11:53:29 +0100 Subject: [PATCH 2/5] exhaustive "llava" match --- operate/models/apis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 273b6046..b34975df 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -50,7 +50,7 @@ async def get_next_action(model, messages, objective, session_id): return "coming soon" if model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None - if model == "llava" or model == "llava:13b" or "bakllava" or "llava-llama3": + if "llava" in model: operation = call_ollama_llava(messages, model) return operation, None if model == "claude-3": From 77f3755970a9100dbfa6bd59714f194f3c21827b Mon Sep 17 00:00:00 2001 From: Hadrien Date: Sun, 9 Feb 2025 11:53:30 +0100 Subject: [PATCH 3/5] logs --- operate/models/apis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index b34975df..4228669d 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -626,7 +626,7 @@ def call_ollama_llava(messages, model): except Exception as e: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[llava] That did not work. Trying again {ANSI_RESET}", + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying again {ANSI_RESET}", e, ) print( From 8ffb5953947206bebaa29899396102512e2c98f9 Mon Sep 17 00:00:00 2001 From: Hadrien Date: Sun, 9 Feb 2025 11:53:30 +0100 Subject: [PATCH 4/5] fix: by default, try it in ollama --- README.md | 16 ++++++++-------- operate/models/apis.py | 14 ++++---------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index ab24691c..0a74bda8 100644 --- a/README.md +++ b/README.md @@ -76,28 +76,28 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a operate -m claude-3 ``` -#### Try LLaVa Hosted Through Ollama `-m llava` -If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! +#### Try a model Hosted Through Ollama `-m llama3.2-vision` +If you wish to experiment with the Self-Operating Computer Framework using e.g. LLaVA on your own machine, you can with Ollama! *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview* First, install Ollama on your machine from https://ollama.ai/download. -Once Ollama is installed, pull the LLaVA model: +Once Ollama is installed, pull the vision model: ``` -ollama pull llava +ollama pull llama3.2-vision ``` This will download the model on your machine which takes approximately 5 GB of storage. -When Ollama has finished pulling LLaVA, start the server: +When Ollama has finished pulling llama3.2-vision, start the server: ``` ollama serve ``` -That's it! Now start `operate` and select the LLaVA model: +That's it! Now start `operate` and select the model: ``` -operate -m llava +operate -m llama3.2-vision ``` -**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. +**Important:** Error rates when using self-hosted models are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama) diff --git a/operate/models/apis.py b/operate/models/apis.py index 4228669d..43493bcb 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -50,14 +50,11 @@ async def get_next_action(model, messages, objective, session_id): return "coming soon" if model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None - if "llava" in model: - operation = call_ollama_llava(messages, model) - return operation, None if model == "claude-3": operation = await call_claude_3_with_ocr(messages, objective, model) return operation, None - raise ModelNotRecognizedException(model) - + operation = call_ollama_llava(model, messages) + return operation, None def call_gpt_4o(messages): if config.verbose: @@ -557,10 +554,7 @@ async def call_gpt_4o_labeled(messages, objective, model): traceback.print_exc() return call_gpt_4o(messages) - -def call_ollama_llava(messages, model): - if model == "": - model = "llava" +def call_ollama_llava(model, messages): if config.verbose: print(f"[call_ollama_llava] model {model}") time.sleep(1) @@ -635,7 +629,7 @@ def call_ollama_llava(messages, model): ) if config.verbose: traceback.print_exc() - return call_ollama_llava(messages, model) + return call_ollama_llava(model, messages) async def call_claude_3_with_ocr(messages, objective, model): From ebc89df9d59c352332e790d42da3e36cb3355ac2 Mon Sep 17 00:00:00 2001 From: Hadrien Date: Sun, 9 Feb 2025 11:53:30 +0100 Subject: [PATCH 5/5] chore: replace call_ollama_llava by call_ollama --- operate/models/apis.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 43493bcb..1b8a2fc4 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -53,7 +53,7 @@ async def get_next_action(model, messages, objective, session_id): if model == "claude-3": operation = await call_claude_3_with_ocr(messages, objective, model) return operation, None - operation = call_ollama_llava(model, messages) + operation = call_ollama(model, messages) return operation, None def call_gpt_4o(messages): @@ -554,9 +554,9 @@ async def call_gpt_4o_labeled(messages, objective, model): traceback.print_exc() return call_gpt_4o(messages) -def call_ollama_llava(model, messages): +def call_ollama(model, messages): if config.verbose: - print(f"[call_ollama_llava] model {model}") + print(f"[call_ollama] model {model}") time.sleep(1) try: model = config.initialize_ollama() @@ -575,7 +575,7 @@ def call_ollama_llava(model, messages): if config.verbose: print( - "[call_ollama_llava] user_prompt", + "[call_ollama] user_prompt", user_prompt, ) @@ -603,7 +603,7 @@ def call_ollama_llava(model, messages): assistant_message = {"role": "assistant", "content": content} if config.verbose: print( - "[call_ollama_llava] content", + "[call_ollama] content", content, ) content = json.loads(content) @@ -629,7 +629,7 @@ def call_ollama_llava(model, messages): ) if config.verbose: traceback.print_exc() - return call_ollama_llava(model, messages) + return call_ollama(model, messages) async def call_claude_3_with_ocr(messages, objective, model):