@@ -48,14 +48,16 @@ async def get_next_action(model, messages, objective, session_id):
48
48
if model == "gpt-4-with-ocr" :
49
49
operation = await call_gpt_4_vision_preview_ocr (messages , objective , model )
50
50
return operation , None
51
- elif model == "agent-1" :
51
+ if model == "agent-1" :
52
52
return "coming soon"
53
- elif model == "gemini-pro-vision" :
53
+ if model == "gemini-pro-vision" :
54
54
return call_gemini_pro_vision (messages , objective ), None
55
- elif model == "llava" :
56
- operation = call_ollama_llava (messages ), None
57
- return operation
58
-
55
+ if model == "llava" :
56
+ operation = call_ollama_llava (messages )
57
+ return operation , None
58
+ if model == "claude-3-with-ocr" :
59
+ operation = await call_claude_3_with_ocr (messages , objective , model )
60
+ return operation , None
59
61
raise ModelNotRecognizedException (model )
60
62
61
63
@@ -261,7 +263,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
261
263
result = reader .readtext (screenshot_filename )
262
264
263
265
text_element_index = get_text_element (
264
- result , text_to_click , screenshot_filename
266
+ result , text_to_click [: 3 ] , screenshot_filename
265
267
)
266
268
coordinates = get_text_coordinates (
267
269
result , text_element_index , screenshot_filename
@@ -528,6 +530,159 @@ def call_ollama_llava(messages):
528
530
return call_ollama_llava (messages )
529
531
530
532
533
+ async def call_claude_3_with_ocr (messages , objective , model ):
534
+ if config .verbose :
535
+ print ("[call_claude_3_with_ocr]" )
536
+
537
+ try :
538
+ time .sleep (1 )
539
+ client = config .initialize_anthropic ()
540
+
541
+ confirm_system_prompt (messages , objective , model )
542
+ screenshots_dir = "screenshots"
543
+ if not os .path .exists (screenshots_dir ):
544
+ os .makedirs (screenshots_dir )
545
+
546
+ screenshot_filename = os .path .join (screenshots_dir , "screenshot.png" )
547
+ capture_screen_with_cursor (screenshot_filename )
548
+
549
+ with open (screenshot_filename , "rb" ) as img_file :
550
+ img = Image .open (img_file )
551
+
552
+ # Calculate the new dimensions while maintaining the aspect ratio
553
+ original_width , original_height = img .size
554
+ aspect_ratio = original_width / original_height
555
+ new_width = 2560 # Adjust this value to achieve the desired file size
556
+ new_height = int (new_width / aspect_ratio )
557
+
558
+ # Resize the image
559
+ img_resized = img .resize ((new_width , new_height ), Image .Resampling .LANCZOS )
560
+
561
+ # Save the resized image to a BytesIO object
562
+ img_buffer = io .BytesIO ()
563
+ img_resized .save (img_buffer , format = 'PNG' )
564
+ img_buffer .seek (0 )
565
+
566
+ # Encode the resized image as base64
567
+ img_data = base64 .b64encode (img_buffer .getvalue ()).decode ("utf-8" )
568
+
569
+ if len (messages ) == 1 :
570
+ user_prompt = get_user_first_message_prompt ()
571
+ else :
572
+ user_prompt = get_user_prompt ()
573
+
574
+ vision_message = {
575
+ "role" : "user" ,
576
+ "content" : [
577
+ {
578
+ "type" : "image" ,
579
+ "source" : {
580
+ "type" : "base64" ,
581
+ "media_type" : "image/png" ,
582
+ "data" : img_data ,
583
+ },
584
+ },
585
+ {"type" : "text" , "text" : user_prompt + "**REMEMBER** Only output json format, do not append any other text." },
586
+ ],
587
+ }
588
+ messages .append (vision_message )
589
+
590
+ # anthropic api expect system prompt as an separate argument
591
+ response = client .messages .create (
592
+ model = "claude-3-opus-20240229" ,
593
+ max_tokens = 3000 ,
594
+ system = messages [0 ]["content" ],
595
+ messages = messages [1 :],
596
+ )
597
+
598
+ content = response .content [0 ].text
599
+ content = clean_json (content )
600
+ content_str = content
601
+ try :
602
+ content = json .loads (content )
603
+ except json .JSONDecodeError as e :
604
+ if config .verbose :
605
+ print (
606
+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RED } [Error] JSONDecodeError: { e } { ANSI_RESET } "
607
+ )
608
+ response = client .messages .create (
609
+ model = "claude-3-opus-20240229" ,
610
+ max_tokens = 3000 ,
611
+ system = f"This json string is not valid, when using with json.loads(content) \
612
+ it throws the following error: { e } , return correct json string. **REMEMBER** Only output json format, do not append any other text." ,
613
+ messages = [{"role" : "user" , "content" : content }],
614
+ )
615
+ content = response .content [0 ].text
616
+ content = clean_json (content )
617
+ content_str = content
618
+ content = json .loads (content )
619
+
620
+ if config .verbose :
621
+ print (
622
+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [{ model } ] content: { content } { ANSI_RESET } "
623
+ )
624
+ processed_content = []
625
+
626
+ for operation in content :
627
+ if operation .get ("operation" ) == "click" :
628
+ text_to_click = operation .get ("text" )
629
+ if config .verbose :
630
+ print (
631
+ "[call_claude_3_ocr][click] text_to_click" ,
632
+ text_to_click ,
633
+ )
634
+ # Initialize EasyOCR Reader
635
+ reader = easyocr .Reader (["en" ])
636
+
637
+ # Read the screenshot
638
+ result = reader .readtext (screenshot_filename )
639
+
640
+ text_element_index = get_text_element (
641
+ result , text_to_click [:3 ], screenshot_filename
642
+ )
643
+ coordinates = get_text_coordinates (
644
+ result , text_element_index , screenshot_filename
645
+ )
646
+
647
+ # add `coordinates`` to `content`
648
+ operation ["x" ] = coordinates ["x" ]
649
+ operation ["y" ] = coordinates ["y" ]
650
+
651
+ if config .verbose :
652
+ print (
653
+ "[call_claude_3_ocr][click] text_element_index" ,
654
+ text_element_index ,
655
+ )
656
+ print (
657
+ "[call_claude_3_ocr][click] coordinates" ,
658
+ coordinates ,
659
+ )
660
+ print (
661
+ "[call_claude_3_ocr][click] final operation" ,
662
+ operation ,
663
+ )
664
+ processed_content .append (operation )
665
+
666
+ else :
667
+ processed_content .append (operation )
668
+
669
+
670
+ assistant_message = {"role" : "assistant" , "content" : content_str }
671
+ messages .append (assistant_message )
672
+
673
+ return processed_content
674
+
675
+ except Exception as e :
676
+ print (
677
+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [{ model } ] That did not work. Trying another method { ANSI_RESET } "
678
+ )
679
+ if config .verbose :
680
+ print ("[Self-Operating Computer][Operate] error" , e )
681
+ traceback .print_exc ()
682
+ raise Exception (e )
683
+ #return gpt_4_fallback(messages, objective, model)
684
+
685
+
531
686
def get_last_assistant_message (messages ):
532
687
"""
533
688
Retrieve the last message from the assistant in the messages array.
0 commit comments