✨ Video Object Detection with DINO-X

JYe9 · JYe9 · commit b0f736028a1b · 2024-11-28T12:51:01.000+02:00
diff --git a/video-demo.py b/video-demo.py
@@ -0,0 +1,129 @@
+# dds cloudapi for DINO-X
+from dds_cloudapi_sdk import Config
+from dds_cloudapi_sdk import Client
+from dds_cloudapi_sdk.tasks.dinox import DinoxTask
+from dds_cloudapi_sdk.tasks.detection import DetectionTask
+from dds_cloudapi_sdk import TextPrompt
+from dds_cloudapi_sdk import DetectionModel
+from dds_cloudapi_sdk import DetectionTarget
+
+# using supervision for visualization
+import cv2
+import numpy as np
+import supervision as sv
+import os
+
+"""
+Hyper Parameters
+"""
+API_TOKEN = "Your API token"
+VIDEO_PATH = "./assets/demo.mp4"
+OUTPUT_PATH = "./annotated_demo_video.mp4"
+TEXT_PROMPT = "wheel . eye . helmet . mouse . mouth . vehicle . steering wheel . ear . nose" 
+
+def process_video_with_dino_x():
+    """
+    Process video using DINO-X object detection
+    """
+    # Step 1: Initialize config and client
+    config = Config(API_TOKEN)
+    client = Client(config)
+
+    # Prepare class mapping
+    classes = [x.strip().lower() for x in TEXT_PROMPT.split('.') if x]
+    class_name_to_id = {name: id for id, name in enumerate(classes)}
+    
+    # Open video
+    cap = cv2.VideoCapture(VIDEO_PATH)
+    
+    # Get video properties
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    
+    # Initialize video writer
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))
+    
+    # Temporary frame for upload
+    temp_frame_path = "./temp_frame.jpg"
+    
+    try:
+        # Process each frame
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            
+            # Save current frame temporarily
+            cv2.imwrite(temp_frame_path, frame)
+            
+            # Upload and process frame
+            image_url = client.upload_file(temp_frame_path)
+            task = DinoxTask(
+                image_url=image_url,
+                prompts=[TextPrompt(text=TEXT_PROMPT)]
+            )
+            client.run_task(task)
+            predictions = task.result.objects
+            
+            # Decode prediction results
+            boxes = []
+            confidences = []
+            class_names = []
+            class_ids = []
+            
+            for obj in predictions:
+                boxes.append(obj.bbox)
+                confidences.append(obj.score)
+                cls_name = obj.category.lower().strip()
+                class_names.append(cls_name)
+                class_ids.append(class_name_to_id[cls_name])
+            
+            boxes = np.array(boxes)
+            class_ids = np.array(class_ids)
+            labels = [
+                f"{class_name} {confidence:.2f}"
+                for class_name, confidence
+                in zip(class_names, confidences)
+            ]
+            
+            # Annotate frame
+            detections = sv.Detections(
+                xyxy=boxes,
+                class_id=class_ids
+            )
+            
+            box_annotator = sv.BoxAnnotator()
+            annotated_frame = box_annotator.annotate(scene=frame.copy(), detections=detections)
+            
+            label_annotator = sv.LabelAnnotator()
+            annotated_frame = label_annotator.annotate(
+                scene=annotated_frame, 
+                detections=detections, 
+                labels=labels
+            )
+            
+            # Write annotated frame
+            out.write(annotated_frame)
+    
+    except Exception as e:
+        print(f"Error processing video: {e}")
+    
+    finally:
+        # Clean up resources
+        cap.release()
+        out.release()
+        cv2.destroyAllWindows()
+        
+        # Remove temporary frame
+        if os.path.exists(temp_frame_path):
+            os.remove(temp_frame_path)
+    
+    print(f"Annotated video saved to {OUTPUT_PATH}")
+
+def main():
+    process_video_with_dino_x()
+
+if __name__ == '__main__':
+    main()