diff --git a/.gitignore b/.gitignore index 2df6ebfd..730a692e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,12 @@ **/__pycache__/** .vscode +.idea build/ **/build **/build/** + +target/ +**/target +**/target/** \ No newline at end of file diff --git a/models/object_tracking_vittrack/README.md b/models/object_tracking_vittrack/README.md index ad3f0a3e..c99e8c97 100644 --- a/models/object_tracking_vittrack/README.md +++ b/models/object_tracking_vittrack/README.md @@ -40,6 +40,21 @@ cmake --build build ./build/opencv_zoo_object_tracking_vittrack -h ``` +## Java + +Install Maven to get started. + +```shell +# tracking on camera input +mvn compile exec:java -q + +# tracking on video +mvn compile exec:java -q -Dexec.args="-i /path/to/video" + +# get help messages +mvn compile exec:java -q -Dexec.args="-h" +``` + # Example outputs <img src="example_outputs/vittrack_demo.gif" style="zoom:200%;" /> diff --git a/models/object_tracking_vittrack/demo.java b/models/object_tracking_vittrack/demo.java new file mode 100644 index 00000000..353c3f84 --- /dev/null +++ b/models/object_tracking_vittrack/demo.java @@ -0,0 +1,206 @@ +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.UnixStyleUsageFormatter; +import org.bytedeco.javacpp.BytePointer; +import org.bytedeco.opencv.global.opencv_dnn; +import org.bytedeco.opencv.opencv_core.*; +import org.bytedeco.opencv.opencv_video.TrackerVit; +import org.bytedeco.opencv.opencv_videoio.VideoCapture; +import org.bytedeco.opencv.opencv_videoio.VideoWriter; + +import static org.bytedeco.opencv.global.opencv_highgui.*; +import static org.bytedeco.opencv.global.opencv_imgproc.*; +import static org.bytedeco.opencv.global.opencv_videoio.CAP_PROP_FPS; + +public class demo { + + // Valid combinations of backends and targets + static int[][] backendTargetPairs = { + {opencv_dnn.DNN_BACKEND_OPENCV, opencv_dnn.DNN_TARGET_CPU}, + {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA}, + {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA_FP16}, + {opencv_dnn.DNN_BACKEND_TIMVX, opencv_dnn.DNN_TARGET_NPU}, + {opencv_dnn.DNN_BACKEND_CANN, opencv_dnn.DNN_TARGET_NPU} + }; + + static class Args { + @Parameter(names = {"--help", "-h"}, order = 0, help = true, + description = "Print help message.") + boolean help; + @Parameter(names = {"--input", "-i"}, order = 1, + description = "Set path to the input video. Omit for using default camera.") + String input; + @Parameter(names = {"--model_path", "-m"}, order = 2, + description = "Set model path.") + String modelPath = "object_tracking_vittrack_2023sep.onnx"; + @Parameter(names = {"--backend_target", "-bt"}, order = 3, + description = "Choose one of the backend-target pair to run this demo:" + + " 0: OpenCV implementation + CPU," + + " 1: CUDA + GPU (CUDA), " + + " 2: CUDA + GPU (CUDA FP16)," + + " 3: TIM-VX + NPU," + + " 4: CANN + NPU") + int backendTarget = 0; + @Parameter(names = {"--save", "-s"}, order = 4, + description = "Specify to save a file with results.") + boolean save; + @Parameter(names = {"--vis", "-v"}, order = 5, arity = 1, + description = "Specify to open a new window to show results.") + boolean vis = true; + } + + static class TrackingResult { + boolean isLocated; + Rect bbox; + float score; + } + + static class VitTrack { + private final TrackerVit model; + + VitTrack(String modelPath, int backendId, int targetId) { + final TrackerVit.Params params = new TrackerVit.Params(); + params.net(new BytePointer(modelPath)) + .backend(backendId) + .target(targetId); + model = TrackerVit.create(params); + } + + void init(Mat image, Rect roi) { + model.init(image, roi); + } + + TrackingResult infer(Mat image) { + final TrackingResult result = new TrackingResult(); + result.bbox = new Rect(); + result.isLocated = model.update(image, result.bbox); + result.score = model.getTrackingScore(); + return result; + } + } + + static Mat visualize(Mat image, Rect bbox, float score, boolean isLocated, double fps, Scalar boxColor, + Scalar textColor, double fontScale, int fontSize) { + final Mat output = image.clone(); + final int h = output.rows(); + final int w = output.cols(); + if (fps >= 0) { + putText(output, String.format("FPS: %.2f", fps), new Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, + textColor); + } + + if (isLocated && score >= 0.3) { + rectangle(output, bbox, boxColor, 2, LINE_8, 0); + putText(output, String.format("%.2f", score), new Point(bbox.x(), bbox.y() + 25), + FONT_HERSHEY_DUPLEX, fontScale, textColor, fontSize, LINE_8, false); + } else { + final Size textSize = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, new int[]{0}); + final int textX = (w - textSize.width()) / 2; + final int textY = (h - textSize.height()) / 2; + putText(output, "Target lost!", new Point(textX, textY), FONT_HERSHEY_DUPLEX, + fontScale, new Scalar(0, 0, 255, 0), fontSize, LINE_8, false); + } + + return output; + } + + /** + * Execute: mvn compile exec:java -q -Dexec.args="" + */ + public static void main(String[] argv) { + final Args args = new Args(); + final JCommander jc = JCommander.newBuilder() + .addObject(args) + .build(); + jc.setUsageFormatter(new UnixStyleUsageFormatter(jc)); + jc.parse(argv); + if (args.help) { + jc.usage(); + return; + } + final int backendId = backendTargetPairs[args.backendTarget][0]; + final int targetId = backendTargetPairs[args.backendTarget][1]; + VitTrack tracker = new VitTrack(args.modelPath, backendId, targetId); + + final VideoCapture video = new VideoCapture(); + if (args.input == null) { + video.open(0); + } else { + video.open(args.input); + } + if (!video.isOpened()) { + System.err.println("Error: Could not open video source"); + return; + } + + Mat firstFrame = new Mat(); + video.read(firstFrame); + + if (firstFrame.empty()) { + System.err.println("No frames grabbed!"); + return; + } + + Mat firstFrameCopy = firstFrame.clone(); + putText(firstFrameCopy, "1. Drag a bounding box to track.", new Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, new Scalar(0, 255, 0, 0)); + putText(firstFrameCopy, "2. Press ENTER to confirm", new Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, new Scalar(0, 255, 0, 0)); + final Rect roi = selectROI("VitTrack Demo", firstFrameCopy); + + if (roi.area() == 0) { + System.err.println("No ROI is selected! Exiting..."); + return; + } else { + System.out.printf("Selected ROI: (x: %d, y: %d, width: %d, height: %d)%n", roi.x(), roi.y(), roi.width(), + roi.height()); + } + + // Create VideoWriter if save option is specified + final VideoWriter outputVideo = new VideoWriter(); + if (args.save) { + final Size frameSize = firstFrame.size(); + outputVideo.open("output.mp4", VideoWriter.fourcc((byte) 'm', (byte) 'p', (byte) '4', (byte) 'v'), + video.get(CAP_PROP_FPS), frameSize); + if (!outputVideo.isOpened()) { + System.err.println("Error: Could not create output video stream"); + return; + } + } + + // Initialize tracker with ROI + tracker.init(firstFrame, roi); + + // Track frame by frame + final TickMeter tm = new TickMeter(); + while (waitKey(1) < 0) { + video.read(firstFrame); + if (firstFrame.empty()) { + System.out.println("End of video"); + break; + } + + // Inference + tm.start(); + final TrackingResult result = tracker.infer(firstFrame); + tm.stop(); + + // Visualize + Mat frame = firstFrame.clone(); + frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS(), + new Scalar(0, 255, 0, 0), new Scalar(0, 255, 0, 0), 1.0, 1); + + if (args.save) { + outputVideo.write(frame); + } + if (args.vis) { + imshow("VitTrack Demo", frame); + } + tm.reset(); + } + if (args.save) { + outputVideo.release(); + } + + video.release(); + } + +} diff --git a/models/object_tracking_vittrack/pom.xml b/models/object_tracking_vittrack/pom.xml new file mode 100644 index 00000000..6b58bac1 --- /dev/null +++ b/models/object_tracking_vittrack/pom.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>opencv_zoo</groupId> + <artifactId>demo</artifactId> + <version>1.0.0-SNAPSHOT</version> + </parent> + + <artifactId>object_tracking_vittrack</artifactId> + + <build> + <sourceDirectory>${project.basedir}</sourceDirectory> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>exec-maven-plugin</artifactId> + <version>3.3.0</version> + <configuration> + <executable>java</executable> + <mainClass>demo</mainClass> + </configuration> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/models/pom.xml b/models/pom.xml new file mode 100644 index 00000000..a38928f3 --- /dev/null +++ b/models/pom.xml @@ -0,0 +1,98 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>opencv_zoo</groupId> + <artifactId>demo</artifactId> + <version>1.0.0-SNAPSHOT</version> + <name>OpenCV Zoo demo application</name> + <packaging>pom</packaging> + + <build> + <sourceDirectory>${project.basedir}</sourceDirectory> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>exec-maven-plugin</artifactId> + <version>3.3.0</version> + <configuration> + <executable>java</executable> + <mainClass>demo</mainClass> + </configuration> + </plugin> + </plugins> + </build> + + <modules> + <module>object_tracking_vittrack</module> + <module>text_detection_ppocr</module> + </modules> + + <dependencies> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacv-platform</artifactId> + <version>1.5.10</version> + <exclusions> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>flycapture-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>libdc1394-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>libfreenect-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>libfreenect2-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>librealsense-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>librealsense2-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>videoinput-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>artoolkitplus-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>leptonica-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>tesseract-platform</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>opencv-platform-gpu</artifactId> + <version>4.9.0-1.5.10</version> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>cuda-platform-redist</artifactId> + <version>12.3-8.9-1.5.10</version> + </dependency> + <dependency> + <groupId>com.beust</groupId> + <artifactId>jcommander</artifactId> + <version>1.82</version> + </dependency> + </dependencies> +</project> \ No newline at end of file diff --git a/models/text_detection_ppocr/README.md b/models/text_detection_ppocr/README.md index 1a875d1c..21367f77 100644 --- a/models/text_detection_ppocr/README.md +++ b/models/text_detection_ppocr/README.md @@ -43,6 +43,19 @@ cmake --build build ./build/opencv_zoo_text_detection_ppocr -h ``` +### Java + +Install Maven to get started with: + +```shell +# detect on camera input +mvn compile exec:java -q +# detect on an image +mvn compile exec:java -q -Dexec.args="--input /path/to/image -v" +# get help messages +mvn compile exec:java -q -Dexec.args="--help" +``` + ### Example outputs  diff --git a/models/text_detection_ppocr/demo.java b/models/text_detection_ppocr/demo.java new file mode 100644 index 00000000..b65a02a5 --- /dev/null +++ b/models/text_detection_ppocr/demo.java @@ -0,0 +1,221 @@ +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.UnixStyleUsageFormatter; +import org.bytedeco.javacpp.FloatPointer; +import org.bytedeco.javacv.CanvasFrame; +import org.bytedeco.javacv.OpenCVFrameConverter; +import org.bytedeco.opencv.global.opencv_dnn; +import org.bytedeco.opencv.opencv_core.*; +import org.bytedeco.opencv.opencv_dnn.TextDetectionModel_DB; +import org.bytedeco.opencv.opencv_videoio.VideoCapture; + +import java.util.AbstractMap; +import java.util.Map; + +import static org.bytedeco.opencv.global.opencv_imgcodecs.imwrite; +import static org.bytedeco.opencv.global.opencv_imgproc.*; + +public class demo { + + // Valid combinations of backends and targets + static int[][] backendTargetPairs = { + {opencv_dnn.DNN_BACKEND_OPENCV, opencv_dnn.DNN_TARGET_CPU}, + {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA}, + {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA_FP16}, + {opencv_dnn.DNN_BACKEND_TIMVX, opencv_dnn.DNN_TARGET_NPU}, + {opencv_dnn.DNN_BACKEND_CANN, opencv_dnn.DNN_TARGET_NPU} + }; + + static class Args { + @Parameter(names = {"--help", "-h"}, order = 0, help = true, + description = "Print help message.") + boolean help; + @Parameter(names = {"--model", "-m"}, order = 1, + description = "Set model type.") + String model = "text_detection_en_ppocrv3_2023may.onnx"; + @Parameter(names = {"--input", "-i"}, order = 2, + description = "Path to input image or video file. Skip this argument to capture frames from a camera.") + String input; + @Parameter(names = "--width", order = 3, + description = "Resize input image to certain width, It should be multiple by 32.") + int width = 736; + @Parameter(names = "--height", order = 4, + description = "Resize input image to certain height, It should be multiple by 32.") + int height = 736; + @Parameter(names = "--binary_threshold", order = 5, + description = "Threshold of the binary map.") + float binaryThreshold = 0.3f; + @Parameter(names = "--polygon_threshold", order = 6, + description = "Threshold of polygons.") + float polygonThreshold = 0.5f; + @Parameter(names = "--max_candidates", order = 7, + description = "Set maximum number of polygon candidates.") + int maxCandidates = 200; + @Parameter(names = "--unclip_ratio", order = 8, + description = "The unclip ratio of the detected text region, which determines the output size.") + double unclipRatio = 2.0; + @Parameter(names = {"--save", "-s"}, order = 9, + description = "Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.") + boolean save; + @Parameter(names = {"--viz", "-v"}, order = 10, + description = "Specify to open a new window to show results. Invalid in case of camera input.") + boolean viz; + @Parameter(names = {"--backend", "-bt"}, order = 11, + description = "Choose one of computation backends:" + + " 0: OpenCV implementation + CPU," + + " 1: CUDA + GPU (CUDA), " + + " 2: CUDA + GPU (CUDA FP16)," + + " 3: TIM-VX + NPU," + + " 4: CANN + NPU") + int backend = 0; + } + + static class PPOCRDet { + private final TextDetectionModel_DB model; + private final Size inputSize; + + public PPOCRDet(String modelPath, Size inputSize, + float binaryThreshold, float polygonThreshold, int maxCandidates, double unclipRatio, + int backendId, int targetId) { + this.inputSize = inputSize; + + model = new TextDetectionModel_DB(modelPath); + model.setPreferableBackend(backendId); + model.setPreferableTarget(targetId); + + model.setBinaryThreshold(binaryThreshold); + model.setPolygonThreshold(polygonThreshold); + model.setUnclipRatio(unclipRatio); + model.setMaxCandidates(maxCandidates); + + model.setInputParams(1.0 / 255.0, inputSize, + new Scalar(122.67891434, 116.66876762, 104.00698793, 0), true, false); + } + + public Map.Entry<PointVectorVector, FloatPointer> infer(Mat image) { + if (image.rows() != inputSize.height()) { + throw new IllegalArgumentException("height of input image != net input size"); + } + if (image.cols() != inputSize.width()) { + throw new IllegalArgumentException("width of input image != net input size"); + } + final PointVectorVector pt = new PointVectorVector(); + final FloatPointer confidences = new FloatPointer(); + model.detect(image, pt, confidences); + return new AbstractMap.SimpleEntry<>(pt, confidences); + } + } + + static Mat visualize(Mat image, Map.Entry<PointVectorVector, FloatPointer> results, double fps, Scalar boxColor, + Scalar textColor, boolean isClosed, int thickness) { + final Mat output = new Mat(); + image.copyTo(output); + if (fps > 0) { + putText(output, String.format("FPS: %.2f", fps), new Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor); + } + final PointVectorVector pvv = results.getKey(); + final MatVector matVector = new MatVector(); + for (int i = 0; i < pvv.size(); i++) { + final PointVector pv = pvv.get(i); + final Point pts = new Point(pv.size()); + for (int j = 0; j < pv.size(); j++) { + pts.position(j).x(pv.get(j).x()).y(pv.get(j).y()); + } + matVector.push_back(new Mat(pts.position(0))); + } + polylines(output, matVector, isClosed, boxColor, thickness, LINE_AA, 0); + matVector.close(); + return output; + } + + /** + * Execute: mvn compile exec:java -q -Dexec.args="" + */ + public static void main(String[] argv) { + final Args args = new Args(); + final JCommander jc = JCommander.newBuilder() + .addObject(args) + .build(); + jc.setUsageFormatter(new UnixStyleUsageFormatter(jc)); + jc.parse(argv); + if (args.help) { + jc.usage(); + return; + } + final int[] backendTargetPair = backendTargetPairs[args.backend]; + if (args.model == null || args.model.isEmpty()) { + throw new IllegalArgumentException("Model name is empty"); + } + final Size inpSize = new Size(args.width, args.height); + + final PPOCRDet model = new PPOCRDet(args.model, inpSize, + args.binaryThreshold, args.polygonThreshold, args.maxCandidates, args.unclipRatio, + backendTargetPair[0], backendTargetPair[1]); + + final VideoCapture cap = new VideoCapture(); + if (args.input != null) { + cap.open(args.input); + } else { + cap.open(0); + } + if (!cap.isOpened()) { + throw new IllegalArgumentException("Cannot open video or file"); + } + Mat originalImage = new Mat(); + + final OpenCVFrameConverter.ToMat converter = new OpenCVFrameConverter.ToMat(); + CanvasFrame mainframe = null; + if (args.input == null || args.viz) { + mainframe = new CanvasFrame(args.model + " Demo", CanvasFrame.getDefaultGamma() / 2.2); + mainframe.setDefaultCloseOperation(javax.swing.JFrame.EXIT_ON_CLOSE); + mainframe.setVisible(true); + } + + final Scalar boxColor = new Scalar(0, 255, 0, 0); + final Scalar textColor = new Scalar(0, 0, 255, 0); + final TickMeter tm = new TickMeter(); + + while (cap.read(originalImage)) { + final int originalW = originalImage.cols(); + final int originalH = originalImage.rows(); + final double scaleHeight = originalH / (double) inpSize.height(); + final double scaleWidth = originalW / (double) inpSize.width(); + final Mat image = new Mat(); + resize(originalImage, image, inpSize); + + // inference + tm.start(); + final Map.Entry<PointVectorVector, FloatPointer> results = model.infer(image); + tm.stop(); + // Scale the results bounding box + final PointVectorVector pvv = results.getKey(); + for (int i = 0; i < pvv.size(); i++) { + final PointVector pts = pvv.get(i); + for (int j = 0; j < pts.size(); j++) { + pts.get(j).x((int) (pts.get(j).x() * scaleWidth)); + pts.get(j).y((int) (pts.get(j).y() * scaleHeight)); + } + } + + originalImage = visualize(originalImage, results, tm.getFPS(), boxColor, textColor, true, 2); + tm.reset(); + if (args.input != null) { + if (args.save) { + System.out.println("Result image saved to result.jpg"); + imwrite("result.jpg", originalImage); + } + if (args.viz) { + mainframe.showImage(converter.convert(originalImage)); + } + } else { + mainframe.showImage(converter.convert(originalImage)); + } + + // clear + pvv.close(); + image.close(); + } + tm.close(); + } + +} diff --git a/models/text_detection_ppocr/pom.xml b/models/text_detection_ppocr/pom.xml new file mode 100644 index 00000000..8571a0dd --- /dev/null +++ b/models/text_detection_ppocr/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>opencv_zoo</groupId> + <artifactId>demo</artifactId> + <version>1.0.0-SNAPSHOT</version> + </parent> + + <artifactId>text_detection_ppocr</artifactId> + +</project> \ No newline at end of file