predict_cv.py

import os
import cv2
import glob
import shutil
import time
import logging as log
import torch
from torchvision import transforms as tf
from pprint import pformat
from PIL import Image, ImageOps
import numpy as np
import sys
sys.path.insert(0, '.')

from utils.envs import initEnv
import models


def py_cpu_nms(dets, thresh=0.5):
    """Pure Python NMS baseline."""
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep


def detect(net, img_path, use_cuda, network_size, nms_thresh):
    #data = Image.open(img_path)
    data = cv2.imread(img_path)
    data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
    data = Image.fromarray(data)
    orig_width, orig_height = data.size  # use Image.open
    #orig_height, orig_width, _ = data.shape
    netw, neth = network_size
    scale = min(float(netw) / orig_width, float(neth) / orig_height)
    new_width = orig_width * scale
    new_height = orig_height * scale
    pad_w = (netw - new_width) / 2.0
    pad_h = (neth - new_height) / 2.0

    st = time.time()
    #data = process_cv(data, network_size)
    data = process_data(data, network_size)  # use Image.open
    print("process time:", time.time()-st)
    s22 = time.time()
    data = tf.ToTensor()(data)
    s33 = time.time()
    print("totensor time:",s33-s22)
    data = data.unsqueeze(0)
    print("unsqueeze time:",time.time()-s33)

    if use_cuda:
        data = data.cuda()
    with torch.no_grad():
        s1 = time.time()
        output, _ = net(data)
        print("inference time:",time.time()-s1)

    res, res_label = [], []
    # conver x,y,w,h to x1,y1,x2,y2
    for o in output[0]:
        xmin = o.x_top_left
        ymin = o.y_top_left
        xmax = xmin + o.width
        ymax = ymin + o.height
        conf = o.confidence
        class_label = o.class_label

        x1 = max(0, float(xmin - pad_w) / scale)
        x2 = min(orig_width - 1, float(xmax - pad_w) / scale)
        y1 = max(0, float(ymin - pad_h) / scale)
        y2 = min(orig_height - 1, float(ymax - pad_h) / scale)
        res.append([x1, y1, x2, y2, conf])
        res_label.append([int(x1), int(y1), int(x2), int(y2), conf, class_label])
    if len(res) == 0:
        return []
    # do nms
    t11 = time.time()
    nms_keep = py_cpu_nms(np.array(res), nms_thresh)
    print("nms time:",time.time()-t11)
    final_res = []
    for index in nms_keep:
        x1, y1, x2, y2 = res_label[index][0], res_label[index][1], res_label[index][2], res_label[index][3]
        conf = res_label[index][4]
        class_label = res_label[index][5]
        final_res.append((class_label, conf, [x1, y1, x2, y2]))
    end = time.time()
    print("detect time(process time + inference + nms): ", end-st)
    return final_res


def process_data(img, dimension):
    fill_color = 127
    net_w, net_h = dimension
    im_w, im_h = img.size
    if im_w == net_w and im_h == net_h:
        return img

    # Rescaling
    if im_w / net_w >= im_h / net_h:
        scale = net_w / im_w
    else:
        scale = net_h / im_h
    if scale != 1:
        resample_mode = Image.NEAREST  # Image.BILINEAR if self.scale > 1 else Image.ANTIALIAS
        img = img.resize((int(scale * im_w), int(scale * im_h)), resample_mode)
        im_w, im_h = img.size

    if im_w == net_w and im_h == net_h:
        return img

    # Padding
    img_np = np.array(img)
    channels = img_np.shape[2] if len(img_np.shape) > 2 else 1
    pad_w = (net_w - im_w) / 2
    pad_h = (net_h - im_h) / 2
    pad = (int(pad_w), int(pad_h), int(pad_w+.5), int(pad_h+.5))
    img = ImageOps.expand(img, border=pad, fill=(fill_color,)*channels)
    return img


class HyperParams(object):
    def __init__(self, config):
        self.cuda = True
        self.labels = config['labels']
        self.classes = len(self.labels)
        self.data_root = config['data_root_dir']
        self.model_name = config['model_name']

        cur_cfg = config
        dataset = cur_cfg['dataset']
        self.testfile = f'{self.data_root}/{dataset}.pkl'
        self.nworkers = cur_cfg['nworkers']
        self.pin_mem = cur_cfg['pin_mem']
        self.network_size = cur_cfg['input_shape']
        self.batch = cur_cfg['batch_size']
        self.weights = cur_cfg['weights']
        self.conf_thresh = cur_cfg['conf_thresh']
        self.nms_thresh = cur_cfg['nms_thresh']
        self.results = cur_cfg['results']

        # cuda check
        if self.cuda:
            if not torch.cuda.is_available():
                log.debug('CUDA not available')
                print('CUDA not available')
                self.cuda = False
            else:
                log.debug('CUDA enabled')
                print('CUDA enabled')


def voc_test(hyper_params):
    model_name = hyper_params.model_name
    use_cuda = hyper_params.cuda
    weights = hyper_params.weights
    conf_thresh = hyper_params.conf_thresh
    network_size = hyper_params.network_size
    labels = hyper_params.labels
    nms_thresh = hyper_params.nms_thresh
    save_dir = hyper_params.results
    if os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    os.mkdir(save_dir)

    test_args = {'conf_thresh': conf_thresh, 'network_size': network_size, 'labels': labels}
    net = models.__dict__[model_name](hyper_params.classes, weights, train_flag=2, test_args=test_args)
    net.eval()
    log.info('Net structure\n%s' % net)
    if use_cuda:
        net.cuda()

    #img_path = "/home/ming.zhang04/data/VOCdevkit/VOC2007/JPEGImages"
    img_path = "test_img"
    #img_path = "/media/lishundong/DATA2/docker/data/VOC_face/VOC2012/JPEGImages"
    # img_path = "/media/lishundong/DATA2/docker/data/VOC_key/VOC2012/JPEGImages"
    img_list = glob.glob(img_path + "/*.jpg")
    for idx, img_path in enumerate(img_list):
        print("--------------------------------")
        result = detect(net, img_path, use_cuda, network_size, nms_thresh)
        img = cv2.imread(img_path)
        for res in result:
            class_label, conf, box = res
            x1, y1, x2, y2 = box
            cv2.putText(img, class_label + ":"+str(conf)[:5], (max(0, x1), max(15, y1)), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2)
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0))
            cv2.imwrite(os.path.join(save_dir, os.path.basename(img_path)), img)
        if len(result) != 0:
            print("detect: {}".format(result))
            print("save: {} -> {}".format(img_path, os.path.join(save_dir, os.path.basename(img_path))))
        else:
            print("detect nothing")


if __name__ == '__main__':
    # model_name = "Yolov3"
    model_name = "TinyYolov3"
    train_flag = 2
    config = initEnv(train_flag=train_flag, model_name=model_name)
    log.info('Config\n\n%s\n' % pformat(config))
    # init env
    hyper_params = HyperParams(config)
    voc_test(hyper_params)