Update to opset 11

yashv28 · yashv28 · commit a7fbb3de52bb · 2020-07-07T13:42:16.000-07:00
diff --git a/INFERENCE.md b/INFERENCE.md
@@ -62,10 +62,12 @@ odtk export model.pth model_fp32.plan --full-precision --size 800 1280
 
 In order to use INT8 precision with TensorRT, you need to provide calibration images (images that are representative of what will be seen at runtime) that will be used to rescale the network.
 ```bash
-odtk export model.pth model_int8.plan --int8 --calibration-images /data/val/ --calibration-batches 10 --calibration-table model_calibration_table
+odtk export model.pth model_int8.plan --int8 --calibration-images /data/val/ --calibration-batches 2 --calibration-table model_calibration_table
 ```
 
-This will randomly select 20 images from `/data/val/` to calibrate the network for INT8 precision. The results from calibration will be saved to `model_calibration_table` that can be used to create subsequent INT8 engines for this model without needed to recalibrate. 
+This will randomly select 16 images from `/data/val/` to calibrate the network for INT8 precision. The results from calibration will be saved to `model_calibration_table` that can be used to create subsequent INT8 engines for this model without needed to recalibrate. 
+
+**NOTE:** Number of images in `/data/val/` must be greater than or equal to the kOPT(middle) optimization profile from `--dynamic-batch-opts`. Here, the default kOPT is 8.
 
 Build an INT8 engine for a previously calibrated model:
 ```bash
diff --git a/csrc/engine.cpp b/csrc/engine.cpp
@@ -93,7 +93,7 @@ Engine::~Engine() {
 }
 
 Engine::Engine(const char *onnx_model, size_t onnx_size, const vector<int>& dynamic_batch_opts,
-    size_t batch, string precision, float score_thresh, int top_n, const vector<vector<float>>& anchors, 
+    string precision, float score_thresh, int top_n, const vector<vector<float>>& anchors, 
     bool rotated, float nms_thresh, int detections_per_im, const vector<string>& calibration_images,
     string model_name, string calibration_table, bool verbose, size_t workspace_size) {
 
@@ -134,9 +134,9 @@ Engine::Engine(const char *onnx_model, size_t onnx_size, const vector<int>& dyna
 
     std::unique_ptr<Int8EntropyCalibrator> calib;
     if (int8) {
-        // Calibration is performed using kOPT values of the profile.
-        // Calibration input data size must match this profile.
         builderConfig->setFlag(BuilderFlag::kINT8);
+        // Calibration is performed using kOPT values of the profile.
+        // Calibration batch size must match this profile.
         builderConfig->setCalibrationProfile(profile);
         ImageStream stream(dynamic_batch_opts[1], inputDims, calibration_images);
         calib = std::unique_ptr<Int8EntropyCalibrator>(new Int8EntropyCalibrator(stream, model_name, calibration_table));
@@ -201,6 +201,8 @@ Engine::Engine(const char *onnx_model, size_t onnx_size, const vector<int>& dyna
     network->destroy();
     builderConfig->destroy();
     builder->destroy();
+
+    _prepare();
 }
 
 void Engine::save(const string &path) {
@@ -236,4 +238,4 @@ int Engine::getStride() {
     return 1;
 }
 
-}
+}
diff --git a/csrc/engine.h b/csrc/engine.h
@@ -29,7 +29,6 @@
 
 #include <cuda_runtime.h>
 
-
 using namespace std;
 using namespace nvinfer1;
 
@@ -43,9 +42,9 @@ class Engine {
 
     // Create engine from serialized onnx model
     
-    Engine(const char *onnx_model, size_t onnx_size, const vector<int>& dynamic_batch_opts, size_t batch,
-        string precision, float score_thresh, int top_n, const vector<vector<float>>& anchors, bool rotated,
-        float nms_thresh, int detections_per_im, const vector<string>& calibration_images,
+    Engine(const char *onnx_model, size_t onnx_size, const vector<int>& dynamic_batch_opts,
+        string precision, float score_thresh, int top_n, const vector<vector<float>>& anchors,
+        bool rotated, float nms_thresh, int detections_per_im, const vector<string>& calibration_images, 
         string model_name, string calibration_table, bool verbose, size_t workspace_size=(1ULL << 30));
     
     ~Engine();
diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp
@@ -183,8 +183,8 @@ vector<at::Tensor> infer(retinanet::Engine &engine, at::Tensor data, bool rotate
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     pybind11::class_<retinanet::Engine>(m, "Engine")
-        .def(pybind11::init<const char *, size_t, const vector<int>&, size_t, string, float,
-            int, const vector<vector<float>>&, bool, float, int, const vector<string>&, string, string, bool>())
+        .def(pybind11::init<const char *, size_t, const vector<int>&, string, float, int,
+            const vector<vector<float>>&, bool, float, int, const vector<string>&, string, string, bool>())
         .def("save", &retinanet::Engine::save)
         .def("infer", &retinanet::Engine::infer)
         .def_property_readonly("stride", &retinanet::Engine::getStride)
diff --git a/extras/cppapi/README.md b/extras/cppapi/README.md
@@ -32,7 +32,7 @@ msbuild retinanet_infer.sln
 
 If you don't have an ONNX core model, generate one from your RetinaNet model:
 ```bash
-retinanet export model.pth model.onnx
+odtk export model.pth model.onnx
 ```
 
 Load the ONNX core model and export it to a RetinaNet TensorRT engine (using FP16 precision):
diff --git a/extras/cppapi/export.cpp b/extras/cppapi/export.cpp
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
 
 	// Define default RetinaNet parameters to use for TRT export
 	const vector<int> dynamic_batch_opts{1, 8, 16};
-	int batch = 1;
+	int calibration_batches = 2; // must be >= 1
 	float score_thresh = 0.05f;
 	int top_n = 1000;
 	size_t workspace_size =(1ULL << 30);
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
 	}
 
 	// For INT8 calibration, after setting COCO_PATH on line 10:
-	// const vector<string> calibration_files = glob(dynamic_batch_opts[1]);
+	// const vector<string> calibration_files = glob(calibration_batches*dynamic_batch_opts[1]);
 	const vector<string> calibration_files;
 	string model_name = "";
 	string calibration_table = argc == 4 ? string(argv[3]) : "";
@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) {
 		precision = "INT8";
 
 	cout << "Building engine..." << endl;
-	auto engine = retinanet::Engine(buffer, size, dynamic_batch_opts, batch, precision, score_thresh, top_n,
+	auto engine = retinanet::Engine(buffer, size, dynamic_batch_opts, precision, score_thresh, top_n,
 		anchors, ROTATED, nms_thresh, detections_per_im, calibration_files, model_name, calibration_table, verbose, workspace_size);
 	engine.save(string(argv[2]));
 
diff --git a/extras/cppapi/infervideo.cpp b/extras/cppapi/infervideo.cpp
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
 		cout << "Inferring on frame: " << count <<"/" << nframes << endl;
 		count++;
 		vector<void *> buffers = { data_d, scores_d, boxes_d, classes_d };
-		engine.infer(buffers);
+		engine.infer(buffers, 1);
 
 		cudaMemcpy(scores.get(), scores_d, sizeof(float) * num_det, cudaMemcpyDeviceToHost);
 		cudaMemcpy(boxes.get(), boxes_d, sizeof(float) * num_det * 4, cudaMemcpyDeviceToHost);
diff --git a/retinanet/dali.py b/retinanet/dali.py
@@ -32,7 +32,7 @@ def __init__(self, batch_size, num_threads, path, training, annotations, world,
 
         self.decode_train = ops.ImageDecoderSlice(device="mixed", output_type=types.RGB)
         self.decode_infer = ops.ImageDecoder(device="mixed", output_type=types.RGB)
-        self.bbox_crop = ops.RandomBBoxCrop(device='cpu', ltrb=True, scaling=[0.3, 1.0],
+        self.bbox_crop = ops.RandomBBoxCrop(device='cpu', bbox_layout="xyXY", scaling=[0.3, 1.0],
                                             thresholds=[0.1, 0.3, 0.5, 0.7, 0.9])
 
         self.bbox_flip = ops.BbFlip(device='cpu', ltrb=True)
@@ -122,7 +122,7 @@ def __init__(self, path, resize, max_size, batch_size, stride, world, annotation
                                  augment_brightness=augment_brightness,
                                  augment_contrast=augment_contrast, augment_hue=augment_hue,
                                  augment_saturation=augment_saturation)
-        
+
         self.pipe.build()
 
     def __repr__(self):
@@ -149,7 +149,7 @@ def __iter__(self):
                 id = int(dali_ids.at(batch)[0])
 
                 # Convert dali tensor to pytorch
-                dali_tensor = dali_data.at(batch)
+                dali_tensor = dali_data[batch]
                 tensor_shape = dali_tensor.shape()
 
                 datum = torch.zeros(dali_tensor.shape(), dtype=torch.float, device=torch.device('cuda'))
@@ -158,7 +158,7 @@ def __iter__(self):
 
                 # Calculate image resize ratio to rescale boxes
                 prior_size = dali_attrs.as_cpu().at(batch)
-                resized_size = dali_resize_img.at(batch).shape()
+                resized_size = dali_resize_img[batch].shape()
                 ratio = max(resized_size) / max(prior_size)
 
                 if self.training:
@@ -192,12 +192,10 @@ def __iter__(self):
 
             if self.training:
                 pyt_targets = pyt_targets.cuda(non_blocking=True)
-
                 yield data, pyt_targets
 
             else:
                 ids = torch.Tensor(ids).int().cuda(non_blocking=True)
                 ratios = torch.Tensor(ratios).cuda(non_blocking=True)
-
                 yield data, ids, ratios
 
diff --git a/retinanet/infer.py b/retinanet/infer.py
@@ -33,10 +33,6 @@ def infer(model, path, detections_file, resize, max_size, batch_size, mixed_prec
 
     # Prepare dataset
     if verbose: print('Preparing dataset...')
-    data_iterator = (DaliDataIterator if use_dali else DataIterator)(
-            path, resize, max_size, batch_size, stride,
-            world, annotations, training=False)
-    
     if rotated_bbox:
         if use_dali: raise NotImplementedError("This repo does not currently support DALI for rotated bbox.")
         data_iterator = RotatedDataIterator(path, resize, max_size, batch_size, stride,
@@ -45,7 +41,6 @@ def infer(model, path, detections_file, resize, max_size, batch_size, mixed_prec
         data_iterator = (DaliDataIterator if use_dali else DataIterator)(
             path, resize, max_size, batch_size, stride,
             world, annotations, training=False)
-    
     if verbose: print(data_iterator)
 
     # Prepare model
diff --git a/retinanet/main.py b/retinanet/main.py
@@ -98,12 +98,10 @@ def parse(args):
     parser_export.add_argument('--size', metavar='height width', type=int, nargs='+',
                                help='input size (square) or sizes (h w) to use when generating TensorRT engine',
                                default=[1280])
-    parser_export.add_argument('--batch', metavar='size', type=int, help='max batch size to use for TensorRT engine',
-                               default=2)
     parser_export.add_argument('--full-precision', help='export in full instead of half precision', action='store_true')
     parser_export.add_argument('--int8', help='calibrate model and export in int8 precision', action='store_true')
     parser_export.add_argument('--calibration-batches', metavar='size', type=int,
-                               help='number of batches to use for int8 calibration', default=4)
+                               help='number of batches to use for int8 calibration', default=2)
     parser_export.add_argument('--calibration-images', metavar='path', type=str,
                                help='path to calibration images to use for int8 calibration', default="")
     parser_export.add_argument('--calibration-table', metavar='path', type=str,
@@ -163,7 +161,7 @@ def worker(rank, args, world, model, state):
         torch.cuda.set_device(rank)
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
 
-        if args.batch % world != 0:
+        if (args.command != 'export') and (args.batch % world != 0):
             raise RuntimeError('Batch size should be a multiple of the number of GPUs')
 
     if model and model.angles is not None:
@@ -204,11 +202,16 @@ def worker(rank, args, world, model, state):
                 for ex in file_extensions:
                     calibration_files += glob.glob("{}/*{}".format(args.calibration_images, ex), recursive=True)
                 # Only need enough images for specified num of calibration batches
-                if len(calibration_files) >= args.calibration_batches * args.batch:
-                    calibration_files = calibration_files[:(args.calibration_batches * args.batch)]
+                if len(calibration_files) >= args.calibration_batches * args.dynamic_batch_opts[1]:
+                    calibration_files = calibration_files[:(args.calibration_batches * args.dynamic_batch_opts[1])]
                 else:
-                    print('Only found enough images for {} batches. Continuing anyway...'.format(
-                        len(calibration_files) // args.batch))
+                    # Number of images for calibration must be greater than or equal to the kOPT optimization profile
+                    if len(calibration_files) >= args.dynamic_batch_opts[1]:
+                        print('Only found enough images for {} batches. Continuing anyway...'.format(
+                            len(calibration_files) // args.dynamic_batch_opts[1]))
+                    else:
+                        raise RuntimeError('Not enough images found for calibration. ({} < {})'
+                                            .format(len(calibration_files), args.dynamic_batch_opts[1]))
 
                 random.shuffle(calibration_files)
 
@@ -218,7 +221,7 @@ def worker(rank, args, world, model, state):
         elif not args.full_precision:
             precision = "FP16"
 
-        exported = model.export(input_size, args.dynamic_batch_opts, args.batch, precision, calibration_files, 
+        exported = model.export(input_size, args.dynamic_batch_opts, precision, calibration_files, 
                                 args.calibration_table, args.verbose, onnx_only=onnx_only)
         if onnx_only:
             with open(args.export, 'wb') as out:
diff --git a/retinanet/model.py b/retinanet/model.py
@@ -15,9 +15,17 @@
 class Model(nn.Module):
     'RetinaNet - https://arxiv.org/abs/1708.02002'
 
-    def __init__(self, backbones='ResNet50FPN', classes=80, 
-                ratios=[1.0, 2.0, 0.5], scales=[4 * 2 ** (i / 3) for i in range(3)],
-                angles=None, rotated_bbox=False, anchor_ious=[0.4, 0.5], config={}):
+    def __init__(
+        self, 
+        backbones='ResNet50FPN', 
+        classes=80, 
+        ratios=[1.0, 2.0, 0.5], 
+        scales=[4 * 2 ** (i / 3) for i in range(3)],
+        angles=None, 
+        rotated_bbox=False, 
+        anchor_ious=[0.4, 0.5], 
+        config={}
+    ):
         super().__init__()
 
         if not isinstance(backbones, list):
@@ -242,15 +250,16 @@ def load(cls, filename, rotated_bbox=False):
 
         return model, state
 
-    def export(self, size, dynamic_batch_opts, batch, precision, calibration_files, calibration_table, verbose, onnx_only=False):
+    def export(self, size, dynamic_batch_opts, precision, calibration_files, calibration_table, verbose, onnx_only=False):
 
-        import torch.onnx.symbolic_opset10 as onnx_symbolic
+        import torch.onnx.symbolic_opset11 as onnx_symbolic
         def upsample_nearest2d(g, input, output_size, *args):
-            # Currently, TRT 5.1/6.0/7.0 ONNX Parser does not support all ONNX ops
+            # Currently, TRT 7.1 ONNX Parser does not support all ONNX ops
             # needed to support dynamic upsampling ONNX forumlation
             # Here we hardcode scale=2 as a temporary workaround
             scales = g.op("Constant", value_t=torch.tensor([1., 1., 2., 2.]))
-            return g.op("Resize", input, scales, mode_s="nearest")
+            empty_tensor = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            return g.op("Resize", input, empty_tensor, scales, mode_s="nearest", nearest_mode_s="floor")
 
         onnx_symbolic.upsample_nearest2d = upsample_nearest2d
 
@@ -265,7 +274,7 @@ def upsample_nearest2d(g, input, output_size, *args):
         dynamic_axes = {input_names[0]: {0:'batch'}}
         for _, name in enumerate(output_names):
             dynamic_axes[name] = dynamic_axes[input_names[0]]
-        extra_args = {'opset_version': 10, 'verbose': verbose,
+        extra_args = {'opset_version': 11, 'verbose': verbose,
                       'input_names': input_names, 'output_names': output_names,
                       'dynamic_axes': dynamic_axes} 
         torch.onnx.export(self.cuda(), zero_input, onnx_bytes, **extra_args)
@@ -284,6 +293,6 @@ def upsample_nearest2d(g, input, output_size, *args):
             anchors = [generate_anchors_rotated(stride, self.ratios, self.scales, 
                     self.angles)[0].view(-1).tolist() for stride in self.strides]
 
-        return Engine(onnx_bytes.getvalue(), len(onnx_bytes.getvalue()), dynamic_batch_opts, batch,
-                      precision, self.threshold, self.top_n, anchors, self.rotated_bbox, self.nms, 
-                      self.detections, calibration_files, model_name, calibration_table, verbose)
+        return Engine(onnx_bytes.getvalue(), len(onnx_bytes.getvalue()), dynamic_batch_opts, precision,
+                      self.threshold, self.top_n, anchors, self.rotated_bbox, self.nms, self.detections,
+                      calibration_files, model_name, calibration_table, verbose)
diff --git a/retinanet/train.py b/retinanet/train.py
@@ -31,10 +31,12 @@ def train(model, state, path, annotations, val_path, val_annotations, resize, ma
     # Setup optimizer and schedule
     optimizer = SGD(model.parameters(), lr=lr, weight_decay=regularization_l2, momentum=0.9)
 
+    loss_scale = "dynamic" if use_dali else "128.0"
+
     model, optimizer = amp.initialize(model, optimizer,
                                       opt_level='O2' if mixed_precision else 'O0',
                                       keep_batchnorm_fp32=True,
-                                      loss_scale=128.0,
+                                      loss_scale=loss_scale,
                                       verbosity=is_master)
 
     if world > 1: