default seed=None; pre-allocate gpu memory

zhangming8 · zhangming8 · commit c465c1423138 · 2021-08-11T21:52:08.000+08:00
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ mAP was reevaluated on COCO val2017 and test2017, and some results are slightly
     
     d. Some tips:
     Ⅰ You can also change params in 'train.sh'(these params will replace opt.xxx in config.py) and use 'nohup sh train.sh &' to train
-    Ⅱ If you want to close mulit-size training, change opt.random_size = None or (20, 21) in 'config.py' or set random_size=None in 'train.sh'
+    Ⅱ If you want to close mulit-size training, change opt.random_size = None in 'config.py' or set random_size=None in 'train.sh'
     Ⅲ Mulit-gpu train: change opt.gpus = "3,5,6,7"
     Ⅳ Visualized log by tensorboard: tensorboard --logdir exp/your_exp_id/logs_2021-08-xx-xx-xx and visit http://localhost:6006
        Your can also use the following shell scripts:
diff --git a/config.py b/config.py
@@ -29,7 +29,7 @@ def update_nano_tiny(cfg, inp_params):
 # opt.dataset_path = r"D:\work\public_dataset\coco2017"  # Windows system
 opt.backbone = "CSPDarknet-s"  # CSPDarknet-nano, CSPDarknet-tiny, CSPDarknet-s, CSPDarknet-m, l, x
 opt.input_size = (640, 640)
-opt.random_size = (14, 26)  # None; multi-size train: from 448 to 800, random sample an int value and *32 as input size
+opt.random_size = (14, 26)  # None; multi-size train: from 448(14*32) to 832(26*32), set None to disable it
 opt.test_size = (640, 640)  # evaluate size
 opt.gpus = "0"  # "-1" "0" "3,4,5" "0,1,2,3,4,5,6,7" # -1 for cpu
 opt.batch_size = 24
@@ -78,7 +78,7 @@ def update_nano_tiny(cfg, inp_params):
 opt.shear = 2.0
 opt.perspective = 0.0
 opt.enable_mixup = True
-opt.seed = 0
+opt.seed = None  # 0
 opt.data_num_workers = 4
 
 opt.momentum = 0.9
@@ -94,6 +94,7 @@ def update_nano_tiny(cfg, inp_params):
 opt.use_amp = False  # True
 opt.cuda_benchmark = True
 opt.nms_thresh = 0.65
+opt.occupy_mem = False  # pre-allocate gpu memory for training to avoid memory Fragmentation.
 
 opt.rgb_means = [0.485, 0.456, 0.406]
 opt.std = [0.229, 0.224, 0.225]
@@ -132,6 +133,7 @@ def update_nano_tiny(cfg, inp_params):
     opt.cuda_benchmark = False
 if opt.reid_dim > 0:
     assert opt.tracking_id_nums is not None
-
+if opt.random_size is None:
+    opt.test_size = opt.input_size
 os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus_str
 print("\n{} final config: {}\n{}".format("-" * 20, "-" * 20, opt))
diff --git a/train.py b/train.py
@@ -19,7 +19,7 @@
 from models.yolox import get_model
 from models.post_process import yolox_post_process
 from utils.lr_scheduler import LRScheduler
-from utils.util import AverageMeter, write_log, configure_module
+from utils.util import AverageMeter, write_log, configure_module, occupy_mem
 from utils.model_utils import EMA, save_model, load_model, ensure_same, clip_grads
 from utils.data_parallel import set_device, _DataParallel
 from utils.logger import Logger
@@ -98,8 +98,7 @@ def run_epoch(model_with_loss, optimizer, scaler, ema, phase, epoch, data_iter,
                 avg_loss_stats[l] = AverageMeter()
             avg_loss_stats[l].update(loss_stats[l], inps.size(0))
             Bar.suffix = Bar.suffix + '|{} {:.4f} '.format(l, avg_loss_stats[l].avg)
-        Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) |Net {bt.avg:.3f}s'.format(dt=data_time,
-                                                                                                 bt=batch_time)
+        Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s |Batch {bt.val:.3f}s'.format(dt=data_time, bt=batch_time)
         if opt.print_iter > 0 and iter_id % opt.print_iter == 0:
             print('{}| {}'.format(opt.exp_id, Bar.suffix))
             logger.write('{}| {}\n'.format(opt.exp_id, Bar.suffix))
@@ -178,6 +177,7 @@ def train(model, scaler, train_loader, val_loader, optimizer, lr_scheduler, star
                 if loss_dict_val['loss'] <= best:
                     best = loss_dict_val['loss']
                     save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model)
+            del loss_dict_val, preds
 
         save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch,
                    model) if epoch % opt.save_epoch == 0 else ""
@@ -227,6 +227,8 @@ def main():
 
     # DP
     opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
+    if opt.occupy_mem and opt.device.type != 'cpu':
+        occupy_mem(opt.device)
     model, optimizer = set_device(model, optimizer, opt)
     train(model, scaler, train_loader, val_loader, optimizer, lr_scheduler, start_epoch, opt.accumulate, no_aug)
 
diff --git a/train.sh b/train.sh
@@ -19,3 +19,6 @@ python train.py gpus='0' backbone="CSPDarknet-s" num_epochs=300 exp_id="coco_CSP
 
 # resume 'model_last.pth', include weight, optimizer, scaler and epoch
 #python train.py gpus='0' backbone="CSPDarknet-s" num_epochs=300 exp_id="coco_CSPDarknet-s_640x640" use_amp=True val_intervals=2 data_num_workers=6 metric="ap" batch_size=48 load_model="exp/coco_CSPDarknet-s_640x640/model_last.pth" resume=True
+
+# GPU memory changes with the input size when multi-size training, which can be avoided by pre allocating memory
+#python train.py gpus='0' backbone="CSPDarknet-tiny" num_epochs=300 exp_id="coco_CSPDarknet-tiny_416x416" use_amp=True val_intervals=2 data_num_workers=6 metric="ap" batch_size=128 occupy_mem=True
diff --git a/utils/util.py b/utils/util.py
@@ -60,6 +60,25 @@ def sync_time(inputs):
     return time.time()
 
 
+def get_total_and_free_memory_in_mb(cuda_device):
+    devices_info_str = os.popen("nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader")
+    devices_info = devices_info_str.read().strip().split("\n")
+    total, used = devices_info[int(cuda_device)].split(",")
+    return int(total), int(used)
+
+
+def occupy_mem(cuda_device, mem_ratio=0.9):
+    """
+    pre-allocate gpu memory for training to avoid memory Fragmentation.
+    """
+    total, used = get_total_and_free_memory_in_mb(0)
+    max_mem = int(total * mem_ratio)
+    block_mem = max_mem - used
+    x = torch.FloatTensor(256, 1024, block_mem).to(cuda_device)
+    del x
+    time.sleep(5)
+
+
 def gpu_mem_usage():
     """
     Compute the GPU memory usage for the current device (MB).