thorgas
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎single_stage_detector/Dockerfile
+20 b/‎single_stage_detector/Dockerfile
+20
diff --git a/‎single_stage_detector/download_dataset.sh
+7 b/‎single_stage_detector/download_dataset.sh
+7
diff --git a/‎single_stage_detector/requirements.txt
+12 b/‎single_stage_detector/requirements.txt
+12
diff --git a/‎single_stage_detector/ssd/README.md
+71 b/‎single_stage_detector/ssd/README.md
+71
diff --git a/‎single_stage_detector/ssd/base_model.py
+206 b/‎single_stage_detector/ssd/base_model.py
+206
@@ -14,6 +14,7 @@ We provide reference implementations for each of the 7 benchmarks in the MLPerf
 
 * image_classification - Resnet-50 v1 applied to Imagenet.
 * object_detection - Mask R-CNN applied to COCO. 
+* single_stage_detector - SSD applied to COCO 2017.
 * speech_recognition - DeepSpeech2 applied to Librispeech.
 * translation - Transformer applied to WMT English-German.
 * recommendation - Neural Collaborative Filtering applied to MovieLens 20 Million (ml-20m).
 
@@ -0,0 +1,20 @@
+FROM pytorch/pytorch:0.4_cuda9_cudnn7
+
+# Set working directory
+WORKDIR /mlperf
+
+RUN apt-get update && \
+    apt-get install -y python3-tk python-pip
+
+# Necessary pip packages
+RUN pip install --upgrade pip
+RUN pip install Cython==0.28.4 \
+                matplotlib==2.2.2
+RUN python3 -m pip install pycocotools==2.0.0
+
+# Copy SSD code
+WORKDIR /mlperf
+COPY . .
+RUN pip install -r requirements.txt
+
+WORKDIR /mlperf/ssd
@@ -0,0 +1,7 @@
+# Get COCO 2017 data sets
+dir=$(pwd)
+mkdir /coco; cd /coco
+curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip
+curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip
+curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip
+cd $dir
@@ -0,0 +1,12 @@
+cycler==0.10.0
+kiwisolver==1.0.1
+matplotlib==2.2.2
+numpy==1.14.5
+Pillow==5.2.0
+pycocotools==2.0.0
+pyparsing==2.2.0
+python-dateutil==2.7.3
+pytz==2018.5
+six==1.11.0
+torch==0.4.0
+torchvision==0.2.1
@@ -0,0 +1,71 @@
+# 1. Problem
+Object detection.
+
+# 2. Directions
+
+### Steps to configure machine
+From Source
+
+Standard script.
+
+From Docker
+1. Checkout the MLPerf repository
+```
+git clone https://github.com/mlperf/reference.git
+```
+2. Install CUDA and Docker
+```
+source reference/install_cuda_docker.sh
+```
+3. Build the docker image for the single stage detection task
+```
+# Build from Dockerfile
+cd reference/single_stage_detector/
+sudo docker build -t mlperf/single_stage_detector .
+```
+
+### Steps to download data
+```
+cd reference/single_stage_detector/
+source download_dataset.sh
+```
+
+### Steps to run benchmark.
+From Source
+
+Run the run_and_time.sh script
+```
+cd reference/single_stage_detector/ssd
+source run_and_time.sh SEED TARGET
+```
+where SEED is the random seed for a run, TARGET is the quality target from Section 5 below.
+
+Docker Image
+```
+sudo nvidia-docker run -v /coco:/coco -t -i --rm --ipc=host mlperf/single_stage_detector ./run_and_time.sh SEED TARGET
+```
+
+# 3. Dataset/Environment
+### Publiction/Attribution.
+Microsoft COCO: COmmon Objects in Context. 2017.
+
+### Training and test data separation
+Train on 2017 COCO train data set, compute mAP on 2017 COCO val data set.
+
+# 4. Model.
+### Publication/Attribution
+Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. In the Proceedings of the European Conference on Computer Vision (ECCV), 2016.
+
+Backbone is ResNet34 pretrained on ILSVRC 2012 (from torchvision). Modifications to the backbone networks: remove conv_5x residual blocks, change the first 3x3 convolution of the conv_4x block from stride 2 to stride1 (this increases the resolution of the feature map to which detector heads are attached), attach all 6 detector heads to the output of the last conv_4x residual block. Thus detections are attached to 38x38, 19x19, 10x10, 5x5, 3x3, and 1x1 feature maps. Convolutions in the detector layers are followed by batch normalization layers.
+
+# 5. Quality.
+### Quality metric
+Metric is COCO box mAP (averaged over IoU of 0.5:0.95), computed over 2017 COCO val data.
+
+### Quality target
+mAP of 0.212
+
+### Evaluation frequency
+
+### Evaluation thoroughness
+All the images in COCO 2017 val data set.
@@ -0,0 +1,206 @@
+"""
+    Load the vgg16 weight and save it to special file
+"""
+
+#from torchvision.models.vgg import vgg16
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+from torch.autograd import Variable
+from collections import OrderedDict
+
+from torchvision.models.resnet import resnet18, resnet34, resnet50
+
+def _ModifyConvStrideDilation(conv, stride=(1, 1), padding=None):
+    conv.stride = stride
+
+    if padding is not None:
+        conv.padding = padding
+
+def _ModifyBlock(block, bottleneck=False, **kwargs):
+    for m in list(block.children()):
+        if bottleneck:
+           _ModifyConvStrideDilation(m.conv2, **kwargs)
+        else:
+           _ModifyConvStrideDilation(m.conv1, **kwargs)
+
+        if m.downsample is not None:
+            # need to make sure no padding for the 1x1 residual connection
+            _ModifyConvStrideDilation(list(m.downsample.children())[0], **kwargs)
+
+class ResNet18(nn.Module):
+    def __init__(self):
+        super().__init__()
+        rn18 = resnet18(pretrained=True)
+
+
+        # discard last Resnet block, avrpooling and classification FC
+        # layer1 = up to and including conv3 block
+        self.layer1 = nn.Sequential(*list(rn18.children())[:6])
+        # layer2 = conv4 block only
+        self.layer2 = nn.Sequential(*list(rn18.children())[6:7])
+
+        # modify conv4 if necessary
+        # Always deal with stride in first block
+        modulelist = list(self.layer2.children())
+        _ModifyBlock(modulelist[0], stride=(1,1))
+
+    def forward(self, data):
+        layer1_activation = self.layer1(data)
+        x = layer1_activation
+        layer2_activation = self.layer2(x)
+
+        # Only need the output of conv4
+        return [layer2_activation]
+
+class ResNet34(nn.Module):
+    def __init__(self):
+        super().__init__()
+        rn34 = resnet34(pretrained=True)
+
+        # discard last Resnet block, avrpooling and classification FC
+        self.layer1 = nn.Sequential(*list(rn34.children())[:6])
+        self.layer2 = nn.Sequential(*list(rn34.children())[6:7])
+        # modify conv4 if necessary
+        # Always deal with stride in first block
+        modulelist = list(self.layer2.children())
+        _ModifyBlock(modulelist[0], stride=(1,1))
+
+
+    def forward(self, data):
+        layer1_activation = self.layer1(data)
+        x = layer1_activation
+        layer2_activation = self.layer2(x)
+
+        return [layer2_activation]
+
+class L2Norm(nn.Module):
+    """
+       Scale shall be learnable according to original paper
+       scale: initial scale number
+       chan_num: L2Norm channel number (norm over all channels)
+    """
+    def __init__(self, scale=20, chan_num=512):
+        super(L2Norm, self).__init__()
+        # Scale across channels
+        self.scale = \
+            nn.Parameter(torch.Tensor([scale]*chan_num).view(1, chan_num, 1, 1))
+
+    def forward(self, data):
+        # normalize accross channel
+        return self.scale*data*data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt()
+
+
+
+def tailor_module(src_model, src_dir, tgt_model, tgt_dir):
+    state = torch.load(src_dir)
+    src_model.load_state_dict(state)
+    src_state = src_model.state_dict()
+    # only need features
+    keys1 = src_state.keys()
+    keys1 = [k for k in src_state.keys() if k.startswith("features")]
+    keys2 = tgt_model.state_dict().keys()
+
+    assert len(keys1) == len(keys2)
+    state = OrderedDict()
+
+    for k1, k2 in zip(keys1, keys2):
+        # print(k1, k2)
+        state[k2] = src_state[k1]
+    #diff_keys = state.keys() - target_model.state_dict().keys()
+    #print("Different Keys:", diff_keys)
+    # Remove unecessary keys
+    #for k in diff_keys:
+    #    state.pop(k)
+    tgt_model.load_state_dict(state)
+    torch.save(tgt_model.state_dict(), tgt_dir)
+
+# Default vgg16 in pytorch seems different from ssd
+def make_layers(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        elif v == 'C':
+            # Notice ceil_mode is true
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return layers
+
+class Loss(nn.Module):
+    """
+        Implements the loss as the sum of the followings:
+        1. Confidence Loss: All labels, with hard negative mining
+        2. Localization Loss: Only on positive labels
+        Suppose input dboxes has the shape 8732x4
+    """
+
+    def __init__(self, dboxes):
+        super(Loss, self).__init__()
+        self.scale_xy = 1.0/dboxes.scale_xy
+        self.scale_wh = 1.0/dboxes.scale_wh
+
+        self.sl1_loss = nn.SmoothL1Loss(reduce=False)
+        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
+            requires_grad=False)
+        # Two factor are from following links
+        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
+        self.con_loss = nn.CrossEntropyLoss(reduce=False)
+
+    def _loc_vec(self, loc):
+        """
+            Generate Location Vectors
+        """
+        gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
+        gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
+
+        return torch.cat((gxy, gwh), dim=1).contiguous()
+
+    def forward(self, ploc, plabel, gloc, glabel):
+        """
+            ploc, plabel: Nx4x8732, Nxlabel_numx8732
+                predicted location and labels
+
+            gloc, glabel: Nx4x8732, Nx8732
+                ground truth location and labels
+        """
+
+        mask = glabel > 0
+        pos_num = mask.sum(dim=1)
+
+        vec_gd = self._loc_vec(gloc)
+
+        # sum on four coordinates, and mask
+        sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
+        sl1 = (mask.float()*sl1).sum(dim=1)
+
+        # hard negative mining
+        con = self.con_loss(plabel, glabel)
+
+        # postive mask will never selected
+        con_neg = con.clone()
+        con_neg[mask] = 0
+        _, con_idx = con_neg.sort(dim=1, descending=True)
+        _, con_rank = con_idx.sort(dim=1)
+
+        # number of negative three times positive
+        neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
+        neg_mask = con_rank < neg_num
+
+        closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
+
+        # avoid no object detected
+        total_loss = sl1 + closs
+        num_mask = (pos_num > 0).float()
+        pos_num = pos_num.float().clamp(min=1e-6)
+
+        ret = (total_loss*num_mask/pos_num).mean(dim=0)
+        return ret
+