TheCulprit
diff --git a/‎invokeai/app/invocations/normal_bae.py
Lines changed: 31 additions & 0 deletions b/‎invokeai/app/invocations/normal_bae.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/normal_bae/LICENSE
Lines changed: 21 additions & 0 deletions b/‎invokeai/backend/image_util/normal_bae/LICENSE
Lines changed: 21 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/normal_bae/__init__.py
Lines changed: 93 additions & 0 deletions b/‎invokeai/backend/image_util/normal_bae/__init__.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/normal_bae/nets/NNET.py
Lines changed: 22 additions & 0 deletions b/‎invokeai/backend/image_util/normal_bae/nets/NNET.py
Lines changed: 22 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/normal_bae/nets/__init__.py b/‎invokeai/backend/image_util/normal_bae/nets/__init__.py
diff --git a/‎invokeai/backend/image_util/normal_bae/nets/baseline.py
Lines changed: 85 additions & 0 deletions b/‎invokeai/backend/image_util/normal_bae/nets/baseline.py
Lines changed: 85 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/normal_bae/nets/submodules/__init__.py b/‎invokeai/backend/image_util/normal_bae/nets/submodules/__init__.py
@@ -0,0 +1,31 @@
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.fields import ImageField, InputField, WithBoard, WithMetadata
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.image_util.normal_bae import NormalMapDetector
+from invokeai.backend.image_util.normal_bae.nets.NNET import NNET
+
+
+@invocation(
+    "normal_map",
+    title="Normal Map",
+    tags=["controlnet", "normal"],
+    category="controlnet",
+    version="1.0.0",
+)
+class NormalMapInvocation(BaseInvocation, WithMetadata, WithBoard):
+    """Generates a normal map."""
+
+    image: ImageField = InputField(description="The image to process")
+
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        image = context.images.get_pil(self.image.image_name, "RGB")
+        loaded_model = context.models.load_remote_model(NormalMapDetector.get_model_url(), NormalMapDetector.load_model)
+
+        with loaded_model as model:
+            assert isinstance(model, NNET)
+            detector = NormalMapDetector(model)
+            normal_map = detector.run(image=image)
+
+        image_dto = context.images.save(image=normal_map)
+        return ImageOutput.build(image_dto)
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Caroline Chan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,93 @@
+# Adapted from https://github.com/huggingface/controlnet_aux
+
+import pathlib
+import types
+
+import cv2
+import huggingface_hub
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from einops import rearrange
+from PIL import Image
+
+from invokeai.backend.image_util.normal_bae.nets.NNET import NNET
+from invokeai.backend.image_util.util import np_to_pil, pil_to_np, resize_to_multiple
+
+
+class NormalMapDetector:
+    """Simple wrapper around the Normal BAE model for normal map generation."""
+
+    hf_repo_id = "lllyasviel/Annotators"
+    hf_filename = "scannet.pt"
+
+    @classmethod
+    def get_model_url(cls) -> str:
+        """Get the URL to download the model from the Hugging Face Hub."""
+        return huggingface_hub.hf_hub_url(cls.hf_repo_id, cls.hf_filename)
+
+    @classmethod
+    def load_model(cls, model_path: pathlib.Path) -> NNET:
+        """Load the model from a file."""
+
+        args = types.SimpleNamespace()
+        args.mode = "client"
+        args.architecture = "BN"
+        args.pretrained = "scannet"
+        args.sampling_ratio = 0.4
+        args.importance_ratio = 0.7
+
+        model = NNET(args)
+
+        ckpt = torch.load(model_path, map_location="cpu")["model"]
+        load_dict = {}
+        for k, v in ckpt.items():
+            if k.startswith("module."):
+                k_ = k.replace("module.", "")
+                load_dict[k_] = v
+            else:
+                load_dict[k] = v
+
+        model.load_state_dict(load_dict)
+        model.eval()
+
+        return model
+
+    def __init__(self, model: NNET) -> None:
+        self.model = model
+        self.norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+    def to(self, device: torch.device):
+        self.model.to(device)
+        return self
+
+    def run(self, image: Image.Image):
+        """Processes an image and returns the detected normal map."""
+
+        device = next(iter(self.model.parameters())).device
+        np_image = pil_to_np(image)
+
+        height, width, _channels = np_image.shape
+
+        # The model requires the image to be a multiple of 8
+        np_image = resize_to_multiple(np_image, 8)
+
+        image_normal = np_image
+
+        with torch.no_grad():
+            image_normal = torch.from_numpy(image_normal).float().to(device)
+            image_normal = image_normal / 255.0
+            image_normal = rearrange(image_normal, "h w c -> 1 c h w")
+            image_normal = self.norm(image_normal)
+
+            normal = self.model(image_normal)
+            normal = normal[0][-1][:, :3]
+            normal = ((normal + 1) * 0.5).clip(0, 1)
+
+            normal = rearrange(normal[0], "c h w -> h w c").cpu().numpy()
+            normal_image = (normal * 255.0).clip(0, 255).astype(np.uint8)
+
+        # Back to the original size
+        output_image = cv2.resize(normal_image, (width, height), interpolation=cv2.INTER_LINEAR)
+
+        return np_to_pil(output_image)
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .submodules.encoder import Encoder
+from .submodules.decoder import Decoder
+
+
+class NNET(nn.Module):
+    def __init__(self, args):
+        super(NNET, self).__init__()
+        self.encoder = Encoder()
+        self.decoder = Decoder(args)
+
+    def get_1x_lr_params(self):  # lr/10 learning rate
+        return self.encoder.parameters()
+
+    def get_10x_lr_params(self):  # lr learning rate
+        return self.decoder.parameters()
+
+    def forward(self, img, **kwargs):
+        return self.decoder(self.encoder(img), **kwargs)
@@ -0,0 +1,85 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .submodules.submodules import UpSampleBN, norm_normalize
+
+
+# This is the baseline encoder-decoder we used in the ablation study
+class NNET(nn.Module):
+    def __init__(self, args=None):
+        super(NNET, self).__init__()
+        self.encoder = Encoder()
+        self.decoder = Decoder(num_classes=4)
+
+    def forward(self, x, **kwargs):
+        out = self.decoder(self.encoder(x), **kwargs)
+
+        # Bilinearly upsample the output to match the input resolution
+        up_out = F.interpolate(out, size=[x.size(2), x.size(3)], mode='bilinear', align_corners=False)
+        
+        # L2-normalize the first three channels / ensure positive value for concentration parameters (kappa)
+        up_out = norm_normalize(up_out) 
+        return up_out
+
+    def get_1x_lr_params(self):  # lr/10 learning rate
+        return self.encoder.parameters()
+
+    def get_10x_lr_params(self):  # lr learning rate
+        modules = [self.decoder]
+        for m in modules:
+            yield from m.parameters()
+
+
+# Encoder
+class Encoder(nn.Module):
+    def __init__(self):
+        super(Encoder, self).__init__()
+
+        basemodel_name = 'tf_efficientnet_b5_ap'
+        basemodel = torch.hub.load('rwightman/gen-efficientnet-pytorch', basemodel_name, pretrained=True)
+
+        # Remove last layer
+        basemodel.global_pool = nn.Identity()
+        basemodel.classifier = nn.Identity()
+
+        self.original_model = basemodel
+
+    def forward(self, x):
+        features = [x]
+        for k, v in self.original_model._modules.items():
+            if (k == 'blocks'):
+                for ki, vi in v._modules.items():
+                    features.append(vi(features[-1]))
+            else:
+                features.append(v(features[-1]))
+        return features
+
+
+# Decoder (no pixel-wise MLP, no uncertainty-guided sampling)
+class Decoder(nn.Module):
+    def __init__(self, num_classes=4):
+        super(Decoder, self).__init__()
+        self.conv2 = nn.Conv2d(2048, 2048, kernel_size=1, stride=1, padding=0)
+        self.up1 = UpSampleBN(skip_input=2048 + 176, output_features=1024)
+        self.up2 = UpSampleBN(skip_input=1024 + 64, output_features=512)
+        self.up3 = UpSampleBN(skip_input=512 + 40, output_features=256)
+        self.up4 = UpSampleBN(skip_input=256 + 24, output_features=128)
+        self.conv3 = nn.Conv2d(128, num_classes, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, features):
+        x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], features[8], features[11]
+        x_d0 = self.conv2(x_block4)
+        x_d1 = self.up1(x_d0, x_block3)
+        x_d2 = self.up2(x_d1, x_block2)
+        x_d3 = self.up3(x_d2, x_block1)
+        x_d4 = self.up4(x_d3, x_block0)
+        out = self.conv3(x_d4)
+        return out
+
+
+if __name__ == '__main__':
+    model = Baseline()
+    x = torch.rand(2, 3, 480, 640)
+    out = model(x)
+    print(out.shape)