diff --git a/keras_hub/src/models/cspnet/cspnet_backbone.py b/keras_hub/src/models/cspnet/cspnet_backbone.py index b66425feba..460c019e6b 100644 --- a/keras_hub/src/models/cspnet/cspnet_backbone.py +++ b/keras_hub/src/models/cspnet/cspnet_backbone.py @@ -81,7 +81,7 @@ class CSPNetBackbone(FeaturePyramidBackbone): # Pretrained backbone model = keras_hub.models.CSPNetBackbone.from_preset( - "cspdarknet53_ra_imagenet" + "csp_darknet_53_ra_imagenet" ) model(input_data) @@ -357,18 +357,6 @@ def apply(x): dtype=dtype, name=f"{name}_bottleneck_block_bn_3", )(x) - if activation == "leaky_relu": - x = layers.LeakyReLU( - negative_slope=0.01, - dtype=dtype, - name=f"{name}_bottleneck_block_activation_3", - )(x) - else: - x = layers.Activation( - activation, - dtype=dtype, - name=f"{name}_bottleneck_block_activation_3", - )(x) x = layers.add( [x, shortcut], dtype=dtype, name=f"{name}_bottleneck_block_add" @@ -673,6 +661,13 @@ def apply(x): name=f"{name}_csp_activation_1", )(x) else: + if strides > 1: + x = layers.ZeroPadding2D( + 1, + data_format=data_format, + dtype=dtype, + name=f"{name}_csp_conv_pad_1", + )(x) x = layers.Conv2D( filters=down_chs, kernel_size=3, @@ -882,6 +877,13 @@ def apply(x): name=f"{name}_cs3_activation_1", )(x) else: + if strides > 1: + x = layers.ZeroPadding2D( + 1, + data_format=data_format, + dtype=dtype, + name=f"{name}_cs3_conv_pad_1", + )(x) x = layers.Conv2D( filters=down_chs, kernel_size=3, @@ -1062,6 +1064,13 @@ def apply(x): name=f"{name}_dark_activation_1", )(x) else: + if strides > 1: + x = layers.ZeroPadding2D( + 1, + data_format=data_format, + dtype=dtype, + name=f"{name}_dark_conv_pad_1", + )(x) x = layers.Conv2D( filters=filters, kernel_size=3, @@ -1091,18 +1100,18 @@ def apply(x): dtype=dtype, name=f"{name}_dark_activation_1", )(x) - for i in range(depth): - x = block_fn( - filters=block_channels, - dilation=dilation, - bottle_ratio=bottle_ratio, - groups=groups, - activation=activation, - data_format=data_format, - channel_axis=channel_axis, - dtype=dtype, - name=f"{name}_block_{i}", - )(x) + for i in range(depth): + x = block_fn( + filters=block_channels, + dilation=dilation, + bottle_ratio=bottle_ratio, + groups=groups, + activation=activation, + data_format=data_format, + channel_axis=channel_axis, + dtype=dtype, + name=f"{name}_block_{i}", + )(x) return x return apply @@ -1135,6 +1144,13 @@ def apply(x): or (i == last_idx and strides > 2 and not pooling) else 1 ) + if conv_strides > 1: + x = layers.ZeroPadding2D( + (kernel_size - 1) // 2, + data_format=data_format, + dtype=dtype, + name=f"csp_stem_pad_{i}", + )(x) x = layers.Conv2D( filters=chs, kernel_size=kernel_size, @@ -1167,10 +1183,19 @@ def apply(x): if pooling == "max": assert strides > 2 + # Use manual padding to handle edge case scenario to ignore zero's + # as max value instead consider negative values from Leaky Relu type + # of activations. + pad_width = [[1, 1], [1, 1]] + if data_format == "channels_last": + pad_width += [[0, 0]] + else: + pad_width = [[0, 0]] + pad_width + pad_width = [[0, 0]] + pad_width + x = ops.pad(x, pad_width=pad_width, constant_values=float("-inf")) x = layers.MaxPooling2D( pool_size=3, strides=2, - padding="same", data_format=data_format, dtype=dtype, name="csp_stem_pool", diff --git a/keras_hub/src/models/cspnet/cspnet_backbone_test.py b/keras_hub/src/models/cspnet/cspnet_backbone_test.py index 3b8681d3d9..efe7c6de21 100644 --- a/keras_hub/src/models/cspnet/cspnet_backbone_test.py +++ b/keras_hub/src/models/cspnet/cspnet_backbone_test.py @@ -22,6 +22,7 @@ def setUp(self): "expand_ratio": (2.0,) + (1.0,), "block_type": "dark_block", "stage_type": "csp", + "stem_padding": "same", } self.input_size = 64 self.input_data = ops.ones((2, self.input_size, self.input_size, 3)) @@ -38,9 +39,9 @@ def test_backbone_basics(self, stage_type, block_type): "stage_type": stage_type, }, input_data=self.input_data, - expected_output_shape=(2, 6, 6, 48), + expected_output_shape=(2, 8, 8, 48), expected_pyramid_output_keys=["P2", "P3", "P4"], - expected_pyramid_image_sizes=[(30, 30), (14, 14), (6, 6)], + expected_pyramid_image_sizes=[(32, 32), (16, 16), (8, 8)], ) @pytest.mark.large diff --git a/keras_hub/src/models/cspnet/cspnet_presets.py b/keras_hub/src/models/cspnet/cspnet_presets.py index 8b090b56bf..af9b1dcff1 100644 --- a/keras_hub/src/models/cspnet/cspnet_presets.py +++ b/keras_hub/src/models/cspnet/cspnet_presets.py @@ -6,11 +6,46 @@ "description": ( "A CSP-DarkNet (Cross-Stage-Partial) image classification model" " pre-trained on the Randomly Augmented ImageNet 1k dataset at " - "a 224x224 resolution." + "a 256x256 resolution." ), - "params": 26652512, + "params": 27642184, "path": "cspnet", }, - "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_darknet_53_ra_imagenet/1", + "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_darknet_53_ra_imagenet/2", + }, + "csp_resnext_50_ra_imagenet": { + "metadata": { + "description": ( + "A CSP-ResNeXt (Cross-Stage-Partial) image classification model" + " pre-trained on the Randomly Augmented ImageNet 1k dataset at " + "a 256x256 resolution." + ), + "params": 20569896, + "path": "cspnet", + }, + "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_resnext_50_ra_imagenet/1", + }, + "csp_resnet_50_ra_imagenet": { + "metadata": { + "description": ( + "A CSP-ResNet (Cross-Stage-Partial) image classification model" + " pre-trained on the Randomly Augmented ImageNet 1k dataset at " + "a 256x256 resolution." + ), + "params": 21616168, + "path": "cspnet", + }, + "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_resnet_50_ra_imagenet/1", + }, + "darknet_53_imagenet": { + "metadata": { + "description": ( + "A DarkNet image classification model pre-trained on the" + "ImageNet 1k dataset at a 256x256 resolution." + ), + "params": 41609928, + "path": "cspnet", + }, + "kaggle_handle": "kaggle://keras/cspdarknet/keras/darknet_53_imagenet/1", }, } diff --git a/keras_hub/src/utils/timm/convert_cspnet.py b/keras_hub/src/utils/timm/convert_cspnet.py index 161edab23f..35a159e591 100644 --- a/keras_hub/src/utils/timm/convert_cspnet.py +++ b/keras_hub/src/utils/timm/convert_cspnet.py @@ -17,10 +17,69 @@ def convert_backbone_config(timm_config): bottle_ratio = (0.5,) + (1.0,) block_ratio = (1.0,) + (0.5,) expand_ratio = (2.0,) + (1.0,) + stem_padding = "same" + stem_pooling = None stage_type = "csp" + groups = 1 block_type = "dark_block" down_growth = True - stackwise_strides = 2 + stackwise_strides = [2, 2, 2, 2, 2] + avg_down = False + cross_linear = False + elif timm_architecture == "cspresnet50": + stem_filters = 64 + stem_kernel_size = 7 + stem_strides = 4 + stackwise_depth = [3, 3, 5, 2] + stackwise_strides = [1, 2, 2, 2] + stackwise_num_filters = [128, 256, 512, 1024] + block_type = "bottleneck_block" + stage_type = "csp" + bottle_ratio = [0.5] + block_ratio = [1.0] + expand_ratio = [2.0] + stem_padding = "valid" + stem_pooling = "max" + avg_down = False + groups = 1 + down_growth = False + cross_linear = True + elif timm_architecture == "cspresnext50": + stem_filters = 64 + stem_kernel_size = 7 + stem_strides = 4 + stackwise_depth = [3, 3, 5, 2] + stackwise_num_filters = [256, 512, 1024, 2048] + bottle_ratio = [1.0] + block_ratio = [0.5] + expand_ratio = [1.0] + stage_type = "csp" + block_type = "bottleneck_block" + stem_pooling = "max" + stackwise_strides = [1, 2, 2, 2] + groups = 32 + stem_padding = "valid" + avg_down = False + down_growth = False + cross_linear = True + elif timm_architecture == "darknet53": + stem_filters = 32 + stem_kernel_size = 3 + stem_strides = 1 + stackwise_depth = [1, 2, 8, 8, 4] + stackwise_num_filters = [64, 128, 256, 512, 1024] + bottle_ratio = [0.5] + block_ratio = [1.0] + groups = 1 + expand_ratio = [1.0] + stage_type = "dark" + block_type = "dark_block" + stem_pooling = None + stackwise_strides = [2, 2, 2, 2, 2] + stem_padding = "same" + avg_down = False + down_growth = False + cross_linear = False else: raise ValueError( f"Currently, the architecture {timm_architecture} is not supported." @@ -38,6 +97,11 @@ def convert_backbone_config(timm_config): block_type=block_type, stackwise_strides=stackwise_strides, down_growth=down_growth, + stem_pooling=stem_pooling, + stem_padding=stem_padding, + avg_down=avg_down, + cross_linear=cross_linear, + groups=groups, ) @@ -81,21 +145,36 @@ def port_batch_normalization(hf_weight_prefix, keras_layer_name): stackwise_depth = backbone.stackwise_depth stage_type = backbone.stage_type block_type = backbone.block_type + strides = backbone.stackwise_strides for idx, block in enumerate(stackwise_depth): - port_conv2d( - f"stages.{idx}.conv_down.conv", - f"stage_{idx}_{stage_type}_conv_down_1", - ) - port_batch_normalization( - f"stages.{idx}.conv_down.bn", f"stage_{idx}_{stage_type}_bn_1" - ) - port_conv2d( - f"stages.{idx}.conv_exp.conv", f"stage_{idx}_{stage_type}_conv_exp" - ) - port_batch_normalization( - f"stages.{idx}.conv_exp.bn", f"stage_{idx}_{stage_type}_bn_2" - ) + if strides[idx] != 1 or stage_type == "dark": + if strides[idx] == 2 and backbone.avg_down: + port_conv2d( + f"stages.{idx}.conv_down.1.conv", + f"stage_{idx}_{stage_type}_conv_down_1", + ) + port_batch_normalization( + f"stages.{idx}.conv_down.1.bn", + f"stage_{idx}_{stage_type}_bn_1", + ) + else: + port_conv2d( + f"stages.{idx}.conv_down.conv", + f"stage_{idx}_{stage_type}_conv_down_1", + ) + port_batch_normalization( + f"stages.{idx}.conv_down.bn", + f"stage_{idx}_{stage_type}_bn_1", + ) + if stage_type != "dark": + port_conv2d( + f"stages.{idx}.conv_exp.conv", + f"stage_{idx}_{stage_type}_conv_exp", + ) + port_batch_normalization( + f"stages.{idx}.conv_exp.bn", f"stage_{idx}_{stage_type}_bn_2" + ) for i in range(block): port_conv2d( @@ -133,16 +212,8 @@ def port_batch_normalization(hf_weight_prefix, keras_layer_name): f"stages.{idx}.conv_transition_b.bn", f"stage_{idx}_{stage_type}_transition_b_bn", ) - port_conv2d( - f"stages.{idx}.conv_transition.conv", - f"stage_{idx}_{stage_type}_conv_transition", - ) - port_batch_normalization( - f"stages.{idx}.conv_transition.bn", - f"stage_{idx}_{stage_type}_transition_bn", - ) - else: + if stage_type != "dark": port_conv2d( f"stages.{idx}.conv_transition.conv", f"stage_{idx}_{stage_type}_conv_transition", diff --git a/keras_hub/src/utils/timm/convert_cspnet_test.py b/keras_hub/src/utils/timm/convert_cspnet_test.py index dcddca8ae5..15104a691a 100644 --- a/keras_hub/src/utils/timm/convert_cspnet_test.py +++ b/keras_hub/src/utils/timm/convert_cspnet_test.py @@ -6,15 +6,15 @@ from keras_hub.src.tests.test_case import TestCase -class TimmDenseNetBackboneTest(TestCase): +class TimmCSPNetBackboneTest(TestCase): @pytest.mark.large - def test_convert_densenet_backbone(self): + def test_convert_cspnet_backbone(self): model = Backbone.from_preset("hf://timm/cspdarknet53.ra_in1k") - outputs = model.predict(ops.ones((1, 224, 224, 3))) - self.assertEqual(outputs.shape, (1, 5, 5, 1024)) + outputs = model.predict(ops.ones((1, 256, 256, 3))) + self.assertEqual(outputs.shape, (1, 8, 8, 1024)) @pytest.mark.large - def test_convert_densenet_classifier(self): + def test_convert_cspnet_classifier(self): model = ImageClassifier.from_preset("hf://timm/cspdarknet53.ra_in1k") outputs = model.predict(ops.ones((1, 512, 512, 3))) self.assertEqual(outputs.shape, (1, 1000)) diff --git a/keras_hub/src/utils/timm/preset_loader.py b/keras_hub/src/utils/timm/preset_loader.py index 26ad0d99a1..18b6180e68 100644 --- a/keras_hub/src/utils/timm/preset_loader.py +++ b/keras_hub/src/utils/timm/preset_loader.py @@ -16,17 +16,17 @@ class TimmPresetLoader(PresetLoader): def __init__(self, preset, config): super().__init__(preset, config) architecture = self.config["architecture"] - if "resnet" in architecture: + if architecture.startswith("resnet"): self.converter = convert_resnet - elif "csp" in architecture: + elif architecture.startswith(("csp", "dark")): self.converter = convert_cspnet - elif "densenet" in architecture: + elif architecture.startswith("densenet"): self.converter = convert_densenet - elif "mobilenet" in architecture: + elif architecture.startswith("mobilenet"): self.converter = convert_mobilenet - elif "vgg" in architecture: + elif architecture.startswith("vgg"): self.converter = convert_vgg - elif "efficientnet" in architecture: + elif architecture.startswith("efficientnet"): self.converter = convert_efficientnet else: raise ValueError( diff --git a/tools/checkpoint_conversion/convert_cspnet_checkpoints.py b/tools/checkpoint_conversion/convert_cspnet_checkpoints.py index 56d18486e9..57678a91d0 100644 --- a/tools/checkpoint_conversion/convert_cspnet_checkpoints.py +++ b/tools/checkpoint_conversion/convert_cspnet_checkpoints.py @@ -2,6 +2,12 @@ python tools/checkpoint_conversion/convert_cspnet_checkpoints.py \ --preset csp_darknet_53_ra_imagenet --upload_uri kaggle://keras/cspdarknet/keras/csp_darknet_53_ra_imagenet +python tools/checkpoint_conversion/convert_cspnet_checkpoints.py \ + --preset csp_resnext_50_ra_imagenet --upload_uri kaggle://keras/cspdarknet/keras/csp_resnext_50_ra_imagenet +python tools/checkpoint_conversion/convert_cspnet_checkpoints.py \ + --preset csp_resnet_50_ra_imagenet --upload_uri kaggle://keras/cspdarknet/keras/csp_resnet_50_ra_imagenet +python tools/checkpoint_conversion/convert_cspnet_checkpoints.py \ + --preset darknet_53_imagenet --upload_uri kaggle://keras/cspdarknet/keras/darknet_53_imagenet """ import os @@ -19,6 +25,9 @@ PRESET_MAP = { "csp_darknet_53_ra_imagenet": "timm/cspdarknet53.ra_in1k", + "csp_resnext_50_ra_imagenet": "cspresnext50.ra_in1k", + "csp_resnet_50_ra_imagenet": "cspresnet50.ra_in1k", + "darknet_53_imagenet": "darknet53.c2ns_in1k", } FLAGS = flags.FLAGS @@ -40,8 +49,8 @@ def validate_output(keras_model, timm_model): file = keras.utils.get_file( origin=( - "https://storage.googleapis.com/keras-cv/" - "models/paligemma/cow_beach_1.png" + "https://upload.wikimedia.org/wikipedia/" + "commons/a/aa/California_quail.jpg" ) ) image = PIL.Image.open(file) @@ -63,6 +72,7 @@ def validate_output(keras_model, timm_model): timm_batch = keras.ops.transpose(keras_preprocessed, axes=(0, 3, 1, 2)) timm_batch = torch.from_numpy(np.array(timm_batch)) timm_outputs = timm_model(timm_batch).detach().numpy() + timm_outputs = keras.ops.softmax(timm_outputs, axis=-1) timm_label = np.argmax(timm_outputs[0]) # Call with Keras.