Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Differential Binarization model from PaddleOCR to Keras3 #1739

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions keras_hub/api/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
from keras_hub.src.models.densenet.densenet_image_converter import (
DenseNetImageConverter,
)
from keras_hub.src.models.differential_binarization.differential_binarization_image_converter import (
DifferentialBinarizationImageConverter,
)
from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
EfficientNetImageConverter,
)
Expand Down
10 changes: 10 additions & 0 deletions keras_hub/api/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@
from keras_hub.src.models.densenet.densenet_image_classifier_preprocessor import (
DenseNetImageClassifierPreprocessor,
)
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_ocr import (
DifferentialBinarizationOCR,
)
from keras_hub.src.models.differential_binarization.differential_binarization_preprocessor import (
DifferentialBinarizationPreprocessor,
)
from keras_hub.src.models.distil_bert.distil_bert_backbone import (
DistilBertBackbone,
)
Expand Down Expand Up @@ -189,6 +198,7 @@
from keras_hub.src.models.image_segmenter_preprocessor import (
ImageSegmenterPreprocessor,
)
from keras_hub.src.models.image_text_detector import ImageTextDetector
from keras_hub.src.models.image_to_image import ImageToImage
from keras_hub.src.models.inpaint import Inpaint
from keras_hub.src.models.llama3.llama3_backbone import Llama3Backbone
Expand Down
9 changes: 9 additions & 0 deletions keras_hub/src/models/differential_binarization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_presets import (
backbone_presets,
)
from keras_hub.src.utils.preset_utils import register_presets

register_presets(backbone_presets, DifferentialBinarizationBackbone)
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import keras
from keras import layers

from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.models.backbone import Backbone


@keras_hub_export("keras_hub.models.DifferentialBinarizationBackbone")
class DifferentialBinarizationBackbone(Backbone):
"""Differential Binarization architecture for scene text detection.

This class implements the Differential Binarization architecture for
detecting text in natural images, described in
[Real-time Scene Text Detection with Differentiable Binarization](
https://arxiv.org/abs/1911.08947).

The backbone architecture in this class contains the feature pyramid
network and model heads.

Args:
image_encoder: A `keras_hub.models.ResNetBackbone` instance.
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
fpn_channels: int. The number of channels to output by the feature
pyramid network. Defaults to 256.
head_kernel_list: list of ints. The kernel sizes of probability map and
threshold map heads. Defaults to [3, 2, 2].
dtype: `None` or str or `keras.mixed_precision.DTypePolicy`. The dtype
to use for the model's computations and weights.
"""

def __init__(
self,
image_encoder,
fpn_channels=256,
head_kernel_list=[3, 2, 2],
dtype=None,
**kwargs,
):
# === Functional Model ===
inputs = image_encoder.input
gowthamkpr marked this conversation as resolved.
Show resolved Hide resolved
x = image_encoder.pyramid_outputs
x = diffbin_fpn_model(x, out_channels=fpn_channels, dtype=dtype)

probability_maps = diffbin_head(
x,
in_channels=fpn_channels,
kernel_list=head_kernel_list,
name="head_prob",
)
threshold_maps = diffbin_head(
x,
in_channels=fpn_channels,
kernel_list=head_kernel_list,
name="head_thresh",
)

outputs = {
"probability_maps": probability_maps,
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
"threshold_maps": threshold_maps,
}

super().__init__(inputs=inputs, outputs=outputs, dtype=dtype, **kwargs)

# === Config ===
self.image_encoder = image_encoder
self.fpn_channels = fpn_channels
self.head_kernel_list = head_kernel_list

def get_config(self):
config = super().get_config()
config["fpn_channels"] = self.fpn_channels
config["head_kernel_list"] = self.head_kernel_list
config["image_encoder"] = keras.layers.serialize(self.image_encoder)
return config

@classmethod
def from_config(cls, config):
config["image_encoder"] = keras.layers.deserialize(
config["image_encoder"]
)
return cls(**config)


def diffbin_fpn_model(inputs, out_channels, dtype=None):
# lateral layers composing the FPN's bottom-up pathway using
# pointwise convolutions of ResNet's pyramid outputs
lateral_p2 = layers.Conv2D(
out_channels,
kernel_size=1,
use_bias=False,
name="neck_lateral_p2",
dtype=dtype,
)(inputs["P2"])
lateral_p3 = layers.Conv2D(
out_channels,
kernel_size=1,
use_bias=False,
name="neck_lateral_p3",
dtype=dtype,
)(inputs["P3"])
lateral_p4 = layers.Conv2D(
out_channels,
kernel_size=1,
use_bias=False,
name="neck_lateral_p4",
dtype=dtype,
)(inputs["P4"])
lateral_p5 = layers.Conv2D(
out_channels,
kernel_size=1,
use_bias=False,
name="neck_lateral_p5",
dtype=dtype,
)(inputs["P5"])
# top-down fusion pathway consisting of upsampling layers with
# skip connections
topdown_p5 = lateral_p5
topdown_p4 = layers.Add(name="neck_topdown_p4")(
[
layers.UpSampling2D(dtype=dtype)(topdown_p5),
lateral_p4,
]
)
topdown_p3 = layers.Add(name="neck_topdown_p3")(
[
layers.UpSampling2D(dtype=dtype)(topdown_p4),
lateral_p3,
]
)
topdown_p2 = layers.Add(name="neck_topdown_p2")(
[
layers.UpSampling2D(dtype=dtype)(topdown_p3),
lateral_p2,
]
)
# construct merged feature maps for each pyramid level
featuremap_p5 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_featuremap_p5",
dtype=dtype,
)(topdown_p5)
featuremap_p4 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_featuremap_p4",
dtype=dtype,
)(topdown_p4)
featuremap_p3 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_featuremap_p3",
dtype=dtype,
)(topdown_p3)
featuremap_p2 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_featuremap_p2",
dtype=dtype,
)(topdown_p2)
featuremap_p5 = layers.UpSampling2D((8, 8), dtype=dtype)(featuremap_p5)
featuremap_p4 = layers.UpSampling2D((4, 4), dtype=dtype)(featuremap_p4)
featuremap_p3 = layers.UpSampling2D((2, 2), dtype=dtype)(featuremap_p3)
featuremap = layers.Concatenate(axis=-1, dtype=dtype)(
[featuremap_p5, featuremap_p4, featuremap_p3, featuremap_p2]
)
return featuremap


def diffbin_head(inputs, in_channels, kernel_list, name):
x = layers.Conv2D(
in_channels // 4,
kernel_size=kernel_list[0],
padding="same",
use_bias=False,
name=f"{name}_conv0_weights",
)(inputs)
x = layers.BatchNormalization(
beta_initializer=keras.initializers.Constant(1e-4),
gamma_initializer=keras.initializers.Constant(1.0),
name=f"{name}_conv0_bn",
)(x)
x = layers.ReLU(name=f"{name}_conv0_relu")(x)
x = layers.Conv2DTranspose(
in_channels // 4,
kernel_size=kernel_list[1],
strides=2,
padding="valid",
bias_initializer=keras.initializers.RandomUniform(
minval=-1.0 / (in_channels // 4 * 1.0) ** 0.5,
maxval=1.0 / (in_channels // 4 * 1.0) ** 0.5,
),
name=f"{name}_conv1_weights",
)(x)
x = layers.BatchNormalization(
beta_initializer=keras.initializers.Constant(1e-4),
gamma_initializer=keras.initializers.Constant(1.0),
name=f"{name}_conv1_bn",
)(x)
x = layers.ReLU(name=f"{name}_conv1_relu")(x)
x = layers.Conv2DTranspose(
1,
kernel_size=kernel_list[2],
strides=2,
padding="valid",
activation="sigmoid",
bias_initializer=keras.initializers.RandomUniform(
minval=-1.0 / (in_channels // 4 * 1.0) ** 0.5,
maxval=1.0 / (in_channels // 4 * 1.0) ** 0.5,
),
name=f"{name}_conv2_weights",
)(x)
return x
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from keras import ops

from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_preprocessor import (
DifferentialBinarizationPreprocessor,
)
from keras_hub.src.models.resnet.resnet_backbone import ResNetBackbone
from keras_hub.src.tests.test_case import TestCase


class DifferentialBinarizationTest(TestCase):
def setUp(self):
self.images = ops.ones((2, 32, 32, 3))
self.image_encoder = ResNetBackbone(
input_conv_filters=[4],
input_conv_kernel_sizes=[7],
stackwise_num_filters=[64, 4, 4, 4],
stackwise_num_blocks=[3, 4, 6, 3],
stackwise_num_strides=[1, 2, 2, 2],
block_type="bottleneck_block",
image_shape=(32, 32, 3),
)
self.preprocessor = DifferentialBinarizationPreprocessor()
self.init_kwargs = {
"image_encoder": self.image_encoder,
"fpn_channels": 16,
"head_kernel_list": [3, 2, 2],
}

def test_backbone_basics(self):
expected_output_shape = {
"probability_maps": (2, 32, 32, 1),
"threshold_maps": (2, 32, 32, 1),
}
self.run_backbone_test(
cls=DifferentialBinarizationBackbone,
init_kwargs=self.init_kwargs,
input_data=self.images,
expected_output_shape=expected_output_shape,
run_mixed_precision_check=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does the mixed precision check pass?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I tried adding an explicit dtype argument, but the problem remains that the mixed precision check checks against each sublayer of the model. The ResNet backbone, which is instantiated separately, therefore has the wrong dtype.

run_quantization_check=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)


@keras_hub_export("keras_hub.layers.DifferentialBinarizationImageConverter")
class DifferentialBinarizationImageConverter(ImageConverter):
backbone_cls = DifferentialBinarizationBackbone
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there should be some resizing/rescaling ops here right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depends. Basically these image operations are implemented in the super class, ImageConverter, and can be used as depicted in the demo colab I've added in the PR description. Dedicated code in this class might make sense to resize to resolutions of multiples of 32, which the model requires. On the other hand, it might be confusing for the user if the masks that are predicted have different resolutions than the input.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might want to look into Segformer for this. The output masks will need to be resized as well

Loading
Loading