Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Differential Binarization model from PaddleOCR to Keras3 #1739

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions keras_hub/api/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
from keras_hub.src.models.densenet.densenet_image_converter import (
DenseNetImageConverter,
)
from keras_hub.src.models.differential_binarization.differential_binarization_image_converter import (
DifferentialBinarizationImageConverter,
)
from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
EfficientNetImageConverter,
)
Expand Down
9 changes: 9 additions & 0 deletions keras_hub/api/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@
from keras_hub.src.models.densenet.densenet_image_classifier_preprocessor import (
DenseNetImageClassifierPreprocessor,
)
from keras_hub.src.models.differential_binarization.differential_binarization import (
DifferentialBinarization,
)
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_preprocessor import (
DifferentialBinarizationPreprocessor,
)
from keras_hub.src.models.distil_bert.distil_bert_backbone import (
DistilBertBackbone,
)
Expand Down
9 changes: 9 additions & 0 deletions keras_hub/src/models/differential_binarization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_presets import (
backbone_presets,
)
from keras_hub.src.utils.preset_utils import register_presets

register_presets(backbone_presets, DifferentialBinarizationBackbone)
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import math

import keras
from keras import layers

from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_preprocessor import (
DifferentialBinarizationPreprocessor,
)
from keras_hub.src.models.differential_binarization.losses import DBLoss
from keras_hub.src.models.image_segmenter import ImageSegmenter


@keras_hub_export("keras_hub.models.DifferentialBinarization")
class DifferentialBinarization(ImageSegmenter):
"""
A Keras model implementing the Differential Binarization
architecture for scene text detection, described in
[Real-time Scene Text Detection with Differentiable Binarization](
https://arxiv.org/abs/1911.08947).

Args:
backbone: A `keras_hub.models.DifferentialBinarizationBackbone`
instance.
head_kernel_list: list of ints. The number of filters for probability
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
and threshold maps. Defaults to [3, 2, 2].
step_function_k: float. `k` parameter used within the differential
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
binarization step function.
preprocessor: `None`, a `keras_hub.models.Preprocessor` instance,
a `keras.Layer` instance, or a callable. If `None` no preprocessing
will be applied to the inputs.

Examples:
```python
input_data = np.ones(shape=(8, 224, 224, 3))

image_encoder = keras_hub.models.ResNetBackbone.from_preset(
"resnet_vd_50_imagenet"
)
backbone = keras_hub.models.DifferentialBinarizationBackbone(image_encoder)
detector = keras_hub.models.DifferentialBinarization(
backbone=backbone
)

detector(input_data)
```
"""

backbone_cls = DifferentialBinarizationBackbone
preprocessor_cls = DifferentialBinarizationPreprocessor

def __init__(
self,
backbone,
head_kernel_list=[3, 2, 2],
step_function_k=50.0,
preprocessor=None,
**kwargs,
):

inputs = backbone.input
x = backbone(inputs)
probability_maps = diffbin_head(
x,
in_channels=backbone.fpn_channels,
kernel_list=head_kernel_list,
name="head_prob",
)
threshold_maps = diffbin_head(
x,
in_channels=backbone.fpn_channels,
kernel_list=head_kernel_list,
name="head_thresh",
)
binary_maps = step_function(
probability_maps, threshold_maps, k=step_function_k
)
outputs = layers.Concatenate(axis=-1)(
[probability_maps, threshold_maps, binary_maps]
)

super().__init__(inputs=inputs, outputs=outputs, **kwargs)

self.backbone = backbone
self.head_kernel_list = head_kernel_list
self.step_function_k = step_function_k
self.preprocessor = preprocessor

def compile(
self,
optimizer="auto",
loss="auto",
**kwargs,
):
"""Configures the `DifferentialBinarization` task for training.

`DifferentialBinarization` extends the default compilation signature of
`keras.Model.compile` with defaults for `optimizer` and `loss`. To
override these defaults, pass any value to these arguments during
compilation.

Args:
optimizer: `"auto"`, an optimizer name, or a `keras.Optimizer`
instance. Defaults to `"auto"`, which uses the default optimizer
for `DifferentialBinarization`. See `keras.Model.compile` and
`keras.optimizers` for more info on possible `optimizer` values.
loss: `"auto"`, a loss name, or a `keras.losses.Loss` instance.
Defaults to `"auto"`, in which case the default loss
computation of `DifferentialBinarization` will be applied. See
`keras.Model.compile` and `keras.losses` for more info on
possible `loss` values.
**kwargs: See `keras.Model.compile` for a full list of arguments
supported by the compile method.
"""
if optimizer == "auto":
optimizer = keras.optimizers.SGD(
learning_rate=0.007, weight_decay=0.0001, momentum=0.9
)
if loss == "auto":
loss = DBLoss()
super().compile(
optimizer=optimizer,
loss=loss,
**kwargs,
)

def get_config(self):
# Backbone serialized in `super`
config = super().get_config()
config.update(
{
"head_kernel_list": self.head_kernel_list,
"step_function_k": self.step_function_k,
}
)
return config


def step_function(x, y, k):
return 1.0 / (1.0 + keras.ops.exp(-k * (x - y)))


def diffbin_head(inputs, in_channels, kernel_list, name):
x = layers.Conv2D(
in_channels // 4,
kernel_size=kernel_list[0],
padding="same",
use_bias=False,
name=f"{name}_conv0_weights",
)(inputs)
x = layers.BatchNormalization(
beta_initializer=keras.initializers.Constant(1e-4),
gamma_initializer=keras.initializers.Constant(1.0),
name=f"{name}_conv0_bn",
)(x)
x = layers.ReLU(name=f"{name}_conv0_relu")(x)
x = layers.Conv2DTranspose(
in_channels // 4,
kernel_size=kernel_list[1],
strides=2,
padding="valid",
bias_initializer=keras.initializers.RandomUniform(
minval=-1.0 / math.sqrt(in_channels // 4 * 1.0),
maxval=1.0 / math.sqrt(in_channels // 4 * 1.0),
),
name=f"{name}_conv1_weights",
)(x)
x = layers.BatchNormalization(
beta_initializer=keras.initializers.Constant(1e-4),
gamma_initializer=keras.initializers.Constant(1.0),
name=f"{name}_conv1_bn",
)(x)
x = layers.ReLU(name=f"{name}_conv1_relu")(x)
x = layers.Conv2DTranspose(
1,
kernel_size=kernel_list[2],
strides=2,
padding="valid",
activation="sigmoid",
bias_initializer=keras.initializers.RandomUniform(
minval=-1.0 / math.sqrt(in_channels // 4 * 1.0),
maxval=1.0 / math.sqrt(in_channels // 4 * 1.0),
),
name=f"{name}_conv2_weights",
)(x)
return x
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import keras
from keras import layers

from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.models.backbone import Backbone


@keras_hub_export("keras_hub.models.DifferentialBinarizationBackbone")
class DifferentialBinarizationBackbone(Backbone):
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

always start docstring with a one liner

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've improved/added the docstrings here and in losses.py. ptal

A Keras model implementing the Differential Binarization
architecture for scene text detection, described in
[Real-time Scene Text Detection with Differentiable Binarization](
https://arxiv.org/abs/1911.08947).

This class contains the backbone architecture containing the feature
pyramid network.

Args:
image_encoder: A `keras_hub.models.ResNetBackbone` instance.
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
fpn_channels: int. The number of channels to output by the feature
pyramid network. Defaults to 256.
"""

def __init__(
self,
image_encoder,
fpn_channels=256,
**kwargs,
):
inputs = image_encoder.input
gowthamkpr marked this conversation as resolved.
Show resolved Hide resolved
x = image_encoder.pyramid_outputs
x = diffbin_fpn_model(x, out_channels=fpn_channels)

super().__init__(inputs=inputs, outputs=x, **kwargs)

self.image_encoder = image_encoder
self.fpn_channels = fpn_channels

def get_config(self):
config = super().get_config()
config["fpn_channels"] = self.fpn_channels
config["image_encoder"] = keras.layers.serialize(self.image_encoder)
return config

@classmethod
def from_config(cls, config):
config["image_encoder"] = keras.layers.deserialize(
config["image_encoder"]
)
return cls(**config)


def diffbin_fpn_model(inputs, out_channels):
in2 = layers.Conv2D(
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
out_channels, kernel_size=1, use_bias=False, name="neck_in2"
)(inputs["P2"])
in3 = layers.Conv2D(
out_channels, kernel_size=1, use_bias=False, name="neck_in3"
)(inputs["P3"])
in4 = layers.Conv2D(
out_channels, kernel_size=1, use_bias=False, name="neck_in4"
)(inputs["P4"])
in5 = layers.Conv2D(
out_channels, kernel_size=1, use_bias=False, name="neck_in5"
)(inputs["P5"])
out4 = layers.Add(name="add1")([layers.UpSampling2D()(in5), in4])
out3 = layers.Add(name="add2")([layers.UpSampling2D()(out4), in3])
out2 = layers.Add(name="add3")([layers.UpSampling2D()(out3), in2])
p5 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_p5",
)(in5)
p4 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_p4",
)(out4)
p3 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_p3",
)(out3)
p2 = layers.Conv2D(
out_channels // 4,
kernel_size=3,
padding="same",
use_bias=False,
name="neck_p2",
)(out2)
p5 = layers.UpSampling2D((8, 8))(p5)
p4 = layers.UpSampling2D((4, 4))(p4)
p3 = layers.UpSampling2D((2, 2))(p3)

fused = layers.Concatenate(axis=-1)([p5, p4, p3, p2])
return fused
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from keras import ops

from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)
from keras_hub.src.models.differential_binarization.differential_binarization_preprocessor import (
DifferentialBinarizationPreprocessor,
)
from keras_hub.src.models.resnet.resnet_backbone import ResNetBackbone
from keras_hub.src.tests.test_case import TestCase


class DifferentialBinarizationTest(TestCase):
def setUp(self):
self.batch_size = 2
self.image_size = 16
self.images = ops.ones((2, 224, 224, 3))
self.image_encoder = ResNetBackbone(
input_conv_filters=[64],
input_conv_kernel_sizes=[7],
stackwise_num_filters=[64, 128, 256, 512],
stackwise_num_blocks=[3, 4, 6, 3],
stackwise_num_strides=[1, 2, 2, 2],
block_type="bottleneck_block",
image_shape=(224, 224, 3),
)
self.preprocessor = DifferentialBinarizationPreprocessor()
self.init_kwargs = {
"image_encoder": self.image_encoder,
}

def test_backbone_basics(self):
self.run_backbone_test(
cls=DifferentialBinarizationBackbone,
init_kwargs=self.init_kwargs,
input_data=self.images,
expected_output_shape=(
2,
56,
56,
256,
),
run_mixed_precision_check=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does the mixed precision check pass?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I tried adding an explicit dtype argument, but the problem remains that the mixed precision check checks against each sublayer of the model. The ResNet backbone, which is instantiated separately, therefore has the wrong dtype.

run_quantization_check=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
from keras_hub.src.models.differential_binarization.differential_binarization_backbone import (
DifferentialBinarizationBackbone,
)


@keras_hub_export("keras_hub.layers.DifferentialBinarizationImageConverter")
class DifferentialBinarizationImageConverter(ImageConverter):
backbone_cls = DifferentialBinarizationBackbone
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there should be some resizing/rescaling ops here right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depends. Basically these image operations are implemented in the super class, ImageConverter, and can be used as depicted in the demo colab I've added in the PR description. Dedicated code in this class might make sense to resize to resolutions of multiples of 32, which the model requires. On the other hand, it might be confusing for the user if the masks that are predicted have different resolutions than the input.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might want to look into Segformer for this. The output masks will need to be resized as well

Loading
Loading