Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adaptive resolution plus review #43

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions face_swapper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ This `config.ini` utilizes the MegaFace dataset to train the Face Swapper model.
[training.dataset]
file_pattern = .datasets/vggface2/**/*.jpg
warp_template = vgg_face_hq_to_arcface_128_v2
transform_size = 256
batch_mode = equal
batch_ratio = 0.2
```
Expand All @@ -52,6 +53,7 @@ motion_extractor_path = .models/motion_extractor.pt
encoder_type = unet-pro
identity_channels = 512
output_channels = 4096
output_size = 256
num_blocks = 2
```

Expand All @@ -71,6 +73,7 @@ attribute_weight = 10
reconstruction_weight = 20
identity_weight = 20
gaze_weight = 0
gaze_scale_factor = 1
pose_weight = 0
expression_weight = 0
```
Expand All @@ -95,6 +98,7 @@ resume_path = .outputs/last.ckpt
directory_path = .exports
source_path = .outputs/last.ckpt
target_path = .exports/face_swapper.onnx
target_size = 256
ir_version = 10
opset_version = 15
```
Expand Down
4 changes: 4 additions & 0 deletions face_swapper/config.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[training.dataset]
file_pattern =
warp_template =
transform_size =
batch_mode =
batch_ratio =

Expand All @@ -18,6 +19,7 @@ motion_extractor_path =
encoder_type =
identity_channels =
output_channels =
output_size =
num_blocks =

[training.model.discriminator]
Expand All @@ -33,6 +35,7 @@ attribute_weight =
reconstruction_weight =
identity_weight =
gaze_weight =
gaze_scale_factor =
pose_weight =
expression_weight =

Expand All @@ -51,6 +54,7 @@ resume_path =
directory_path =
source_path =
target_path =
target_size =
ir_version =
opset_version =

Expand Down
5 changes: 3 additions & 2 deletions face_swapper/src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@


class DynamicDataset(Dataset[Tensor]):
def __init__(self, file_pattern : str, warp_template : WarpTemplate, batch_mode : BatchMode, batch_ratio : float) -> None:
def __init__(self, file_pattern : str, warp_template : WarpTemplate, transform_size : int, batch_mode : BatchMode, batch_ratio : float) -> None:
self.file_paths = glob.glob(file_pattern)
self.warp_template = warp_template
self.transform_size = transform_size
self.batch_mode = batch_mode
self.batch_ratio = batch_ratio
self.transforms = self.compose_transforms()
Expand All @@ -38,7 +39,7 @@ def compose_transforms(self) -> transforms:
[
AugmentTransform(),
transforms.ToPILImage(),
transforms.Resize((256, 256), interpolation = transforms.InterpolationMode.BICUBIC),
transforms.Resize((self.transform_size, self.transform_size), interpolation = transforms.InterpolationMode.BICUBIC),
transforms.ToTensor(),
WarpTransform(self.warp_template),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
Expand Down
3 changes: 2 additions & 1 deletion face_swapper/src/exporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def export() -> None:
directory_path = CONFIG.get('exporting', 'directory_path')
source_path = CONFIG.get('exporting', 'source_path')
target_path = CONFIG.get('exporting', 'target_path')
target_size = CONFIG.getint('exporting', 'target_size')
ir_version = CONFIG.getint('exporting', 'ir_version')
opset_version = CONFIG.getint('exporting', 'opset_version')

Expand All @@ -21,5 +22,5 @@ def export() -> None:
model.eval()
model.ir_version = torch.tensor(ir_version)
source_tensor = torch.randn(1, 512)
target_tensor = torch.randn(1, 3, 256, 256)
target_tensor = torch.randn(1, 3, target_size, target_size)
torch.onnx.export(model, (source_tensor, target_tensor), target_path, input_names = [ 'source', 'target' ], output_names = [ 'output' ], opset_version = opset_version)
2 changes: 1 addition & 1 deletion face_swapper/src/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def warp_tensor(input_tensor : Tensor, warp_template : WarpTemplate) -> Tensor:

def calc_embedding(embedder : EmbedderModule, input_tensor : Tensor, padding : Padding) -> Embedding:
crop_tensor = warp_tensor(input_tensor, 'arcface_128_v2_to_arcface_112_v2')
crop_tensor = nn.functional.interpolate(crop_tensor, size = (112, 112), mode = 'area')
crop_tensor = nn.functional.interpolate(crop_tensor, size = 112, mode = 'area')
crop_tensor[:, :, :padding[0], :] = 0
crop_tensor[:, :, 112 - padding[1]:, :] = 0
crop_tensor[:, :, :, :padding[2]] = 0
Expand Down
7 changes: 4 additions & 3 deletions face_swapper/src/models/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ def __init__(self) -> None:
encoder_type = CONFIG.get('training.model.generator', 'encoder_type')
identity_channels = CONFIG.getint('training.model.generator', 'identity_channels')
output_channels = CONFIG.getint('training.model.generator', 'output_channels')
output_size = CONFIG.getint('training.model.generator', 'output_size')
num_blocks = CONFIG.getint('training.model.generator', 'num_blocks')

if encoder_type == 'unet':
self.encoder = UNet()
self.encoder = UNet(output_size)
if encoder_type == 'unet-pro':
self.encoder = UNetPro()
self.generator = AAD(identity_channels, output_channels, num_blocks)
self.encoder = UNetPro(output_size)
self.generator = AAD(identity_channels, output_channels, output_size, num_blocks)
self.encoder.apply(init_weight)
self.generator.apply(init_weight)

Expand Down
10 changes: 8 additions & 2 deletions face_swapper/src/models/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,15 @@ def forward(self, target_tensor : Tensor, output_tensor : Tensor) -> Tuple[Tenso
return gaze_loss, weighted_gaze_loss

def detect_gaze(self, input_tensor : Tensor) -> Gaze:
crop_tensor = input_tensor[:, :, 60: 224, 16: 205]
scale_factor = CONFIG.getint('training.losses', 'gaze_scale_factor')
y_min = int(60 * scale_factor)
Copy link
Contributor Author

@henryruhs henryruhs Mar 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder where the magic numbers come from and if we could use a torchvision transform instead?

crop_tensor = F.interpolate(input_tensor, scale_factor = scale_factor, mode = 'bicubic')
crop_tensor = crop_tensor[:, :, 60:224, 16:205]

y_max = int(224 * scale_factor)
x_min = int(16 * scale_factor)
x_max = int(205 * scale_factor)

crop_tensor = input_tensor[:, :, y_min:y_max, x_min:x_max]
crop_tensor = (crop_tensor + 1) * 0.5
crop_tensor = transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], std = [ 0.229, 0.224, 0.225 ])(crop_tensor)
crop_tensor = nn.functional.interpolate(crop_tensor, size = (448, 448), mode = 'bicubic')
crop_tensor = nn.functional.interpolate(crop_tensor, size = 448, mode = 'bicubic')
pitch_tensor, yaw_tensor = self.gazer(crop_tensor)
return pitch_tensor, yaw_tensor
79 changes: 52 additions & 27 deletions face_swapper/src/networks/aad.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,49 @@


class AAD(nn.Module):
def __init__(self, identity_channels : int, output_channels : int, num_blocks : int) -> None:
def __init__(self, identity_channels : int, output_channels : int, output_size : int, num_blocks : int) -> None:
super().__init__()
self.identity_channels = identity_channels
self.output_channels = output_channels
self.output_size = output_size
self.num_blocks = num_blocks
self.pixel_shuffle_up_sample = PixelShuffleUpSample(identity_channels, output_channels)
self.layers = self.create_layers(identity_channels, num_blocks)
self.layers = self.create_layers()

@staticmethod
def create_layers(identity_channels : int, num_blocks : int) -> nn.ModuleList:
return nn.ModuleList(
def create_layers(self) -> nn.ModuleList:
layers = nn.ModuleList(
[
AdaptiveFeatureModulation(1024, 1024, 1024, identity_channels, num_blocks),
AdaptiveFeatureModulation(1024, 1024, 2048, identity_channels, num_blocks),
AdaptiveFeatureModulation(1024, 1024, 1024, identity_channels, num_blocks),
AdaptiveFeatureModulation(1024, 512, 512, identity_channels, num_blocks),
AdaptiveFeatureModulation(512, 256, 256, identity_channels, num_blocks),
AdaptiveFeatureModulation(256, 128, 128, identity_channels, num_blocks),
AdaptiveFeatureModulation(128, 64, 64, identity_channels, num_blocks),
AdaptiveFeatureModulation(64, 3, 64, identity_channels, num_blocks)
AdaptiveFeatureModulation(1024, 1024, 1024, self.identity_channels, self.num_blocks),
AdaptiveFeatureModulation(1024, 1024, 2048, self.identity_channels, self.num_blocks),
AdaptiveFeatureModulation(1024, 1024, 1024, self.identity_channels, self.num_blocks),
AdaptiveFeatureModulation(1024, 512, 512, self.identity_channels, self.num_blocks),
AdaptiveFeatureModulation(512, 256, 256, self.identity_channels, self.num_blocks),
AdaptiveFeatureModulation(256, 128, 128, self.identity_channels, self.num_blocks),
AdaptiveFeatureModulation(128, 64, 64, self.identity_channels, self.num_blocks)
])

if self.output_size in [ 384, 512, 768, 1024 ]:
layers.append(AdaptiveFeatureModulation(64, 32, 32, self.identity_channels, self.num_blocks))
if self.output_size in [ 512, 768, 1024 ]:
layers.append(AdaptiveFeatureModulation(32, 16, 16, self.identity_channels, self.num_blocks))
if self.output_size in [ 768, 1024 ]:
layers.append(AdaptiveFeatureModulation(16, 8, 8, self.identity_channels, self.num_blocks))
if self.output_size == 1024:
layers.append(AdaptiveFeatureModulation(8, 4, 4, self.identity_channels, self.num_blocks))

if self.output_size == 256:
layers.append(AdaptiveFeatureModulation(64, 3, 64, self.identity_channels, self.num_blocks))
if self.output_size == 384:
layers.append(AdaptiveFeatureModulation(32, 3, 32, self.identity_channels, self.num_blocks))
if self.output_size == 512:
layers.append(AdaptiveFeatureModulation(16, 3, 16, self.identity_channels, self.num_blocks))
if self.output_size == 768:
layers.append(AdaptiveFeatureModulation(8, 3, 8, self.identity_channels, self.num_blocks))
if self.output_size == 1024:
layers.append(AdaptiveFeatureModulation(4, 3, 4, self.identity_channels, self.num_blocks))

return layers

def forward(self, source_embedding : Embedding, target_attributes : Attributes) -> Tensor:
temp_tensors = self.pixel_shuffle_up_sample(source_embedding)

Expand All @@ -41,37 +65,38 @@ def __init__(self, input_channels : int, output_channels : int, attribute_channe
super().__init__()
self.input_channels = input_channels
self.output_channels = output_channels
self.primary_layers = self.create_primary_layers(input_channels, output_channels, attribute_channels, identity_channels, num_blocks)
self.shortcut_layers = self.create_shortcut_layers(input_channels, output_channels, attribute_channels, identity_channels)
self.attribute_channels = attribute_channels
self.identity_channels = identity_channels
self.num_blocks = num_blocks
self.primary_layers = self.create_primary_layers()
self.shortcut_layers = self.create_shortcut_layers()

@staticmethod
def create_primary_layers(input_channels : int, output_channels : int, attribute_channels : int, identity_channels : int, num_blocks : int) -> nn.ModuleList:
def create_primary_layers(self) -> nn.ModuleList:
primary_layers = nn.ModuleList()

for index in range(num_blocks):
for index in range(self.num_blocks):
primary_layers.extend(
[
FeatureModulation(input_channels, attribute_channels, identity_channels),
FeatureModulation(self.input_channels, self.attribute_channels, self.identity_channels),
nn.ReLU(inplace = True)
])

if index < num_blocks - 1:
primary_layers.append(nn.Conv2d(input_channels, input_channels, kernel_size = 3, padding = 1, bias = False))
if index < self.num_blocks - 1:
primary_layers.append(nn.Conv2d(self.input_channels, self.input_channels, kernel_size = 3, padding = 1, bias = False))
else:
primary_layers.append(nn.Conv2d(input_channels, output_channels, kernel_size = 3, padding = 1, bias = False))
primary_layers.append(nn.Conv2d(self.input_channels, self.output_channels, kernel_size = 3, padding = 1, bias = False))

return primary_layers

@staticmethod
def create_shortcut_layers(input_channels : int, output_channels : int, attribute_channels : int, identity_channels : int) -> nn.ModuleList:
def create_shortcut_layers(self) -> nn.ModuleList:
shortcut_layers = nn.ModuleList()

if input_channels > output_channels:
if self.input_channels > self.output_channels:
shortcut_layers.extend(
[
FeatureModulation(input_channels, attribute_channels, identity_channels),
FeatureModulation(self.input_channels, self.attribute_channels, self.identity_channels),
nn.ReLU(inplace = True),
nn.Conv2d(input_channels, output_channels, kernel_size = 3, padding = 1, bias = False)
nn.Conv2d(self.input_channels, self.output_channels, kernel_size = 3, padding = 1, bias = False)
])

return shortcut_layers
Expand Down
23 changes: 13 additions & 10 deletions face_swapper/src/networks/nld.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,28 @@
class NLD(nn.Module):
def __init__(self, input_channels : int, num_filters : int, num_layers : int, kernel_size : int) -> None:
super().__init__()
self.layers = self.create_layers(input_channels, num_filters, num_layers, kernel_size)
self.input_channels = input_channels
self.num_filters = num_filters
self.num_layers = num_layers
self.kernel_size = kernel_size
self.layers = self.create_layers()
self.sequences = nn.Sequential(*self.layers)

@staticmethod
def create_layers(input_channels : int, num_filters : int, num_layers : int, kernel_size : int) -> nn.ModuleList:
padding = math.ceil((kernel_size - 1) / 2)
current_filters = num_filters
def create_layers(self) -> nn.ModuleList:
padding = math.ceil((self.kernel_size - 1) / 2)
current_filters = self.num_filters
layers = nn.ModuleList(
[
nn.Conv2d(input_channels, current_filters, kernel_size = kernel_size, stride = 2, padding = padding),
nn.Conv2d(self.input_channels, current_filters, kernel_size = self.kernel_size, stride = 2, padding = padding),
nn.LeakyReLU(0.2, True)
])

for _ in range(1, num_layers):
for _ in range(1, self.num_layers):
previous_filters = current_filters
current_filters = min(current_filters * 2, 512)
layers +=\
[
nn.Conv2d(previous_filters, current_filters, kernel_size = kernel_size, stride = 2, padding = padding),
nn.Conv2d(previous_filters, current_filters, kernel_size = self.kernel_size, stride = 2, padding = padding),
nn.InstanceNorm2d(current_filters),
nn.LeakyReLU(0.2, True)
]
Expand All @@ -33,10 +36,10 @@ def create_layers(input_channels : int, num_filters : int, num_layers : int, ker
current_filters = min(current_filters * 2, 512)
layers +=\
[
nn.Conv2d(previous_filters, current_filters, kernel_size = kernel_size, padding = padding),
nn.Conv2d(previous_filters, current_filters, kernel_size = self.kernel_size, padding = padding),
nn.InstanceNorm2d(current_filters),
nn.LeakyReLU(0.2, True),
nn.Conv2d(current_filters, 1, kernel_size = kernel_size, padding = padding)
nn.Conv2d(current_filters, 1, kernel_size = self.kernel_size, padding = padding)
]
return layers

Expand Down
Loading