# MIT License
#
# Copyright (c) 2018 Haoxintong
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Residual Attention network, implemented in Gluon."""
from mxnet.gluon import nn
from mxnet.gluon.model_zoo.vision.resnet import BottleneckV2
from ..nn.basic_blocks import FrBase
__all__ = ["AttentionNet", "AttentionNetFace",
"get_attention_net", "get_attention_face",
"attention_net56", "attention_net92", "attention_net128",
"attention_net164", "attention_net236", "attention_net452"
]
class AttentionBlock(nn.HybridBlock):
def __init__(self, channels, out_size, stage, p=1, t=2, r=1, **kwargs):
r"""Residual Attention Block from
`"Residual Attention Network for Image Classification"
<https://arxiv.org/abs/1704.06904>`_ paper.
Parameters
----------
channels: int.
Number of output channels.
out_size: int.
Size of the output feature map, now it only supports square shape.
stage: int.
Stage described in Figure 2.
p: int.
Number of pre-processing Residual Units before split into trunk branch and mask branch.
t: int.
Number of Residual Units in trunk branch.
r: int.
Number of Residual Units between adjacent pooling layer in the mask branch.
kwargs:
"""
super().__init__(**kwargs)
with self.name_scope():
self.pre = nn.HybridSequential()
for i in range(p):
self.pre.add(BottleneckV2(channels, 1, prefix='pre_%d_' % i))
self.trunk_branch = nn.HybridSequential()
for i in range(t):
self.trunk_branch.add(BottleneckV2(channels, 1, prefix='trunk_%d_' % i))
self.mask_branch = _MaskBlock(channels, r, out_size, stage, prefix='mask_')
self.post = nn.HybridSequential()
for i in range(p):
self.post.add(BottleneckV2(channels, 1, prefix='post_%d_' % i))
def hybrid_forward(self, F, x, *args, **kwargs):
x = self.pre(x)
mask = self.mask_branch(x)
trunk = self.trunk_branch(x)
out = (1 + mask) * trunk
out = self.post(out)
return out
class _UpSampleBlock(nn.HybridBlock):
def __init__(self, out_size, **kwargs):
super().__init__(**kwargs)
self._size = out_size
def hybrid_forward(self, F, x, *args, **kwargs):
return F.contrib.BilinearResize2D(x, height=self._size, width=self._size)
class _MaskBlock(nn.HybridBlock):
def __init__(self, channels, r, out_size, stage, **kwargs):
super().__init__(**kwargs)
self._stage = stage
with self.name_scope():
self._make_layers(channels, r, stage, out_size)
def _make_layers(self, channels, r, stage, out_size):
if stage <= 1:
self.down_sample_1 = nn.MaxPool2D(3, 2, 1)
self.down_res_unit_1 = nn.HybridSequential()
for i in range(r):
self.down_res_unit_1.add(BottleneckV2(channels, 1, prefix="down_res1_%d_" % i))
self.skip_connection_1 = BottleneckV2(channels, 1)
self.up_res_unit_1 = nn.HybridSequential()
for i in range(r):
self.up_res_unit_1.add(BottleneckV2(channels, 1, prefix="up_res1_%d_" % i))
self.up_sample_1 = _UpSampleBlock(out_size)
out_size = out_size // 2
if stage <= 2:
self.down_sample_2 = nn.MaxPool2D(3, 2, 1)
self.down_res_unit_2 = nn.HybridSequential()
for i in range(r):
self.down_res_unit_2.add(BottleneckV2(channels, 1, prefix="down_res2_%d_" % i))
self.skip_connection_2 = BottleneckV2(channels, 1)
self.up_res_unit_2 = nn.HybridSequential()
for i in range(r):
self.up_res_unit_2.add(BottleneckV2(channels, 1, prefix="up_res2_%d_" % i))
self.up_sample_2 = _UpSampleBlock(out_size)
out_size = out_size // 2
if stage <= 3:
self.down_sample_3 = nn.MaxPool2D(3, 2, 1)
self.down_res_unit_3 = nn.HybridSequential()
for i in range(r):
self.down_res_unit_3.add(BottleneckV2(channels, 1, prefix="down_res3_%d_" % i))
self.up_res_unit_3 = nn.HybridSequential()
for i in range(r):
self.up_res_unit_3.add(BottleneckV2(channels, 1, prefix="up_res3_%d_" % i))
self.up_sample_3 = _UpSampleBlock(out_size)
self.output = nn.HybridSequential()
self.output.add(nn.BatchNorm(),
nn.Activation('relu'),
nn.Conv2D(channels, kernel_size=1, strides=1, use_bias=False),
nn.BatchNorm(),
nn.Activation('relu'),
nn.Conv2D(channels, kernel_size=1, strides=1, use_bias=False),
nn.Activation('sigmoid')
)
def hybrid_forward(self, F, x, *args, **kwargs):
if self._stage <= 1:
x_down1 = self.down_sample_1(x)
x_down1 = self.down_res_unit_1(x_down1)
residual_1 = self.skip_connection_1(x_down1)
else:
x_down1 = x
residual_1 = 0
if self._stage <= 2:
x_down2 = self.down_sample_2(x_down1)
x_down2 = self.down_res_unit_2(x_down2)
residual_2 = self.skip_connection_2(x_down2)
else:
x_down2 = x
residual_2 = 0
if self._stage <= 3:
x_down3 = self.down_sample_3(x_down2)
x_down3 = self.down_res_unit_3(x_down3)
x_up3 = self.up_res_unit_3(x_down3)
x_up3 = self.up_sample_3(x_up3)
else:
raise ValueError("param stage should be a number not larger than 3!")
if self._stage <= 2:
x_up2 = x_up3 + residual_2
x_up2 = self.up_res_unit_2(x_up2)
x_up2 = self.up_sample_2(x_up2)
else:
x_up2 = x_up3
if self._stage <= 1:
x_up1 = x_up2 + residual_1
x_up1 = self.up_res_unit_1(x_up1)
x_up1 = self.up_sample_1(x_up1)
else:
x_up1 = x_up2
out = self.output(x_up1)
return out
class AttentionNet(nn.HybridBlock):
r"""AttentionNet Model from
`"Residual Attention Network for Image Classification"
<https://arxiv.org/abs/1704.06904>`_ paper.
Parameters
----------
classes: int.
Number of classification classes.
modules: list.
The number of Attention Module in each stage.
p: int.
Number of pre-processing Residual Units before split into trunk branch and mask branch.
t: int.
Number of Residual Units in trunk branch.
r: int.
Number of Residual Units between adjacent pooling layer in the mask branch.
kwargs:
"""
def __init__(self, classes, modules, p, t, r, **kwargs):
super().__init__(**kwargs)
assert len(modules) == 3
with self.name_scope():
self.features = nn.HybridSequential()
# 112x112
self.features.add(nn.Conv2D(64, 3, 2, 1, use_bias=False))
self.features.add(nn.BatchNorm())
self.features.add(nn.Activation('relu'))
# 56x56
self.features.add(nn.MaxPool2D(3, 2, 1))
self.features.add(BottleneckV2(256, 1, True, 64))
for _ in range(modules[0]):
self.features.add(AttentionBlock(256, 56, 1, p, t, r))
# 28x28
self.features.add(BottleneckV2(512, 2, True, 256))
for _ in range(modules[1]):
self.features.add(AttentionBlock(512, 28, 2, p, t, r))
# 14x14
self.features.add(BottleneckV2(1024, 2, True, 512))
for _ in range(modules[2]):
self.features.add(AttentionBlock(1024, 14, 3, p, t, r))
# 7x7
self.features.add(BottleneckV2(2048, 2, True, 1024),
BottleneckV2(2048, 1),
BottleneckV2(2048, 1))
# 2048
self.features.add(nn.BatchNorm(),
nn.Activation('relu'),
nn.GlobalAvgPool2D(),
nn.Flatten())
# classes
self.output = nn.Dense(classes)
def hybrid_forward(self, F, x, *args, **kwargs):
x = self.features(x)
x = self.output(x)
return x
class AttentionNetFace(FrBase):
r"""AttentionNet Model for input 112x112.
Parameters
----------
classes: int.
Number of classification classes.
modules: list.
The number of Attention Module in each stage.
p: int.
Number of pre-processing Residual Units before split into trunk branch and mask branch.
t: int.
Number of Residual Units in trunk branch.
r: int.
Number of Residual Units between adjacent pooling layer in the mask branch.
embedding_size : int
Units of embedding layer.
weight_norm : bool, default False
Whether use weight norm in NormDense layer.
feature_norm : bool, default False
Whether use features norm in NormDense layer.
need_cls_layer : bool, default True
Whether use NormDense layer.Normally it depends on your loss function.
When you use Softmax, ArcLoss or based on Softmax loss, you need to set it to True.
When you only need embedding output, like you are predicting or training with triplet loss,
you need to set it to False.
"""
def __init__(self, classes, modules, p, t, r,
weight_norm=False, feature_norm=False, embedding_size=512,
need_cls_layer=True, **kwargs):
super().__init__(classes, embedding_size, weight_norm, feature_norm, need_cls_layer, **kwargs)
assert len(modules) == 3
with self.name_scope():
self.features = nn.HybridSequential()
# 112x112
self.features.add(nn.Conv2D(64, 3, 1, 1, use_bias=False))
self.features.add(nn.BatchNorm())
self.features.add(nn.Activation('relu'))
# 56x56
self.features.add(BottleneckV2(256, 2, True, 64))
for _ in range(modules[0]):
self.features.add(AttentionBlock(256, 56, 1, p, t, r))
# 28x28
self.features.add(BottleneckV2(512, 2, True, 256))
for _ in range(modules[1]):
self.features.add(AttentionBlock(512, 28, 2, p, t, r))
# 14x14
self.features.add(BottleneckV2(1024, 2, True, 512))
for _ in range(modules[2]):
self.features.add(AttentionBlock(1024, 14, 3, p, t, r))
# 8x8
self.features.add(BottleneckV2(2048, 2, True, 1024),
BottleneckV2(2048, 1),
BottleneckV2(2048, 1))
# 2048
self.features.add(nn.BatchNorm(),
nn.Activation('relu'),
nn.GlobalAvgPool2D(),
nn.Flatten())
# embedding
self.features.add(nn.Dense(embedding_size, use_bias=False),
nn.BatchNorm(scale=False, center=False),
nn.PReLU())
# Specification ([p, t, r], [stage1, stage2, stage3])
# The hyper-parameters are based on paper section 4.1. The number of layers can be calculated by 36m+20
# where m is the number of Attention Module in each stage when `p, t, r = 1, 2, 1`.
attention_net_spec = {56: ([1, 2, 1], [1, 1, 1]),
92: ([1, 2, 1], [1, 2, 3]),
128: ([1, 2, 1], [3, 3, 3]),
164: ([1, 2, 1], [4, 4, 4]),
236: ([1, 2, 1], [6, 6, 6]),
452: ([2, 4, 3], [6, 6, 6])}
# Constructor
[docs]def get_attention_net(classes, num_layers, **kwargs):
r"""AttentionNet Model from
`"Residual Attention Network for Image Classification"
<https://arxiv.org/abs/1704.06904>`_ paper.
Parameters
----------
classes : int,
Number of classification classes.
num_layers : int
Numbers of layers. Options are 56, 92, 128, 164, 236, 452.
"""
assert num_layers in attention_net_spec, \
"Invalid number of layers: %d. Options are %s" % (
num_layers, str(attention_net_spec.keys()))
ptr, modules = attention_net_spec[num_layers]
assert len(ptr) == len(modules) == 3
p, t, r = ptr
net = AttentionNet(classes, modules, p, t, r, **kwargs)
return net
[docs]def get_attention_face(classes=-1, num_layers=128, embedding_size=512, need_cls_layer=True, **kwargs):
r"""AttentionNet Model for 112x112 face images from
`"Residual Attention Network for Image Classification"
<https://arxiv.org/abs/1704.06904>`_ paper.
Parameters
----------
classes : int, -1
Number of classification classes.
num_layers : int, 128
Numbers of layers. Options are 56, 92, 128, 164, 236, 452.
embedding_size: int, 256
Feature dimensions of the embedding layers.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
assert num_layers in attention_net_spec, \
"Invalid number of layers: %d. Options are %s" % (
num_layers, str(attention_net_spec.keys()))
ptr, modules = attention_net_spec[num_layers]
assert len(ptr) == len(modules) == 3
p, t, r = ptr
net = AttentionNetFace(classes, modules, p, t, r, embedding_size=embedding_size, need_cls_layer=need_cls_layer, **kwargs)
return net
[docs]def attention_net56(classes=-1, need_cls_layer=True, **kwargs):
r"""AttentionNet 56 Model for face recognition.
Parameters
----------
classes : int, -1
Number of classification classes.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
return get_attention_face(classes, 56, need_cls_layer=need_cls_layer, **kwargs)
[docs]def attention_net92(classes=-1, need_cls_layer=True, **kwargs):
r"""AttentionNet 92 Model for face recognition.
Parameters
----------
classes : int, -1
Number of classification classes.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
return get_attention_face(classes, 92, need_cls_layer=need_cls_layer, **kwargs)
[docs]def attention_net128(classes=-1, need_cls_layer=True, **kwargs):
r"""AttentionNet 128 Model for face recognition.
Parameters
----------
classes : int, -1
Number of classification classes.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
return get_attention_face(classes, 128, need_cls_layer=need_cls_layer, **kwargs)
[docs]def attention_net164(classes=-1, need_cls_layer=True, **kwargs):
r"""AttentionNet 164 Model for face recognition.
Parameters
----------
classes : int, -1
Number of classification classes.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
return get_attention_face(classes, 164, need_cls_layer=need_cls_layer, **kwargs)
[docs]def attention_net236(classes=-1, need_cls_layer=True, **kwargs):
r"""AttentionNet 236 Model for face recognition.
Parameters
----------
classes : int, -1
Number of classification classes.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
return get_attention_face(classes, 236, need_cls_layer=need_cls_layer, **kwargs)
[docs]def attention_net452(classes=-1, need_cls_layer=True, **kwargs):
r"""AttentionNet 452 Model for face recognition.
Parameters
----------
classes : int, -1
Number of classification classes.
need_cls_layer : bool, default True
Whether to use NormDense output layer.
"""
return get_attention_face(classes, 452, need_cls_layer=need_cls_layer, **kwargs)