Source code for gluonfr.loss

# MIT License
#
# Copyright (c) 2018 Haoxintong
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Custom losses"""
import math
import numpy as np
from mxnet import nd, init
from mxnet.gluon.loss import Loss, SoftmaxCrossEntropyLoss

__all__ = ["get_loss", "get_loss_list", "ArcLoss", "TripletLoss", "RingLoss",
           "CosLoss", "L2Softmax", "ASoftmax", "CenterLoss", "ContrastiveLoss", "LGMLoss",
           "MPSLoss", "GitLoss", "COCOLoss", "SVXSoftmax"]
numeric_types = (float, int, np.generic)


def _apply_weighting(F, loss, weight=None, sample_weight=None):
    """Apply weighting to loss.

    Parameters
    ----------
    loss : Symbol
        The loss to be weighted.
    weight : float or None
        Global scalar weight for loss.
    sample_weight : Symbol or None
        Per sample weighting. Must be broadcastable to
        the same shape as loss. For example, if loss has
        shape (64, 10) and you want to weight each sample
        in the batch separately, `sample_weight` should have
        shape (64, 1).

    Returns
    -------
    loss : Symbol
        Weighted loss
    """
    if sample_weight is not None:
        loss = F.broadcast_mul(loss, sample_weight)

    if weight is not None:
        assert isinstance(weight, numeric_types), "weight must be a number"
        loss = loss * weight

    return loss


def _reshape_like(F, x, y):
    """Reshapes x to the same shape as y."""
    return x.reshape(y.shape) if F is nd.ndarray else F.reshape_like(x, y)


class L2Softmax(SoftmaxCrossEntropyLoss):
    r"""L2Softmax from
    `"L2-constrained Softmax Loss for Discriminative Face Verification"
    <https://arxiv.org/abs/1703.09507>`_ paper.

    Parameters
    ----------
    classes: int.
        Number of classes.
    alpha: float.
        The scaling parameter, a hypersphere with small alpha
        will limit surface area for embedding features.
    p: float, default is 0.9.
        The expected average softmax probability for correctly
        classifying a feature.
    from_normx: bool, default is False.
         Whether input has already been normalized.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.
    """

    def __init__(self, classes, alpha, p=0.9, from_normx=False,
                 axis=-1, sparse_label=True, weight=None, batch_axis=0, **kwargs):
        super().__init__(axis=axis, sparse_label=sparse_label, weight=weight, batch_axis=batch_axis, **kwargs)
        alpha_low = math.log(p * (classes - 2) / (1 - p))
        assert alpha > alpha_low, "For given probability of p={}, alpha should higher than {}.".format(p, alpha_low)
        self.alpha = alpha
        self._from_normx = from_normx

    def hybrid_forward(self, F, x, label, sample_weight=None):
        if not self._from_normx:
            x = F.L2Normalization(x, mode='instance', name='fc1n')
        fc7 = x * self.alpha
        return super().hybrid_forward(F, pred=fc7, label=label, sample_weight=sample_weight)


class CosLoss(SoftmaxCrossEntropyLoss):
    r"""CosLoss from
       `"CosFace: Large Margin Cosine Loss for Deep Face Recognition"
       <https://arxiv.org/abs/1801.09414>`_ paper.

       It is also AM-Softmax from
       `"Additive Margin Softmax for Face Verification"
       <https://arxiv.org/abs/1801.05599>`_ paper.

    Parameters
    ----------
    classes: int.
        Number of classes.
    m: float, default 0.4
        Margin parameter for loss.
    s: int, default 64
        Scale parameter for loss.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.
    """

    def __init__(self, classes, m, s, dtype="float32", **kwargs):
        super().__init__(**kwargs)
        self._classes = classes
        self._scale = s
        self._margin = m
        self._dtype = dtype

    def hybrid_forward(self, F, x, label, sample_weight=None):
        if self._sparse_label:
            one_hot_label = F.one_hot(label, depth=self._classes, on_value=1.0, off_value=0.0, dtype=self._dtype)
        else:
            one_hot_label = label

        body = one_hot_label * self._margin
        fc7 = (x - body) * self._scale

        return super().hybrid_forward(F, pred=fc7, label=label, sample_weight=sample_weight)


class ArcLoss(SoftmaxCrossEntropyLoss):
    r"""ArcLoss from
    `"ArcFace: Additive Angular Margin Loss for Deep Face Recognition"
    <https://arxiv.org/abs/1801.07698>`_ paper.

    Parameters
    ----------
    classes: int.
        Number of classes.
    m: float.
        Margin parameter for loss.
    s: int.
        Scale parameter for loss.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.
    """

    def __init__(self, classes, m=0.5, s=64, easy_margin=True, dtype="float32", **kwargs):
        super().__init__(**kwargs)
        assert s > 0.
        assert 0 <= m < (math.pi / 2)
        self.s = s
        self.m = m
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.mm = math.sin(math.pi - m) * m
        self.threshold = math.cos(math.pi - m)
        self._classes = classes
        self.easy_margin = easy_margin
        self._dtype = dtype

    def hybrid_forward(self, F, pred, label, sample_weight=None, *args, **kwargs):
        cos_t = F.pick(pred, label, axis=1)  # cos(theta_yi)
        if self.easy_margin:
            cond = F.Activation(data=cos_t, act_type='relu')
        else:
            cond_v = cos_t - self.threshold
            cond = F.Activation(data=cond_v, act_type='relu')

        # sin_t = F.sqrt(1.0 - cos_t * cos_t)  # sin(theta)
        # new_zy = cos_t * self.cos_m - sin_t * self.sin_m  # cos(theta_yi + m)

        new_zy = F.cos(F.arccos(cos_t) + self.m)  # cos(theta_yi + m)
        if self.easy_margin:
            zy_keep = cos_t
        else:
            zy_keep = cos_t - self.mm  # (cos(theta_yi) - sin(pi - m)*m)
        new_zy = F.where(cond, new_zy, zy_keep)
        diff = new_zy - cos_t  # cos(theta_yi + m) - cos(theta_yi)
        diff = F.expand_dims(diff, 1)  # shape=(b, 1)
        gt_one_hot = F.one_hot(label, depth=self._classes, on_value=1.0, off_value=0.0, dtype=self._dtype)
        body = F.broadcast_mul(gt_one_hot, diff)
        pred = pred + body
        pred = pred * self.s

        return super().hybrid_forward(F, pred=pred, label=label, sample_weight=sample_weight)


class TripletLoss(Loss):
    r"""Calculates triplet loss given three input tensors and a positive margin.
    Triplet loss measures the relative similarity between prediction, a positive
    example and a negative example:

    .. math::
        L = \sum_i \max(\Vert {pred}_i - {pos_i} \Vert_2^2 -
                        \Vert {pred}_i - {neg_i} \Vert_2^2 + {margin}, 0)

    `pred`, `positive` and `negative` can have arbitrary shape as long as they
    have the same number of elements.

    Parameters
    ----------
    margin: float
        Margin of separation between correct and incorrect pair.
    weight: float or None
        Global scalar weight for loss.
    batch_axis: int, default 0
        The axis that represents mini-batch.


    Inputs:
        - **pred**: prediction tensor with arbitrary shape
        - **positive**: positive example tensor with arbitrary shape. Must have
          the same size as pred.
        - **negative**: negative example tensor with arbitrary shape Must have
          the same size as pred.

    Outputs:
        - **loss**: loss tensor with shape (batch_size,).
    """

    def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
        super(TripletLoss, self).__init__(weight, batch_axis, **kwargs)
        self._margin = margin

    def hybrid_forward(self, F, pred, positive, negative):
        positive = _reshape_like(F, positive, pred)
        negative = _reshape_like(F, negative, pred)
        loss = F.sum(F.square(pred - positive) - F.square(pred - negative),
                     axis=self._batch_axis, exclude=True)
        loss = F.relu(loss + self._margin)
        return _apply_weighting(F, loss, self._weight, None)


class ContrastiveLoss(Loss):
    r"""Computes the contrastive loss.
    See `"Dimensionality Reduction by Learning an Invariant Mapping"
    <http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf>`_ paper.
    This loss encourages the embedding to be close to each other for
    the samples of the same label and the embedding to be far apart at least
    by the margin constant for the samples of different labels.


    Parameters
    ----------
    margin: float, default is 1.
        Margin term in the loss definition.


    Inputs:
        - **anchor**: prediction tensor. Embeddings should be l2 normalized.
        - **positive**: positive example tensor with arbitrary shape. Must have
          the same size as anchor. Embeddings should be l2 normalized.
        - **labels**: array with shape (batch_size,) of
          binary labels indicating positive vs negative pair.

    Outputs:
        - **loss**:  loss tensor with shape (batch_size,).Dimensions other than
          batch_axis are averaged out.
      """

    def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
        super().__init__(weight, batch_axis, **kwargs)
        self._margin = margin

    def hybrid_forward(self, F, anchor, positive, labels):
        positive = _reshape_like(F, positive, anchor)
        dists = F.norm(F.square(anchor - positive), axis=1)
        loss = labels * F.square(dists) + (1 - labels) * F.square(F.maximum(self._margin - dists, 0))
        return _apply_weighting(F, loss, self._weight, None)


class RingLoss(SoftmaxCrossEntropyLoss):
    r"""Computes the Ring Loss from
    `"Ring loss: Convex Feature Normalization for Face Recognition"
    <https://arxiv.org/abs/1803.00130>`_ paper.

    .. math::
        L = -\sum_i \log \softmax({pred})_{i,{label}_i} +  \frac{\lambda}{2m} \sum_{i=1}^{m}
         (\Vert \mathcal{F}({x}_i)\Vert_2 - R )^2

    Parameters
    ----------

    lamda: float.
        The loss weight enforcing a trade-off between the softmax loss and ring loss.
    r_init: float.
        The initial value of Hyper Parameter R.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.

    """

    def __init__(self, lamda, r_init=1.0, dtype='float32', **kwargs):
        super().__init__(**kwargs)

        self._lamda = lamda
        self.R = self.params.get('R', shape=(1,), init=init.Constant(r_init),
                                 dtype=dtype, allow_deferred_init=True)

    def hybrid_forward(self, F, pred, label, embedding, R, sample_weight=None):
        # RingLoss
        emb_norm = F.norm(embedding, axis=1)
        loss_r = F.square(F.broadcast_sub(emb_norm, R)) * 0.5
        loss_r = _apply_weighting(F, loss_r, self._weight, sample_weight)

        # Softmax
        loss_sm = super().hybrid_forward(F, pred, label, sample_weight)

        return loss_sm + self._lamda * loss_r


class ASoftmax(SoftmaxCrossEntropyLoss):
    r"""ASoftmax from
    `"SphereFace: Deep Hypersphere Embedding for Face Recognition"
    <https://arxiv.org/pdf/1704.08063.pdf>`_ paper.
    input(weight, x) has already been normalized

    Parameters
    ----------
    classes: int.
        Number of classes.
    m: float.
        Margin parameter for loss.
    s: int.
        Scale parameter for loss.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.
    """

    def __init__(self, classes, m, s, phiflag=True, dtype="float32", **kwargs):
        super().__init__(**kwargs)
        self._classes = classes
        self._scale = s
        self._margin = m
        self._phiflag = phiflag
        self._dtype = dtype
        self.it = 0
        self.LambdaMin = 5.0
        self.LambdaMax = 1500.0
        self.lamb = 1500.0
        self.mlambda = [
            lambda x: x ** 0,
            lambda x: x ** 1,
            lambda x: 2 * x ** 2 - 1,
            lambda x: 4 * x ** 3 - 3 * x,
            lambda x: 8 * x ** 4 - 8 * x ** 2 + 1,
            lambda x: 16 * x ** 5 - 20 * x ** 3 + 5 * x
        ]

    @staticmethod
    def _myphi(x, m):
        x = x * m
        return 1 - x ** 2 / math.factorial(2) + x ** 4 / math.factorial(4) - x ** 6 / math.factorial(6) + \
               x ** 8 / math.factorial(8) - x ** 9 / math.factorial(9)

    def hybrid_forward(self, F, x, label, sample_weight=None):
        cos_theta = F.clip(x, -1, 1)

        if self._phiflag:
            cos_m_theta = self.mlambda[int(self._margin)](cos_theta)
            theta = cos_theta.arccos()
            k = (self._margin * theta / math.pi).floor()
            n_one = k * 0.0 - 1
            phi_theta = (n_one ** k) * cos_m_theta - 2 * k
        else:
            theta = cos_theta.arccos()
            phi_theta = self._myphi(theta, self._margin)
            phi_theta = phi_theta.clip(-1 * self._margin, 1)

        if self._sparse_label:
            one_hot_label = F.one_hot(label, depth=self._classes, on_value=1.0, off_value=0.0, dtype=self._dtype)
        else:
            one_hot_label = label

        self.it += 1
        self.lamb = max(self.LambdaMin, self.LambdaMax / (1 + 0.1 * self.it))
        diff = (phi_theta - x) * 1.0 / (1 + self.lamb)

        body = one_hot_label * diff
        fc7 = (x + body) * self._scale

        return super().hybrid_forward(F, pred=fc7, label=label, sample_weight=sample_weight)


class CenterLoss(SoftmaxCrossEntropyLoss):
    r"""Computes the Center Loss from
    `"A Discriminative Feature Learning Approach for Deep Face Recognition"
    <http://ydwen.github.io/papers/WenECCV16.pdf>`_ paper.

    Implementation is refer to
    "https://github.com/ShownX/mxnet-center-loss/blob/master/center_loss.py"

    Parameters
    ----------
    classes: int.
        Number of classes.

    lamda: float
        The loss weight enforcing a trade-off between the softmax loss and center loss.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.

    """

    def __init__(self, classes, embedding_size, lamda, weight_initializer=init.Xavier(magnitude=2.24),
                 dtype='float32', **kwargs):
        super().__init__(**kwargs)
        self._lamda = lamda
        self._classes = classes
        self._dtype = dtype
        self.centers = self.params.get('centers', shape=(classes, embedding_size), init=weight_initializer,
                                       dtype=dtype, allow_deferred_init=True)

    def hybrid_forward(self, F, x, label, embeddings, centers, sample_weight=None):
        # loss center
        centers_count = F.take(F.sum(F.one_hot(label, depth=self._classes, dtype=self._dtype), axis=0), label)
        centers_selected = F.take(centers, label)
        loss_c = self._lamda * 0.5 * F.sum(F.square(embeddings - centers_selected), 1) / centers_count

        # Softmax
        loss_sm = super().hybrid_forward(F, x, label, sample_weight)
        return loss_sm + loss_c


class LGMLoss(Loss):
    r"""LGM Loss from
    `"Rethinking Feature Distribution for Loss Functions in Image Classification"
    <https://arxiv.org/abs/1803.02988>`_ paper.

    Implementation is refer to
    https://github.com/LeeJuly30/L-GM-Loss-For-Gluon/blob/master/L_GM.py


    Parameters
    ----------
    num_classes: int.
        The num of classes.
    embedding_size: int.
        The size of embedding feature.
    alpha: float.
        A non-negative parameter controlling the size of the expected margin between
        two classes on the training set.
    lamda: float.
        A non-negative weighting coefficient.
    lr_mult: float.
        Var updating need a relatively low learning rate compared to the overall learning rate.
    """

    def __init__(self, num_classes, embedding_size, alpha, lamda, lr_mult, dtype="float32", **kwargs):
        super().__init__(weight=None, batch_axis=0, **kwargs)
        self._num_class = num_classes
        self._feature_dim = embedding_size
        self._alpha = alpha
        self._lamda = lamda
        self._dtype = dtype
        self.mean = self.params.get('mean', shape=(num_classes, embedding_size), init=init.Xavier())
        self.var = self.params.get('var', shape=(num_classes, embedding_size), init=init.Constant(1), lr_mult=lr_mult)

    def _classification_probability(self, F, x, label, mean, var):
        reshape_var = F.reshape(var, (-1, 1, self._feature_dim))
        reshape_mean = F.reshape(mean, (-1, 1, self._feature_dim))
        x = F.expand_dims(x, 0)
        x = F.broadcast_minus(x, reshape_mean)
        d_z = F.elemwise_mul(F.broadcast_div(x, (reshape_var + 1e-8)), x)
        d_z = F.transpose(F.sum(d_z, axis=2) / 2)

        mask = F.one_hot(label, self._num_class, dtype=self._dtype) * self._alpha + 1
        margin_d_z = d_z * mask
        probability = F.broadcast_div(F.exp(-margin_d_z), (F.sqrt(F.prod(var, 1)) + 1e-8))
        return probability, d_z

    def hybrid_forward(self, F, x, label, mean, var):
        probability, m_distance = self._classification_probability(F, x, label, mean, var)

        # classification loss
        class_probability = F.pick(probability, label, axis=1)
        loss_cls = -F.log(class_probability / (F.sum(probability, 1) + 1e-8) + 1e-8)

        # likehood loss
        loss_lkd = F.pick(m_distance, label, axis=1)
        l_gm_loss = loss_cls + self._lamda * loss_lkd
        return l_gm_loss, probability


class RangeLoss(Loss):
    r"""Range Loss from
    `"Range Loss for Deep Face Recognition with Long-tail"
    <https://arxiv.org/abs/1611.08976>`_ paper.

    Implementation is refer to
    https://github.com/LeeJuly30/RangeLoss-For-Gluno/blob/master/RangeLossForGluon.py

    L = L_SM + lamda*(alpha*L_Rintra + beta*L_Rinter)

    Parameters
    ----------
    alpha: float.
        Weight of L_Rintra.
    beta: float.
        Weight of L_Rinter.
    top_k: int.
        Compute the first k-largest range within each class, k=2 is recommended in paper.
    num_class: int.

    num_in_class: int.

    margin: float.
        Denotes a super parameter as the max optimization margin that will exclude DCenter greater than
        this margin from the computation of the inter loss.

    TODO: this loss has not been tested and made to be hybridize.
    """

    def __init__(self, alpha, beta, top_k, num_class, num_in_class, feature_dim, margin, **kwargs):
        super(RangeLoss, self).__init__(weight=None, batch_axis=0, **kwargs)
        self._alpha = alpha
        self._beta = beta
        self._top_k = top_k
        self._num_class = num_class
        self._num_in_class = num_in_class
        self._magrin = margin

    def _pair_distance(self, F, features):
        dot_product = F.dot(features, features.T)
        square_norm = F.sum(F.square(features), axis=1)
        distances = F.expand_dims(square_norm, 0) - 2.0 * dot_product + F.expand_dims(square_norm, 1)
        distances = F.maximum(distances, 0.0)
        mask = F.equal(distances, 0.0)
        distances = distances + mask * 1e-16
        distances = F.sqrt(distances)
        distances = distances * (1.0 - mask)
        return distances

    def _inter_class_loss(self, F, x, y):
        reshape_out = x.reshape((self._num_class, self._num_in_class, -1))
        centers = F.mean(reshape_out, axis=1)
        center_distance = self._pair_distance(F, centers)
        mask = F.array(
            1. - np.greater_equal.outer(np.arange(self._num_class), np.arange(self._num_class)).astype(np.float32))
        center_distance = center_distance * mask + (1. - mask) * 1e4
        center_distance = center_distance.reshape((-1,))
        inter_class_loss = F.maximum(self._magrin - F.min(center_distance), 0)
        return inter_class_loss

    def _intra_class_loss(self, F, x, y):
        intra_class_loss = F.array([0.])
        for i in range(self._num_class):
            same_label_feature = x[i * self._num_in_class:(i + 1) * self._num_in_class, :]
            same_label_distance = self._pair_distance(F, same_label_feature)
            mask = F.array(
                1. - np.greater_equal.outer(np.arange(self._num_in_class), np.arange(self._num_in_class)).astype(
                    np.float32))
            same_label_distance = same_label_distance * mask
            same_label_distance = same_label_distance.reshape((-1,))
            top_k_distance = F.topk(same_label_distance, k=self._top_k, ret_typ='value', is_ascend=False)
            harmonic_mean = self._top_k / F.sum(1 / (top_k_distance + 1e-8))
            intra_class_loss = intra_class_loss + harmonic_mean
        return intra_class_loss

    def hybrid_forward(self, F, x, y):
        inter_class_loss = self._inter_class_loss(F, x, y)
        intra_class_loss = self._intra_class_loss(F, x, y)
        range_loss = self._alpha * inter_class_loss + self._beta * intra_class_loss
        return range_loss


class MPSLoss(Loss):
    r"""Computes the MPS Loss from
    `"DocFace: Matching ID Document Photos to Selfies"
    <https://arxiv.org/abs/1805.02283>`_ paper.


    Parameters
    ----------
    m: float
        Margin parameter for loss.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.
    """

    def __init__(self, m=1.0, **kwargs):
        super().__init__(weight=None, batch_axis=0, **kwargs)
        self.m = m

    @staticmethod
    def euclidean_distance(F, X, Y, sqrt=False):
        """Compute the distance between each X and Y.

        Args:
            X: a (m x d) tensor
            Y: a (d x n) tensor
            sqrt:

        Returns:
            diffs: an m x n distance matrix.
        """
        XX = F.sum(F.square(X), 1, keepdims=True)
        YY = F.sum(F.square(Y), 0, keepdims=True)
        XY = F.dot(X, Y)

        diffs = XX + YY - 2 * XY
        diffs = F.relu(diffs)
        if sqrt:
            diffs = F.sqrt(diffs)
        return diffs

    def hybrid_forward(self, F, pred1, pred2):
        pred1_norm = F.L2Normalization(pred1, mode="instance")
        pred2_norm = F.L2Normalization(pred2, mode="instance")

        # compute euclidean distance
        dist = -0.5 * self.euclidean_distance(F, pred1_norm, pred2_norm.transpose(), sqrt=False) + 1
        dist_pos = F.diag(dist)
        dist_neg = dist - F.diag(dist_pos)

        # get max dist between one image and others in a batch
        dist_neg_1 = F.expand_dims(F.max(dist_neg, axis=1), axis=1)
        dist_neg_2 = F.expand_dims(F.max(dist_neg, axis=0), axis=1)
        logits_neg = F.maximum(dist_neg_1, dist_neg_2)

        loss = (self.m + logits_neg - dist_pos) * 0.5
        return F.relu(loss)


class GitLoss(SoftmaxCrossEntropyLoss):
    r"""Computes the Git Loss from
    `"Git Loss for Deep Face Recognition"
    <https://arxiv.org/abs/1807.08512>`_ paper.

    This implementation require the batch size not changing in training or validation.
    Commonly, it is ok, as when we train models last batch discard is applied, and no need
    for validation to compute the loss.

    Parameters
    ----------
    classes: int.
        Number of classes.
    embedding_size: int.
        Size of feature.
    lamda_c: float.
        The loss weight enforcing a trade-off between the softmax loss and center loss.
    lamda_g: float.
        The loss weight enforcing a trade-off between the softmax loss and git loss.
    batch_size_per_gpu: int.
        This size is sample numbers in each gpu or device, not total batch size


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.

    """

    def __init__(self, classes, embedding_size, lamda_c, lamda_g, batch_size_per_gpu,
                 weight_initializer=init.Xavier(magnitude=2.24), dtype='float32', **kwargs):
        super().__init__(**kwargs)
        self._lamda_c = lamda_c
        self._lamda_g = lamda_g
        self._classes = classes
        self._dtype = dtype
        self.centers = self.params.get('centers', shape=(classes, embedding_size), init=weight_initializer,
                                       dtype=dtype, allow_deferred_init=True)
        self.mask = self.params.get_constant('mask', np.expand_dims(1 - np.eye(int(batch_size_per_gpu)), axis=2))

    def hybrid_forward(self, F, x, label, embeddings, centers, mask, sample_weight=None):
        centers_selected = F.take(centers, label)

        # Softmax
        loss_sm = super().hybrid_forward(F, x, label, sample_weight)
        onehot_label = F.one_hot(label, depth=self._classes, dtype=self._dtype)

        # loss center
        label_hist = F.sum(onehot_label, axis=0)
        centers_count = F.take(label_hist, label)
        loss_c = F.sum(F.square(embeddings - centers_selected), 1) / centers_count

        # loss git
        diffs = F.broadcast_sub(F.expand_dims(embeddings, axis=1), F.expand_dims(centers_selected, 0))
        diffs = F.broadcast_mul(diffs, mask)
        loss_g = F.mean(1 / (1 + F.sum(F.square(diffs), axis=2)), axis=1)

        return loss_sm + self._lamda_c * 0.5 * loss_c + self._lamda_g * loss_g


class COCOLoss(SoftmaxCrossEntropyLoss):
    r"""Computes the COCO Loss from
    `"Rethinking Feature Discrimination and Polymerization for Large-scale Recognition"
    <https://arxiv.org/abs/1710.00870>`_ paper.

    This loss can be replaced by NormDense with Softmax, it is not recommended to use this.


    Parameters
    ----------
    classes: int.
        Number of classes.
    embedding_size: int.
        Size of feature.
    alpha: float.
        The scaling parameter, a hypersphere with small alpha
        will limit surface area for embedding features.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.

    """

    def __init__(self, classes, embedding_size, alpha,
                 weight_initializer=init.Xavier(magnitude=2.24), dtype='float32', **kwargs):
        super().__init__(**kwargs)
        self._alpha = alpha
        self._classes = classes
        self.centers = self.params.get('centers', shape=(classes, embedding_size), init=weight_initializer,
                                       dtype=dtype, allow_deferred_init=True)

    def hybrid_forward(self, F, embeddings, label, centers, sample_weight=None):
        norm_embs = self._alpha * F.L2Normalization(embeddings, mode='instance', name='fc1n')
        norm_centers = F.L2Normalization(centers, mode='instance', name='center_norm')
        outputs = F.dot(norm_embs, norm_centers, transpose_b=True)
        return super().hybrid_forward(F, outputs, label, sample_weight)


class SVXSoftmax(SoftmaxCrossEntropyLoss):
    r"""SVXSoftmax from
    `"Support Vector Guided Softmax Loss for Face Recognition"
    <https://arxiv.org/abs/1812.11317>`_ paper.

    When use default parameter, the designed SV-X-Softmax loss becomes identical to the original softmax loss.

    Parameters
    ----------
    classes: int.
        Number of classes.
    s: int.
        Scale parameter for loss.
    t: float.
        Indicator parameter of SV.
    m1: float.
        Margin parameter for sphere softmax.
    m2: float.
        Margin parameter for cos/am softmax.
    m3: float.
        Margin parameter for arc softmax.


    Outputs:
        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
          batch_axis are averaged out.
    """

    def __init__(self, classes, s, t=1, m1=1, m2=0, m3=0, dtype="float32", **kwargs):
        super().__init__(**kwargs)

        self._classes = classes
        self._scale = s
        self._m1, self._m2, self._m3 = m1, m2, m3
        self._t = t

        self._dtype = dtype

    def hybrid_forward(self, F, pred, label, sample_weight=None):
        cos_ty = F.pick(pred, label, axis=1, keepdims=True)
        cos_ty_m = F.cos(self._m1 * F.arccos(cos_ty) + self._m3) - self._m2

        indicator = F.Activation(data=F.broadcast_sub(pred, cos_ty_m), act_type='relu')
        fc = F.where(indicator, self._t * pred + self._t - 1, pred)

        diff = cos_ty_m - F.pick(fc, label, axis=1, keepdims=True)
        oh_label = F.one_hot(label, depth=self._classes, on_value=1.0, off_value=0.0, dtype=self._dtype)
        diff = F.broadcast_mul(oh_label, diff)

        fc = fc + diff
        fc = fc * self._scale
        return super().hybrid_forward(F, pred=fc, label=label, sample_weight=sample_weight)


_losses = {
    'softmax': SoftmaxCrossEntropyLoss,
    'arcface': ArcLoss,
    'triplet': TripletLoss,
    'ringloss': RingLoss,
    'cosLoss': CosLoss,
    'l2softmax': L2Softmax,
    'asoftmax': ASoftmax,
    'centerloss': CenterLoss,
    'contrastiveloss': ContrastiveLoss,
    'lgmloss': LGMLoss,
    'mpsoss': MPSLoss,
    'gitloss': GitLoss,
    'cocoloss': COCOLoss,
    "svxsoftmax": SVXSoftmax,
}


[docs]def get_loss(name, **kwargs):
    """Return the loss by name.

    Parameters
    ----------
    name : str.
        Available losses name in gluon face
    kwargs : str.
        Check the docs for details.

    Returns
    -------
    HybridBlock
        The loss.
    """
    name = name.lower()
    if name not in _losses:
        err_str = '"%s" is not among the following losses list:\n\t' % (name)
        err_str += '%s' % ('\n\t'.join(sorted(_losses.keys())))
        raise ValueError(err_str)
    loss = _losses[name](**kwargs)
    return loss


[docs]def get_loss_list():
    """Get the entire list of loss names in losses.

    Returns
    -------
    list of str.
        Entire list of loss names in losses.
    """
    return _losses.keys()