/*******************************************************************************
 * Copyright 2016 Intel Corporation.
 *
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them is governed by
 * the express license under which they were provided to you ('License'). Unless the License provides otherwise,
 * you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related
 * documents without Intel's prior written permission.
 * This software and the related documents are provided as is, with no express or implied warranties, other than
 * those that are expressly stated in the License.
 *******************************************************************************/

/* Intel(R) Integrated Performance Primitives (Intel(R) IPP) */

#include "pifilbilbrd_t.h"
static void ownGetBilateralSliceSize(IppiSize dstRoiSize, IppiSize maskSize, Ipp32u numThreads, IppiSize *pTileSize, IppiSize *pLastSize,
                                     IppiPoint *splitImage)
{
    IppiSize tileSize;
    tileSize.width = dstRoiSize.width;
    tileSize.height = dstRoiSize.height / (int)numThreads;
    splitImage->x = splitImage->y = 1;
    if (((numThreads == 1) || (tileSize.height < TYLE_S)) && (dstRoiSize.height)) {
        (*pLastSize).width = (*pTileSize).width = dstRoiSize.width;
        (*pLastSize).height = (*pTileSize).height = dstRoiSize.height;
    } else {
        tileSize.height = TYLE_S;
        tileSize.width = dstRoiSize.width;
        /* split the image to tiles */
        ippiSplitToTiles_T(dstRoiSize, tileSize, splitImage, pTileSize, pLastSize);
    }
}
/* /////////////////////////////////////////////////////////////////////////////
//                     Bilateral filter functions with Border
// /////////////////////////////////////////////////////////////////////////////
//  Name:       ippiFilterBilateralBorderGetBufferSize_T
//  Purpose:    to define buffer size for bilateral filter
//  Parameters:
//   filter        Type of bilateral filter. Possible value is ippiFilterBilateralGauss.
//   dstRoiSize    Roi size (in pixels) of destination image what will be applied
//                 for processing.
//   radius        Radius of circular neighborhood what defines pixels for calculation.
//   dataType      Data type of the source and desination images. Possible values
//                 are Ipp8u and Ipp32f.
//   numChannels   Number of channels in the images. Possible values are 1 and 3.
//   distMethod    The type of method for definition of distance beetween pixel untensity.
//                 Possible value is ippDistNormL1.
//   pSpecSize     Pointer to the size (in bytes) of the spec.
//   pBufferSize   Pointer to the size (in bytes) of the external work buffer.
//  Return:
//    ippStsNoErr               OK
//    ippStsNullPtrErr          any pointer is NULL
//    ippStsSizeErr             size of dstRoiSize is less or equal 0
//    ippStsMaskSizeErr         kernelWidthHeight is less or equal 0
//    ippStsNotSupportedModeErr filter or distMethod is not supported
//    ippStsDataTypeErr         Indicates an error when dataType has an illegal value.
//    ippStsNumChannelsErr      Indicates an error when numChannels has an illegal value.
*/
IPPFUN(IppStatus, ippiFilterBilateralGetBufferSize_T,
       (IppiFilterBilateralType filter, IppiSize dstRoiSize, int kernelWidthHeight, IppDataType dataType, int numChannels,
        IppiDistanceMethodType distMethodType, int *pSpecSizeL, int *pBufferSize))
{
    Ipp32s numThreads;
    IppiSize pTileSize = {0, 0}, pLastSize = {0, 0};
    int pSpecSize;
    IppiPoint splitImage = {0, 0};
    IppStatus status = ippStsNoErr;
    IppiSize maskSize;
    int width = dstRoiSize.width, pBufSize;
    int height = dstRoiSize.height;
    if (pSpecSizeL == 0 || pBufferSize == 0)
        return ippStsNullPtrErr;
    if (width <= 0 || height <= 0)
        return ippStsSizeErr;
    maskSize.height = maskSize.width = kernelWidthHeight;

    ippGetNumThreads_T(&numThreads);

    ownGetBilateralSliceSize(dstRoiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    status = ippiFilterBilateralGetBufferSize(filter, pLastSize, kernelWidthHeight, dataType, numChannels, distMethodType, &pSpecSize, &pBufSize);
    if (status >= 0) {
        *pSpecSizeL = pSpecSize + sizeof(BilateralInfo);
        *pBufferSize = pBufSize * splitImage.y * splitImage.x;
    }
    return status;
}

/* /////////////////////////////////////////////////////////////////////////////
//  Name:       ippiFilterBilateralBorderInit_T
//  Purpose:    initialization of Spec for bilateral filter with border
//  Parameters:
//   filter           Type of bilateral filter. Possible value is ippiFilterBilateralGauss.
//   dstRoiSize   Size of the destination ROI
//   radius           Radius of circular neighborhood what defines pixels for calculation.
//   dataType         Data type of the source and desination images. Possible values
//                    are Ipp8u and Ipp32f.
//   numChannels      Number of channels in the images. Possible values are 1 and 3.
//   distMethodType   The type of method for definition of distance beetween pixel intensity.
//                    Possible value is ippDistNormL1.
//   valSquareSigma   square of Sigma for factor function for pixel intensity
//   posSquareSigma   square of Sigma for factor function for pixel position
//    pSpec           pointer to Spec
//  Return:
//    ippStsNoErr               OK
//    ippStsNullPtrErr          pointer ro Spec is NULL
//    ippStsSizeErr             size of dstRoiSize is less or equal 0
//    ippStsMaskSizeErr         radius is less or equal 0
//    ippStsNotSupportedModeErr filter or distMethod is not supported
//    ippStsDataTypeErr         Indicates an error when dataType has an illegal value.
//    ippStsNumChannelsErr      Indicates an error when numChannels has an illegal value.
//    ippStsBadArgErr           valSquareSigma or posSquareSigma is less or equal 0
*/
IPPFUN(IppStatus, ippiFilterBilateralInit_T,
       (IppiFilterBilateralType filter, IppiSize dstRoiSize, int kernelWidthHeight, IppDataType dataType, int numChannels,
        IppiDistanceMethodType distMethod, Ipp64f valSquareSigma, Ipp64f posSquareSigma, IppiFilterBilateralSpec_T *pSpecL))
{
    IppStatus status = ippStsNoErr;
    Ipp32u numThreads;
    IppiSize dstRoiSize32;
    BilateralInfo *pBilateralInfo = 0;
    IppiSize pTileSize = {0, 0}, pLastSize = {0, 0};
    IppiPoint splitImage;
    IppiSize maskSize;
    int pSpecSize;
    int pBufSize, width = dstRoiSize.width;
    int height = dstRoiSize.height;
    if (pSpecL == 0)
        return ippStsNullPtrErr;
    if (width <= 0 || height <= 0)
        return ippStsSizeErr;

    maskSize.height = maskSize.width = kernelWidthHeight;
    dstRoiSize32.height = dstRoiSize32.width = 21;
    splitImage.x = splitImage.y = 0;

    ippGetNumThreads_T((int *)&numThreads);

    ownGetBilateralSliceSize(dstRoiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    status = ippiFilterBilateralGetBufferSize(ippiFilterBilateralGauss, pLastSize, kernelWidthHeight, dataType, numChannels, ippDistNormL1,
                                              &pSpecSize, &pBufSize);
    pBilateralInfo = (BilateralInfo *)pSpecL;
    pSpecL = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpecL + sizeof(BilateralInfo));
    status = ippiFilterBilateralInit(filter, dstRoiSize32, kernelWidthHeight, dataType, numChannels, distMethod, valSquareSigma, posSquareSigma,
                                     (IppiFilterBilateralSpec *)pSpecL);
    pBilateralInfo->bufsize = pBufSize;
    pBilateralInfo->lastTile.width = pLastSize.width;
    pBilateralInfo->lastTile.height = pLastSize.height;
    pBilateralInfo->tileSize.width = pTileSize.width;
    pBilateralInfo->tileSize.height = pTileSize.height;
    pBilateralInfo->split.x = splitImage.x;
    pBilateralInfo->split.y = splitImage.y;

    pBilateralInfo->radius = kernelWidthHeight;
    return status;
}

/* /////////////////////////////////////////////////////////////////////////////
//  Name:       ippiFilterBilateralBorder_8u_C1R_T
//              ippiFilterBilateralBorder_8u_C3R_T
//              ippiFilterBilateralBorder_32f_C1R_T
//              ippiFilterBilateralBorder_32f_C3R_T
//  Purpose:    bilateral filter
//  Parameters:
//    pSrc         Pointer to the source image
//    srcStep      Step through the source image
//    pDst         Pointer to the destination image
//    dstStep      Step through the destination image
//    dstRoiSize   Size of the destination ROI
//    radius       Radius of circular neighborhood what defines pixels for calculation.
//    borderType   Type of border.
//    borderValue  Pointer to constant value to assign to pixels of the constant border. This parameter is applicable
//                 only to the ippBorderConst border type. If this pointer is NULL than the constant value is equal 0.
//    pSpec        Pointer to filter spec
//    pBuffer      Pointer ro work buffer
//  Return:
//    ippStsNoErr           OK
//    ippStsNullPtrErr      pointer to Src, Dst, Spec or Buffer is NULL
//    ippStsSizeErr         size of dstRoiSize is less or equal 0
//    ippStsContextMatchErr filter Spec is not match
//    ippStsNotEvenStepErr  Indicated an error when one of the step values is not divisible by 4
//                          for floating-point images.
//    ippStsBorderErr       Indicates an error when borderType has illegal value.
*/

static IppStatus ippiFilterBilateral_8u_C1R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_T_Str *ts = (ippiFilterBilateral_T_Str *)arg;
    const Ipp8u *pSrc = (const Ipp8u *)ts->pSrc; // const
    int srcStep = ts->srcStep;
    Ipp8u *pDst = ts->pDst;
    int dstStep = ts->dstStep;
    IppiBorderType border = ts->border;
    Ipp8u *borderValue = ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp8u *pSrcRoi;
    Ipp8u *pDstRoi;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc = 0;
    int tileOffsetDst = 0;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp8u *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp8u *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            tileOffsetSrc = (firstGreaterIndex + h - splitImage.y) * srcStep;
            tileOffsetDst = (firstGreaterIndex + h - splitImage.y) * dstStep;
        }
    } else {
        if (pLastSize.height && (h == (splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    pSrcRoi += tileOffsetSrc;
    pDstRoi += tileOffsetDst;

    return ippiFilterBilateral_8u_C1R(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_8u_C3R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_T_Str *ts = (ippiFilterBilateral_T_Str *)arg;
    const Ipp8u *pSrc = (const Ipp8u *)ts->pSrc; // const
    int srcStep = ts->srcStep;
    Ipp8u *pDst = ts->pDst;
    int dstStep = ts->dstStep;
    IppiBorderType border = ts->border;
    Ipp8u *borderValue = ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc = 0;
    int tileOffsetDst = 0;

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp8u *pSrcRoi;
    Ipp8u *pDstRoi;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp8u *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp8u *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            tileOffsetSrc = (firstGreaterIndex + h - splitImage.y) * srcStep;
            tileOffsetDst = (firstGreaterIndex + h - splitImage.y) * dstStep;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    pSrcRoi += tileOffsetSrc;
    pDstRoi += tileOffsetDst;

    return ippiFilterBilateral_8u_C3R(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_32f_C1R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_T_Str *ts = (ippiFilterBilateral_T_Str *)arg;
    const Ipp32f *pSrc = (const Ipp32f *)ts->pSrc; // const
    int srcStep = ts->srcStep;
    Ipp32f *pDst = (Ipp32f *)ts->pDst;
    int dstStep = ts->dstStep;
    IppiBorderType border = ts->border;
    Ipp32f *borderValue = (Ipp32f *)ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp32f *pSrcRoi;
    Ipp32f *pDstRoi;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc = 0;
    int tileOffsetDst = 0;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp32f *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp32f *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            tileOffsetSrc = (firstGreaterIndex + h - splitImage.y) * srcStep;
            tileOffsetDst = (firstGreaterIndex + h - splitImage.y) * dstStep;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    pSrcRoi += tileOffsetSrc / sizeof(Ipp32f);
    pDstRoi += tileOffsetDst / sizeof(Ipp32f);

    return ippiFilterBilateral_32f_C1R(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_32f_C3R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_T_Str *ts = (ippiFilterBilateral_T_Str *)arg;
    const Ipp32f *pSrc = (const Ipp32f *)ts->pSrc; // const
    int srcStep = ts->srcStep;
    Ipp32f *pDst = (Ipp32f *)ts->pDst;
    int dstStep = ts->dstStep;
    IppiBorderType border = ts->border;
    Ipp32f *borderValue = (Ipp32f *)ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc = 0;
    int tileOffsetDst = 0;

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp32f *pSrcRoi;
    Ipp32f *pDstRoi;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp32f *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp32f *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            tileOffsetSrc = (firstGreaterIndex + h - splitImage.y) * srcStep;
            tileOffsetDst = (firstGreaterIndex + h - splitImage.y) * dstStep;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    pSrcRoi += tileOffsetSrc / sizeof(Ipp32f);
    pDstRoi += tileOffsetDst / sizeof(Ipp32f);

    return ippiFilterBilateral_32f_C3R(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_64f_C1R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_T_Str *ts = (ippiFilterBilateral_T_Str *)arg;
    const Ipp64f *pSrc = (const Ipp64f *)ts->pSrc; // const
    int srcStep = ts->srcStep;
    Ipp64f *pDst = (Ipp64f *)ts->pDst;
    int dstStep = ts->dstStep;
    IppiBorderType border = ts->border;
    Ipp64f *borderValue = (Ipp64f *)ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp64f *pSrcRoi;
    Ipp64f *pDstRoi;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc = 0;
    int tileOffsetDst = 0;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp64f *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp64f *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            tileOffsetSrc = (firstGreaterIndex + h - splitImage.y) * srcStep;
            tileOffsetDst = (firstGreaterIndex + h - splitImage.y) * dstStep;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    pSrcRoi += tileOffsetSrc / sizeof(Ipp64f);
    pDstRoi += tileOffsetDst / sizeof(Ipp64f);

    return ippiFilterBilateral_64f_C1R(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_64f_C3R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_T_Str *ts = (ippiFilterBilateral_T_Str *)arg;
    const Ipp64f *pSrc = (const Ipp64f *)ts->pSrc; // const
    int srcStep = ts->srcStep;
    Ipp64f *pDst = (Ipp64f *)ts->pDst;
    int dstStep = ts->dstStep;
    IppiBorderType border = ts->border;
    Ipp64f *borderValue = (Ipp64f *)ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc = 0;
    int tileOffsetDst = 0;

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp64f *pSrcRoi;
    Ipp64f *pDstRoi;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp64f *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp64f *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            tileOffsetSrc = (firstGreaterIndex + h - splitImage.y) * srcStep;
            tileOffsetDst = (firstGreaterIndex + h - splitImage.y) * dstStep;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    pSrcRoi += tileOffsetSrc / sizeof(Ipp64f);
    pDstRoi += tileOffsetDst / sizeof(Ipp64f);

    return ippiFilterBilateral_64f_C3R(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_8u_P3R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_P3_T_Str *ts = (ippiFilterBilateral_P3_T_Str *)arg;
    const Ipp8u *pSrc[3] /*= (const Ipp8u *)ts->pSrc*/; // const
    int srcStep[3] /* = ts->srcStep*/;
    Ipp8u *pDst[3] /* = ts->pDst*/;
    int dstStep[3] /* = ts->dstStep*/;
    IppiBorderType border = ts->border;
    Ipp8u *borderValue = ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc[3] = {0};
    int tileOffsetDst[3] = {0};

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp8u *pSrcRoi[3];
    Ipp8u *pDstRoi[3];
    int n;
    h = t / splitImage.x;
    w = t % splitImage.x;
    for (n = 0; n < 3; n++) {
        pSrc[n] = (const Ipp8u *)ts->pSrc[n]; // const
        srcStep[n] = ts->srcStep[n];
        pDst[n] = ts->pDst[n];
        dstStep[n] = ts->dstStep[n];
        pSrcRoi[n] = (Ipp8u *)((Ipp8u *)(pSrc[n] + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep[n]);
        pDstRoi[n] = (Ipp8u *)((Ipp8u *)(pDst[n] + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep[n]);
    }
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            for (n = 0; n < 3; n++) {
                tileOffsetSrc[n] = (firstGreaterIndex + h - splitImage.y) * srcStep[n];
                tileOffsetDst[n] = (firstGreaterIndex + h - splitImage.y) * dstStep[n];
            }
        }
    } else {
        if (pLastSize.height && (h == (splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if (splitImage.y > 1) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if (splitImage.x > 1) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    for (n = 0; n < 3; n++) {
        pSrcRoi[n] += tileOffsetSrc[n];
        pDstRoi[n] += tileOffsetDst[n];
    }

    return ippiFilterBilateral_8u_P3R((const Ipp8u **)pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue,
                                      (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_32f_P3R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_P3_T_Str *ts = (ippiFilterBilateral_P3_T_Str *)arg;
    const Ipp32f *pSrc[3] /*= (const Ipp8u *)ts->pSrc*/; // const
    int srcStep[3] /* = ts->srcStep*/;
    Ipp32f *pDst[3] /* = ts->pDst*/;
    int dstStep[3] /* = ts->dstStep*/;
    IppiBorderType border = ts->border;
    Ipp32f *borderValue = (Ipp32f *)ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc[3] = {0};
    int tileOffsetDst[3] = {0};

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp32f *pSrcRoi[3];
    Ipp32f *pDstRoi[3];
    int n;
    h = t / splitImage.x;
    w = t % splitImage.x;
    for (n = 0; n < 3; n++) {
        pSrc[n] = (const Ipp32f *)ts->pSrc[n]; // const
        srcStep[n] = ts->srcStep[n];
        pDst[n] = (Ipp32f *)ts->pDst[n];
        dstStep[n] = ts->dstStep[n];
        pSrcRoi[n] = (Ipp32f *)((Ipp8u *)(pSrc[n] + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep[n]);
        pDstRoi[n] = (Ipp32f *)((Ipp8u *)(pDst[n] + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep[n]);
    }
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            for (n = 0; n < 3; n++) {
                tileOffsetSrc[n] = (firstGreaterIndex + h - splitImage.y) * srcStep[n];
                tileOffsetDst[n] = (firstGreaterIndex + h - splitImage.y) * dstStep[n];
            }
        }
    } else {
        if (pLastSize.height && (h == (splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if (splitImage.y > 1) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if (splitImage.x > 1) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    for (n = 0; n < 3; n++) {
        pSrcRoi[n] += tileOffsetSrc[n] / sizeof(Ipp32f);
        pDstRoi[n] += tileOffsetDst[n] / sizeof(Ipp32f);
    }

    return ippiFilterBilateral_32f_P3R((const Ipp32f **)pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue,
                                       (IppiFilterBilateralSpec *)pSpec, pBuf);
}
static IppStatus ippiFilterBilateral_64f_P3R_T_Fun(int t, void *arg)
{
    ippiFilterBilateral_P3_T_Str *ts = (ippiFilterBilateral_P3_T_Str *)arg;
    const Ipp64f *pSrc[3] /*= (const Ipp8u *)ts->pSrc*/; // const
    int srcStep[3] /* = ts->srcStep*/;
    Ipp64f *pDst[3] /* = ts->pDst*/;
    int dstStep[3] /* = ts->dstStep*/;
    IppiBorderType border = ts->border;
    Ipp64f *borderValue = (Ipp64f *)ts->borderValue;
    const IppiFilterBilateralSpec_T *pSpec = (const IppiFilterBilateralSpec_T *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    int bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPoint splitImage = ts->splitImage;
    IppiSize pTileSize = ts->pTileSize;
    IppiSize pLastSize = ts->pLastSize;
    int threadIdx = 0;
    int firstGreaterIndex;
    int tileOffsetSrc[3] = {0};
    int tileOffsetDst[3] = {0};

    IppiSize roiSizeS;
    roiSizeS.height = pTileSize.height;
    int w, h;
    IppiBorderType borderTrd = border;
    IppiBorderType borderTrdW = borderTrd;
    Ipp64f *pSrcRoi[3];
    Ipp64f *pDstRoi[3];
    int n;
    h = t / splitImage.x;
    w = t % splitImage.x;
    for (n = 0; n < 3; n++) {
        pSrc[n] = (const Ipp64f *)ts->pSrc[n]; // const
        srcStep[n] = ts->srcStep[n];
        pDst[n] = (Ipp64f *)ts->pDst[n];
        dstStep[n] = ts->dstStep[n];
        pSrcRoi[n] = (Ipp64f *)((Ipp8u *)(pSrc[n] + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep[n]);
        pDstRoi[n] = (Ipp64f *)((Ipp8u *)(pDst[n] + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep[n]);
    }
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pTileSize.height + 1;
            for (n = 0; n < 3; n++) {
                tileOffsetSrc[n] = (firstGreaterIndex + h - splitImage.y) * srcStep[n];
                tileOffsetDst[n] = (firstGreaterIndex + h - splitImage.y) * dstStep[n];
            }
        }
    } else {
        if (pLastSize.height && (h == (splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if (splitImage.y > 1) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom);
        else if (h == (splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)border | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if (splitImage.x > 1) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_T(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    for (n = 0; n < 3; n++) {
        pSrcRoi[n] += tileOffsetSrc[n] / sizeof(Ipp64f);
        pDstRoi[n] += tileOffsetDst[n] / sizeof(Ipp64f);
    }

    return ippiFilterBilateral_64f_P3R((const Ipp64f **)pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue,
                                       (IppiFilterBilateralSpec *)pSpec, pBuf);
}

IPPFUN(IppStatus, ippiFilterBilateral_8u_C1R_T,
       (const Ipp8u *pSrc, int srcStep, Ipp8u *pDst, int dstStep, IppiSize roiSize, IppiBorderType border, const Ipp8u borderValue[1],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1, radius;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) {
        /* Intel IPP function call */
        statusAll = ippiFilterBilateral_8u_C1R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_T_Str ts;
        fBilateralBrdThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, border, (Ipp8u *)borderValue,
                                              (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                              &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_8u_C1R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_8u_C3R_T,
       (const Ipp8u *pSrc, int srcStep, Ipp8u *pDst, int dstStep, IppiSize roiSize, IppiBorderType border, const Ipp8u borderValue[3],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3, radius;
    Ipp32u numThreads;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) { /* Intel IPP function call */
        statusAll = ippiFilterBilateral_8u_C3R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_T_Str ts;
        fBilateralBrdThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, border, (Ipp8u *)borderValue,
                                              (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                              &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_8u_C3R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_32f_C1R_T,
       (const Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize, IppiBorderType border, const Ipp32f borderValue[1],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1, radius;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterBilateral_32f_C1R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_T_Str ts;
        fBilateralBrdThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, border, (Ipp8u *)borderValue,
                                              (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                              &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_32f_C1R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_32f_C3R_T,
       (const Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize, IppiBorderType border, const Ipp32f borderValue[3],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3, radius;
    Ipp32u numThreads;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) { /* Intel IPP function call */
        statusAll =
            ippiFilterBilateral_32f_C3R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_T_Str ts;
        fBilateralBrdThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, border, (Ipp8u *)borderValue,
                                              (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                              &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_32f_C3R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_64f_C1R_T,
       (const Ipp64f *pSrc, int srcStep, Ipp64f *pDst, int dstStep, IppiSize roiSize, IppiBorderType border, const Ipp64f borderValue[1],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1, radius;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterBilateral_64f_C1R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_T_Str ts;
        fBilateralBrdThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, border, (Ipp8u *)borderValue,
                                              (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                              &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_64f_C1R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_64f_C3R_T,
       (const Ipp64f *pSrc, int srcStep, Ipp64f *pDst, int dstStep, IppiSize roiSize, IppiBorderType border, const Ipp64f borderValue[3],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3, radius;
    Ipp32u numThreads;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) { /* Intel IPP function call */
        statusAll =
            ippiFilterBilateral_64f_C3R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_T_Str ts;
        fBilateralBrdThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, border, (Ipp8u *)borderValue,
                                              (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                              &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_64f_C3R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_8u_P3R_T,
       (const Ipp8u *pSrc[3], int srcStep[3], Ipp8u *pDst[3], int dstStep[3], IppiSize roiSize, IppiBorderType border, const Ipp8u borderValue[3],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3, radius;
    Ipp32u numThreads;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (pSrc[0] == 0 || pSrc[1] == 0 || pSrc[2] == 0)
        return ippStsNullPtrErr;
    if (pDst[0] == 0 || pDst[1] == 0 || pDst[2] == 0)
        return ippStsNullPtrErr;
    if (srcStep == 0 || dstStep == 0)
        return ippStsNullPtrErr;
    if (srcStep[0] == 0 || srcStep[1] == 0 || srcStep[2] == 0)
        return ippStsNullPtrErr;
    if (dstStep[0] == 0 || dstStep[1] == 0 || dstStep[2] == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) { /* Intel IPP function call */
        statusAll = ippiFilterBilateral_8u_P3R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_P3_T_Str ts;
        fBilateralBrdThreadingStructureEncode_P3((Ipp8u **)pSrc, srcStep, (Ipp8u **)pDst, dstStep, border, (Ipp8u *)borderValue,
                                                 (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                                 &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_8u_P3R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_32f_P3R_T,
       (const Ipp32f *pSrc[3], int srcStep[3], Ipp32f *pDst[3], int dstStep[3], IppiSize roiSize, IppiBorderType border, const Ipp32f borderValue[3],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3, radius;
    Ipp32u numThreads;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (pSrc[0] == 0 || pSrc[1] == 0 || pSrc[2] == 0)
        return ippStsNullPtrErr;
    if (pDst[0] == 0 || pDst[1] == 0 || pDst[2] == 0)
        return ippStsNullPtrErr;
    if (srcStep == 0 || dstStep == 0)
        return ippStsNullPtrErr;
    if (srcStep[0] == 0 || srcStep[1] == 0 || srcStep[2] == 0)
        return ippStsNullPtrErr;
    if (dstStep[0] == 0 || dstStep[1] == 0 || dstStep[2] == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) { /* Intel IPP function call */
        statusAll =
            ippiFilterBilateral_32f_P3R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_P3_T_Str ts;
        fBilateralBrdThreadingStructureEncode_P3((Ipp8u **)pSrc, srcStep, (Ipp8u **)pDst, dstStep, border, (Ipp8u *)borderValue,
                                                 (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                                 &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_32f_P3R_T_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ippiFilterBilateral_64f_P3R_T,
       (const Ipp64f *pSrc[3], int srcStep[3], Ipp64f *pDst[3], int dstStep[3], IppiSize roiSize, IppiBorderType border, const Ipp64f borderValue[3],
        const IppiFilterBilateralSpec_T *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3, radius;
    Ipp32u numThreads;
    IppiSize maskSize;
    BilateralInfo *pBilateralInfo; /* Bilateral Info structure */
    int bufSize;
    IppiPoint splitImage = {1, 1};
    IppiSize pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (pSrc[0] == 0 || pSrc[1] == 0 || pSrc[2] == 0)
        return ippStsNullPtrErr;
    if (pDst[0] == 0 || pDst[1] == 0 || pDst[2] == 0)
        return ippStsNullPtrErr;
    if (srcStep == 0 || dstStep == 0)
        return ippStsNullPtrErr;
    if (srcStep[0] == 0 || srcStep[1] == 0 || srcStep[2] == 0)
        return ippStsNullPtrErr;
    if (dstStep[0] == 0 || dstStep[1] == 0 || dstStep[2] == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    pBilateralInfo = (BilateralInfo *)pSpec;
    pSpec = (IppiFilterBilateralSpec_T *)((Ipp8u *)pSpec + sizeof(BilateralInfo));
    bufSize = pBilateralInfo->bufsize;
    splitImage.x = pBilateralInfo->split.x;
    splitImage.y = pBilateralInfo->split.y;
    pTileSize.width = pBilateralInfo->tileSize.width;
    pTileSize.height = pBilateralInfo->tileSize.height;
    pLastSize.width = pBilateralInfo->lastTile.width;
    pLastSize.height = pBilateralInfo->lastTile.height;
    radius = pBilateralInfo->radius;
    maskSize.height = maskSize.width = radius;
    statusAll = ippStsNoErr;

    ippGetNumThreads_T((int *)&numThreads);

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TYLE_S)) { /* Intel IPP function call */
        statusAll =
            ippiFilterBilateral_64f_P3R(pSrc, srcStep, pDst, dstStep, roiSize, border, borderValue, (IppiFilterBilateralSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterBilateral_P3_T_Str ts;
        fBilateralBrdThreadingStructureEncode_P3((Ipp8u **)pSrc, srcStep, (Ipp8u **)pDst, dstStep, border, (Ipp8u *)borderValue,
                                                 (IppiFilterBilateralSpec_T *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize,
                                                 &ts);
        statusAll = ippParallelFor_T(numTiles, (void *)&ts, ippiFilterBilateral_64f_P3R_T_Fun);
    }
    return statusAll;
}
