/*******************************************************************************
 * Copyright 2020 Intel Corporation.
 *
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them is governed by
 * the express license under which they were provided to you ('License'). Unless the License provides otherwise,
 * you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related
 * documents without Intel's prior written permission.
 * This software and the related documents are provided as is, with no express or implied warranties, other than
 * those that are expressly stated in the License.
 *******************************************************************************/

/* Intel(R) Integrated Performance Primitives (Intel(R) IPP) */

#include "pcvfilterseparable_tl.h"

static void ownGetSeparableSliceSize(IppiSizeL dstRoiSize, IppiSize maskSize, Ipp32u numThreads, IppiSizeL *pTileSize, IppiSizeL *pLastSize,
                                     IppiPointL *splitImage)
{
    IppiSizeL tileSize;
    tileSize.width = dstRoiSize.width;
    tileSize.height = dstRoiSize.height / (IppSizeL)numThreads;
    (*splitImage).x = (*splitImage).y = 1;
    if (((numThreads == 1) || (tileSize.height < TILE_S)) && (dstRoiSize.height)) {
        (*pLastSize).width = (*pTileSize).width = dstRoiSize.width;
        (*pLastSize).height = (*pTileSize).height = dstRoiSize.height;
    } else {
        tileSize.height = TILE_S;
        tileSize.width = dstRoiSize.width;
        /* split the image to tiles */
        ippiSplitToTiles_LT(dstRoiSize, tileSize, splitImage, pTileSize, pLastSize);
    }
}

/****************************************************************************************\
*                                   Separable Filters                                    *
\****************************************************************************************/

/*///////////////////////////////////////////////////////////////////////////////////////////
//  Name:      ippiFilterSeparableGetBufferSize_LT,
//             ippiFilterSeparableGetSpecSize_LT,
//             ippiFilterSeparableInit_16s_LT,
//             ippiFilterSeparableInit_32f_LT,
//             ippiFilterSeparable_8u_C1R_LT,      ippiFilterSeparable_8u_C3R_LT,      ippiFilterSeparable_8u_C4R_LT
//             ippiFilterSeparable_8u16s_C1R_LT,   ippiFilterSeparable_8u16s_C3R_LT,   ippiFilterSeparable_8u16s_C4R_LT
//             ippiFilterSeparable_16s_C1R_LT,     ippiFilterSeparable_16s_C3R_LT,     ippiFilterSeparable_16s_C4R_LT
//             ippiFilterSeparable_16u_C1R_LT,     ippiFilterSeparable_16u_C3R_LT,     ippiFilterSeparable_16u_C4R_LT
//             ippiFilterSeparable_32f_C1R_LT,     ippiFilterSeparable_32f_C3R_LT,     ippiFilterSeparable_32f_C4R_LT
//  Purpose:   Convolves source image rows and columns with the row and column kernels
//
//  Return:
//    ippStsNoErr              Ok
//    ippStsNullPtrErr         One of pointers is NULL
//    ippStsSizeErr            The width or height of images is less or equal zero
//    ippStsStepErr            The steps in images are too small
//    ippStsNotEvenStepErr     Step is not multiple of element.
//    ippStsBadArgErr          Zero divisor
//
//  Parameters:
//    pSrc                     The pointer to the source image
//    pDst                     The pointer to the destination image
//    srcStep                  The step in the source image
//    dstStep                  The step in the destination image
//    roiSize                  The image ROI size
//    borderType               Type of the border
//    borderValue              Pointer to the constant value(s) if border type equals ippBorderConstant
//    pSpec                    Pointer to the allocated and initialized context structure
//    pBuffer                  The pointer to the working buffer
//    kernelSize               Sizes of row and column kernels
//    dataType                 Data type of source image {ipp8u|ipp16s|ipp16u|ipp32f}
//    kernelType               Kernel type {ipp16s|ipp32f}
//    numChannels              Number of channels, possible values are 1, 3 or 4
//    pBufferSize              Pointer to the size (in bytes) of the external buffer
//    pSpecSize                Pointer to the size (in bytes) of the spec structure
//    pRowKernel               Pointer to row kernel
//    pColumnKernel            Pointer to column kernel
//    divisor                  The integer value by which the computed result is divided
//    scaleFactor              The integer value by which the computed result is scaled
*/

IPPFUN(IppStatus, ippiFilterSeparableGetBufferSize_LT,
       (IppiSizeL roiSize, IppiSize kernelSize, IppDataType dataType, IppDataType kernelType, int numChannels, IppSizeL *pBufferSize))
{
    Ipp32s numThreads;
    IppiSizeL pTileSize = {0, 0}, pLastSize = {0, 0};
    IppiPointL splitImage = {0, 0};
    IppStatus status = ippStsNoErr;
    IppiSize maskSize;
    IppSizeL pBufSize;
    if (pBufferSize == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    maskSize.height = kernelSize.height;
    maskSize.width = kernelSize.width;

    ippGetNumThreads_LT(&numThreads);

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    status = ippiFilterSeparableGetBufferSize_L(pLastSize, kernelSize, dataType, kernelType, numChannels, &pBufSize);
    if (status >= 0) {
        *pBufferSize = pBufSize * ((IppSizeL)splitImage.y * splitImage.x);
    }
    return status;
}

IPPFUN(IppStatus, ippiFilterSeparableGetSpecSize_LT, (IppiSize kernelSize, IppDataType dataType, int numChannels, int *pSpecSize))
{
    if (pSpecSize == NULL)
        return ippStsNullPtrErr;
    IppStatus status = ippStsNoErr;
    int specSize;

    status = ippiFilterSeparableGetSpecSize_L(kernelSize, dataType, numChannels, &specSize);
    if (status >= 0) {
        *pSpecSize = specSize + sizeof(SeparableInfo);
    }
    return status;
}

IPPFUN(IppStatus, ippiFilterSeparableInit_16s_LT,
       (const Ipp16s *pRowKernel, const Ipp16s *pColumnKernel, IppiSize kernelSize, int divisor, int scaleFactor, IppDataType dataType,
        int numChannels, IppiFilterSeparableSpec_LT *pSpec))
{
    IppStatus status = ippStsNoErr;
    SeparableInfo *pSeparableInfo = 0;
    if (pSpec == 0)
        return ippStsNullPtrErr;

    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    status = ippiFilterSeparableInit_16s_L(pRowKernel, pColumnKernel, kernelSize, divisor, scaleFactor, dataType, numChannels,
                                           (IppiFilterSeparableSpec *)pSpec);

    pSeparableInfo->kernelSize.width = kernelSize.width;
    pSeparableInfo->kernelSize.height = kernelSize.height;
    return status;
}
IPPFUN(IppStatus, ippiFilterSeparableInit_32f_LT,
       (const Ipp32f *pRowKernel, const Ipp32f *pColumnKernel, IppiSize kernelSize, IppDataType dataType, int numChannels,
        IppiFilterSeparableSpec_LT *pSpec))
{
    IppStatus status = ippStsNoErr;
    SeparableInfo *pSeparableInfo = 0;
    if (pSpec == 0)
        return ippStsNullPtrErr;

    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    status = ippiFilterSeparableInit_32f_L(pRowKernel, pColumnKernel, kernelSize, dataType, numChannels, (IppiFilterSeparableSpec *)pSpec);

    pSeparableInfo->kernelSize.width = kernelSize.width;
    pSeparableInfo->kernelSize.height = kernelSize.height;
    return status;
}

static IppStatus ippiFilterSeparable_8u_LT_Fun(IppSizeL t, void *arg)
{
    ippiFilterSeparable_LT_Str *ts = (ippiFilterSeparable_LT_Str *)arg;
    const Ipp8u *pSrc = (const Ipp8u *)ts->pSrc; // const
    IppSizeL srcStep = ts->srcStep;
    Ipp8u *pDst = ts->pDst;
    IppSizeL dstStep = ts->dstStep;
    IppiBorderType borderType = ts->borderType;
    Ipp8u *borderValue = ts->borderValue.borderValue_8u;
    const IppiFilterSeparableSpec_LT *pSpec = (const IppiFilterSeparableSpec_LT *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPointL splitImage = ts->splitImage;
    IppiSizeL pTileSize = ts->pTileSize;
    IppiSizeL pLastSize = ts->pLastSize;

    IppiSizeL roiSizeS;
    roiSizeS.height = pTileSize.height;
    IppSizeL w, h;
    IppiBorderType borderTrd = borderType;
    IppiBorderType borderTrdW = borderTrd;
    Ipp8u *pSrcRoi;
    Ipp8u *pDstRoi;
    int threadIdx = 0;
    IppSizeL firstGreaterIndex;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp8u *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp8u *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pLastSize.height;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    switch (numChannels) {
    case 1:
        return ippiFilterSeparable_8u_C1R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, *borderValue, (IppiFilterSeparableSpec *)pSpec,
                                            pBuf);
    case 3:
        return ippiFilterSeparable_8u_C3R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                            pBuf);
    default: // case 4
        return ippiFilterSeparable_8u_C4R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                            pBuf);
    }
}

static IppStatus ippiFilterSeparable_8u16s_LT_Fun(IppSizeL t, void *arg)
{
    ippiFilterSeparable_LT_Str *ts = (ippiFilterSeparable_LT_Str *)arg;
    const Ipp8u *pSrc = (const Ipp8u *)ts->pSrc; // const
    IppSizeL srcStep = ts->srcStep;
    Ipp16s *pDst = (Ipp16s *)ts->pDst;
    IppSizeL dstStep = ts->dstStep;
    IppiBorderType borderType = ts->borderType;
    Ipp8u *borderValue = ts->borderValue.borderValue_8u;
    const IppiFilterSeparableSpec_LT *pSpec = (const IppiFilterSeparableSpec_LT *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPointL splitImage = ts->splitImage;
    IppiSizeL pTileSize = ts->pTileSize;
    IppiSizeL pLastSize = ts->pLastSize;

    IppiSizeL roiSizeS;
    roiSizeS.height = pTileSize.height;
    IppSizeL w, h;
    IppiBorderType borderTrd = borderType;
    IppiBorderType borderTrdW = borderTrd;
    Ipp8u *pSrcRoi;
    Ipp16s *pDstRoi;
    int threadIdx = 0;
    IppSizeL firstGreaterIndex;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp8u *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp16s *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pLastSize.height;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    switch (numChannels) {
    case 1:
        return ippiFilterSeparable_8u16s_C1R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, *borderValue,
                                               (IppiFilterSeparableSpec *)pSpec, pBuf);
    case 3:
        return ippiFilterSeparable_8u16s_C3R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue,
                                               (IppiFilterSeparableSpec *)pSpec, pBuf);
    default: // case 4
        return ippiFilterSeparable_8u16s_C4R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue,
                                               (IppiFilterSeparableSpec *)pSpec, pBuf);
    }
}

static IppStatus ippiFilterSeparable_16s_LT_Fun(IppSizeL t, void *arg)
{
    ippiFilterSeparable_LT_Str *ts = (ippiFilterSeparable_LT_Str *)arg;
    const Ipp16s *pSrc = (const Ipp16s *)ts->pSrc; // const
    IppSizeL srcStep = ts->srcStep;
    Ipp16s *pDst = (Ipp16s *)ts->pDst;
    IppSizeL dstStep = ts->dstStep;
    IppiBorderType borderType = ts->borderType;
    Ipp16s *borderValue = ts->borderValue.borderValue_16s;
    const IppiFilterSeparableSpec_LT *pSpec = (const IppiFilterSeparableSpec_LT *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPointL splitImage = ts->splitImage;
    IppiSizeL pTileSize = ts->pTileSize;
    IppiSizeL pLastSize = ts->pLastSize;

    IppiSizeL roiSizeS;
    roiSizeS.height = pTileSize.height;
    IppSizeL w, h;
    IppiBorderType borderTrd = borderType;
    IppiBorderType borderTrdW = borderTrd;
    Ipp16s *pSrcRoi;
    Ipp16s *pDstRoi;
    int threadIdx = 0;
    IppSizeL firstGreaterIndex;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp16s *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp16s *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pLastSize.height;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    switch (numChannels) {
    case 1:
        return ippiFilterSeparable_16s_C1R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, *borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    case 3:
        return ippiFilterSeparable_16s_C3R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    default: // case 4
        return ippiFilterSeparable_16s_C4R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    }
}

static IppStatus ippiFilterSeparable_16u_LT_Fun(IppSizeL t, void *arg)
{
    ippiFilterSeparable_LT_Str *ts = (ippiFilterSeparable_LT_Str *)arg;
    const Ipp16u *pSrc = (const Ipp16u *)ts->pSrc; // const
    IppSizeL srcStep = ts->srcStep;
    Ipp16u *pDst = (Ipp16u *)ts->pDst;
    IppSizeL dstStep = ts->dstStep;
    IppiBorderType borderType = ts->borderType;
    Ipp16u *borderValue = ts->borderValue.borderValue_16u;
    const IppiFilterSeparableSpec_LT *pSpec = (const IppiFilterSeparableSpec_LT *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPointL splitImage = ts->splitImage;
    IppiSizeL pTileSize = ts->pTileSize;
    IppiSizeL pLastSize = ts->pLastSize;

    IppiSizeL roiSizeS;
    roiSizeS.height = pTileSize.height;
    IppSizeL w, h;
    IppiBorderType borderTrd = borderType;
    IppiBorderType borderTrdW = borderTrd;
    Ipp16u *pSrcRoi;
    Ipp16u *pDstRoi;
    int threadIdx = 0;
    IppSizeL firstGreaterIndex;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp16u *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp16u *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pLastSize.height;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    switch (numChannels) {
    case 1:
        return ippiFilterSeparable_16u_C1R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, *borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    case 3:
        return ippiFilterSeparable_16u_C3R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    default: // case 4
        return ippiFilterSeparable_16u_C4R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    }
}

static IppStatus ippiFilterSeparable_32f_LT_Fun(IppSizeL t, void *arg)
{
    ippiFilterSeparable_LT_Str *ts = (ippiFilterSeparable_LT_Str *)arg;
    const Ipp32f *pSrc = (const Ipp32f *)ts->pSrc; // const
    IppSizeL srcStep = ts->srcStep;
    Ipp32f *pDst = (Ipp32f *)ts->pDst;
    IppSizeL dstStep = ts->dstStep;
    IppiBorderType borderType = ts->borderType;
    Ipp32f *borderValue = ts->borderValue.borderValue_32f;
    const IppiFilterSeparableSpec_LT *pSpec = (const IppiFilterSeparableSpec_LT *)ts->pSpec; // const
    Ipp8u *pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    int numChannels = ts->numChannels;
    IppiPointL splitImage = ts->splitImage;
    IppiSizeL pTileSize = ts->pTileSize;
    IppiSizeL pLastSize = ts->pLastSize;

    IppiSizeL roiSizeS;
    roiSizeS.height = pTileSize.height;
    IppSizeL w, h;
    IppiBorderType borderTrd = borderType;
    IppiBorderType borderTrdW = borderTrd;
    Ipp32f *pSrcRoi;
    Ipp32f *pDstRoi;
    int threadIdx = 0;
    IppSizeL firstGreaterIndex;

    h = t / splitImage.x;
    w = t % splitImage.x;
    pSrcRoi = (Ipp32f *)((Ipp8u *)(pSrc + w * pTileSize.width * numChannels) + h * pTileSize.height * srcStep);
    pDstRoi = (Ipp32f *)((Ipp8u *)(pDst + w * pTileSize.width * numChannels) + h * pTileSize.height * dstStep);
    roiSizeS.height = pTileSize.height;
    firstGreaterIndex = pLastSize.height - pTileSize.height;
    if ((firstGreaterIndex < splitImage.y) && (pLastSize.height > pTileSize.height) && (firstGreaterIndex > 0)) {
        if (h >= (int)(splitImage.y - firstGreaterIndex)) {
            roiSizeS.height = pLastSize.height;
        }
    } else {
        if (pLastSize.height && (h == (int)(splitImage.y - 1))) {
            roiSizeS.height = pLastSize.height;
        }
    }
    roiSizeS.width = pTileSize.width;
    if (pLastSize.width && (w == (int)(splitImage.x - 1)))
        roiSizeS.width = pLastSize.width;
    if ((splitImage.y > 1)) {
        if (h == 0)
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1))
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemTop);
        else
            borderTrd = (IppiBorderType)((int)borderType | (int)ippBorderInMemBottom | (int)ippBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1)) {
        if (w == 0)
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight);
        else if (w == (int)(splitImage.x - 1))
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemLeft);
        else
            borderTrdW = (IppiBorderType)((int)borderTrd | (int)ippBorderInMemRight | (int)ippBorderInMemLeft);
    }
    /* Intel IPP function call */

    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;

    switch (numChannels) {
    case 1:
        return ippiFilterSeparable_32f_C1R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, *borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    case 3:
        return ippiFilterSeparable_32f_C3R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    default: // case 4
        return ippiFilterSeparable_32f_C4R_L(pSrcRoi, srcStep, pDstRoi, dstStep, roiSizeS, borderTrdW, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                             pBuf);
    }
}

IPPFUN(IppStatus, ippiFilterSeparable_8u_C1R_LT,
       (const Ipp8u *pSrc, IppSizeL srcStep, Ipp8u *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp8u borderValue,
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_8u_C1R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_8u = &borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_8u_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_8u_C3R_LT,
       (const Ipp8u *pSrc, IppSizeL srcStep, Ipp8u *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp8u borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_8u_C3R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_8u = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_8u_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_8u_C4R_LT,
       (const Ipp8u *pSrc, IppSizeL srcStep, Ipp8u *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp8u borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 4;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_8u_C4R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_8u = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_8u_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_8u16s_C1R_LT,
       (const Ipp8u *pSrc, IppSizeL srcStep, Ipp16s *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp8u borderValue,
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll = ippiFilterSeparable_8u16s_C1R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                                    pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_8u = &borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_8u16s_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_8u16s_C3R_LT,
       (const Ipp8u *pSrc, IppSizeL srcStep, Ipp16s *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp8u borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll = ippiFilterSeparable_8u16s_C3R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                                    pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_8u = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_8u16s_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_8u16s_C4R_LT,
       (const Ipp8u *pSrc, IppSizeL srcStep, Ipp16s *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp8u borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 4;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll = ippiFilterSeparable_8u16s_C4R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec,
                                                    pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_8u = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_8u16s_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_16s_C1R_LT,
       (const Ipp16s *pSrc, IppSizeL srcStep, Ipp16s *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp16s borderValue,
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_16s_C1R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_16s = &borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_16s_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_16s_C3R_LT,
       (const Ipp16s *pSrc, IppSizeL srcStep, Ipp16s *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp16s borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_16s_C3R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_16s = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_16s_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_16s_C4R_LT,
       (const Ipp16s *pSrc, IppSizeL srcStep, Ipp16s *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp16s borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 4;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_16s_C4R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_16s = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_16s_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_16u_C1R_LT,
       (const Ipp16u *pSrc, IppSizeL srcStep, Ipp16u *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp16u borderValue,
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_16u_C1R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_16u = &borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_16u_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_16u_C3R_LT,
       (const Ipp16u *pSrc, IppSizeL srcStep, Ipp16u *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp16u borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_16u_C3R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_16u = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_16u_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_16u_C4R_LT,
       (const Ipp16u *pSrc, IppSizeL srcStep, Ipp16u *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp16u borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 4;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_16u_C4R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_16u = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_16u_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_32f_C1R_LT,
       (const Ipp32f *pSrc, IppSizeL srcStep, Ipp32f *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp32f borderValue,
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 1;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_32f_C1R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_32f = &borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_32f_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_32f_C3R_LT,
       (const Ipp32f *pSrc, IppSizeL srcStep, Ipp32f *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp32f borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 3;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_32f_C3R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_32f = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_32f_LT_Fun);
    }
    return statusAll;
}

IPPFUN(IppStatus, ippiFilterSeparable_32f_C4R_LT,
       (const Ipp32f *pSrc, IppSizeL srcStep, Ipp32f *pDst, IppSizeL dstStep, IppiSizeL roiSize, IppiBorderType borderType, Ipp32f borderValue[3],
        const IppiFilterSeparableSpec_LT *pSpec, Ipp8u *pBuffer))
{
    IppStatus statusAll;
    int numChannels = 4;
    Ipp32u numThreads = 1;
    IppiSize maskSize;
    SeparableInfo *pSeparableInfo; /* Separable Info structure */
    IppSizeL bufSize;
    IppiPointL splitImage = {1, 1};
    IppiSizeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)
        return ippStsNullPtrErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (pSpec == 0 || pBuffer == 0)
        return ippStsNullPtrErr;

    ippGetNumThreads_LT((int *)&numThreads);
    pSeparableInfo = (SeparableInfo *)pSpec;
    pSpec = (IppiFilterSeparableSpec_LT *)((Ipp8u *)pSpec + sizeof(SeparableInfo));
    maskSize.height = pSeparableInfo->kernelSize.height;
    maskSize.width = pSeparableInfo->kernelSize.width;

    ownGetSeparableSliceSize(roiSize, maskSize, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width < pTileSize.width)
        pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)
        pLastSize.height = pTileSize.height;
    ippiFilterSeparableGetBufferSize_L(pLastSize, maskSize, ipp8u, ipp16s, numChannels, &bufSize);

    statusAll = ippStsNoErr;

    if ((numThreads == 1) || ((roiSize.height / (int)numThreads) < TILE_S)) {
        /* Intel IPP function call */
        statusAll =
            ippiFilterSeparable_32f_C4R_L(pSrc, srcStep, pDst, dstStep, roiSize, borderType, borderValue, (IppiFilterSeparableSpec *)pSpec, pBuffer);
    } else {
        int numTiles = splitImage.x * splitImage.y;
        ippiFilterSeparable_LT_Str ts;
        bValue unionBorderValue;
        unionBorderValue.borderValue_32f = borderValue;
        fSeparableThreadingStructureEncode((Ipp8u *)pSrc, srcStep, (Ipp8u *)pDst, dstStep, borderType, unionBorderValue,
                                           (IppiFilterSeparableSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void *)&ts, ippiFilterSeparable_32f_LT_Fun);
    }
    return statusAll;
}
