/*******************************************************************************
* Copyright (C) 2023 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

#define ESIMD_UNROLL _Pragma("unroll")

//
// Esimd kernel for SpMV using ESB format blockptr, colind, values arrays
//
auto mv_esb3_esimd_kernel = [=](sycl::item<1> item) SYCL_ESIMD_KERNEL
{
    local_int_t block = item.get_id(0);
    if (block > nBlocks) return;

    if (applyReorder) block = reorder[block];

    const local_int_t start_row  = block * BLOCK_SIZE;

    esimd::simd<double, BLOCK_SIZE> y_vec(0.0);

    // prefetches seem to make it so the loop does not unroll, and
    // the unrolling is better than prefetching for performance

    // esimd_lsc_prefetch<local_int_t, local_int_t, 2*BLOCK_SIZE, ca, uc>(colind, blockptr[block]);
    // esimd_lsc_prefetch<double, local_int_t, 2*BLOCK_SIZE, ca, uc>(values, blockptr[block]);

    ESIMD_UNROLL
    for (local_int_t j = blockptr[block]; j < blockptr[block+1]; ++j) {
        // esimd_lsc_prefetch<local_int_t, local_int_t, BLOCK_SIZE, ca, uc>(colind, j+BLOCK_SIZE);
        // esimd_lsc_prefetch<double, local_int_t, BLOCK_SIZE, ca, uc>(values, j+BLOCK_SIZE);

        esimd::simd<local_int_t, BLOCK_SIZE> indices =
            esimd_lsc_block_load<local_int_t, local_int_t, BLOCK_SIZE, ca, uc>(
                colind, j * BLOCK_SIZE);
        esimd::simd<double, BLOCK_SIZE> vals =
            esimd_lsc_block_load<double, local_int_t, BLOCK_SIZE, ca, uc>(
                values, j * BLOCK_SIZE);
        esimd::simd<double, BLOCK_SIZE> x_vec =
            esimd_lsc_gather<double, local_int_t, BLOCK_SIZE, nc, nc>(x, indices);
        y_vec += x_vec * vals;
    }
    y_vec.copy_to(y + start_row);

    if constexpr (withDot) {
        auto x_vec = esimd_lsc_block_load<double, local_int_t, BLOCK_SIZE, ca, ca>(x, start_row);
        y_vec = y_vec * x_vec;
        auto res = esimd::reduce<double>(y_vec, std::plus<>());
        sycl::ext::intel::esimd::atomic_update<LSCAtomicOp::fadd, double, 1>(xAx, 0, res);
    }

};
