/*******************************************************************************
* Copyright (C) 2024 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file main_test_base.cxx

 common initialization code in main() for main_bench_kernels and main_test_kernel

 */

#ifndef HPCG_NO_MPI
  MPI_Init(&argc, &argv);
#endif

  HPCG_Params params;

  HPCG_Init(&argc, &argv, params);

  // Catch asynchronous exceptions
  auto exception_handler = [](sycl::exception_list exceptions) {
      for (std::exception_ptr const &e : exceptions) {
          try {
              std::rethrow_exception(e);
          }
          catch (sycl::exception const &e) {
              std::cout << "Caught asynchronous SYCL "
                           "exception while running HPCG benchmark:\n"
                        << e.what() << std::endl;
          }
      }
  };

  int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID

#ifdef HPCG_DEBUG
  const std::string spaces = "    ";
#else
  const std::string spaces = "";
#endif


  //
  // Setup SYCL GPU platform and distribute MPI rank to hardware
  //
  sycl::platform plat = sycl::platform{sycl::gpu_selector_v};
  auto platform_cards = plat.get_devices(sycl::info::device_type::gpu);

  // check if cards on platform can be partitioned or not (assumes all cards are same)
  auto part_prop = platform_cards[0].get_info<sycl::info::device::partition_properties>();

#ifndef HPCG_NO_MPI
  char node_name[MPI_MAX_PROCESSOR_NAME];
  int name_len;
  MPI_Get_processor_name(node_name, &name_len);
#else
  char node_name[2] = "0"; // default name for non-mpi case
#endif

  int nranks_on_node = 1;
  int rank_on_node = 0;
#ifndef HPCG_NO_MPI
  { // extract a split communicator representing this node to get rank on node and nranks on node
      MPI_Comm shmcomm;
      MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
      MPI_Comm_rank(shmcomm, &rank_on_node);
      MPI_Comm_size(shmcomm, &nranks_on_node);
  }
#endif

  // we make the simplifying assumption that all cards attached to a node look the same, 
  // so we can determine sizes based on a single sample of card 0
  int tiles_per_card = 1;
  if (part_prop.empty()) {
    tiles_per_card = 1;
  } else {
      const auto affinity_partition = sycl::info::partition_property::partition_by_affinity_domain;
      for (int i = 0; i < part_prop.size(); i++ ) {
        if (part_prop[i] == affinity_partition) {
            sycl::device sample_card = platform_cards[0];
            std::vector<sycl::device> sample_tiles = sample_card.create_sub_devices<affinity_partition>(
                                                            sycl::info::partition_affinity_domain::numa);
            tiles_per_card = sample_tiles.size();
            break;
        }
        else {
            tiles_per_card = 1;
        }
      }
  }
 
  const int cards_per_node = platform_cards.size();
  const int tiles_per_node = cards_per_node * tiles_per_card;

  int card_number = 0;
  int tile_number = 0;
  if (params.affinity == AffinityPerNode::compact) {
      card_number = floor_div(rank_on_node, tiles_per_card) % cards_per_node; // wrap around cards per node
      tile_number = rank_on_node % tiles_per_card; // wrap around tiles per card
  }
  else if (params.affinity == AffinityPerNode::roundRobin) {
      card_number = rank_on_node % cards_per_node; // wrap around cards per node
      tile_number = floor_div(rank_on_node, cards_per_node) % tiles_per_card; // wrap around tiles per card
  }
  else {
    throw std::runtime_error("unexpected affinity per node");
  }

  sycl::device card = platform_cards[card_number];
  sycl::device dev;
  
  try {
      int tiles_per_card_detected = 0;
      if (tiles_per_card == 1) {
          dev = card;
          tiles_per_card_detected = 1;
      }
      else {
          std::vector<sycl::device> tiles =
              card.create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
                  sycl::info::partition_affinity_domain::numa);

          tiles_per_card_detected = tiles.size();
          if (tiles_per_card_detected != tiles_per_card) {
            throw std::runtime_error("Unexpected number of tiles on this card.");
          }
          dev = tiles[tile_number];
      }
      
      //printf("rank %d, #ranks %d, tiles_per_card %d, cards_per_node %d, tiles_per_node %d, ranks_per_node %d, rank_on_node %d, card_number %d, tile_number %d\n",
      //        rank, size, tiles_per_card, cards_per_node, tiles_per_node, nranks_on_node, rank_on_node, card_number, tile_number); fflush(0);

      // print out serially
#ifndef HPCG_NO_MPI
      for (int p = 0; p < size; p++) {
          MPI_Barrier( MPI_COMM_WORLD);
          if ( (rank == p) && ( p==0 || p==size-1) ) {
#endif
              std::cout << "[" << rank << " / " << size << "] node " << node_name << ", card " << card_number << " / " << cards_per_node
                        << ", tile " << tile_number << " / " << tiles_per_card_detected << std::endl;
#ifndef HPCG_NO_MPI
          }
      }
      MPI_Barrier( MPI_COMM_WORLD);
#endif
  }
  catch (sycl::exception const& e) {
      if (e.code() == sycl::errc::feature_not_supported) {
          // running on hardware unsupported, or ZE_AFFINITY_MASK specifies a particular tile
          if (size > 1) {
              throw std::runtime_error("Can't allocate tiles to MPI ranks.");
          }
          dev = card;
          std::cout << "[" << rank << " / " << size << "] using GPU device" << std::endl;
      }
      else {
          throw;
      }
  }
  sycl::queue main_queue(dev, exception_handler);


