/**
 * Example external extension module using CuVec.
 *
 * Copyright (2021) Casper da Costa-Luis
 */
#include "Python.h"
#include "pycuvec.cuh" // PyCuVec
#ifdef CUVEC_DISABLE_CUDA
#include <chrono> // std::chrono
#else
/** functions */
/// dst = src + 1
__global__ void _d_incr(float *dst, float *src, int X, int Y) {
  int x = threadIdx.x + blockDim.x * blockIdx.x;
  if (x >= X) return;
  int y = threadIdx.y + blockDim.y * blockIdx.y;
  if (y >= Y) return;
  dst[y * X + x] = src[y * X + x] + 1;
}
#endif // CUVEC_DISABLE_CUDA
static PyObject *increment2d_f(PyObject *self, PyObject *args, PyObject *kwargs) {
  PyCuVec<float> *src = NULL;
  PyCuVec<float> *dst = NULL;
  bool timing = false;
  static const char *kwds[] = {"src", "output", "timing", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&|Ob", (char **)kwds, &asPyCuVec_f, &src, &dst,
                                   &timing))
    return NULL;
  dst = asPyCuVec(dst);
  if (!src) return NULL;
  std::vector<Py_ssize_t> &N = src->shape;
  if (N.size() != 2) {
    PyErr_SetString(PyExc_IndexError, "`src` must be 2D");
    return NULL;
  }

#ifndef CUVEC_DISABLE_CUDA
  cudaEvent_t eStart, eAlloc, eKern;
  cudaEventCreate(&eStart);
  cudaEventCreate(&eAlloc);
  cudaEventCreate(&eKern);
  cudaEventRecord(eStart);
#else
  auto eStart = std::chrono::steady_clock::now();
#endif

  if (dst) {
    if (N != dst->shape) {
      PyErr_SetString(PyExc_IndexError, "`output` must be same shape as `src`");
      return NULL;
    }
    Py_INCREF((PyObject *)dst); // anticipating returning
  } else {
    dst = PyCuVec_zeros_like(src);
    if (!dst) return NULL;
  }

#ifndef CUVEC_DISABLE_CUDA
  cudaEventRecord(eAlloc);
  dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32);
  dim3 blcks(32, 32);
  _d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]);
  // cudaDeviceSynchronize();
  cudaEventRecord(eKern);
  cudaEventSynchronize(eKern);
  float alloc_ms, kernel_ms;
  cudaEventElapsedTime(&alloc_ms, eStart, eAlloc);
  cudaEventElapsedTime(&kernel_ms, eAlloc, eKern);
// fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms);
#else
  auto eAlloc = std::chrono::steady_clock::now();
  for (size_t i = 0; i < src->vec.size(); i++) dst->vec[i] = src->vec[i] + 1;
  auto eKern = std::chrono::steady_clock::now();
  double alloc_ms = std::chrono::duration<double, std::milli>(eAlloc - eStart).count();
  double kernel_ms = std::chrono::duration<double, std::milli>(eKern - eAlloc).count();
// fprintf(stderr, "%.3lf ms, %.3lf ms\n", alloc_ms, kernel_ms);
#endif
  if (timing) {
    // hack: store times in first two elements of output
    dst->vec[0] = alloc_ms;
    dst->vec[1] = kernel_ms;
  }
  return (PyObject *)dst;
}
static PyMethodDef example_methods[] = {
    {"increment2d_f", (PyCFunction)increment2d_f, METH_VARARGS | METH_KEYWORDS,
     "Args: src, output (optional). Returns: alloc_ms, kernel_ms, src + 1."},
    {NULL, NULL, 0, NULL} // Sentinel
};

/** module */
static struct PyModuleDef example_mod = {PyModuleDef_HEAD_INIT,
                                         "example_mod", // module
                                         "Example external module.",
                                         -1, // module keeps state in global variables
                                         example_methods};
PyMODINIT_FUNC PyInit_example_mod(void) {
  Py_Initialize();
  return PyModule_Create(&example_mod);
}
