#pragma OPENCL EXTENSION cl_khr_fp64 : enable

__kernel void udislo(__global double *random1,
                     __global double *random2,
                     __global double3 *rd0,
                     __global double2 *r1,
                     __global double3 *u1,
                     const double be_len,
                     const double bs_len,
                     const double size,
                     const double nu,
                     const int Np,
                     const int Nd,
                     const int Flag_Square,
                     __local double3 *shared)
{
  int k = get_global_id(0);
  u1[k] = (double3)(0.0f, 0.0f, 0.0f);
  r1[k] = (double2)(0.0f, 0.0f);

  double uaux = 1.0f/(4.0f*(1.0f-nu));
  double uaux1 = 1.0f/(2.0f*(1-nu));
  double uaux2 = 1.0f - 2.0f*nu;
  double pi = 3.1415926535897f;

  if (k < Np) {
    double2 r11 = (double2)(0.0f, 0.0f);
    if (Flag_Square == 1) {
      r11.x = random1[k] * size;
      r11.y = random2[k] * size;
    } else {
      double r = random1[k]*size;
      double phi = random2[k] * 2.0f * pi;
      r11.x = r * cos(phi);
      r11.y = r * sin(phi);
    }

    // compute displacement field generated by Nd dislocations
    double3 u2 = (double3)(0.0f);
    double3 r0 = (double3)(0.0f);
    double zzz = 0.0f;
    double ax2x1;
    double x1;
    double x2;
    double x1x2;
    double signe;
    int kk;

    // get the parameters of the thread
    int lsize = get_local_size(0);
    int lid = get_local_id(0);

    // loop over all dislocations
    for (kk=0; kk<Nd; kk=kk+lsize) {
      // size of the last batch of dislocations coordinates to be transfered from main memory to shared one
      if ((kk+lsize) > Nd) {
        lsize = Nd - kk;
      }

      // grab the coordinates and store them in the shared vector
      if ((kk+lid) < Nd) {
        shared[lid] = rd0[kk+lid];
      }

      // synchronization barrier
      barrier(CLK_LOCAL_MEM_FENCE);

      // compute the contribution of the dislocations located in the shared memory
      for( int dislo=0; dislo<lsize; dislo++) {
        x1 = r11.x - shared[dislo].x;
        x2 = r11.y - shared[dislo].y;
        signe = shared[dislo].z;

        x1x2 = x1*x1 + x2*x2;
        ax2x1 = atan2(x2, x1);

        // components of the displacement field
        u2.x = u2.x + be_len*signe*(ax2x1+x1*x2/x1x2*uaux1);
        u2.y = u2.y - be_len*signe*(uaux2*log(x1x2)+(x1*x1-x2*x2)/x1x2)*uaux;
        u2.z = u2.z + bs_len*signe*ax2x1;
      }
      // synchronization barrier
      barrier(CLK_LOCAL_MEM_FENCE);
    }
    u1[k] = u2;
    r1[k] = r11;
  }
}

__kernel void comptf(__global double16 *Vect16FC,
                     __global double3 *rd0,
                     __global double2 *r1,
                     __global double3 *u1,
                     const double be_len,
                     const double bs_len,
                     double size,
                     double nu,
                     const int Np,
                     const int IndexFourier,
                     const double gs,
                     const double3 gd_vec,
                     const double2 a3vd,
                     const double a3,
                     const int Nd,
                     const int Flag_Square,
                     __global int *inout,
                     __local double3 *shared)
{

  double uaux =1.0f/(4.0f*(1.0f-nu));
  double uaux1=1.0f/(2.0f*(1-nu));
  double uaux2=1.0f-2.0f*nu;
  double eps2l= 0.0f;

  double c1AL = 0.0f;
  double s1AL = 0.0f;

  double c2AL = 0.0f;
  double s2AL = 0.0f;

  double c3AL = 0.0f;
  double s3AL = 0.0f;

  double c4AL = 0.0f;
  double s4AL = 0.0f;

  double c5AL = 0.0f;
  double s5AL = 0.0f;

  // variable for the computation of translations
  double3 u2 = (double3)(0.0f, 0.0f, 0.0f);
  double2 r2 = (double2)(0.0f, 0.0f);
  double2 r0 = (double2)(0.0f, 0.0f);
  double x1 = 0.0f;
  double x2 = 0.0f;
  double x1x2 = 0.0f;
  double ax2x1 = 0.0f;

  double sz_gu = 0.0f;
  double3 du = (double3)(0.0f);
  double2 bb = (double2)(0.0f);
  int flaginside = 0;
  int inside = 0;
  int kk;
  double signe = 0.0;

  // Get k index of the random point i.e. 0 <= k < Np
  size_t k = get_global_id(0);

  // get the parameters of the opencl work-item
  int lsize = get_local_size(0);
  int lid = get_local_id(0);

  // initialize the variables used in the computation of the Fourier Transform
  inout[k] = 1;
  Vect16FC[k] = (double16)(0.0f);

  int flag_outside = 0;

  if ((int)k < Np) {
    // step 1: compute the translated position
    bb = (double)(IndexFourier)*a3vd;
    r2 = r1[k] + bb;
    if (Flag_Square == 0) {
      if (length(r2) > size) {
        r2 = r1[k] - bb;
        if (length(r2) > size) {
          inout[k] = 0; // translated point outside the region of interest
        } else {
          flag_outside = 1;
        }
      }
    } else {
      if ((r2.x<0) || (r2.x>size) || (r2.y<0) || (r2.y>size)) {
        r2 = r1[k] - bb;
        if ((r2.x<0) || (r2.x>size) || (r2.y<0) || (r2.y>size)) {
          inout[k] = 0; // translated point outside the region of interest
        } else {
          flag_outside = 1;
        }
      }
    }

    // step 2: compute the displacement field generated by all the dislocations on the translated point
    u2 = (double3)(0.0f, 0.0f, 0.0f);

    // loop over all dislocation points by batch of lsize points
    for (kk=0; kk<Nd; kk=kk+lsize) {

      // size of the last batch of dislocations coordinates to be transfered from main memory to shared one
      if ((kk+lsize) > Nd) {
        lsize = Nd - kk;
      }

      // grab the coordinates
      if ((kk+lid) < Nd) {
        shared[lid] = rd0[kk+lid];
      }

      // synchronization
      barrier(CLK_LOCAL_MEM_FENCE);

      // compute the contribution of the dislocations
      for (int dislo=0; dislo<lsize; dislo++) {
        x1 = r2.x - shared[dislo].x;
        x2 = r2.y - shared[dislo].y;
        signe = shared[dislo].z;

        x1x2 = x1*x1 + x2*x2;
        ax2x1 = atan2(x2, x1);

        u2.x = u2.x + be_len*signe*(ax2x1+x1*x2/x1x2*uaux1);
        u2.y = u2.y - be_len*signe*(uaux2*log(x1x2)+(x1*x1-x2*x2)/x1x2)*uaux;
        u2.z = u2.z + bs_len*signe*ax2x1;
      }

      // synchronization
      barrier(CLK_LOCAL_MEM_FENCE);
    }

    // compute increment between the cumulated displacement field and the given displacement field
    du = (flag_outside==1) ? u2-u1[k] : u1[k]-u2;

    // compute data related to the Fourier Coefficients
    sz_gu = dot(gd_vec, du);
    eps2l = (sz_gu/gs/(a3*(double)(IndexFourier)));
    eps2l *= eps2l;

    c1AL = cos(1.0f*sz_gu);
    c2AL = cos(2.0f*sz_gu);
    c3AL = cos(3.0f*sz_gu);
    c4AL = cos(4.0f*sz_gu);
    c5AL = cos(5.0f*sz_gu);

    s1AL = sin(1.0f*sz_gu);
    s2AL = sin(2.0f*sz_gu);
    s3AL = sin(3.0f*sz_gu);
    s4AL = sin(4.0f*sz_gu);
    s5AL = sin(5.0f*sz_gu);

    Vect16FC[k] = inout[k]*(double16)(c1AL, s1AL, c2AL, s2AL, c3AL, s3AL, c4AL, s4AL, c5AL, s5AL, eps2l, 0.0, 0.0, 0.0, 0.0, 0.0);
  }
}
