#include <thrust/device_vector.h>
#include <algorithm>

#include "eval_lap.cuh"
#include "basis_to_gpu.cuh"
#include "cuda_basis_utils.cuh"
#include "eval.cuh"

using namespace chemtools;

/// Note that these sum of second derivatives were generated by the python file ./generate/generate_sec_derivs_cont.py
__device__ __forceinline__ void chemtools::eval_AOs_lap(
    double *d_lap,
    const double3& pt,
    const int &n_pts,
    uint &idx,
    const int &iorb_start)
{
    uint ibasis     = 0;                                       // Index to go over constant memory.
    uint iorb       = iorb_start;                              // Index to go over rows of d_AO_vals
    uint n_cshells  = (uint) g_constant_basis[ibasis++];       // Number Contraction Shells

    #pragma unroll 1
    for(int ishell = 0; ishell < n_cshells; ishell++) {
        double3 r_A = {
          pt.x - g_constant_basis[ibasis++],
          pt.y - g_constant_basis[ibasis++],
          pt.z - g_constant_basis[ibasis++]
        };
        uint n_seg_shells = (uint) g_constant_basis[ibasis++];
        uint n_prims      = (uint) g_constant_basis[ibasis++];
        #pragma unroll 1
        for(int iseg = 0; iseg < n_seg_shells; iseg++) {
            int L = (int) g_constant_basis[ibasis + n_prims + (n_prims + 1) * iseg];
            if(L == S_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c  = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a  = g_constant_basis[ibasis + i_prim];
                    double ce = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    d_lap[idx + iorb * n_pts] +=
                        normalization_primitive_s(a) *
                            2 * a * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 3) *
                            ce;
                }
            }
            else if (L == P_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c  = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a  = g_constant_basis[ibasis + i_prim];
                    double ce = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    d_lap[idx + iorb * n_pts] +=
                        normalization_primitive_p(a) *
                            2 * a * r_A.x * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 5)
                            *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        normalization_primitive_p(a) *
                            2 * a * r_A.y * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 5)
                            *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        normalization_primitive_p(a) *
                            2 * a * r_A.z * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 5)
                            *
                            ce;
                }
            }
            else if (L == D_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c  = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a  = g_constant_basis[ibasis + i_prim];
                    double ce = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    // The ordering is ['xx', 'yy', 'zz', 'xy', 'xz', 'yz']   Old ordering: xx, xy, xz, yy, yz, zz
                    d_lap[idx + iorb * n_pts] +=
                        normalization_primitive_d(a, 2, 0, 0) *
                            (4 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 4 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 14 * a * r_A.x * r_A.x + 2) *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        normalization_primitive_d(a, 0, 2, 0) *
                            (4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 4 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 4 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 14 * a * r_A.y * r_A.y + 2) *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        normalization_primitive_d(a, 0, 0, 2) *
                            (4 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 4 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 4 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 14 * a * r_A.z * r_A.z + 2) *
                            ce;
                    d_lap[idx + (iorb + 3) * n_pts] +=
                        normalization_primitive_d(a, 1, 1, 0) *
                            2 * a * r_A.x * r_A.y
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 7) *
                            ce;
                    d_lap[idx + (iorb + 4) * n_pts] +=
                        normalization_primitive_d(a, 1, 0, 1) *
                            2 * a * r_A.x * r_A.z
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 7) *
                            ce;
                    d_lap[idx + (iorb + 5) * n_pts] +=
                        normalization_primitive_d(a, 0, 1, 1) *
                            2 * a * r_A.y * r_A.z
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 7) *
                            ce;
                }
            }
            else if (L == DP_TYPE) {
                
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c          = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a          = g_constant_basis[ibasis + i_prim];
                    double ce         = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    // Negatives are s denoting sine and c denoting cosine.
                    // Fchk ordering is  ['c0', 'c1', 's1', 'c2', 's2']
                    double norm_const = normalization_primitive_pure_d(a);
                    d_lap[idx + iorb * n_pts] +=
                        norm_const *
                            a * (-2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                            + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                            + 2 * a * r_A.y * r_A.y * r_A.z * r_A.z + 4 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                            + 7 * r_A.x * r_A.x + 7 * r_A.y * r_A.y - 14 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        norm_const *
                            sqrt(3.) *
                            2 * a * r_A.x * r_A.z
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 7) *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        norm_const *
                            sqrt(3.) *
                            2 * a * r_A.y * r_A.z
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 7) *
                            ce;
                    d_lap[idx + (iorb + 3) * n_pts] +=
                        norm_const *
                            sqrt(3.) *
                            a * (2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z
                            - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y) - 2 * a * r_A.y * r_A.y * r_A.z * r_A.z
                            - 7 * r_A.x * r_A.x + 7 * r_A.y * r_A.y) *
                            ce;
                    d_lap[idx + (iorb + 4) * n_pts] +=
                        norm_const *
                            sqrt(3.) *
                            2 * a * r_A.x * r_A.y
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 7) *
                            ce;
                }
            }
            else if (L == F_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c  = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a  = g_constant_basis[ibasis + i_prim];
                    double ce = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    // The ordering is ['xxx', 'yyy', 'zzz', 'xyy', 'xxy', 'xxz', 'xzz', 'yzz', 'yyz', 'xyz']
                    d_lap[idx + iorb * n_pts] +=
                        normalization_primitive_f(a, 3, 0, 0) *
                            2 * r_A.x
                            * (2 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 9 * a * r_A.x * r_A.x + 3) *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        normalization_primitive_f(a, 0, 3, 0) *
                            2 * r_A.y
                            * (2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 9 * a * r_A.y * r_A.y + 3) *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        normalization_primitive_f(a, 0, 0, 3) *
                            2 * r_A.z
                            * (2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 2 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 9 * a * r_A.z * r_A.z + 3) *
                            ce;
                    d_lap[idx + (iorb + 3) * n_pts] +=
                        normalization_primitive_f(a, 1, 2, 0) *
                            2 * r_A.x
                            * (2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 9 * a * r_A.y * r_A.y + 1) *
                            ce;
                    d_lap[idx + (iorb + 4) * n_pts] +=
                        normalization_primitive_f(a, 2, 1, 0) *
                            2 * r_A.y
                            * (2 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 9 * a * r_A.x * r_A.x + 1) *
                            ce;
                    d_lap[idx + (iorb + 5) * n_pts] +=
                        normalization_primitive_f(a, 2, 0, 1) *
                            2 * r_A.z
                            * (2 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 9 * a * r_A.x * r_A.x + 1) *
                            ce;
                    d_lap[idx + (iorb + 6) * n_pts] +=
                        normalization_primitive_f(a, 1, 0, 2) *
                            2 * r_A.x
                            * (2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 2 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 9 * a * r_A.z * r_A.z + 1) *
                            ce;
                    d_lap[idx + (iorb + 7) * n_pts] +=
                        normalization_primitive_f(a, 0, 1, 2) *
                            2 * r_A.y
                            * (2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 2 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 9 * a * r_A.z * r_A.z + 1) *
                            ce;
                    d_lap[idx + (iorb + 8) * n_pts] +=
                        normalization_primitive_f(a, 0, 2, 1) *
                            2 * r_A.z
                            * (2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 9 * a * r_A.y * r_A.y + 1) *
                            ce;
                    d_lap[idx + (iorb + 9) * n_pts] +=
                        normalization_primitive_f(a, 1, 1, 1) *
                            2 * a * r_A.x * r_A.y * r_A.z
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 9) *
                            ce;
                }
            }
            else if (L == SF_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c          = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a          = g_constant_basis[ibasis + i_prim];
                    double ce         = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    // ['c0', 'c1', 's1', 'c2', 's2', 'c3', 's3']
                    double norm_const = normalization_primitive_pure_f(a);
                    d_lap[idx + iorb * n_pts] +=
                        norm_const *
                            a * r_A.z
                            * (-6 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 12 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                - 2 * a * r_A.x * r_A.x * r_A.z * r_A.z - 6 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                - 2 * a * r_A.y * r_A.y * r_A.z * r_A.z + 4 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                                + 27 * r_A.x * r_A.x + 27 * r_A.y * r_A.y - 18 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        norm_const *
                            sqrt(1.5) *
                            a * r_A.x
                            * (-2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 6 * a * r_A.x * r_A.x * r_A.z * r_A.z - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 6 * a * r_A.y * r_A.y * r_A.z * r_A.z + 8 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                                + 9 * r_A.x * r_A.x + 9 * r_A.y * r_A.y - 36 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        norm_const *
                            sqrt(1.5) *
                            a * r_A.y
                            * (-2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 6 * a * r_A.x * r_A.x * r_A.z * r_A.z - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 6 * a * r_A.y * r_A.y * r_A.z * r_A.z + 8 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                                + 9 * r_A.x * r_A.x + 9 * r_A.y * r_A.y - 36 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 3) * n_pts] +=
                        norm_const *
                            sqrt(15.0) *
                            a * r_A.z * (2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z
                            - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y) - 2 * a * r_A.y * r_A.y * r_A.z * r_A.z
                            - 9 * r_A.x * r_A.x + 9 * r_A.y * r_A.y) *
                            ce;
                    d_lap[idx + (iorb + 4) * n_pts] +=
                        norm_const *
                            sqrt(15.0) *
                            2 * a * r_A.x * r_A.y * r_A.z
                            * (2 * a * r_A.x * r_A.x + 2 * a * r_A.y * r_A.y + 2 * a * r_A.z * r_A.z - 9) *
                            ce;
                    d_lap[idx + (iorb + 5) * n_pts] +=
                        norm_const *
                            sqrt(2.5) *
                            a * r_A.x * (2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                            + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z - 6 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                            - 6 * a * r_A.y * r_A.y * r_A.z * r_A.z - 9 * r_A.x * r_A.x + 27 * r_A.y * r_A.y) *
                            ce;
                    d_lap[idx + (iorb + 6) * n_pts] +=
                        norm_const *
                            sqrt(2.5) *
                            a * r_A.y * (6 * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                            + 6 * a * r_A.x * r_A.x * r_A.z * r_A.z - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                            - 2 * a * r_A.y * r_A.y * r_A.z * r_A.z - 27 * r_A.x * r_A.x + 9 * r_A.y * r_A.y) *
                            ce;
                }
            }
            else if (L == G_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c  = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a  = g_constant_basis[ibasis + i_prim];
                    double ce = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    // The ordering is ['zzzz', 'yzzz', 'yyzz', 'yyyz', 'yyyy', 'xzzz', 'xyzz', 'xyyz', 'xyyy', 'xxzz',
                    //                                                                'xxyz', 'xxyy', 'xxxz', 'xxxy', 'xxxx']
                    d_lap[idx + iorb * n_pts] +=
                        normalization_primitive_g(a, 0, 0, 4) *
                            r_A.z * r_A.z
                            * (4 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 4 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 4 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 22 * a * r_A.z * r_A.z + 12) *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        normalization_primitive_g(a, 0, 1, 3) *
                            2 * r_A.y * r_A.z
                            * (2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 2 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 11 * a * r_A.z * r_A.z + 3) *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        normalization_primitive_g(a, 0, 2, 2) *
                            (4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y * r_A.z * r_A.z
                                + 4 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y) * r_A.z * r_A.z
                                + 4 * a * a * r_A.y * r_A.y * (r_A.z * r_A.z * r_A.z * r_A.z)
                                - 22 * a * r_A.y * r_A.y * r_A.z * r_A.z + 2 * r_A.y * r_A.y + 2 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 3) * n_pts] +=
                        normalization_primitive_g(a, 0, 3, 1) *
                            2 * r_A.y * r_A.z
                            * (2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 11 * a * r_A.y * r_A.y + 3) *
                            ce;
                    d_lap[idx + (iorb + 4) * n_pts] +=
                        normalization_primitive_g(a, 0, 4, 0) *
                            r_A.y * r_A.y
                            * (4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 4 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 4 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 22 * a * r_A.y * r_A.y + 12) *
                            ce;
                    d_lap[idx + (iorb + 5) * n_pts] +=
                        normalization_primitive_g(a, 1, 0, 3) *
                            2 * r_A.x * r_A.z
                            * (2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 2 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 11 * a * r_A.z * r_A.z + 3) *
                            ce;
                    d_lap[idx + (iorb + 6) * n_pts] +=
                        normalization_primitive_g(a, 1, 1, 2) *
                            2 * r_A.x * r_A.y
                            * (2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z
                                + 2 * a * a * (r_A.z * r_A.z * r_A.z * r_A.z) - 11 * a * r_A.z * r_A.z + 1) *
                            ce;
                    d_lap[idx + (iorb + 7) * n_pts] +=
                        normalization_primitive_g(a, 1, 2, 1) *
                            2 * r_A.x * r_A.z
                            * (2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 11 * a * r_A.y * r_A.y + 1) *
                            ce;
                    d_lap[idx + (iorb + 8) * n_pts] +=
                        normalization_primitive_g(a, 1, 3, 0) *
                            2 * r_A.x * r_A.y
                            * (2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * a * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * a * r_A.y * r_A.y * r_A.z * r_A.z - 11 * a * r_A.y * r_A.y + 3) *
                            ce;
                    d_lap[idx + (iorb + 9) * n_pts] +=
                        normalization_primitive_g(a, 2, 0, 2) *
                            (4 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.z * r_A.z
                                + 4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y * r_A.z * r_A.z
                                + 4 * a * a * r_A.x * r_A.x * (r_A.z * r_A.z * r_A.z * r_A.z)
                                - 22 * a * r_A.x * r_A.x * r_A.z * r_A.z + 2 * r_A.x * r_A.x + 2 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 10) * n_pts] +=
                        normalization_primitive_g(a, 2, 1, 1) *
                            2 * r_A.y * r_A.z
                            * (2 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 11 * a * r_A.x * r_A.x + 1) *
                            ce;
                    d_lap[idx + (iorb + 11) * n_pts] +=
                        normalization_primitive_g(a, 2, 2, 0) *
                            (4 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.y * r_A.y
                                + 4 * a * a * r_A.x * r_A.x * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y * r_A.z * r_A.z
                                - 22 * a * r_A.x * r_A.x * r_A.y * r_A.y + 2 * r_A.x * r_A.x + 2 * r_A.y * r_A.y) *
                            ce;
                    d_lap[idx + (iorb + 12) * n_pts] +=
                        normalization_primitive_g(a, 3, 0, 1) *
                            2 * r_A.x * r_A.z
                            * (2 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 11 * a * r_A.x * r_A.x + 3) *
                            ce;
                    d_lap[idx + (iorb + 13) * n_pts] +=
                        normalization_primitive_g(a, 3, 1, 0) *
                            2 * r_A.x * r_A.y
                            * (2 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 11 * a * r_A.x * r_A.x + 3) *
                            ce;
                    d_lap[idx + (iorb + 14) * n_pts] +=
                        normalization_primitive_g(a, 4, 0, 0) *
                            r_A.x * r_A.x
                            * (4 * a * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 4 * a * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 4 * a * a * r_A.x * r_A.x * r_A.z * r_A.z - 22 * a * r_A.x * r_A.x + 12) *
                            ce;
                }
            }
            else if (L == SG_TYPE) {
                for(int i_prim= 0; i_prim < n_prims; i_prim++) {
                    double c          = g_constant_basis[ibasis + n_prims * (iseg + 1) + i_prim + 1 + iseg];
                    double a          = g_constant_basis[ibasis + i_prim];
                    double ce         = c * exp(-a * ((r_A.x * r_A.x) + (r_A.y * r_A.y) + (r_A.z * r_A.z)));
                    // ['c0', 'c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4']
                    double norm_const = normalization_primitive_pure_g(a);
                    d_lap[idx + iorb * n_pts] +=
                        norm_const *
                            a * (6 * a * (r_A.x * r_A.x * r_A.x * r_A.x * r_A.x * r_A.x)
                            + 18 * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.y * r_A.y
                            - 42 * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.z * r_A.z
                            + 18 * a * r_A.x * r_A.x * (r_A.y * r_A.y * r_A.y * r_A.y)
                            - 84 * a * r_A.x * r_A.x * r_A.y * r_A.y * r_A.z * r_A.z
                            - 32 * a * r_A.x * r_A.x * (r_A.z * r_A.z * r_A.z * r_A.z)
                            + 6 * a * (r_A.y * r_A.y * r_A.y * r_A.y * r_A.y * r_A.y)
                            - 42 * a * (r_A.y * r_A.y * r_A.y * r_A.y) * r_A.z * r_A.z
                            - 32 * a * r_A.y * r_A.y * (r_A.z * r_A.z * r_A.z * r_A.z)
                            + 16 * a * (r_A.z * r_A.z * r_A.z * r_A.z * r_A.z * r_A.z)
                            - 33 * (r_A.x * r_A.x * r_A.x * r_A.x) - 66 * r_A.x * r_A.x * r_A.y * r_A.y
                            + 264 * r_A.x * r_A.x * r_A.z * r_A.z - 33 * (r_A.y * r_A.y * r_A.y * r_A.y)
                            + 264 * r_A.y * r_A.y * r_A.z * r_A.z - 88 * (r_A.z * r_A.z * r_A.z * r_A.z)) / 4 *
                            ce;
                    d_lap[idx + (iorb + 1) * n_pts] +=
                        norm_const *
                            sqrt(2.5) *
                            a * r_A.x * r_A.z
                            * (-6 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 12 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z - 6 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * r_A.y * r_A.y * r_A.z * r_A.z + 8 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                                + 33 * r_A.x * r_A.x + 33 * r_A.y * r_A.y - 44 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 2) * n_pts] +=
                        norm_const *
                            sqrt(2.5) *
                            a * r_A.y * r_A.z
                            * (-6 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 12 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z - 6 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 2 * a * r_A.y * r_A.y * r_A.z * r_A.z + 8 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                                + 33 * r_A.x * r_A.x + 33 * r_A.y * r_A.y - 44 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 3) * n_pts] +=
                        norm_const *
                            sqrt(5.0) *
                            a * (-2 * a * (r_A.x * r_A.x * r_A.x * r_A.x * r_A.x * r_A.x)
                            - 2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.y * r_A.y
                            + 10 * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.z * r_A.z
                            + 2 * a * r_A.x * r_A.x * (r_A.y * r_A.y * r_A.y * r_A.y)
                            + 12 * a * r_A.x * r_A.x * (r_A.z * r_A.z * r_A.z * r_A.z)
                            + 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y * r_A.y * r_A.y)
                            - 10 * a * (r_A.y * r_A.y * r_A.y * r_A.y) * r_A.z * r_A.z
                            - 12 * a * r_A.y * r_A.y * (r_A.z * r_A.z * r_A.z * r_A.z)
                            + 11 * (r_A.x * r_A.x * r_A.x * r_A.x) - 66 * r_A.x * r_A.x * r_A.z * r_A.z
                            - 11 * (r_A.y * r_A.y * r_A.y * r_A.y) + 66 * r_A.y * r_A.y * r_A.z * r_A.z) / 2 *
                            ce;
                    d_lap[idx + (iorb + 4) * n_pts] +=
                        norm_const *
                            sqrt(5.0) *
                            a * r_A.x * r_A.y
                            * (-2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 10 * a * r_A.x * r_A.x * r_A.z * r_A.z - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                + 10 * a * r_A.y * r_A.y * r_A.z * r_A.z + 12 * a * (r_A.z * r_A.z * r_A.z * r_A.z)
                                + 11 * r_A.x * r_A.x + 11 * r_A.y * r_A.y - 66 * r_A.z * r_A.z) *
                            ce;
                    d_lap[idx + (iorb + 5) * n_pts] +=
                        norm_const *
                            sqrt(35.0 / 2.0) *
                            a * r_A.x * r_A.z
                            * (2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) - 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z - 6 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                - 6 * a * r_A.y * r_A.y * r_A.z * r_A.z - 11 * r_A.x * r_A.x + 33 * r_A.y * r_A.y) *
                            ce;
                    d_lap[idx + (iorb + 6) * n_pts] +=
                        norm_const *
                            sqrt(35.0 / 2.0) *
                            a * r_A.y * r_A.z
                            * (6 * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 4 * a * r_A.x * r_A.x * r_A.y * r_A.y
                                + 6 * a * r_A.x * r_A.x * r_A.z * r_A.z - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y)
                                - 2 * a * r_A.y * r_A.y * r_A.z * r_A.z - 33 * r_A.x * r_A.x + 11 * r_A.y * r_A.y) *
                            ce;
                    d_lap[idx + (iorb + 7) * n_pts] +=
                        norm_const *
                            sqrt(35.0) *
                            a * (2 * a * (r_A.x * r_A.x * r_A.x * r_A.x * r_A.x * r_A.x)
                            - 10 * a * (r_A.x * r_A.x * r_A.x * r_A.x) * r_A.y * r_A.y
                            - 10 * a * r_A.x * r_A.x * (r_A.y * r_A.y * r_A.y * r_A.y)
                            + 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y * r_A.y * r_A.y)
                            - 10 * (r_A.x * r_A.x * r_A.x * r_A.x) + 60 * r_A.x * r_A.x * r_A.y * r_A.y
                            - 10 * (r_A.y * r_A.y * r_A.y * r_A.y) + (2 * a * r_A.z * r_A.z - 1)
                            * ((r_A.x * r_A.x * r_A.x * r_A.x) - 6 * r_A.x * r_A.x * r_A.y * r_A.y
                                + (r_A.y * r_A.y * r_A.y * r_A.y))) / 4 *
                            ce;
                    d_lap[idx + (iorb + 8) * n_pts] +=
                        norm_const *
                            sqrt(35.0) *
                            a * r_A.x * r_A.y
                            * (2 * a * (r_A.x * r_A.x * r_A.x * r_A.x) + 2 * a * r_A.x * r_A.x * r_A.z * r_A.z
                                - 2 * a * (r_A.y * r_A.y * r_A.y * r_A.y) - 2 * a * r_A.y * r_A.y * r_A.z * r_A.z
                                - 11 * r_A.x * r_A.x + 11 * r_A.y * r_A.y) *
                            ce;
                }
            } // End going over contractions of a single segmented shell.
            // Update index that goes over each contraction.
            if(L == S_TYPE){
              iorb += 1;
            }
            else if (L == P_TYPE) {
              iorb += 3;
            }
            else if (L == D_TYPE) {
              iorb += 6;
            }
            else if (L == DP_TYPE) {
              iorb += 5;
            }
            else if (L == F_TYPE) {
              iorb += 10;
            }
            else if (L == SF_TYPE) {
              iorb += 7;
            }
            else if (L == G_TYPE) {
              iorb += 15;
            }
            else if (L == SG_TYPE) {
              iorb += 9;
            }
        } // End updating segmented shell.
        ibasis += n_prims + n_seg_shells + n_seg_shells * n_prims;
  } // End Contractions
}


__global__ void chemtools::eval_AOs_lap_from_constant_memory_on_any_grid(
          double* __restrict__ d_lap,
    const double* __restrict__ d_points,
    const int     n_pts,
    const int     n_cshells,
    const int     iorb_start
) {
    uint idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n_pts) {
        // Get the grid points where `d_points` is in column-major order with shape (N, 3)
        double grid_x = d_points[idx];
        double grid_y = d_points[idx + n_pts];
        double grid_z = d_points[idx + n_pts * 2];
        
        // Evaluate the contractions and store it in d_contractions_array
        eval_AOs_lap(
            d_lap, {grid_x, grid_y, grid_z}, n_pts, idx, iorb_start
        );
    }
}


__host__ std::vector<double> chemtools::evaluate_sum_of_second_derivative_contractions(
    IOData& iodata, const double* h_points, const int knumb_points
){
  // Get the molecular basis from iodata and put it in constant memory of the gpu.
  MolecularBasis molecular_basis = iodata.GetOrbitalBasis();
  int knbasisfuncs = molecular_basis.numb_basis_functions();

  // The output of the contractions in column-major order with shape (3, M, N).
  std::vector<double> h_contractions(knbasisfuncs * knumb_points);

  // Transfer grid points to GPU, this is in column order with shape (N, 3)
  double* d_points;
  CUDA_CHECK(cudaMalloc((double **) &d_points, sizeof(double) * 3 * knumb_points));
  CUDA_CHECK(cudaMemcpy(d_points, h_points,sizeof(double) * 3 * knumb_points, cudaMemcpyHostToDevice));

  // Evaluate derivatives of each contraction this is in row-order (3, M, N), where M =number of basis-functions.
  double* d_sum_second_derivs;
  CUDA_CHECK(cudaMalloc((double **) &d_sum_second_derivs, sizeof(double) * knumb_points * knbasisfuncs));
  dim3 threadsPerBlock(128);
  dim3 grid((knumb_points + threadsPerBlock.x - 1) / (threadsPerBlock.x));
    evaluate_scalar_quantity_density(
      molecular_basis,
      false,
      false,
      "rho_lap",
      d_sum_second_derivs,
      d_points,
      knumb_points,
      knbasisfuncs,
      threadsPerBlock,
      grid
  );
  
  // Transfer from device memory to host memory
  CUDA_CHECK(cudaMemcpy(&h_contractions[0],
                                       d_sum_second_derivs,
                                       sizeof(double) * knumb_points * knbasisfuncs, cudaMemcpyDeviceToHost));

  cudaFree(d_points);
  cudaFree(d_sum_second_derivs);

  return h_contractions;
}


__host__ std::vector<double> chemtools::evaluate_laplacian_on_any_grid_handle(
    cublasHandle_t& handle, IOData &iodata, const double* h_points, const int n_pts, const std::string& spin
) {
    const MolecularBasis molbasis = iodata.GetOrbitalBasis();
    const int            nbasis   = molbasis.numb_basis_functions();
    
    // Calculate Optimal Memory Chunks
    const size_t MAX_PTS_PER_ITER = 64 * 64 * 32;
    auto         chunks           = GpuMemoryPartitioner::compute(
        nbasis,
        [](size_t mem, size_t numb_basis) {
          return ((mem / sizeof(double)) - numb_basis * numb_basis - numb_basis) /
          (5 * numb_basis + 3);
        },
        n_pts,
        MAX_PTS_PER_ITER
    );
    
    // Resulting Laplacian and second-term temporary array
    std::vector<double> h_laplacian(n_pts);
    std::vector<double> h_second_term(chunks.pts_per_iter);
    
    // Allocate All Device Variables, It is actually faster than doing per iteration
    double *d_one_rdm = nullptr;
    CUDA_CHECK(cudaMalloc((double **) &d_one_rdm, nbasis * nbasis * sizeof(double)));
    CUBLAS_CHECK(cublasSetMatrix(
        iodata.GetOneRdmShape(),
        iodata.GetOneRdmShape(),
        sizeof(double),
        iodata.GetMOOneRDM(spin),
        iodata.GetOneRdmShape(),
        d_one_rdm,
        iodata.GetOneRdmShape()
    ));
    thrust::device_vector<double> all_ones(sizeof(double) * nbasis, 1.0);
    double *d_all_ones          = thrust::raw_pointer_cast(all_ones.data());
    double *d_AOs_all           = nullptr;  // AO row-major (M, N)
    double *d_sec_deriv_AOs_all = nullptr;  // second_derivatives AO row-major (M, N)
    double *d_AOs_deriv_all     = nullptr;  // AO derivatives ro-major (3, M, N)
    CUDA_CHECK(cudaMalloc((double **) &d_AOs_all, sizeof(double) * nbasis * chunks.pts_per_iter));
    CUDA_CHECK(cudaMalloc((double **) &d_sec_deriv_AOs_all, sizeof(double) * nbasis * chunks.pts_per_iter));
    CUDA_CHECK(cudaMalloc((double **) &d_AOs_deriv_all, sizeof(double) * 3 * chunks.pts_per_iter * nbasis));
    
    // Create temporary points so that it is easy to update the last iteration
    double *d_AOs       = d_AOs_all, *d_AOs_sec_deriv = d_sec_deriv_AOs_all;
    double *d_AOs_deriv = d_AOs_deriv_all;
    
    
    // Iterate through each chunk of the data set.
    size_t index_to_copy = 0;
    size_t i_iter        = 0;
    while (index_to_copy < n_pts) {
        // Calculate number of points to do
        size_t npts_iter = std::min(
            n_pts - i_iter * chunks.pts_per_iter,
            chunks.pts_per_iter
        );
        
        // If it is the last iteration, I'll need to move the pointers to fit new size
        if (npts_iter != chunks.pts_per_iter) {
            d_AOs           = d_AOs + nbasis * (chunks.pts_per_iter - npts_iter);
            d_AOs_deriv     = d_AOs_deriv + 3 * nbasis * (chunks.pts_per_iter - npts_iter);
            d_AOs_sec_deriv = d_AOs_sec_deriv + nbasis * (chunks.pts_per_iter - npts_iter);
        }
        
        // Allocate points and copy grid points column-order
        double *d_pts = nullptr;
        CUDA_CHECK(cudaMalloc((double **) &d_pts, sizeof(double) * 3 * npts_iter));
        #pragma unroll
        for (int coord = 0; coord < 3; coord++) {
            CUDA_CHECK(cudaMemcpyAsync(
                &d_pts[coord * npts_iter],
                &h_points[coord * n_pts + index_to_copy],
                sizeof(double) * npts_iter,
                cudaMemcpyHostToDevice)
            );
        }
        //*******************************************************************************
        //Compute the second term of the Laplacian:
        //    2 \sum_{i, j}  c_{i, j}  [\sum_k \partial \phi_i^2 \ \partial x_k^2] \phi_j .
        // Evaluate Sum Second Derivatives Of Atomic Orbitals
        constexpr int THREADS_PER_BLOCK = 128;
        dim3          threads(THREADS_PER_BLOCK);
        dim3          blocks((npts_iter + THREADS_PER_BLOCK - 1) / (THREADS_PER_BLOCK));
        CUDA_CHECK(cudaMemset(d_AOs_sec_deriv, 0.0, sizeof(double) * npts_iter * nbasis));
        evaluate_scalar_quantity_density(
            molbasis,
            false,
            false,
            "rho_lap",
            d_AOs_sec_deriv,
            d_pts,
            npts_iter,
            nbasis,
            threads,
            blocks
        );
        
        // Evaluate Atomic Orbitals
        CUDA_CHECK(cudaMemset(d_AOs, 0.0, sizeof(double) * npts_iter * nbasis));
        evaluate_scalar_quantity_density(
            molbasis,
            false,
            false,
            "rho",
            d_AOs,
            d_pts,
            npts_iter,
            nbasis,
            threads,
            blocks
        );
        
        // Matrix-Multiplication of the One-RDM with the sum of second derivatives
        double *d_MO_sum_deriv = &d_AOs_deriv[2 * npts_iter * nbasis];  // (M, N)
        double alpha           = 1.0, beta = 0.0;
        CUBLAS_CHECK(cublasDgemm(
            handle, CUBLAS_OP_N, CUBLAS_OP_N,
            npts_iter, nbasis, nbasis,
            &alpha,
            d_AOs_sec_deriv, npts_iter,
            d_one_rdm, nbasis,
            &beta,
            d_MO_sum_deriv, npts_iter
        ));
        
        // Hadamard Product with the Contractions Array
        dim3 threadsPerBlock2(320);
        dim3 grid2((npts_iter * nbasis + threadsPerBlock2.x - 1) / (threadsPerBlock2.x));
        hadamard_product<<<grid2, threadsPerBlock2>>>(d_MO_sum_deriv, d_AOs, nbasis, npts_iter);
        
        // Sum over the basis set
        double *d_first_term = &d_AOs_sec_deriv[(nbasis - 1) * npts_iter];
        CUBLAS_CHECK(cublasDgemv(
            handle, CUBLAS_OP_N, npts_iter, nbasis,
            &alpha, d_MO_sum_deriv, npts_iter, d_all_ones, 1, &beta,
            d_first_term, 1
        ));
        
        dim3 threadsPerBlock3(320);
        dim3 grid3((npts_iter + threadsPerBlock3.x - 1) / (threadsPerBlock3.x));
        multiply_scalar<<< grid3, threadsPerBlock3>>>(d_first_term, 2.0, npts_iter);
        
        // Transfer first term from device memory to host memory.
        CUDA_CHECK(cudaMemcpy(
            &h_laplacian[index_to_copy],
            d_first_term,
            sizeof(double) * npts_iter,
            cudaMemcpyDeviceToHost
        ));
        
        //*******************************************************************************
        //Compute Second term:
        //        2 \sum_{k in {x,y,z}} \sum_{i, j}  c_{i, j}  [d \phi_i \ dx_k] [d\phi_j \ dx_k]
        
        // Evaluate derivatives of AOs (3, M, N) row-order
        CUDA_CHECK(cudaMemset(d_AOs_deriv, 0.0, sizeof(double) * 3 * npts_iter * nbasis));
        evaluate_scalar_quantity_density(
            molbasis,
            false,
            false,
            "rho_deriv",
            d_AOs_deriv,
            d_pts,
            npts_iter,
            nbasis,
            threads,
            blocks
        );
        CUDA_CHECK(cudaMemset(d_AOs, 0.0, sizeof(double) * npts_iter * nbasis));
        double *d_MOs_derivi = d_AOs;
        
        #pragma unroll 3
        for (int i_deriv = 0; i_deriv < 3; i_deriv++) {
            // Ith Deriv AO (M, N) row-major
            double *d_ith_deriv = &d_AOs_deriv[i_deriv * npts_iter * nbasis];
            
            // Matrix multiply one-rdm with the ith derivative of contractions
            double alpha = 1.0;
            double beta  = 0.0;
            CUBLAS_CHECK(cublasDgemm(
                handle, CUBLAS_OP_N, CUBLAS_OP_N,
                npts_iter, nbasis, nbasis,
                &alpha,
                d_ith_deriv, npts_iter,
                d_one_rdm, nbasis,
                &beta,
                d_MOs_derivi, npts_iter
            ));
            
            // Do a hadamard product with MO_ith_deriv and ith_deriv
            dim3 threadsPerBlock2(320);
            dim3 grid2((npts_iter * nbasis + threadsPerBlock2.x - 1) / (threadsPerBlock2.x));
            hadamard_product<<<grid2, threadsPerBlock2>>>(
                d_MOs_derivi, d_ith_deriv, nbasis, npts_iter
            );
            
            // Take the sum. This is done via matrix-vector multiplication of ones
            CUBLAS_CHECK(cublasDgemv(
                handle, CUBLAS_OP_N, npts_iter, nbasis,
                &alpha, d_MOs_derivi, npts_iter, d_all_ones, 1, &beta,
                d_first_term, 1
            ));
            
            // Multiply by two
            dim3 threadsPerBlock3(320);
            dim3 grid3((npts_iter + threadsPerBlock2.x - 1) / (threadsPerBlock2.x));
            multiply_scalar<<<grid3, threadsPerBlock3>>>(d_first_term, 2.0, npts_iter);
            
            // Copy to host
            CUDA_CHECK(cudaMemcpy(h_second_term.data(),
                                  d_first_term,
                                  sizeof(double) * npts_iter,
                                  cudaMemcpyDeviceToHost));
            
            // Add to h_laplacian
            for (size_t i = index_to_copy; i < index_to_copy + npts_iter; i++) {
                h_laplacian[i] += h_second_term[i - index_to_copy];
            }
        }
        
        // Update lower-bound of the grid for the next iteration
        index_to_copy += npts_iter;
        i_iter++;
    }
    cudaFree(d_AOs_all);
    cudaFree(d_sec_deriv_AOs_all);
    cudaFree(d_AOs_deriv_all);
    cudaFree(d_one_rdm);
    all_ones.clear();
    all_ones.shrink_to_fit();
    
    return h_laplacian;
}

__host__ std::vector<double> chemtools::evaluate_laplacian(
    IOData& iodata, const double* h_points, const int knumb_points, const std::string& spin)
{
  cublasHandle_t handle;
  CUBLAS_CHECK(cublasCreate(&handle));
  std::vector<double> laplacian = evaluate_laplacian_on_any_grid_handle(
      handle, iodata, h_points, knumb_points, spin
  );
  CUBLAS_CHECK(cublasDestroy(handle));
  return laplacian;
}
