/* Reciprocal space particle mesh Ewald routines for molecular
   simulation programs (see Essmann et al., J. Chem. Phys. 103,
   8577 (1995)). */

#include "build.h"
#include "pme_recip.h"

/* Compute reciprocal-space contributions to electrostatic energy,
   virial, and forces, using the PME method. */
void pme_recip(int n_atoms, int n_grid_x, int n_grid_y, int n_grid_z,
   double **scaled_atom_coords, double **gridded_atom_coords,
   int B_spline_order, double **M_x, double **M_y, double **M_z,
   double **dM_du_x, double **dM_du_y, double **dM_du_z,
   double *atom_charges, fft elec,
   double volume, double alpha,
   double *b_1_un, double *b_2_un, double *b_3_un,
   double *b_mod2_x, double *b_mod2_y, double *b_mod2_z,
   double *pe_coul_recip, double **vir_coul_recip,
   double **atom_forces_coul_recip)
{
#if defined(COMPAQ)
   int ldx, ldy, sx, sy, sz, status;
#endif

int n_grid, i, j;

#if defined(COMPAQ)
   /* Initialize temporary variables for fft routines. */
   ldx = n_grid_x;
   ldy = n_grid_y;
   sx = sy = sz = 1;
#endif

   /* Zero long-range Coulomb forces. */
   *pe_coul_recip = 0.0;
   for (i = 0; i < 3; ++i)
      for (j = i; j < 3; ++j)
         vir_coul_recip[i][j] = 0.0;

   for (i = 0; i < n_atoms; ++i) {
      atom_forces_coul_recip[i][0] = 0.0;
      atom_forces_coul_recip[i][1] = 0.0;
      atom_forces_coul_recip[i][2] = 0.0;
   }

   /* Calculate gridded atomic coordinates. */
   gridded_atomic_coords(n_atoms, atom_charges, n_grid_x, n_grid_y, n_grid_z,
      scaled_atom_coords, gridded_atom_coords);

   /* Calculate B-spline coefficients and their first derivatives
      for all atoms. */
   B_spline_coefficients(n_atoms, gridded_atom_coords, atom_charges,
      B_spline_order, M_x, M_y, M_z, dM_du_x, dM_du_y, dM_du_z);

   /* Map atomic charges to grid. */
   grid_atomic_charges(n_atoms, gridded_atom_coords,
      atom_charges, n_grid_x, n_grid_y, n_grid_z,
      B_spline_order, M_x, M_y, M_z, elec);

   /* Compute FFT of charge array. */
#if defined(COMPAQ)
   status = zfft_apply_3d_("R", "R", "F",
      (&elec)->Q_linear_re, (&elec)->Q_linear_im, 
      (&elec)->Q_linear_re, (&elec)->Q_linear_im, &ldx, &ldy, &(elec.coeff), 
      &sx, &sy, &sz);
#elif defined(SGI)
   zfft3d(1, n_grid_x, n_grid_y, n_grid_z, (&elec)->Q_linear,
      n_grid_x, n_grid_y, (&elec)->coeff);
#elif defined(FFTW)
   fftwnd_one((&elec)->pfw, (&elec)->Q_linear, NULL);
#endif

   /* Compute reciprocal-space contribution to electrostatic energy
      and virial. */
/* pme_recip_energy_virial(volume, alpha,
      n_grid_x, n_grid_y, n_grid_z,
      b_1_un, b_2_un, b_3_un, B_spline_order, elec,
      b_mod2_x, b_mod2_y, b_mod2_z,
      pe_coul_recip, vir_coul_recip);
*/

   pme_recip_energy_virial_half(volume, alpha,
      n_grid_x, n_grid_y, n_grid_z,
      b_1_un, b_2_un, b_3_un, B_spline_order, elec,
      b_mod2_x, b_mod2_y, b_mod2_z,
      pe_coul_recip, vir_coul_recip);

   /* Compute inverse FFT of charge array. */
#if defined(COMPAQ)
   status = zfft_apply_3d_("R", "R", "B",
      (&elec)->Q_linear_re, (&elec)->Q_linear_im, 
      (&elec)->Q_linear_re, (&elec)->Q_linear_im,
      &ldx, &ldy, &(elec.coeff), &sx, &sy, &sz);
#elif defined(SGI)
   zfft3d(-1, n_grid_x, n_grid_y, n_grid_z, (&elec)->Q_linear,
      n_grid_x, n_grid_y, (&elec)->coeff);
#elif defined(FFTW)
   fftwnd_one((&elec)->pbw, (&elec)->Q_linear, NULL);
#endif

   /* Compute reciprocal-space contribution to electrostatic forces. */
   pme_recip_forces(n_atoms, atom_charges,
      gridded_atom_coords,
      n_grid_x, n_grid_y, n_grid_z,
      b_1_un, b_2_un, b_3_un, B_spline_order, M_x, M_y, M_z,
      dM_du_x, dM_du_y, dM_du_z,
      elec, atom_forces_coul_recip);
}

/* Calculate squared moduli of 1d discrete Fourier transforms of
   B-spline coefficient array. */
void B_spline_dft_moduli(int B_spline_order,
   int n_grid_x, int n_grid_y, int n_grid_z,
   double *b_mod2_x, double *b_mod2_y, double *b_mod2_z)
{
   double *M_x_tmp, *M_y_tmp, *M_z_tmp, *u, *dM_du_x_tmp, *dM_du_y_tmp, 
     *dM_du_z_tmp;

   /* Allocate memory for temporary B-spline arrays. */
   M_x_tmp = allocate_1d_array(B_spline_order, sizeof(double));
   M_y_tmp = allocate_1d_array(B_spline_order, sizeof(double));
   M_z_tmp = allocate_1d_array(B_spline_order, sizeof(double));
   dM_du_x_tmp = allocate_1d_array(B_spline_order, sizeof(double));
   dM_du_y_tmp = allocate_1d_array(B_spline_order, sizeof(double));
   dM_du_z_tmp = allocate_1d_array(B_spline_order, sizeof(double));

   /* Set distance to nearest (lower) grid point to zero. */
   u = allocate_1d_array(3, sizeof(double));
   u[0] = u[1] = u[2] = 0.0;

   /* Calculate B-spline coefficients of order B_spline_order. */
   B_spline_single(u, B_spline_order, M_x_tmp, M_y_tmp, M_z_tmp, dM_du_x_tmp,
                   dM_du_y_tmp, dM_du_z_tmp);

   /* Calculate squared moduli of 1d discrete Fourier transforms of
      B-spline coefficient array. */
   B_spline_dft_modulus(B_spline_order, n_grid_x, M_x_tmp, b_mod2_x);
   B_spline_dft_modulus(B_spline_order, n_grid_y, M_x_tmp, b_mod2_y);
   B_spline_dft_modulus(B_spline_order, n_grid_z, M_x_tmp, b_mod2_z);
   /* Free up memory. */
   free_1d_array(M_x_tmp);
   free_1d_array(M_y_tmp);
   free_1d_array(M_z_tmp);
   free_1d_array(u);
   free_1d_array(dM_du_x_tmp);
   free_1d_array(dM_du_y_tmp);
   free_1d_array(dM_du_z_tmp);
}

#define TINY 1.0e-7

/* Calculate squared modulus of 1d discrete Fourier transform of
   B-spline coefficient array. */
void B_spline_dft_modulus(int B_spline_order, int n_grid, double *M_tmp, 
                          double *b_mod2)
{
   int m, k, m_minus, m_plus;
#if defined(COMPAQ)
   double sum_re, sum_im;
#elif defined(SGI)
   zomplex sum;
#elif defined(FFTW)
   fftw_complex sum;
#endif
   double two_pi_over_n_grid, argument;

   /* Calculate squared modulus of discrete Fourier transform
      of M_tmp. */
   two_pi_over_n_grid = TWO_PI / n_grid;
   for (m = 0; m < n_grid; ++m) {
#if defined(COMPAQ)
      sum_re = 0.0;
      sum_im = 0.0;
      for (k = 0; k < B_spline_order; ++k) {
         argument = two_pi_over_n_grid * m * k;
         sum_re += M_tmp[k] * cos(argument);
         sum_im += M_tmp[k] * sin(argument);
      }
      b_mod2[m] = sum_re * sum_re + sum_im * sum_im;
#elif defined(SGI)
      sum.re = 0.0;
      sum.im = 0.0;
      for (k = 0; k < B_spline_order; ++k) {
         argument = two_pi_over_n_grid * m * k;
         sum.re += M_tmp[k] * cos(argument);
         sum.im += M_tmp[k] * sin(argument);
      }
      b_mod2[m] = sum.re * sum.re + sum.im * sum.im;
#elif defined(FFTW)
      sum.re = 0.0;
      sum.im = 0.0;
      for (k = 0; k < B_spline_order; ++k) {
         argument = two_pi_over_n_grid * m * k;
         sum.re += M_tmp[k] * cos(argument);
         sum.im += M_tmp[k] * sin(argument);
      }
      b_mod2[m] = sum.re * sum.re + sum.im * sum.im;
#endif
   }
   for (m = 0; m < n_grid; ++m)
      if (b_mod2[m] < TINY) {
         m_minus = m - 1;
         m_minus = (m_minus + n_grid) % n_grid;
         m_plus = m + 1;
         m_plus = (m_plus + n_grid) % n_grid;
         b_mod2[m] = 0.5 * (b_mod2[m_minus] + b_mod2[m_plus]);
      }
}

#undef TINY

/* Calculate B-spline coefficients of order B_spline_order 
   for a single atom. */
void B_spline_single(double *u, int B_spline_order,
   double *M_x_ptr, double *M_y_ptr, double *M_z_ptr,
   double *dM_du_x_ptr, double *dM_du_y_ptr, double *dM_du_z_ptr)
{
   int j;

   /* Calculate second-order B-spline coefficients (in general, we
      calculate M_j(u - k), where j is the order of the B spline
      (2 in this case); the first index of the array M is k + j - 1,
      where k is the offset from the grid interval containing
      the atom). */
   M_x_ptr[0] = 1.0 - u[0];
   M_y_ptr[0] = 1.0 - u[1];
   M_z_ptr[0] = 1.0 - u[2];
   M_x_ptr[1] = u[0];
   M_y_ptr[1] = u[1];
   M_z_ptr[1] = u[2];

   /* Generate B-spline coefficients up to order B_spline_order - 1
      by recursion. */
   for (j = 3; j < B_spline_order; ++j)
      B_spline_recursion(u, j, M_x_ptr, M_y_ptr, M_z_ptr);

   /* Generate first derivatives of B-spline coefficients of
      order B_spline_order from B-spline coefficients of order
      B_spline_order - 1. */
   B_spline_derivative(B_spline_order, M_x_ptr, M_y_ptr, M_z_ptr,
      dM_du_x_ptr, dM_du_y_ptr, dM_du_z_ptr);

   /* Generate B-spline coefficients of order B_spline_order
      by recursion. */
   B_spline_recursion(u, B_spline_order, M_x_ptr, M_y_ptr, M_z_ptr);
}

/* Use recursion to generate B-spline coefficients of order j from
   those of order j - 1 (see Eq. 4.1 of Essmann et al.). */
void B_spline_recursion(double *u, int j, double *M_x_ptr, double *M_y_ptr,
double *M_z_ptr)
{
   int k;
   double norm;

   /* Compute normalization factor. */
   norm = 1.0 / (j - 1);

   /* Generate M_j(u) from M_{j-1}(u). */
   M_x_ptr[j-1] = norm * u[0] * M_x_ptr[j-2];
   M_y_ptr[j-1] = norm * u[1] * M_y_ptr[j-2];
   M_z_ptr[j-1] = norm * u[2] * M_z_ptr[j-2];

   /* Loop over grid points, generating M_j(u - k) from M_{j-1}(u - k)
      and M_{j-1}(u - k - 1). */
   for (k = - 1; k > - (j - 1); --k) {
      M_x_ptr[j+k-1] = norm * ((u[0] - k) * M_x_ptr[j+k-2]
         + (j + k - u[0]) * M_x_ptr[j+k-1]);
      M_y_ptr[j+k-1] = norm * ((u[1] - k) * M_y_ptr[j+k-2]
         + (j + k - u[1]) * M_y_ptr[j+k-1]);
      M_z_ptr[j+k-1] = norm * ((u[2] - k) * M_z_ptr[j+k-2]
         + (j + k - u[2]) * M_z_ptr[j+k-1]);
   }

   /* Generate M_j(u + j - 1) from M_{j-1}(u + j - 2). */
   M_x_ptr[0] = norm * (1 - u[0]) * M_x_ptr[0];
   M_y_ptr[0] = norm * (1 - u[1]) * M_y_ptr[0];
   M_z_ptr[0] = norm * (1 - u[2]) * M_z_ptr[0];
}

/* Calculate gridded atomic coordinates, which range from 0 to n_grid_x
   in x, 0 to n_grid_y in y, etc. */
void gridded_atomic_coords(int n_atoms, double *atom_charges,
   int n_grid_x, int n_grid_y, int n_grid_z,
   double **scaled_atom_coords, double **gridded_atom_coords)
{
   int i;

   /* Calculate gridded atomic coordinates for all atoms. */
   for (i = 0; i < n_atoms; ++i) 
      if (atom_charges[i] != 0.0){
      gridded_atom_coords[i][0] = n_grid_x * (scaled_atom_coords[i][0] + 0.5);
      gridded_atom_coords[i][1] = n_grid_y * (scaled_atom_coords[i][1] + 0.5);
      gridded_atom_coords[i][2] = n_grid_z * (scaled_atom_coords[i][2] + 0.5);
      }
}

/* Calculate B-spline coefficients of order B_spline_order and their
   first derivatives for all atoms. */
void B_spline_coefficients(int n_atoms, double **gridded_atom_coords,
   double *atom_charges, int B_spline_order, double **M_x, double **M_y, 
   double **M_z, double **dM_du_x, double **dM_du_y, double **dM_du_z)
{
   int i;
   double u[3], *M_x_ptr, *M_y_ptr, *M_z_ptr, *dM_du_x_ptr, *dM_du_y_ptr,
      *dM_du_z_ptr;

   /* Loop over atoms. */
   for (i = 0; i < n_atoms; ++i) {

     if (atom_charges[i] != 0.0){
      /* Calculate distance from atom to nearest (lower) grid point. */
      u[0] = gridded_atom_coords[i][0] - (int) gridded_atom_coords[i][0];
      u[1] = gridded_atom_coords[i][1] - (int) gridded_atom_coords[i][1];
      u[2] = gridded_atom_coords[i][2] - (int) gridded_atom_coords[i][2];

      /* Set pointers to B-spline coefficient array and first
         derivative of B-spline coefficient array for atom i. */
      M_x_ptr = M_x[i];
      M_y_ptr = M_y[i];
      M_z_ptr = M_z[i];
      dM_du_x_ptr = dM_du_x[i];
      dM_du_y_ptr = dM_du_y[i];
      dM_du_z_ptr = dM_du_z[i];

      /* Calculate B-spline coefficients of order B_spline_order and
         their first derivatives for atom i. */
      B_spline_single(u, B_spline_order, M_x_ptr, M_y_ptr, M_z_ptr, 
                      dM_du_x_ptr, dM_du_y_ptr, dM_du_z_ptr);
    }
   }
}

/* Generate derivatives of B-spline coefficients of order j from B-spline
   coefficients of order j - 1 (see Eq. 4.2 of Essmann et al.). */
void B_spline_derivative(int j, double *M_x_ptr, double *M_y_ptr,
    double *M_z_ptr, double *dM_du_x_ptr, double *dM_du_y_ptr,
    double *dM_du_z_ptr)
{
   int k;

   /* Generate dM_j/du(u) from M_{j-1}(u). */
   dM_du_x_ptr[j-1] = M_x_ptr[j-2];
   dM_du_y_ptr[j-1] = M_y_ptr[j-2];
   dM_du_z_ptr[j-1] = M_z_ptr[j-2];

   /* Loop over grid points, generating dM_j/du(u - k)
      from M_{j-1}(u - k) and M_{j-1}(u - k - 1). */
   for (k = - 1; k > - (j - 1); --k) {
      dM_du_x_ptr[j+k-1] = M_x_ptr[j+k-2] - M_x_ptr[j+k-1];
      dM_du_y_ptr[j+k-1] = M_y_ptr[j+k-2] - M_y_ptr[j+k-1];
      dM_du_z_ptr[j+k-1] = M_z_ptr[j+k-2] - M_z_ptr[j+k-1];
   }

   /* Generate dM_j/du(u + j - 1) from M_{j-1}(u + j - 2). */
   dM_du_x_ptr[0] = - M_x_ptr[0];
   dM_du_y_ptr[0] = - M_y_ptr[0];
   dM_du_z_ptr[0] = - M_z_ptr[0];
}

/* Map atomic charges to grid, using B-spline interpolation. */
void grid_atomic_charges(int n_atoms, double **gridded_atom_coords,
   double *atom_charges, int n_grid_x, int n_grid_y, int n_grid_z,
   int B_spline_order, double **M_x, double **M_y, double **M_z, 
   fft elec)
{
   double *M_x_ptr, *M_y_ptr, *M_z_ptr;
   int n_grid_xyz, index_z, index_yz, index_xyz,
      ix, iy, iz, i, ix_0, iy_0, iz_0, kx, ky, kz;
   double charge, prod_z, prod_yz, prod_xyz;

   /* Zero charge array. */
   n_grid_xyz = n_grid_x * n_grid_y * n_grid_z;

   for (index_xyz = 0; index_xyz < n_grid_xyz; ++index_xyz)
#if defined(COMPAQ)
      elec.Q_linear_re[index_xyz] = elec.Q_linear_im[index_xyz] = 0.0;
#elif defined(SGI)
      (elec.Q_linear[index_xyz]).re = (elec.Q_linear[index_xyz]).im = 0.0;
#elif defined(FFTW)
      (elec.Q_linear[index_xyz]).re = (elec.Q_linear[index_xyz]).im = 0.0;
#endif

   /* Loop over atoms. */
   for (i = 0; i < n_atoms; ++i) {

     if (atom_charges[i] != 0.0) {

      /* Compute origin of grid domain to which to map charge of atom i. */
      ix_0 = (int) (gridded_atom_coords[i][0]) - B_spline_order + 1;
      iy_0 = (int) (gridded_atom_coords[i][1]) - B_spline_order + 1;
      iz_0 = (int) (gridded_atom_coords[i][2]) - B_spline_order + 1;

      /* Set pointer to B-spline coefficient array for atom i. */
      M_x_ptr = M_x[i];
      M_y_ptr = M_y[i];
      M_z_ptr = M_z[i];

      /* Get charge of atom i. */
      charge = atom_charges[i];
      /* Map charge of atom i to grid domain and add contributions to
         charge array (see Eq. 3.7 of Essmann et al.). */
      for (kz = 0; kz < B_spline_order; ++kz) {
         iz = iz_0 + kz;
         iz = (iz + n_grid_z) % n_grid_z;
         index_z = n_grid_y * iz;
         prod_z = M_z_ptr[kz] * charge;
         for (ky = 0; ky < B_spline_order; ++ky) {
            iy = iy_0 + ky;
            iy = (iy + n_grid_y) % n_grid_y;
            index_yz = n_grid_x * (iy + index_z);
            prod_yz = M_y_ptr[ky] * prod_z;
            for (kx = 0; kx < B_spline_order; ++kx) {
               ix = ix_0 + kx;
               ix = (ix + n_grid_x) % n_grid_x;
               index_xyz = ix + index_yz;
               prod_xyz = M_x_ptr[kx] * prod_yz;
#if defined(COMPAQ)
               elec.Q_linear_re[index_xyz] += prod_xyz;
#elif defined(SGI)
               (elec.Q_linear[index_xyz]).re += prod_xyz;
#elif defined(FFTW)
            (elec.Q_linear[index_xyz]).re += prod_xyz;
#endif

            }
         }
      }
    }
   }
}

/* Compute reciprocal-space contribution to electrostatic energy
   and virial, using the PME method. */
void pme_recip_energy_virial(double volume, double alpha,
   int n_grid_x, int n_grid_y, int n_grid_z,
   double *b_1_un, double *b_2_un, double *b_3_un,
   int B_spline_order, fft elec,
   double *b_mod2_x, double *b_mod2_y, double *b_mod2_z,
   double *pe_coul_recip, double **vir_coul_recip)
{
   int n_nyq_x, n_nyq_y, n_nyq_z,
      i, j, mx, my, mz, mx_fold, my_fold, mz_fold,
      index_z, index_yz, index_xyz;
   double two_pi_volume, pi_over_alpha, pi_over_alpha2,
      denom_z, denom_yz, denom_xyz, G_x, G_y, G_z, G2,
      argument, kernel, v_coul, struct2, u_coul;

   /* Compute Nyquist "frequencies". */
   n_nyq_x = n_grid_x / 2;
   n_nyq_y = n_grid_y / 2;
   n_nyq_z = n_grid_z / 2;

   /* Compute various constants. */
   two_pi_volume = 2.0 * PI * volume;
   pi_over_alpha = PI / alpha;
   pi_over_alpha2 = SQR(pi_over_alpha);

   /* Zero accumulators. */
   *pe_coul_recip = 0.0;
   for (i = 0; i < 3; ++i)
      for (j = i; j < 3; ++j)
         vir_coul_recip[i][j] = 0.0;

   /* Loop over elements in Fourier transformed charge array,
      folding reciprocal lattice vectors into a domain centered
      on the origin. */
   for (mz = 0; mz < n_grid_z; ++mz) {
      index_z = n_grid_y * mz;
      denom_z = b_mod2_z[mz] * two_pi_volume;
      mz_fold = (mz > n_nyq_z) ? mz - n_grid_z : mz;
      for (my = 0; my < n_grid_y; ++my) {
         index_yz = n_grid_x * (my + index_z);
         denom_yz = b_mod2_y[my] * denom_z;
         my_fold = (my > n_nyq_y) ? my - n_grid_y : my;
         for (mx = 0; mx < n_grid_x; ++mx) {
            index_xyz = mx + index_yz;
            if (index_xyz > 0) {
               denom_xyz = b_mod2_x[mx] * denom_yz;
               mx_fold = (mx > n_nyq_x) ? mx - n_grid_x : mx;

               /* Compute components and squared magnitude of
                  reciprocal lattice vector. */
               G_x = mx_fold * b_1_un[0];
               G_y = mx_fold * b_1_un[1] + my_fold * b_2_un[1];
               G_z = mx_fold * b_1_un[2] + my_fold * b_2_un[2] 
                     + mz_fold * b_3_un[2];
               G2 = SQR(G_x) + SQR(G_y) + SQR(G_z);

               /* Compute contributions to energy and virial. */
               denom_xyz *= G2;
               argument = pi_over_alpha2 * G2;
               kernel = exp(- argument) / denom_xyz;
               v_coul = 2.0 * (1.0 + argument) / G2;
#if defined(COMPAQ)
                  struct2 = SQR(elec.Q_linear_re[index_xyz])
                     + SQR(elec.Q_linear_im[index_xyz]);
#elif defined(SGI)
                  struct2 = SQR((elec.Q_linear[index_xyz]).re)
                     + SQR((elec.Q_linear[index_xyz]).im);
#elif defined(FFTW)
                  struct2 = SQR((elec.Q_linear[index_xyz]).re)
                     + SQR((elec.Q_linear[index_xyz]).im);
#endif
               u_coul = kernel * struct2;
               *pe_coul_recip += u_coul;
               vir_coul_recip[0][0] += u_coul * (v_coul * SQR(G_x) - 1.0);
               vir_coul_recip[1][1] += u_coul * (v_coul * SQR(G_y) - 1.0);
               vir_coul_recip[2][2] += u_coul * (v_coul * SQR(G_z) - 1.0);
               vir_coul_recip[0][1] += u_coul * (v_coul * G_x * G_y);
               vir_coul_recip[0][2] += u_coul * (v_coul * G_x * G_z);
               vir_coul_recip[1][2] += u_coul * (v_coul * G_y * G_z);

               /* Prepare charge grid for force calculation. */
#if defined(COMPAQ)
               elec.Q_linear_re[index_xyz] *= 2.0 * kernel;
               elec.Q_linear_im[index_xyz] *= 2.0 * kernel;
#elif defined(SGI)
               (elec.Q_linear[index_xyz]).re *= 2.0 * kernel;
               (elec.Q_linear[index_xyz]).im *= 2.0 * kernel;
#elif defined(FFTW)
               (elec.Q_linear[index_xyz]).re *= 2.0 * kernel;
               (elec.Q_linear[index_xyz]).im *= 2.0 * kernel;
#endif

            }
         }
      }
   }
}

/* Compute reciprocal-space contribution to electrostatic energy
   and virial, using the PME method. Only half of reciprocal space
   is summed over (this routine takes advantage of the inversion
   symmetry of the structure factor). */
void pme_recip_energy_virial_half(double volume, double alpha,
   int n_grid_x, int n_grid_y, int n_grid_z,
   double *b_1_un, double *b_2_un, double *b_3_un,
   int B_spline_order, fft elec,
   double *b_mod2_x, double *b_mod2_y, double *b_mod2_z,
   double *pe_coul_recip, double **vir_coul_recip)
{
   int n_nyq_x, n_nyq_y, n_nyq_z,
      i, j, mx, my, mz, mx_min, my_min, mx_min_tmp, my_min_tmp,
      mx_fold, my_fold, mx_minus, my_minus, mz_minus,
      index_z, index_yz, index_xyz,
      index_z_minus, index_yz_minus, index_xyz_minus;
   double pi_volume, pi_over_alpha, pi_over_alpha2, factor,
      denom_z, denom_yz, denom_xyz, G_x, G_y, G_z, G2,
      argument, kernel, v_coul, struct2, u_coul;

   /* Compute Nyquist "frequencies". */
   n_nyq_x = n_grid_x / 2;
   n_nyq_y = n_grid_y / 2;
   n_nyq_z = n_grid_z / 2;

   /* Compute minimum frequencies for reciprocal-space sum. */
   if (n_grid_x % 2 == 0)
      mx_min = - n_nyq_x + 1;
   else
      mx_min = - n_nyq_x;
   if (n_grid_y % 2 == 0)
      my_min = - n_nyq_y + 1;
   else
      my_min = - n_nyq_y;

   /* Compute various constants. */
   pi_volume = PI * volume;
   pi_over_alpha = PI / alpha;
   pi_over_alpha2 = SQR(pi_over_alpha);

   /* Zero accumulators. */
   *pe_coul_recip = 0.0;
   for (i = 0; i < 3; ++i)
      for (j = i; j < 3; ++j)
         vir_coul_recip[i][j] = 0.0;

   /* Loop over elements in Fourier transformed charge array,
      folding reciprocal lattice vectors into a domain centered
      on the origin. */
/*   factor = pi_volume / COUL_COUPLING; */
   my_min_tmp = 0;
   mx_min_tmp = 1;
   for (mz = 0; mz <= n_nyq_z; ++mz) {
      mz_minus = - mz;
      mz_minus = (mz_minus < 0) ? mz_minus + n_grid_z : mz_minus;
      index_z = n_grid_y * mz;
      index_z_minus = n_grid_y * mz_minus;
/*      denom_z = b_mod2_z[mz] * factor; */
      denom_z = b_mod2_z[mz] * pi_volume; 
      for (my = my_min_tmp; my <= n_nyq_y; ++my) {
         my_fold = (my < 0) ? my + n_grid_y : my;
         my_minus = - my;
         my_minus = (my_minus < 0) ? my_minus + n_grid_y : my_minus;
         index_yz = n_grid_x * (my_fold + index_z);
         index_yz_minus = n_grid_x * (my_minus + index_z_minus);
         denom_yz = b_mod2_y[my_fold] * denom_z;
         for (mx = mx_min_tmp; mx <= n_nyq_x; ++mx) {
            mx_fold = (mx < 0) ? mx + n_grid_x : mx;
            mx_minus = - mx;
            mx_minus = (mx_minus < 0) ? mx_minus + n_grid_x : mx_minus;
            index_xyz = mx_fold + index_yz;
            index_xyz_minus = mx_minus + index_yz_minus;
            denom_xyz = b_mod2_x[mx_fold] * denom_yz;

            /* Compute components and squared magnitude of
               reciprocal lattice vector. */
            G_x = mx * b_1_un[0];
            G_y = mx * b_1_un[1] + my * b_2_un[1];
            G_z = mx * b_1_un[2] + my * b_2_un[2] + mz * b_3_un[2];
            G2 = SQR(G_x) + SQR(G_y) + SQR(G_z);

            /* Compute contributions to energy and virial. */
            denom_xyz *= G2;
            argument = pi_over_alpha2 * G2;
            kernel = exp(- argument) / denom_xyz;
            v_coul = 2.0 * (1.0 + argument) / G2;
#if defined(COMPAQ)
               struct2 = SQR(elec.Q_linear_re[index_xyz])
                  + SQR(elec.Q_linear_im[index_xyz]);
#elif defined(SGI)
               struct2 = SQR((elec.Q_linear[index_xyz]).re)
                  + SQR((elec.Q_linear[index_xyz]).im);
#elif defined(FFTW)
               struct2 = SQR((elec.Q_linear[index_xyz]).re)
                  + SQR((elec.Q_linear[index_xyz]).im);
#endif
               u_coul = kernel * struct2;
               *pe_coul_recip += u_coul;
               vir_coul_recip[0][0] += u_coul * (v_coul * SQR(G_x) - 1.0);
               vir_coul_recip[1][1] += u_coul * (v_coul * SQR(G_y) - 1.0);
               vir_coul_recip[2][2] += u_coul * (v_coul * SQR(G_z) - 1.0);
               vir_coul_recip[0][1] += u_coul * (v_coul * G_x * G_y);
               vir_coul_recip[0][2] += u_coul * (v_coul * G_x * G_z);
               vir_coul_recip[1][2] += u_coul * (v_coul * G_y * G_z);

            /* Prepare charge grid for force calculation. */
#if defined(COMPAQ)
            elec.Q_linear_re[index_xyz] *= kernel;
            elec.Q_linear_im[index_xyz] *= kernel;
            elec.Q_linear_re[index_xyz_minus] *= kernel;
            elec.Q_linear_im[index_xyz_minus] *= kernel;
#elif defined(SGI)
            (elec.Q_linear[index_xyz]).re *= kernel;
            (elec.Q_linear[index_xyz]).im *= kernel;
            (elec.Q_linear[index_xyz_minus]).re *= kernel;
            (elec.Q_linear[index_xyz_minus]).im *= kernel;
#elif defined(FFTW)
           (elec.Q_linear[index_xyz]).re *= kernel;
            (elec.Q_linear[index_xyz]).im *= kernel;
            (elec.Q_linear[index_xyz_minus]).re *= kernel;
            (elec.Q_linear[index_xyz_minus]).im *= kernel;
#endif
         }
         mx_min_tmp = mx_min;
      }
      my_min_tmp = my_min;
   }
}

/* Compute reciprocal-space contribution to electrostatic forces,
   using the PME method. */
void pme_recip_forces(int n_atoms, double *atom_charges,
   double **gridded_atom_coords,
   int n_grid_x, int n_grid_y, int n_grid_z,
   double *b_1_un, double *b_2_un, double *b_3_un,
   int B_spline_order, double **M_x, double **M_y, double **M_z,
   double **dM_du_x, double **dM_du_y, double **dM_du_z,
   fft elec, double **atom_forces_coul_recip)
{
   double *M_x_ptr, *M_y_ptr, *M_z_ptr, *dM_du_x_ptr, *dM_du_y_ptr, 
      *dM_du_z_ptr;
   int i, ix_0, iy_0, iz_0, kx, ky, kz, ix, iy, iz,
      index_z, index_yz, index_xyz, n_grid_xyz;
   double f_1, f_2, f_3, charge, prod, prod_1, prod_2, prod_3,
      prod_1_z, prod_2_z, prod_3_z, prod_1_yz, prod_2_yz, prod_3_yz,
      prod_1_xyz, prod_2_xyz, prod_3_xyz,
      f_x, f_y, f_z, f_tot_x, f_tot_y, f_tot_z;


   /* Zero total force accumulators. */
   f_tot_x = f_tot_y = f_tot_z = 0.0;

   /* Loop over atoms. */
   n_grid_xyz = n_grid_x *  n_grid_y *  n_grid_z;
   for (i = 0; i < n_atoms; ++i) {

     if (atom_charges[i] != 0.0){
 
      /* Zero force accumulators. */
      f_1 = f_2 = f_3 = 0.0;

      /* Compute origin of grid domain over which to sum contributions to
         force on atom i. */
      ix_0 = (int) (gridded_atom_coords[i][0]) - B_spline_order + 1;
      iy_0 = (int) (gridded_atom_coords[i][1]) - B_spline_order + 1;
      iz_0 = (int) (gridded_atom_coords[i][2]) - B_spline_order + 1;

      /* Set pointers to arrays of B-spline coefficients and their
         first derivatives for atom i. */
      M_x_ptr = M_x[i];
      M_y_ptr = M_y[i];
      M_z_ptr = M_z[i];
      dM_du_x_ptr = dM_du_x[i];
      dM_du_y_ptr = dM_du_y[i];
      dM_du_z_ptr = dM_du_z[i];

      /* Get charge of atom i. */
      charge = atom_charges[i];
#if defined(COMPAQ)
      prod = n_grid_xyz * charge;
#elif defined(SGI)
      prod = charge;
#elif defined(FFTW)
      prod = charge;
#endif

      prod_1 = - n_grid_x * prod;
      prod_2 = - n_grid_y * prod;
      prod_3 = - n_grid_z * prod;

      /* Loop over grid domain, summing contributions to force on atom i. */
      for (kz = 0; kz < B_spline_order; ++kz) {
         iz = iz_0 + kz;
         iz = (iz + n_grid_z) % n_grid_z;
         index_z = n_grid_y * iz;
         prod_1_z = M_z_ptr[kz] * prod_1;
         prod_2_z = M_z_ptr[kz] * prod_2;
         prod_3_z = dM_du_z_ptr[kz] * prod_3;
         for (ky = 0; ky < B_spline_order; ++ky) {
            iy = iy_0 + ky;
            iy = (iy + n_grid_y) % n_grid_y;
            index_yz = n_grid_x * (iy + index_z);
            prod_1_yz = M_y_ptr[ky] * prod_1_z;
            prod_2_yz = dM_du_y_ptr[ky] * prod_2_z;
            prod_3_yz = M_y_ptr[ky] * prod_3_z;
            for (kx = 0; kx < B_spline_order; ++kx) {
               ix = ix_0 + kx;
               ix = (ix + n_grid_x) % n_grid_x;
               index_xyz = ix + index_yz;
               prod_1_xyz = dM_du_x_ptr[kx] * prod_1_yz;
               prod_2_xyz = M_x_ptr[kx] * prod_2_yz;
               prod_3_xyz = M_x_ptr[kx] * prod_3_yz;
#if defined(COMPAQ)
               f_1 += prod_1_xyz * elec.Q_linear_re[index_xyz];
               f_2 += prod_2_xyz * elec.Q_linear_re[index_xyz];
               f_3 += prod_3_xyz * elec.Q_linear_re[index_xyz];
#elif defined(SGI)
               f_1 += prod_1_xyz * (elec.Q_linear[index_xyz]).re;
               f_2 += prod_2_xyz * (elec.Q_linear[index_xyz]).re;
               f_3 += prod_3_xyz * (elec.Q_linear[index_xyz]).re;
#elif defined(FFTW)
               f_1 += prod_1_xyz * (elec.Q_linear[index_xyz]).re;
               f_2 += prod_2_xyz * (elec.Q_linear[index_xyz]).re;
               f_3 += prod_3_xyz * (elec.Q_linear[index_xyz]).re;
#endif

            }
         }
      }
      f_x = b_1_un[0] * f_1;
      f_y = b_1_un[1] * f_1 + b_2_un[1] * f_2;
      f_z = b_1_un[2] * f_1 + b_2_un[2] * f_2 + b_3_un[2] * f_3;
      atom_forces_coul_recip[i][0] += f_x;
      atom_forces_coul_recip[i][1] += f_y;
      atom_forces_coul_recip[i][2] += f_z;
      f_tot_x += f_x;
      f_tot_y += f_y;
      f_tot_z += f_z;

     }
   }

   /* Subtract average net force from force on each atom to remove
      center of mass drift. */
   f_tot_x /= n_atoms;
   f_tot_y /= n_atoms;
   f_tot_z /= n_atoms;
   for (i = 0; i < n_atoms; ++i) {
    if (atom_charges[i] != 0.0){
      atom_forces_coul_recip[i][0] -= f_tot_x;
      atom_forces_coul_recip[i][1] -= f_tot_y;
      atom_forces_coul_recip[i][2] -= f_tot_z;
    }
   }

}
