#define CODENAME "NBI"
#define VERSION	   "1"

/* dimension of physical space */
#define DIM	     		3
/* max number of bodies, obviously */
#define MAXNBODIES   		2100

/* TOLERANCES */

/* the extrapolation scheme stops when
	Tj,k - T(j-1),k becomes smaller than this */
#define GBSTOL	      		5.0e-32

/* The solution of Kepler's equation is accepted when
   when iterates change by an amount smaller than this  */
#define KEPLERSOLVERTOLERANCE	5.0e-16

/* what method and order to use to initialize the CSN integrator, GBS or HWH  */
#define CSNINITGBS		1
#define CSNINITGBSORDER		4	
#define CSNINITHWH		2
#define CSNINITHWHORDER		4
int csninit = CSNINITGBS;


/* which extrapolation sequence to use in the GBS method */
#define EXSEQHARM int extrapolseq[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
#define EXSEQBUL  int extrapolseq[] = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32};

/* cange this line accordingly */
EXSEQBUL


/* MACROS TO DISTINGUISH BETWEEN TEST PARTICLES AND
   PARTICLES WITH FINITE MASS
*/

#define NOTTESTPARTICLE		1
#define TESTPARTICLE		0

/* the header files we need, only standard stuff */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "omp.h"

/* Macro definitions */
#define NUMMETHODS	4
#define SKP 1
#define HWH 2
#define CSN 3
#define GBS 4

/* it is really the number of stages for GBS */
char *methodnames[] = { "   ", "SKP", "HWH", "CSN", "GBS" };
#define MAXCSN  15
#define MAXGBS  10
int MAXORDERS[] = { 0, 4, 4, MAXCSN, MAXGBS };

/* 
   MACROS WHICH DEFINE WHAT IS STORED FOR THE PARTICLES
	from 1 to 6: x y z vx vy vz
	          7: G*mass
	from 8 to 10:  accelerations in x, y and z
   e.g., particle i's y velocity is
	 parts[i][5]
*/

#define TMPSIZE      (2*DIM+1) 
/* Note: we need only the G*mass for the actual integrations */ 
#define GMASSCO	     (7)
#define ACC          (7)
/* you can add more data for particles here
   but keep NPARTDATA to specify the total number of
   coordinates and other components  */
#define NPARTDATA    (ACC+3)
/* this is used from index 1 in both indeces */
#define DECLARR [MAXNBODIES+1][NPARTDATA+1]

/* Cowell-Stormer-Newman array */
#define DECLSTORMER [MAXNBODIES+1][DIM+1][MAXCSN+2]
/* Gragg-Bulirsch-Stoer array */
#define DECLBS [MAXNBODIES+1][2*DIM+1][2][MAXGBS+2]

#define DIM2	     (2*DIM)

/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		GLOBAL DATA
 	ARRAYS THAT HOLD THE PARTICLES AND AUXILIARY DATA 
*/

/* ARRAYS THAT HOLD THE PARTICLES' DATA.
	This can be moved into main() without
	any adverse effect.  */
double parts DECLARR;

/* This holds test particle indices.
   It seems easier to keep this global. It is used, e.g.,
   by PlainAccel to avoid the computation of force between
   test particles.  */
int tpstore[MAXNBODIES+1];

/* this is needed for error messages from deep within the integrator */
FILE *FLOG;

/* Local error monitoring for the Cowell-Stormer integrator */
double CSNerror = 0;


/* FUNCTION AND STRUCT DECLARATIONS */

/* the tree to store Jacobi coordinates */
struct tree {
	double tmass; /* total G*mass */
	double rm1;   /* total mass / mass1 */
	double rm2;   /* total mass / mass2 */ 
	double bc[DIM2+1]; /* barycenter: position and velocity
				for actual particle these  are pos. and vel. */
	double acc[DIM+1];
/* nonzero particle index indicates that this is one with finite mass
 or it is an array of test particles which can be processed in a simple loop,
   zero means that this is a node in the tree
*/
	int particleindex;  /* nonzero indicates actual particle(s) */
	struct tree *p1;
	struct tree *p2;
	int *testparticles; /* nonzero indicates array of test particles */
	};


int CStep(int ord, double h, int nump, double parts DECLARR, 
		struct tree *treein);

int GBSStep(int ord, double h, int nump, double parts DECLARR);

/* makes a simple chain, usual planetary motion case */
struct tree *MakeSimpleTree(int nump);
/* makes an arbitrary tree from a textual description, i.e.,
  parses the text and builds the tree */
struct tree *MakeTree(FILE *f);

/* Fill tree with barycenters, used to initialize the tree */
int FillCMinTree(struct tree *tree, double p DECLARR);
/* copy particle coordinates from tree;
   since the tree is the primary storage for the coordinates
   of particles with finite mass, one needs to call this
   before accessing data in the "parts" array
 */
int ParticlesFromTree(struct tree *tree, double p DECLARR);

/* The actual HWH integrator */
int HWHStep(int order, struct tree *tree, double stepsize, double p DECLARR);

/* Kinetic-potential energy split */
int SKPStep(int order, double stepsize, int nump, double p DECLARR);


/* access integer arrays in trees that hold the indeces
	of test particles attached to that point,
	returns pointer to testparticles,
	included for the sake of completeness, not actually used */
int **GetTestParticles(struct tree *tree, int ind);


/*    VERY SIMPLE IO, you can write your own if you need something more */
int readdata(FILE *f, int num, double p DECLARR) {
	int i, j; double tmp;
	for ( i = 1; i <= num; ++i ) {
			fscanf(f, "%le", &tmp); p[i][GMASSCO] = tmp; }
	for ( i = 1; i <= num; ++i ) {
		for ( j = 1; j <= 6; ++j ) {
			fscanf(f, "%le", &tmp); p[i][j] = tmp; } }
	return num; }

// X
int printdataA(int num, FILE *f, double p DECLARR) {
	int i, j;
	for ( i = 1; i <= num; ++i ) {
		for ( j = 1; j <= DIM; ++j ) { 
			fprintf(f, "%.14le  ", p[i][j]); 
		} 
		fprintf(f, "%c", '\n');
		for ( j = DIM+1; j <= 2*DIM; ++j ) { 
			fprintf(f, "%.14le  ", p[i][j]); 
		} 
		fprintf(f, "%c", '\n');
	}
	return num; 
}

// X
int printdata(int num, FILE *f, double p DECLARR, int c1, int c2) {
	int i, j;
	for ( i = 1; i <= num; ++i ) {
		for ( j = c1; j <= c2; ++j ) {
			fprintf(f, "%.14le  ", p[i][j]); } 
		    fprintf(f, "%c", '\n');
		}
	return num; 
}

//X total energy for particles with finite mass 
double energy(int nump, int dim, double p DECLARR, int *tps) {
	double h = 0, tmp, tmp2;
	int i,j,k;
	
	for ( i = 1; i <= nump; ++i ) {
		if ( tps[i] == 0 ) 
			continue;
		tmp = 0.0;
		
		#pragma omp parallel for reduction(+:tmp) schedule(runtime)
		for ( k = 1; k <= dim; ++k ) {
			tmp = tmp + p[i][k+dim]*p[i][k+dim]; 
		}
		tmp = tmp*p[i][GMASSCO]/2.0;
		
		h = h + tmp;
		for ( j = i+1; j <= nump; ++j ) {
			tmp = 0.0;
			#pragma omp parallel for reduction(+:tmp) schedule(runtime)
			for ( k = 1; k <= dim; ++k ) {
				tmp2 = p[i][k]-p[j][k];
				tmp = tmp+tmp2*tmp2; 
			}
			tmp = 1/sqrt(tmp);
			h = h - tmp*p[i][GMASSCO]*p[j][GMASSCO]; 
		}
	}
	
	return h; 
}

// X z component of angular momentum 
double angmomz(int nump, int dim, double p DECLARR, int *tps) {
	double az = 0.0;
	int i;
	
	for ( i = 1; i <= nump; ++i ) {
		if ( tps[i] == 0 ) 
			continue;
		// x*vy - y*vx	
		az = az + (p[i][1]*p[i][2+dim] - p[i][2]*p[i][1+dim]) * p[i][GMASSCO]; 
	}
	return az; 
}

// X transforms all particles to coordinates relative to barycenter 
int tobarycenter(int nump, int dim, double p DECLARR) {
	double bc[10];
	int i, j;
	
	#pragma omp parallel private(j)
	#pragma omp for schedule(runtime)
	for (j = 0; j <= 2*dim; ++j ) 
		bc[j] = 0.0;
	
	for ( i = nump; i >= 1; --i ) {
		bc[0] += p[i][GMASSCO];
		#pragma omp parallel private(j)
		#pragma omp for schedule(runtime)
		for (j = 1; j <= 2*dim; ++j ) 
			bc[j] += p[i][j]*p[i][GMASSCO]; 
	}
	
	#pragma omp parallel private(j)
	#pragma omp for schedule(runtime)
	for (j = 1; j <= 2*dim; ++j ) 
		bc[j] = bc[j]/bc[0];
		
    
	for ( i = nump; i >= 1; --i ) {
		#pragma omp parallel private(j)
		#pragma omp for schedule(runtime)
		for (j = 1; j <= 2*dim; ++j ) 
			p[i][j] -= bc[j]; 
	}
	return 0; 
}


/* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""
	MAIN 
*/

int main(int argc, char **argv) {
	FILE *f, *fin, *flog, *flocerr;
	char sicode[100];
	int icode; /* SKP = 1 HWH = 2 CSN = 3 GBS = 4 */
	struct tree *tree; int num, num2;
	int ii, i, order = 1, nin, nps;
	double stepsize;
	double time = 0.0, ftime, ptime;
	char fnamin[1000]; char id[1000];
	double az, az0, toten, toten0;

	// get command-line argument
	if ( argc < 2 ) { exit(1); }
	sscanf(argv[1], "%s", fnamin); 

	// read input file
	fin = fopen(fnamin, "r");
	fscanf(fin, "%s", id); strcpy(fnamin, id);
	
	// create output and log file
	// id - name of output file	
	f = fopen(id, "w"); strcat(fnamin, ".log"); flog = fopen(fnamin, "w"); 
	FLOG = flog;

	#define FAILEDIN { fclose(fin); fclose(f); fclose(flog); exit(1); }

	fprintf(flog, "ID = %s\n", id);
	fscanf(fin, "%s", sicode); 
	fscanf(fin, "%d", &order); fprintf(flog, "ORDER = %2d\n", order);
	icode = 0;
	for ( icode = 1; icode <= NUMMETHODS; ++icode ) {
		if ( strcmp(sicode, methodnames[icode] ) == 0 ) { 
			if ( order < 1 || order > MAXORDERS[icode] ) {
				fprintf(flog, " Order %2d for method %s is not implemented \n", order, sicode);
				FAILEDIN 
			}
			break; 
		} 
	}
	
	if ( icode > NUMMETHODS ) {
		fprintf(flog, " Invalid method code %s\n", sicode);
		FAILEDIN 
	}


	fscanf(fin, "%le", &stepsize); fprintf(flog, "STEPSIZE = %.12f\n", stepsize);
	fscanf(fin, "%le", &ftime); fprintf(flog, "FINAL TIME = %.12f\n", ftime);
	fscanf(fin, "%le", &ptime); fprintf(flog, "PRINTING TIME = %.12f\n", ptime);
	fscanf(fin, "%d", &num); fprintf(flog, "NUMBER OF PARTS = %2d\n", num);
	
	if ( num > MAXNBODIES ) {
		fprintf(flog, " Too many particles, change MAXNBODIES macro in code ");
		FAILEDIN 
	}

	/* initialize tpstore */
	
	for ( i = 1; i <= num+2; ++i ) { 
		tpstore[i] = NOTTESTPARTICLE; 
	}

	/* this will TESTPARTICLE for test particles in tpstore */
	tree = MakeTree(fin);

	/* For CSN create error file */
	if ( icode == CSN ) {
		strcpy(fnamin, id); strcat(fnamin, ".lerr"); flocerr = fopen(fnamin, "w"); 
	}

	num2 = readdata(fin, num, parts);
	/* DONE READING ALL INPUTS */
	fclose(fin);

	/* zero out test particles' tpstore, again, just in case */
	for ( i = 1; i <= num; ++i ) { 
		if ( parts[i][GMASSCO] == 0.0 ) { 
			tpstore[i] = TESTPARTICLE; 
		} 
	}

	if ( num != num2 ) exit(1);
	fprintf(flog, "GMs\n");
	printdata(num, flog, parts, GMASSCO, GMASSCO); 
	
	fprintf(flog, "Inits \n");
	printdataA(num, flog, parts);
	
	fflush(flog);

	/* WATCH OUT FOR THIS 
	   The transformation to barycenter
	   tends to improve things. 
	*/
	tobarycenter(num, DIM, parts);
	fprintf(flog, "Actual Barycentric Inits \n"); 
	printdataA(num, flog, parts);

	fprintf(flog, "Actual integrator is %s \n", sicode); 
	fprintf(flog, "CODE: %s \n", CODENAME); 
	fprintf(flog, "VERSION: %s \n", VERSION); 
	fflush(flog);


	// z component of total angular momentum, it is used to monitor accumulation of roundoff
	az0 = angmomz(num, DIM, parts, tpstore);
	toten0 = energy(num, DIM, parts, tpstore);


	/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
		ACTUAL INTEGRATION
	*/
	
	// the length of the inner loop, i.e.,  between printing 
	nin = (int) (fabs(ptime)/fabs(stepsize));  
	
	if ( nin == 0 ) {
		fprintf(flog, "%s\n", " Stepsize is larger than printing time, aborted");
		exit(1); 
	}
	
	// the length of the outer loop, i.e., printings from beginning to end 
	nps = (int) (fabs(ftime)/fabs(ptime));  

	fprintf(flog, "nin = %d nps = %d", nin, nps);
	
	/* WARNING: VERY LONG INTEGRATIONS CAN PRODUCE VERY LARGE
		INTEGERS AND ONE HAS TO MAKE SURE THAT THEY ARE STILL
		SMALLER THAN THE LARGEST REPRESENTABLE INTEGER 
	  ANOTHER WARNING: ROUNDOFF EFFECTS MAY CAUSE nin AND/OR nps TO BE
	  DIFFERENT FROM WHAT THE USER EXPECTS
	*/


	/* print the first data point, i.e., the initial data */ 
	#define PRINTDATA fprintf(f, "%.12f   ", time);\
		az = angmomz(num, DIM, parts, tpstore); \
		toten = energy(num, DIM, parts, tpstore); \
		fprintf(f, "%.16e  ", (toten-toten0)/toten0); \
		fprintf(f, "%.16e\n", (az-az0)/az0); \
		printdataA(num, f, parts);

	#define PRINTERROR \
		if ( icode == CSN ) { \
			fprintf(flocerr, "%.12f    %.12e\n", time, CSNerror); }

	PRINTDATA
	PRINTERROR


	/* MAIN LOOP */
	for ( ii = 1; ii <= nps; ++ii ) {
		for ( i = 1; i <= nin; ++i ) {
			switch(icode) {
			case SKP:
			  SKPStep(order, stepsize, num, parts); break;
			case HWH:
			  HWHStep(order, tree, stepsize, parts); break;
			case CSN:
			  CStep(order, stepsize, num, parts, tree); break;
			case GBS:
			  GBSStep(order, stepsize, num, parts); break;
			default: 
				fprintf(flog, "Not implemented\n");
				fclose(flog); fclose(f); exit(1); break;
			} 
		
		/* DONE ONE TIME STEP
		ADD MORE CODE HERE IF NECESSARY 
		variables you can use:
		double time;
		double parts DECLARR;
		Note that actual time should be computed as
		actualtime = time+stepsize*i
		*/

		}

		time += stepsize*nin;
		fprintf(f, "end of step, time = %f\n", time);
		
		if ( icode == HWH ) { ParticlesFromTree(tree, parts); }

		/* CHANGE THIS PART OR ADD YOUR CODE IF NECESSARY */
	
		PRINTDATA
		PRINTERROR

		if ( stepsize > 0.0 && time > ftime ) break;
		if ( stepsize < 0.0 && time < ftime ) break;

	}


	fclose(f);
	fprintf(flog, "\n Finished \n");
	fclose(flog);
	if ( icode == CSN ) fclose(flocerr);
	
	return 0;
}


/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
	FOR TEST: THE SIMPLE SPLIT, I.E., Kinetic - Potential
	It does not provide special treatment for test particles.
*/
int PlainSplitKineticStep(int num, double stepsize, double p DECLARR) {
	int i, k;
	for ( i = 1; i <= num; ++i ) {
		for ( k = 1; k <= DIM; ++k ) {
			p[i][k] += stepsize*p[i][k+DIM]; } }
	return 0; }

int PlainSplitPotentialStep(int num, double stepsize, double p DECLARR) {
	int i, j, k;
	double tmp, tmp2, tmp3;
	for ( i = 1; i <= num; ++i ) {
		tmp = 0.0;
		for ( j = i+1; j <= num; ++j ) {
			tmp2 = 0.0;
			for ( k = 1; k <= DIM; ++k ) {
				tmp3 = p[i][k] - p[j][k];
				tmp2 = tmp2 + tmp3*tmp3; }
			tmp3 = 1.0/sqrt(tmp2);
			tmp2 = tmp3/tmp2;
			for ( k = 1; k <= DIM; ++k ) {
				tmp3 = p[i][k] - p[j][k];
				tmp = -tmp3*tmp2*p[j][GMASSCO]*stepsize;
				p[i][k+DIM] += tmp;
				tmp = tmp3*tmp2*p[i][GMASSCO]*stepsize;
				p[j][k+DIM] += tmp; }
			}
	}
	return 0; }


/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
	COWELL-STORMER INTEGRATOR */


/* this computes accelerations, used both for CSN and GBS,
	not to be confused with Accel used for HWH */

int PlainAccel(int num, double p DECLARR) {
	int i, j, k; int tpi, tpj;
	double tmp, tmp2, tmp3;
	double massi, massj;
	for ( i = 1; i <= num; ++i ) {
		for ( k = 1; k <= DIM; ++k ) { p[i][ACC+k] = 0.0; } }
/* for testing things
for ( i = 1; i <= num; ++i ) {
	for ( k = 1; k <= DIM; ++k ) { p[i][ACC+k] = 1.0; } }
return 0;
*/
	for ( i = num; i >= 1; --i ) { 
		tpi = tpstore[i];
		massi = p[i][GMASSCO];
		for ( j = i+1; j <= num; ++j ) {
			tpj = tpstore[j];
			if ( tpi == TESTPARTICLE && tpj == TESTPARTICLE ) continue;
			tmp2 = 0.0;
			for ( k = 1; k <= DIM; ++k ) {
				tmp3 = p[i][k] - p[j][k];
				tmp2 = tmp2 + tmp3*tmp3; }
			tmp3 = 1.0/sqrt(tmp2);
			tmp2 = tmp3/tmp2;
			massj = p[j][GMASSCO];
			for ( k = 1; k <= DIM; ++k ) {
				tmp3 = p[i][k] - p[j][k];
				tmp = tmp3*tmp2;
			if ( tpj == NOTTESTPARTICLE ) {
				p[i][ACC+k] -= (tmp*massj); }
			if ( tpi == NOTTESTPARTICLE ) {
				p[j][ACC+k] += (tmp*massi); }
				}
			}
	}
	return 0; }

/* Cowell-Stormer-Newman 14(6)th order */


/* Newman's 14(6)th order Stormer's coeffs */

double CSalpha[] =  {   0.0,
			1.000000000000000000000,
                        0.000000000000000000000,
                        0.083333333333333333333,
                        0.083333333333333333333,
                        0.079166666666666666666,
                        0.075000000000000000000,
                        0.071345899470899470899,
                        0.068204365079365079365,
                        0.065495756172839506172,
                        0.063140432098765432098,
                        0.061072649861712361712,
                        0.059240564123376623376,
                        0.057603625837453135733,
                        0.056129980884507174190,
                        0.054794379107071147139
                                                 };
 
 
double CSbeta[] =  {    0.0,
			1.50000000000000000000,
                        0.33333333333333333333,
                        0.37500000000000000000,
                        0.35277777777777777777,
                        0.33402777777777777777,
                        0.31924603174603174603,
                        0.30736607142857142857,
                        0.29757660934744268077,
                        0.28933077050264550264,
                        0.28225737868098979210,
                        0.27609762576993479771,
                        0.27066578505957226195,
                        0.26582499331955247133,
                        0.26147199790503706399,
                        0.25752728193649391773
                                               };
 
	
double  sumfrombottom(double *a, int nr, double *coef) {
	int i;
	double sum = 0.0;
	for ( i = nr; i >= 1;  --i ) {
		sum += a[i]*coef[i]; }
	return sum; }

int differencing(double *a, int nr, double in) {
	int i, j, p;
	double tmp1, tmp2;
	tmp1 = a[1]; a[1] = in;
	for ( i = 2; i <= nr; ++i ) {
		tmp2 = a[i]; a[i] = a[i-1]-tmp1; tmp1 = tmp2; } 
	return 0; }


static int CStepCount = 0;
static int CStepInit = 0;
static struct tree *initCStree;

int CStep(int ord, double h, int nump, 
		double parts DECLARR, struct tree *treein) {
	double *a, asav, vx, sum, tmp; int p,j, i;
static double df DECLSTORMER;
/* actual local order to determine how terms to sum */
if ( CStepInit == 0 ) {
	initCStree = treein;
	CStepInit = 1;
	for ( i = 1; i <= ord+1; ++i ) {
	for ( p = 1; p <= nump; ++p ) {
	for ( j = 1; j <= DIM; ++j ) {
		df[p][j][i] = 0.0; } } } 	}
if ( CStepInit == 1 ) {
/* initialization stage */
		for ( p = 1; p <= nump; ++p ) {
		for ( j = 1; j <= DIM; ++j ) {
			df[p][j][0] = -parts[p][j]; } }

		if ( csninit == CSNINITHWH ) {
 			HWHStep(CSNINITHWHORDER, initCStree, h, parts); 
 			ParticlesFromTree(initCStree, parts);  }
		else { GBSStep(CSNINITGBSORDER, h, nump, parts); }
		if ( CStepCount > ord ) CStepInit = 2;
		for ( p = 1; p <= nump; ++p ) {
		for ( j = 1; j <= DIM; ++j ) {
			df[p][j][0] += parts[p][j]; } }
		}
else {
/* actual run */
CSNerror = 0;
for ( p = 1; p <= nump; ++p ) {
for ( j = 1; j <= DIM; ++j ) {
/* formula: x(n+1) - x(n) = (x(n) - x(n-1)) + h^2*sum
	with X(n) = x(n)-x(n-1)
 	X(n+1) = X(n) + h^2*sum
	x(n+1) = X(N+1) + x(n)
X(n) is in a[0] 
*/
	a = df[p][j];
	asav = a[0];
	sum = sumfrombottom(a, ord, CSalpha);
	sum *= h*h;
	a[0] += sum;
	parts[p][j] += a[0];
/* vx = (1.0/h)*(x(n)-x(n-1)) + h*sum */
	sum = h*sumfrombottom(a, ord, CSbeta);
	vx = asav/h + sum;
	parts[p][DIM+j] = vx;
/* record error */
	tmp = fabs(CSalpha[ord]*a[ord]*h*h);
	if ( CSNerror < tmp ) CSNerror = tmp;
	} } }
PlainAccel(nump, parts);
for ( p = 1; p <= nump; ++p ) {
for ( j = 1; j <= DIM; ++j ) {
	differencing(df[p][j], ord, parts[p][ACC+j]);
	parts[p][ACC+j] = 0.0;
	} }
++CStepCount;
return 0; }


/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	BULIRSCH-STOER
	actually, GRAGG-BULIRSCH-STOER, following Hairer et al., 1993
*/

/* Gragg's symmetric integrator, this is used to generate Tj,1  */
int GStep(int nump, double parts DECLARR, double s DECLARR, double h, int len) {
int k, j, p, d, knext, kprev, m;
double dif, difsum, tmp1, astep;
static double y0 DECLARR;
static double y1 DECLARR;
/* save originial in y0 */
for ( p = 1; p <= nump; ++p ) {
		y0[p][GMASSCO] = parts[p][GMASSCO];
for ( d = 1; d <= DIM; ++d ) { 
		y0[p][d] = parts[p][d]; 
		y0[p][d+DIM] = parts[p][d+DIM]; } }
/* consult Hairer et al. why we need this
	in short: we need a method which has only even powers
	of h in the expansion of its error, i.e., a symmetric method
 */
h = h/2.0;
PlainAccel(nump, y0);
for ( p = 1; p <= nump; ++p ) {
/* load masses into y1 */
		y1[p][GMASSCO] = y0[p][GMASSCO];
for ( d = 1; d <= DIM; ++d ) { 
		y1[p][d] = y0[p][d]+h*y0[p][DIM+d]; 
		y1[p][d+DIM] = y0[p][d+DIM]+h*y0[p][ACC+d]; } }
/* compute y2 (m=1), y3' (m=2), y4 (m=3), y5' (m=4) y(2n+1)' (m=2*n) 
	n = len
	*/
for ( m = 1; m <= 2*len; ++m ) {
	if ( (m%2) ) {
	for ( p = 1; p <= nump; ++p ) {
	for ( d = 1; d <= DIM; ++d ) { 
		y1[p][d] = y0[p][d]+2.0*h*y1[p][d+DIM]; 
		s[p][d] = y1[p][d]; y0[p][d] = y1[p][d]; 
		}  } }
	else {
	PlainAccel(nump, y1);
	for ( p = 1; p <= nump; ++p ) {
	for ( d = 1; d <= DIM; ++d ) { 
		y0[p][d+DIM] = y1[p][d+DIM]; 
		y1[p][d+DIM] = y0[p][d+DIM]+2.0*h*y1[p][ACC+d]; 
		s[p][d+DIM] = (y0[p][d+DIM]+y1[p][d+DIM])/2.0; 
		} } 
		}
	}
return 0; }


int GBSStep(int bsord, double h, int nump, double parts DECLARR) {
int k, j, jfinal, p, d, jnext, jprev, len;
static double gbsarr DECLBS;
double dif, difsum, tmp1, astep;
int *n = extrapolseq;
static double s DECLARR;
/* for testing GStep
	astep = h; len = 1;
	GStep(nump, parts, s, astep, len);
for ( p = 1; p <= nump; ++p ) {
for ( d = 1; d <= 2*DIM; ++d ) { parts[p][d] = s[p][d]; } } 
return 0;
*/
jfinal = bsord;
for ( j = 1; j <= bsord; ++j ) {
	difsum = 0.0;
/* map j and j-1 to storage index */
	jnext = ((j)%2); 
	jprev = ((j-1)%2);
/* compute Tj,1 */
	len = n[j];
	astep = h/len;
	GStep(nump, parts, s, astep, len);
	for ( p = 1; p <= nump; ++p ) {
	for ( d = 1; d <= 2*DIM; ++d ) { gbsarr[p][d][jnext][1] = s[p][d]; } }
/* compute Tj,k */
for ( k = 1; k <= j-1; ++k ) {
/* Aitken-Neville for nonsymmetric underlying method */
	tmp1 = ((double) n[j])/((double) n[j-k]); 
/* Aitken-Neville for symmetric underlying method (square of previous) */
	tmp1 = tmp1*tmp1;
for ( p = 1; p <= nump; ++p ) {
for ( d = 1; d <= 2*DIM; ++d ) {
	dif = (gbsarr[p][d][jnext][k] - gbsarr[p][d][jprev][k]);
	if ( k == j-1 ) difsum += fabs(dif);
	gbsarr[p][d][jnext][k+1] = gbsarr[p][d][jnext][k] + dif/(tmp1-1.0);
	} }
	} 
/* stop computing more is reached roundoff level */
	if ( j > 1 && difsum <= GBSTOL ) { jfinal = j; break; }
	}
j = jfinal; k = j; jnext = ((j)%2); 
for ( p = 1; p <= nump; ++p ) {
for ( d = 1; d <= 2*DIM; ++d ) { parts[p][d] = gbsarr[p][d][jnext][k]; } } 
return 0; }
	

/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
	Actual  HWH integration */
int MoveCMandKepler(struct tree *tree, double *cmove, double stepsize, 
		double p DECLARR);
int KeplerOnTree(struct tree *tree, double stepsize, double p DECLARR);
int RecursivePerturbations(struct tree *tree, double stepsize, double p DECLARR);
int RecPertAcc(struct tree *tree, double stepsize, double p DECLARR);

/* fourth order coeffs */
static double Coeff4x0 = 0.0;
static double Coeff4x1 = 0.0;

static int SympCalled = 0;

void initSympCoefs() {
	double cbrt2 = cbrt(2.0);
	Coeff4x0 = -cbrt2/(2.0-cbrt2);
	Coeff4x1 = 1.0/(2.0-cbrt2);
	SympCalled = 1; }
	

static int HWHCalled = 0;

// X
int HWHStep(int order, struct tree *tree, double stepsize, double p DECLARR) {
	if ( SympCalled == 0 ) { 
		initSympCoefs(); 
	}
	if ( HWHCalled == 0) {
		// initialize tree 
		FillCMinTree(tree, p); 
		HWHCalled = 1; 
	}
	if ( order == 1 ) {
		KeplerOnTree(tree, stepsize, p);
		RecursivePerturbations(tree, stepsize, p);
		return 0; 
	}
	if ( order == 2 ) {
		KeplerOnTree(tree, stepsize/2.0, p);
		RecursivePerturbations(tree, stepsize, p);
		KeplerOnTree(tree, stepsize/2.0, p); 
		return 0; 
	}
	if ( order == 4 ) {
		HWHStep(2, tree, stepsize*Coeff4x1, p);
		HWHStep(2, tree, stepsize*Coeff4x0, p);
		HWHStep(2, tree, stepsize*Coeff4x1, p);
		return 0; 
	}
	printf(" Order %2d  is not implemented, Aborted", order);
	exit(1); 
}


int SKPStep(int order, double stepsize, int nump, double p DECLARR) {
	if ( SympCalled == 0 ) { initSympCoefs(); }
	if ( order == 1 ) {
		PlainSplitKineticStep(nump, stepsize, p);
		PlainSplitPotentialStep(nump, stepsize, p);
		return 0; }
	if ( order == 2 ) {
		PlainSplitKineticStep(nump, stepsize/2.0, p);
		PlainSplitPotentialStep(nump, stepsize, p);
		PlainSplitKineticStep(nump, stepsize/2.0, p);
		return 0; }
	if ( order == 4 ) {
		SKPStep(2, stepsize*Coeff4x1, nump, p);
		SKPStep(2, stepsize*Coeff4x0, nump, p);
		SKPStep(2, stepsize*Coeff4x1, nump, p);
		return 0; }
	printf(" Order %2d  is not implemented, Aborted", order);
	exit(1); }


/* +++++++++++++++++++++++++++++++++++++++++++++++++++++ 
/	TREE
/
/
*/


int inittree(struct tree *tmp) {
	int i; tmp->tmass = 0.0; tmp->particleindex = 0;
	tmp->p1 = 0; tmp->p2 = 0;
	tmp->testparticles = 0; tmp->rm1 = 0; tmp->rm2 = 0;
	for ( i = 1; i <= DIM2; ++i ) { tmp->bc[i] = 0; }
	for ( i = 1; i <= DIM; ++i ) { tmp->acc[i] = 0; }
	return 0; }


struct tree *newtree() {
	struct tree *tmp = (struct tree*) malloc(sizeof(struct tree)); 
	inittree(tmp); return tmp; }

void deletetree(struct tree *p) { 
	if ( p->p1 ) deletetree(p->p1);
	if ( p->p2 ) deletetree(p->p2);
	free((void *) p); }


struct tree *MakeSimpleTree(int nump) {
	int i;
	struct tree *root;
	struct tree *t1, *p1, *p2;
	root = newtree();
	t1 = root;
	for ( i = nump; i > 1; --i ) {
		p1 = newtree();
		p2 = newtree();
		p1->particleindex = 0;
		p2->particleindex = i;
		t1->p1 = p1; t1->p2 = p2;
		t1 = p1; }
	t1->particleindex = 1;
	return root; }

int eatwhite(FILE *f) {
	int c; 
	for ( ; ; ) {
		c = getc(f);
		if ( !isspace(c) ) break; 
	}
	ungetc(c, f); 
	return c;
}


/* just an array to hold indeces read in,
	portions of this are assigned to
	tpa's below */
int tmpReadList[MAXNBODIES+1000];
/* bookkeeping */
int freelist = 0;

int *ReadList(FILE *f) {
	int i1, i2; int c; int i; int *arr;
	c = eatwhite(f);
	if ( c != '[' ) {
		printf(" Error: bad list, aborted \n"); exit(1); 
	}
	c = getc(f);
	fscanf(f, "%d - %d", &i1, &i2);
	c = eatwhite(f);
	if ( c != ']' ) {
		printf(" Error: bad list, aborted \n"); exit(1); 
	}
	c = getc(f); 
	arr = tmpReadList+freelist;
	
	/* put zero in tpstore */
	for ( i = i1; i <= i2; ++i ) tpstore[i] = TESTPARTICLE;
	for ( i = i1; i <= i2; ++i ) arr[i-i1+1] = i;
	
	arr[0] = i2-i1+1;
	freelist += i2-i1+2;
	return arr; 
}
	
		
struct tree *MakeTree(FILE *f) {
	int i; int c; int pind; int *tpa;
	struct tree *root;
	struct tree *p1, *p2;
	c = eatwhite(f);
	if ( c != '(' ) {
		if ( isdigit(c) ) {
			// single particle 
			fscanf(f,"%d", &pind);
			root = newtree();
			root->particleindex = pind;
			return root;
		} 
		// set of test particles
		tpa = ReadList(f);
		root = newtree();
		root->particleindex = tpa[1];
		root->testparticles = tpa;
		return root;
	}
	c = getc(f);
	root = newtree();
	p1 = MakeTree(f);
	p2 = MakeTree(f);
	root->p1 = p1; root->p2 = p2;
	root->particleindex = 0;
	c = eatwhite(f);
	
	if ( c != ')' ) {
		printf(" Error: bad tree, aborted \n"); exit(1); 
	}
	c = getc(f);
	return root; 
}


/* not used now but might come handy */
struct tree *getparticleintree(struct tree *tree, int ind) {
	struct tree *tmp;
	if ( tree->particleindex ) { 
		if ( tree->particleindex == ind ) { return tree; }
		return 0; }
	tmp = getparticleintree(tree->p1, ind);
	if ( tmp != 0 ) return tmp;
	tmp = getparticleintree(tree->p2, ind);
	if ( tmp != 0 ) return tmp;
	return 0; }
	

/* not used now but might come handy */
int **GetTestParticles(struct tree *tree, int ind) {
	struct tree *tmp = getparticleintree(tree, ind);
	if ( tmp == 0 ) return 0;
	return &(tmp->testparticles); }
	

/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	ACTUAL NUMERICAL STUFF ON TREE
*/

// X fills in initial data: masses, mass ratios, barycenters 
int FillCMinTree(struct tree *tree, double p DECLARR) {
	int j; double *bc = tree->bc;
	int i, ind, tp, num, *tpa = tree->testparticles;
	
	#pragma omp parallel for private(j) schedule(runtime)
	for ( j = 1; j <= DIM; ++j ) 
		tree->acc[j] = 0.0;
	
	tree->tmass = 0.0; tree->rm1 = 0.0; tree->rm2 = 0.0;
	#pragma omp parallel for private(j) schedule(runtime)
	for ( j = 1; j <= DIM2; ++j ) 
		bc[j] = 0.0;
		
	ind = tree->particleindex;
	if ( ind ) {
	    if ( tpa != 0 ) {  
			num = tpa[0];
			for ( tp = 1; tp <= num; ++tp ) { 
				ind = tpa[tp]; 
				if ( ind < 1 ) 
					continue;
				#pragma omp parallel for private(i) schedule(runtime)
				for ( i = 1; i <= DIM; ++i ) 
					p[ind][ACC+i] = 0.0;
				} 
			}  
	    else { 
			tree->tmass = p[ind][GMASSCO];
			#pragma omp parallel for private(j) schedule(runtime)
			for ( j = 1; j <= 2*DIM; ++j ) 
				bc[j] = p[ind][j]; 
		}
	    return 0; 
	} 
	FillCMinTree(tree->p1, p);
	FillCMinTree(tree->p2, p);
	tree->tmass = tree->p1->tmass + tree->p2->tmass;
	tree->rm1 = (tree->p1->tmass)/(tree->tmass); 
	tree->rm2 = (tree->p2->tmass)/(tree->tmass);
	
	#pragma omp parallel for private(j) schedule(runtime)
	for ( j = 1; j <= DIM2; ++j ) {
		bc[j] = tree->p1->bc[j] * tree->rm1 + tree->p2->bc[j] * tree->rm2; 
	} 
	return 0; 
}

/* copy from tree into p */
int ParticlesFromTree(struct tree *tree, double p DECLARR) {
	int j; double *bc = tree->bc;
	int i, ind, *tpa = tree->testparticles;
	ind = tree->particleindex;
	if ( ind ) {
	   if ( tpa == 0 ) {
		tree->tmass = p[ind][GMASSCO];
		for ( j = 1; j <= DIM2; ++j ) p[ind][j] = bc[j]; }
	   return 0; } 
	ParticlesFromTree(tree->p1, p);
	ParticlesFromTree(tree->p2, p);
	return 0; }


/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	KEPLER STEP ON TREE
*/

/* computes Stumpff functions several ways, depending on x */
int StumpffFunctions(double *vals, double x);
/* advances the Kepler motion 1/2*v^2 - mu/|x| for time deltat */
double KeplerStumpffStep(int dim, double mu, double *xv, double deltat);
/* return only the increment in xv */
double IncrementKeplerStumpffStep(int dim, double mu, double *xv, double deltat);

// X NOTE: this does not CHANGE centers of masses 
int KeplerOnTree(struct tree *tree, double stepsize, double p DECLARR) { 
	// move the center of mass 
	double cmove[10]; 
	int i;
	#pragma omp parallel for private(i) schedule(runtime)
	for ( i = 1; i <= 2*DIM; ++i ) 
		cmove[i] = 0.0;

	// move by 1/2*(Tmass)*v^2 */
	#pragma omp parallel for private(i) schedule(runtime)
	for ( i = 1; i <= DIM; ++i ) 
		cmove[i] = (tree->bc[DIM+i])*stepsize; 

	MoveCMandKepler(tree, cmove, stepsize, p);
	return 0; 
}


// X Assume that test particles are attached to 2nd 
int DoTestPartKepler(struct tree *tree, double *cmove, double stepsize, double p DECLARR) {
	int i, tp, num, *tpa, ind;
	double mu, tmpkep[TMPSIZE], *bc;
	tpa = tree->p2->testparticles; 
	num = tpa[0];
	if ( num > 0 ) {
		bc = tree->p1->bc; 
		mu = tree->tmass;
		for ( tp = 1; tp <= num; ++tp ) { 
			ind = tpa[tp]; 
			if ( ind < 1 ) continue;
			#pragma omp parallel for private(i) schedule(runtime)
			for ( i = 1; i <= DIM2; ++i )   
				tmpkep[i] = p[ind][i] - bc[i];
			IncrementKeplerStumpffStep(DIM, mu, tmpkep, stepsize); 
			for ( i = 1; i <= DIM2; ++i ) 
				p[ind][i] += (cmove[i]+tmpkep[i]);
		} 
	}
	return 0; 
}

// X move the center of mass and do a Kepler step on relative coordinates, transmit the new "moves" down 
int MoveCMandKepler(struct tree *tree, double *cmove, double stepsize, double p DECLARR) {
	int i, *tpa, ind;
	double tmp[TMPSIZE], tmpkep[TMPSIZE], *xv1, *xv2, mu;
	double *bc = tree->bc;
	
	for ( i = 1; i <= DIM2; ++i ) { 
		bc[i] += cmove[i]; 
	}
	// do nothing else for actual particles 
	if ( tree->particleindex ) { 
		return 0; 
	}
  	if ( tree->p2->testparticles ) {	
		// this is a pair where p2 is an array of test particles 
		DoTestPartKepler(tree, cmove, stepsize, p); 
		MoveCMandKepler(tree->p1, cmove, stepsize, p); 
		return 0; 
	}
	mu = tree->tmass;
	xv1 = tree->p1->bc; 
	xv2 = tree->p2->bc;
	#pragma omp parallel for private(i) schedule(runtime)
	for ( i = 1; i <= DIM2; ++i ) { 
		tmpkep[i] = xv2[i]-xv1[i]; 
	}
	// do Kepler step on tmpkep, we get back the increment 
	IncrementKeplerStumpffStep(DIM, mu, tmpkep, stepsize); 
	#pragma omp parallel for private(i) schedule(runtime)
	// use only the relative change 
	for ( i = 1; i <= DIM2; ++i ) { 
		tmp[i] = cmove[i] - tmpkep[i]*(tree->rm2); 
	}
	MoveCMandKepler(tree->p1, tmp, stepsize, p);
	#pragma omp parallel for private(i) schedule(runtime)
	for ( i = 1; i <= DIM2; ++i ) { 
		tmp[i] = cmove[i] + tmpkep[i]*(tree->rm1); 
	}
	MoveCMandKepler(tree->p2, tmp, stepsize, p);
	return 0; 
}
	
/*  ++++++++++++++++++++++++++++++++++++++++++++++++++++++
	PERTURBATION ACCELERATIONS
*/

/* X simple acceleration */
int Accel(double *x1, double *x2, double *acc) {
	double tmp[10], r = 0.0; int i; 
	for ( i = 1; i <= DIM; ++i ) {
		tmp[i] = x1[i]-x2[i];
		r = r + tmp[i]*tmp[i]; 
	}
	r = sqrt(r);
	r = r*r*r;
	for ( i = 1; i <= DIM; ++i ) { 
		acc[i] = -tmp[i]/r; 
	}
	return 0; 
}

int AdvanceTestParticleVel(struct tree *tree, double stepsize, double p DECLARR) {
		int i, ind, tp, num, *tpa = tree->testparticles;
		if ( tpa == 0 ) return 0; num = tpa[0];
	for ( tp = 1; tp <= num; ++tp ) {
		ind = tpa[tp]; if ( ind < 1 ) continue;
		for ( i = 1; i <= DIM; ++i ) {
			p[ind][DIM+i] += stepsize*p[ind][ACC+i]; 
			p[ind][ACC+i] = 0.0; }
			} 
	return 0; }
			
	
/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	PERTURBATIONS ON TREE
*/
// X 
int RecursivePerturbations(struct tree *tree, double stepsize, double p DECLARR) {
	int j, ind; double *bc = tree->bc;
	/* advance attached test particles 
	if ( tree->particleindex ) {
	/* if particle we are ready to advance v */
	if ( tree->testparticles ) {
		AdvanceTestParticleVel(tree, stepsize, p); 
		return 0; 
	}
	// this is a nonzero mass actual particle, advance v 
		for ( j = 1; j <= DIM; ++j ) {
			bc[DIM+j] += stepsize*tree->acc[j]; 
			tree->acc[j] = 0.0; 
		}
		return 0; 
	
	/* if not, first we take care of accs. for pairs with the two parts  
		IMPORTANT TO DO THIS FIRST
	*/
	RecPertAcc(tree, stepsize, p);
	// now descend and do the same for both parts 
	RecursivePerturbations(tree->p2, stepsize, p);
	RecursivePerturbations(tree->p1, stepsize, p);
	/* now regenerate center of mass etc. */
	if ( tree->p2->testparticles ) {
		#pragma omp parallel private(j)
		#pragma omp for schedule(runtime)
		for ( j = 1; j <= DIM2; ++j ) { 
			bc[j] = tree->p1->bc[j]; 
		}
		return 0; 
	}
	
	#pragma omp parallel private(j)
	#pragma omp for schedule(runtime)
	for ( j = 1; j <= DIM2; ++j ) {
		bc[j] = tree->p1->bc[j] * tree->rm1 + tree->p2->bc[j] * tree->rm2; 
	}
	return 0; 
}

/* assume test particles in tree2 for dir > 1, reverse signs for dir < 0 */
int TestParticleAcc(int dir, double *cacc, struct tree *tree1, struct tree *tree2, 
				double p DECLARR) {
		int i, ind, tp, num, *tpa = tree2->testparticles;
		double acc[TMPSIZE];
		double *bc = tree1->bc;
		double tmp, gm1 = tree1->tmass;
		num = tpa[0];
	for ( tp = 1; tp <= num; ++tp ) {
		ind = tpa[tp]; if ( ind < 1 ) continue;
/* rewrite this to avoid bad numerical attitude */
			Accel(bc, &p[ind][0], acc);
		for ( i = 1; i <= DIM; ++i ) {
/* since test particles are in 2nd */
			tmp = acc[i]-cacc[i]*dir;
			p[ind][ACC+i] += -gm1*tmp; }
			} 
	return 0; }
			

/* X c1 and c2 are for possible future use */
int RecPertAccSub(struct tree *tree1, 
	struct tree *tree2, double *cacc, double *c1, double *c2, double p DECLARR) {
	/* do the work if both are particles */
	int i, ind1, ind2; double acc[TMPSIZE];
	double tmp, gm1 = tree1->tmass, gm2 = tree2->tmass;
	ind1 = tree1->particleindex;
	ind2 = tree2->particleindex;
	if ( ind1 && ind2 ) {
		/* do nothing if both are test particle arrays */
		if ( tree1->testparticles  && tree2->testparticles  ) 
			return 0;  
		/* test particles first */
		if ( tree2->testparticles ) {
			TestParticleAcc(1, cacc, tree1, tree2, p); 
			return 0; 
		}
		if ( tree1->testparticles ) {
			TestParticleAcc( -1, cacc, tree2, tree1, p); 
			return 0; 
		}
		/* if actual particles with nonzero mass */
		Accel(tree1->bc, tree2->bc, acc);
		for ( i = 1; i <= DIM; ++i ) {
			/* rewrite this to avoid bad numerical attitude */
			tmp = acc[i]-cacc[i];
			tree1->acc[i] += gm2*tmp; 
			tree2->acc[i] += -gm1*tmp; 
		}
		return 0; 
	}
	/* if not, descend but only for one branch, the other is taken care in turn */
	if ( !ind1 ) {
		RecPertAccSub(tree1->p2, tree2, cacc, c1, c2, p);
		RecPertAccSub(tree1->p1, tree2, cacc, c1, c2, p);
		return 0; 
	}
	RecPertAccSub(tree1, tree2->p1, cacc, c1, c2, p);
	RecPertAccSub(tree1, tree2->p2, cacc, c1, c2, p);
	return 0; 
}

/* assume test particles in tree2 */
int TestParticleAccSpec(double *cm, struct tree *tree1, struct tree *tree2, double p DECLARR) {
	int i, ind, tp, num, *tpa = tree2->testparticles;
	double acc[TMPSIZE], cacc[TMPSIZE];
	double *bc = tree1->bc;
	double tmp, gm1 = tree1->tmass;
	num = tpa[0];
	for ( tp = 1; tp <= num; ++tp ) {
		ind = tpa[tp]; if ( ind < 1 ) continue;
		/* rewrite this to avoid bad numerical attitude */
		Accel(bc, &p[ind][0], acc);
		Accel(cm, &p[ind][0], cacc);
		for ( i = 1; i <= DIM; ++i ) {
			/* since test particles are in 2nd */
			tmp = acc[i]-cacc[i];
			p[ind][ACC+i] += -gm1*tmp; 
		}
	} 
	return 0; 
}
			

/* c1 and c2 are for possible future use */
int RecPertAccSubTest(double *cm,  struct tree *tree1, struct tree *tree2, double p DECLARR) {
	int ind1;
	ind1 = tree1->particleindex;
	if ( ind1 ) {
		/* do nothing if tree1 has only test particles */
		if ( tree1->testparticles ) 
			return 0;
		TestParticleAccSpec(cm, tree1, tree2, p); 
		return 0; 
	}
	/* if not, descend for both branches */
	RecPertAccSubTest(cm, tree1->p1, tree2, p);
	RecPertAccSubTest(cm, tree1->p2, tree2, p);
	return 0; 
}


int RecPertAcc(struct tree *tree, double stepsize, double p DECLARR) {
	double cacc[TMPSIZE];
	/* do nothing if all in p2 are attached to p1: pure Kepler motion */
	if ( tree->p1->particleindex &&  tree->p2->particleindex  )  
		return 0; 
	if ( tree->p2->testparticles == 0 ) {
		/* actual nonzero masses in p2 */
		Accel(tree->p1->bc, tree->p2->bc, cacc);
		
		RecPertAccSub(tree->p1, tree->p2, cacc, tree->p1->bc, tree->p2->bc, p); 
		return 0;
	}
	/* test particles in p2 */
	RecPertAccSubTest(tree->p1->bc, tree->p1, tree->p2, p); 
	return 0; 
}


/*
// ===============================================================================
//
//   KEPLER MOTION USING STUMPFF FUCNTIONS
//	FOLLOWING DANBY, 1992
//	(quartic Newtonian solver, Stumpff function definitions,
//	 f and g functions)
//	see also: Stiefel and Sheifele, Linear and Regular
//	Celestial Mechanics
//
// ===============================================================================

// Stumpff functions (c0, c1, c2, c3)
*/

/* CHANGE these numbers to tune accuracy */
/* this one can be changed runtime */
static double StumpffQSTol = KEPLERSOLVERTOLERANCE;

/* the limit for which trig and hyp. trig functions are used */
#define STUMPFFPLAINLIM (5.0)
/* reduction limit */
#define STUMPFFRED (1.0/32.0)
#define STUMPFFSHORTTOL (1.0e-6)

/* this one is not used now */
static double StumpffSerTol = 1.0e-25;

static int StumpffErrorCount = 0;
static int StumpffErrorCountLimit = 100;

CheckStumpffErrorCount() {
	++StumpffErrorCount;
	if ( StumpffErrorCount > StumpffErrorCountLimit ) {
		fprintf(FLOG, "\n Too many errors, aborted \n");
		fflush(FLOG);
		exit(1); 
	}
	return 0; 
}

double util_fact(int k) {
	int i; double fact = 1;
	for ( i = 1; i <= k; ++i ) {
		fact *= i; 
	}
	return fact; 
}

/*
In case we need the coefficients, produced by Mathematica 
static double c2_0 = 0.5;
static double c2_1 = -1.0/24;
static double c2_2 = 1.0/720;
static double c2_3 = -1.0/40320;
static double c2_4 = 1.0/3628800;
static double c2_5 = -1.0/479001600;
static double c2_6 = 1.0/87178291200;
static double c2_7 = -1.0/20922789888000;
static double c2_8 = 1.0/6402373705728000;
static double c2_9 = -1.0/2432902008176640000;                          
static double c2_10 = 1.0/1124000727777607680000;

static double c3_0 = 1.0/6;
static double c3_1 = -1.0/120;
static double c3_2 = 1.0/5040;
static double c3_3 = -1.0/362880;
static double c3_4 = 1.0/39916800;
static double c3_5 = -1.0/6227020800;
static double c3_6 = 1.0/1307674368000;
static double c3_7 = -1.0/355687428096000;
static double c3_8 = 1.0/121645100408832000;
static double c3_9 = -1.0/51090942171709440000;
static double c3_10 = 1.0/25852016738884976640000;
*/


/* This is what is actually used, generated by factoring the coefficients */
double HornerStumpff2(double x) {
	return
(1.0 - x*(1 - x*(1 - x*(1 - x*(1 - x *(1 - x
/* up to degree 10, if you need it 
*(1 - x*(1 - x*(1 - (1 - x/462)*x/380)/306)/240)        */
/182)/132)/90)/56)/30)/12)/2;
	}

double HornerStumpff3(double x) {
	return
 (1.0 - x*(1 - x*(1 - x*(1 - x*(1 - x*(1 - x
/* up to degree 10, if you need it 
*(1 - x*(1 - x*(1 - (1 - x/506)*x/420)/342)/272) 	*/
/210)/156)/110)/72)/42)/20)/6;
	}


/* first 4 terms, ie., up to x^3 */
double Stumpff2Short(double x) {
	return (1.0 - x*(1 - x*(1 - x/56)/30)/12)/2;
	}

double Stumpff3Short(double x) {
	return (1.0 - x*(1 - x*(1 - x/72)/42)/20)/6;
	}


// Original for reference 
double StumpffSeriesCk(int k, double x) {
	int i;
	double term, sum = 0.0;
	double terms[100], num;
	term = 1.0;
	for ( i = 1; i <= 50; ++i ) {
		num = (k+2*i-1)*(k+2*i);
		term = term*(-x/num);
		terms[i] = term;
		if ( fabs(term) < StumpffSerTol ) break; 
		}
	for ( ; i > 0; --i )  sum += terms[i];
	sum = (1.0+sum)/util_fact(k);
	return sum; }

// X
int StumpffFunctions(double *c, double x) {
	double x2, tmp;
	int i;
/* What this is supposed to be
c[0] = StumpffSeriesCk(0, x);
c[1] = StumpffSeriesCk(1, x);
c[2] = StumpffSeriesCk(2, x);
c[3] = StumpffSeriesCk(3, x);
return 0;
*/
/* FOR REFERENCE, THIS COULD BE USED FOR LARGE VALUES, MAYBE 
	if ( x > STUMPFFPLAINLIM ) {
		x2 = sqrt(x);
		c[0] = cos(x2);
		c[1] = sin(x2)/x2;
		c[2] = (1.0-cos(x2))/x;
		c[3] = (x2-sin(x2))/(x*x2); 
		return 0; }
	if ( x < -STUMPFFPLAINLIM ) {
		x2 = sqrt(-x);
		c[0] = cosh(x2);
		c[1] = sinh(x2)/x2;
		c[2] = (cosh(x2)-1.0)/x;
		c[3] = (sinh(x2)-x2)/(x*x2); 
		return 0; }
*/
	if ( fabs(x) < STUMPFFSHORTTOL ) {
		c[2] = Stumpff2Short(x); 
		c[3] = Stumpff3Short(x);
		c[1] = 1.0 - x*c[3]; 
		c[0] = 1.0 - x*c[2];
		return 0;
	}
	if ( fabs(x) < STUMPFFRED ) {
		c[2] = HornerStumpff2(x); 
		c[3] = HornerStumpff3(x); 
		c[1] = 1.0 - x*c[3]; 
		c[0] = 1.0 - x*c[2];
		return 0;
	}
	// reduce value to small 
	x2 = x;
	for ( i = 1; i <= 20; ++i ) {
		x2 = x2/4.0;
		if ( fabs(x2) < STUMPFFRED ) 
			break; 
	}
	if ( i > 20 ) {
		fprintf(FLOG, "%s"," StumpfReduction No Good \n"); 
		CheckStumpffErrorCount();
		fflush(FLOG);
		return 0; 
	} 
	/* original 
		c[2] = StumpffSeriesCk(2, x2);
		c[3] = StumpffSeriesCk(3, x2);
	*/
	c[2] = HornerStumpff2(x2);
	c[3] = HornerStumpff3(x2);
	c[1] = 1.0 - x2*c[3];
	c[0] = 1.0 - x2*c[2];
	for ( ; i > 0; --i ) {
		tmp = c[2];
		c[2] = (c[1]*c[1])/2.0; 
		c[3] = (tmp+c[0]*c[3])/4.0; 
		c[1] = c[0]*c[1];
		c[0] = (2.0*c[0]*c[0])-1.0;
	}
	return i; 
}


/* ===============================================================================
*/

/* NOTE:
It seems that there might be a systematic error here, not sure yet.
The Newton solvers tend to end up systematically on a preferred side 
of the solution.  E.g., for x^2: if starting from above the solution,
it ends at a slightly larger value than the solution.  Starting from
 below, in the first step it will get to the previous case.  Since 
the solution cannot be computed to machine precision, we could have 
a systematic error.  Since the orbit spends more time near apocenter
than near pericenter, this seems to lead an increase in energy and 
decrease in angular momentum.  Perhaps this could be randomized with 
an additional bisection at the end?
*/
// X
double QuarticStumpffSolverIterator(double mu, double alpha, double s, 
		double r0, double u, double deltat) {
	double c[5];
	double d0, d1, d2, d3, s2, ssav = s, tmp1, tmp2, tmp3;
	double delta1, delta2, delta3;
	int i;
	for ( i = 1; i <= 20; ++i ) {
		s2 = s*s;
		StumpffFunctions(c, alpha*s2);
		tmp1 = s*c[1];
		tmp2 = s2*c[2];
		tmp3 = -r0*alpha+mu;
		d0 = u*tmp2+mu*s2*s*c[3];
		d0 += r0*tmp1;
		d0 -= deltat; 
		d1 = u*tmp1+mu*tmp2; 
		d1 += r0*c[0];
		d2 = u*c[0]+tmp3*tmp1; 
		d3 = tmp3*c[0]-u*alpha*c[1]; 
		delta1 = -d0/d1;
		delta2 = -d0/(d1+delta1*d2/2);
		delta3 = -d0/(d1+delta1*d2/2+delta2*delta2*d3/6);
		ssav= s;
		s = s + delta3;
		if ( fabs(delta3/s) < StumpffQSTol ) return s; 
	}
	fprintf(FLOG, "%s"," QUARTIC STUMPFF SOLVER NOT CONVERGENT? \n"); 
	fprintf(FLOG, "    s = %.16e \n", s); 
	fprintf(FLOG, "sprev = %.16e \n", ssav); 
	fprintf(FLOG, "delta3/s = %.16e \n", delta3/s); 
	fprintf(FLOG, "Stumpff x = %.16e \n", alpha*s2); 
	fprintf(FLOG, "deltas = %.16e ", delta1); fprintf(FLOG, "%.16e ", delta2); 
	fprintf(FLOG, "%.16e \n", delta3);
	CheckStumpffErrorCount();
	fflush(FLOG);
	return s; 
}

// X
double QuarticStumpffSolver(double mu, double alpha, double s, double r0, double u, double deltat) {
	/* initial guess */
	double tmp = deltat/r0;
	double s0 = tmp;
	double r0dot = u/r0;
	tmp = tmp*tmp;
	s0 -= 0.5*r0dot*tmp;
	/* third order in deltat */
	tmp = tmp*deltat/r0;
	s0 += (0.5*r0dot*r0dot-1/6*(mu-alpha*r0)/r0)*tmp;
	s = QuarticStumpffSolverIterator( mu, alpha, s0, r0, u, deltat);
	return s; 
}

/* ===============================================================================
*/
// X
void IncfgAndDerivsStumpff(double *vals, double mu, double alpha, double s, 
		double r0, double u, double deltat) {
	double r, s2, tmp1, tmp2, tmp3;
	double c[5];
	s = QuarticStumpffSolver( mu, alpha, s, r0, u, deltat);
	s2 = s*s;
	StumpffFunctions(c, alpha*s2);
	//recompute c[1] from angular mom. 
	tmp1 = s*c[1];
	tmp2 = s2*c[2];
	tmp3 = u*tmp1+mu*tmp2;
	/* r */
	r = tmp3+(r0*c[0]);
	vals[0] = r;
	tmp3 = -mu*tmp2;
	/* f */
	vals[1] = tmp3/r0;
	/* g, this is usually bad in terms of roundoff */
	vals[2] = r0*tmp1+u*tmp2;
	/* derivs */
	vals[3] = -(mu*tmp1)/(r0*r);
	vals[4] =  tmp3/r;
	return; 
}

void fgAndDerivsStumpff(double *vals, double mu, double alpha, double s, 
		double r0, double u, double deltat) {
	IncfgAndDerivsStumpff(vals, mu, alpha, s, r0, u, deltat);
	vals[1] += 1.0;
	vals[4] += 1.0;
	return; }

double KeplerStumpffStep(int dim, double mu, double *xv, double deltat) {
	double fgvals[5], tmp;
	double *x = xv;
	double *v = xv+dim;
	double v02 = 0.0, u = 0.0, r0 = 0.0;
	double r0dot, alpha;
	int i;
	for ( i = 1; i <= dim; ++i ) {
		v02 += v[i]*v[i];
		u += x[i]*v[i];
		r0 += x[i]*x[i]; }
	r0 = sqrt(r0);
	r0dot = u/r0;
/* a = 1.0/(2.0/r0 - v02/mu); alpha = mu/a; */
	alpha = 2.0*mu/r0 - v02;
	fgAndDerivsStumpff(fgvals, mu, alpha, 0.0, r0, u, deltat);
/* now update */
	for ( i = 1; i <= dim; ++i ) {
		tmp = x[i];
		x[i] = fgvals[1]*x[i]+fgvals[2]*v[i];
		v[i] = fgvals[3]*tmp+fgvals[4]*v[i]; }
	return 0.0; }


// X return only the increment 
double IncrementKeplerStumpffStep(int dim, double mu, double *xv, double deltat) {
	double fg[5], tmp, tmp2;
	double *x = xv;
	double *v = xv+dim;
	double alpha, v02 = 0.0, u = 0.0, r02 = 0.0, r0;
	int i, ii;
	double tmp1, r, delta1;
	for ( i = 1; i <= dim; ++i ) {
		v02 += v[i]*v[i];
		u += x[i]*v[i];
		r02 += x[i]*x[i]; 
	}
	r0 = sqrt(r02);
	/* a = 1.0/(2.0/r0 - v02/mu); alpha = mu/a; */
	alpha = 2.0*mu/r0 - v02;
	IncfgAndDerivsStumpff(fg, mu, alpha, 0.0, r0, u, deltat);

	// using fg'-gf' = 1 ( ' = dot) or total energy do not help much 
		
	// now update 
	for ( i = 1; i <= dim; ++i ) {
		tmp = x[i];
		x[i] = fg[1]*x[i]+fg[2]*v[i]; 
		v[i] = fg[3]*tmp+fg[4]*v[i];  
	}
	return 0.0; 
}

/* FOR TEST 
double IncrementKeplerStumpffStep(int dim, double mu, double *xv, double deltat) {
	double tmp[7], tmp2[7];
	int i;
	for ( i = 1; i <= 2*dim; ++i ) { tmp[i] = tmp2[i] = xv[i]; }
IncrementKeplerStumpffStepSub(dim, mu, tmp, deltat);
	for ( i = 1; i <= 2*dim; ++i ) tmp2[i] += tmp[i];
IncrementKeplerStumpffStepSub(dim, mu, tmp2, -deltat);
	for ( i = 1; i <= 2*dim; ++i ) xv[i] = (tmp[i]-tmp2[i])/2.0;
	return 0.0; }
*/