__constant float VERY_LARGE_NUMBER = 99999999; // A big number used fo scalling time steps;


__kernel void leapfrog_integrator(	__global float4* pos ,

									__global float4* vel,

									int numBodies,

								    float deltaTime,
		
								    float epsSqr,

								    __local float4* localPos,
								    
								    __local float4* localVel,
								    
								    __global float4* acc ,
								    
								    int initialAccComputation,
								   
								    __global float* collTime)

{
	//---------------------------------------
    unsigned int tid = get_local_id(0);		

    unsigned int gid = get_global_id(0);		// Index of the particle that it must update;

    unsigned int localSize = get_local_size(0); // This can be thought of as the number of threads
												// executing this kernel, for different work-items, 
												// concurrently and synchronously;

	//---------------------------------------
    // Number of tiles we need to iterate

    unsigned int numTiles = numBodies / localSize;


	//---------------------------------------
	// Reads the particle position (and mass) and velocity 
	// of particle "i" for which this kernel invocation 
	// is tasked to update. 
	
    float4 oldPos = pos[gid];
    float4 oldVel = vel[gid];

	//---------------------------------------
	// Initializes the float4 we will use to accumulate
	// the acceleration on particle "i".
	
    float4 oldAcc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);

	float time_scale_sq = VERY_LARGE_NUMBER;	// Init the collision time [read theory
												// for more information;
												
    // Compute the partial v_(n+1) and x_(n+1):
	//oldVel += acc[gid]*deltaTime/2;
	//oldPos += oldVel*deltaTime;
	
    for(int i = 0; i < numTiles; ++i)

    {
        // load one tile into local memory

        int idx = i * localSize + tid;

        localPos[tid] = pos[idx];
        localVel[tid] = vel[idx];

        // Synchronize to make sure data is available for processing

        barrier(CLK_LOCAL_MEM_FENCE);

        for(int j = 0; j < localSize; ++j)

        {
			float4 rji, vji;
            float r2= 0.0f, r3= 0.0f, v2=0.0f;	// r^2, r^3, v^2;
           	
			rji = localPos[j] - oldPos;
			vji = localVel[j] - oldVel;
           
			r2 = pow(rji.x, 2) + pow(rji.y, 2) + pow(rji.z, 2);
			v2 = pow(vji.x, 2) + pow(vji.y, 2) + pow(vji.z, 2);

			if (r2 == 0)
				r2 = 0.000000000001;
           
			r3= r2*sqrt(r2);
		
        	//----------------------
			// The computation of acc
			
			oldAcc += oldPos.w*rji/r3;
			
			
			//----------------------
			// The computation of time_scale_sq:
			
			float estimate_sq= r2/v2;			// [distance]^2/[velocity]^2 = [time]^2
        	if (time_scale_sq > estimate_sq){
          		time_scale_sq = estimate_sq;
			}
			
			float a = (oldPos.w + localPos[j].w)/r2;
		    estimate_sq = sqrt(r2)/a;         	// [distance]/[acceleration] = [time]^2
			
			if (time_scale_sq > estimate_sq){
          		time_scale_sq = estimate_sq;
			}
	    }

        // Synchronize so that next tile can be loaded

        barrier(CLK_LOCAL_MEM_FENCE);
		
    }
    
    collTime[gid] = sqrt(time_scale_sq);
    
    // Now we compute the rest of the v_(n+1):
    //oldVel += oldAcc*deltaTime/2;
    
//	pos[gid] = oldPos;
//	vel[gid] = oldVel;
	acc[gid] = oldAcc;

}