__constant float VERY_LARGE_NUMBER = 99999999; // A big number used fo scalling time steps; __kernel void leapfrog_integrator( __global float4* pos , __global float4* vel, int numBodies, float deltaTime, float epsSqr, __local float4* localPos, __local float4* localVel, __global float4* acc , int initialAccComputation, __global float* collTime) { //--------------------------------------- unsigned int tid = get_local_id(0); unsigned int gid = get_global_id(0); // Index of the particle that it must update; unsigned int localSize = get_local_size(0); // This can be thought of as the number of threads // executing this kernel, for different work-items, // concurrently and synchronously; //--------------------------------------- // Number of tiles we need to iterate unsigned int numTiles = numBodies / localSize; //--------------------------------------- // Reads the particle position (and mass) and velocity // of particle "i" for which this kernel invocation // is tasked to update. float4 oldPos = pos[gid]; float4 oldVel = vel[gid]; //--------------------------------------- // Initializes the float4 we will use to accumulate // the acceleration on particle "i". float4 oldAcc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); float time_scale_sq = VERY_LARGE_NUMBER; // Init the collision time [read theory // for more information; // Compute the partial v_(n+1) and x_(n+1): //oldVel += acc[gid]*deltaTime/2; //oldPos += oldVel*deltaTime; for(int i = 0; i < numTiles; ++i) { // load one tile into local memory int idx = i * localSize + tid; localPos[tid] = pos[idx]; localVel[tid] = vel[idx]; // Synchronize to make sure data is available for processing barrier(CLK_LOCAL_MEM_FENCE); for(int j = 0; j < localSize; ++j) { float4 rji, vji; float r2= 0.0f, r3= 0.0f, v2=0.0f; // r^2, r^3, v^2; rji = localPos[j] - oldPos; vji = localVel[j] - oldVel; r2 = pow(rji.x, 2) + pow(rji.y, 2) + pow(rji.z, 2); v2 = pow(vji.x, 2) + pow(vji.y, 2) + pow(vji.z, 2); if (r2 == 0) r2 = 0.000000000001; r3= r2*sqrt(r2); //---------------------- // The computation of acc oldAcc += oldPos.w*rji/r3; //---------------------- // The computation of time_scale_sq: float estimate_sq= r2/v2; // [distance]^2/[velocity]^2 = [time]^2 if (time_scale_sq > estimate_sq){ time_scale_sq = estimate_sq; } float a = (oldPos.w + localPos[j].w)/r2; estimate_sq = sqrt(r2)/a; // [distance]/[acceleration] = [time]^2 if (time_scale_sq > estimate_sq){ time_scale_sq = estimate_sq; } } // Synchronize so that next tile can be loaded barrier(CLK_LOCAL_MEM_FENCE); } collTime[gid] = sqrt(time_scale_sq); // Now we compute the rest of the v_(n+1): //oldVel += oldAcc*deltaTime/2; // pos[gid] = oldPos; // vel[gid] = oldVel; acc[gid] = oldAcc; }