[30] | 1 | /* |
---|
| 2 | * Each work-item invocation of this kernel, calculates the position for |
---|
| 3 | * one particle |
---|
| 4 | * |
---|
| 5 | * Work-items use local memory to reduce memory bandwidth and reuse of data |
---|
| 6 | */ |
---|
| 7 | |
---|
| 8 | |
---|
| 9 | |
---|
| 10 | __kernel void nbody_sim( __global float4* pos , |
---|
| 11 | |
---|
| 12 | __global float4* vel, |
---|
| 13 | |
---|
| 14 | int numBodies, |
---|
| 15 | |
---|
| 16 | float deltaTime, |
---|
| 17 | |
---|
| 18 | float epsSqr, |
---|
| 19 | |
---|
| 20 | __local float4* localPos) |
---|
| 21 | |
---|
| 22 | { |
---|
| 23 | unsigned int tid = get_local_id(0); |
---|
| 24 | |
---|
| 25 | unsigned int gid = get_global_id(0); |
---|
| 26 | |
---|
| 27 | unsigned int localSize = get_local_size(0); |
---|
| 28 | |
---|
| 29 | |
---|
| 30 | |
---|
| 31 | // Number of tiles we need to iterate |
---|
| 32 | |
---|
| 33 | unsigned int numTiles = numBodies / localSize; |
---|
| 34 | |
---|
| 35 | |
---|
| 36 | |
---|
| 37 | // position of this work-item |
---|
| 38 | |
---|
| 39 | float4 myPos = pos[gid]; |
---|
| 40 | |
---|
| 41 | float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); |
---|
| 42 | |
---|
| 43 | |
---|
| 44 | |
---|
| 45 | for(int i = 0; i < numTiles; ++i) |
---|
| 46 | |
---|
| 47 | { |
---|
| 48 | |
---|
| 49 | // load one tile into local memory |
---|
| 50 | |
---|
| 51 | int idx = i * localSize + tid; |
---|
| 52 | |
---|
| 53 | localPos[tid] = pos[idx]; |
---|
| 54 | |
---|
| 55 | |
---|
| 56 | |
---|
| 57 | // Synchronize to make sure data is available for processing |
---|
| 58 | |
---|
| 59 | barrier(CLK_LOCAL_MEM_FENCE); |
---|
| 60 | |
---|
| 61 | |
---|
| 62 | |
---|
| 63 | // calculate acceleration effect due to each body |
---|
| 64 | |
---|
| 65 | // a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2) |
---|
| 66 | |
---|
| 67 | for(int j = 0; j < localSize; ++j) |
---|
| 68 | |
---|
| 69 | { |
---|
| 70 | |
---|
| 71 | // Calculate acceleartion caused by particle j on particle i |
---|
| 72 | |
---|
| 73 | float4 r = localPos[j] - myPos; |
---|
| 74 | |
---|
| 75 | float distSqr = r.x * r.x + r.y * r.y + r.z * r.z; |
---|
| 76 | |
---|
| 77 | float invDist = 1.0f / sqrt(distSqr + epsSqr); |
---|
| 78 | |
---|
| 79 | float invDistCube = invDist * invDist * invDist; |
---|
| 80 | |
---|
| 81 | float s = localPos[j].w * invDistCube; |
---|
| 82 | |
---|
| 83 | |
---|
| 84 | |
---|
| 85 | // accumulate effect of all particles |
---|
| 86 | |
---|
| 87 | acc += s * r; |
---|
| 88 | |
---|
| 89 | } |
---|
| 90 | |
---|
| 91 | |
---|
| 92 | |
---|
| 93 | // Synchronize so that next tile can be loaded |
---|
| 94 | |
---|
| 95 | barrier(CLK_LOCAL_MEM_FENCE); |
---|
| 96 | |
---|
| 97 | } |
---|
| 98 | |
---|
| 99 | |
---|
| 100 | |
---|
| 101 | float4 oldVel = vel[gid]; |
---|
| 102 | |
---|
| 103 | |
---|
| 104 | |
---|
| 105 | // updated position and velocity |
---|
| 106 | |
---|
| 107 | float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime; |
---|
| 108 | |
---|
| 109 | newPos.w = myPos.w; |
---|
| 110 | |
---|
| 111 | |
---|
| 112 | |
---|
| 113 | float4 newVel = oldVel + acc * deltaTime; |
---|
| 114 | |
---|
| 115 | |
---|
| 116 | |
---|
| 117 | // write to global memory |
---|
| 118 | |
---|
| 119 | pos[gid] = newPos; |
---|
| 120 | |
---|
| 121 | vel[gid] = newVel; |
---|
| 122 | } |
---|
| 123 | |
---|