[152] | 1 | /* compute.c */ |
---|
| 2 | #include "SolarSim.h" |
---|
| 3 | |
---|
| 4 | inline void |
---|
| 5 | v_init( vector *v, real x, real y, real z ) { |
---|
| 6 | v->x = x; |
---|
| 7 | v->y = y; |
---|
| 8 | v->z = z; |
---|
| 9 | } |
---|
| 10 | |
---|
| 11 | #if __INTEL_COMPILER |
---|
| 12 | #if defined _DOUBLE_PRECISION_ |
---|
| 13 | /* double precision using SSE instructions */ |
---|
| 14 | inline void |
---|
| 15 | compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { |
---|
| 16 | __m128d r11, r12, r21, r22, d1, d2, t1, t2, a11, a12, a21, a22; |
---|
| 17 | double tmp; |
---|
| 18 | |
---|
| 19 | // load position vectors into SSE registers |
---|
| 20 | r11 = _mm_loadu_pd((double*)rv1); // (x1, y1) |
---|
| 21 | r12 = _mm_load1_pd((double*)rv1+2); // (z1, z1) |
---|
| 22 | r21 = _mm_loadu_pd((double*)rv2); // (x2, y2) |
---|
| 23 | r22 = _mm_load1_pd((double*)rv2+2); // (z2, z2) |
---|
| 24 | |
---|
| 25 | // load acceleration vectors |
---|
| 26 | a11 = _mm_loadu_pd((double*)av1); |
---|
| 27 | a12 = _mm_load1_pd((double*)av1+2); |
---|
| 28 | a21 = _mm_loadu_pd((double*)av2); |
---|
| 29 | a22 = _mm_load1_pd((double*)av2+2); |
---|
| 30 | // d = r2 - r1 |
---|
| 31 | d1= _mm_sub_pd(r21, r11); // (dx dy) |
---|
| 32 | d2= _mm_sub_pd(r22, r12); // (dz dz) |
---|
| 33 | |
---|
| 34 | t1 = _mm_mul_pd(d1, d1); // (dx^2, dy^2) |
---|
| 35 | t2 = _mm_mul_pd(d2, d2); // (dz^2, dz^2) |
---|
| 36 | t1 = _mm_hadd_pd(t1, t1); // (dx^2+dy^2, dx^2+dy^2) |
---|
| 37 | t1 = _mm_add_pd(t1, t2); // (dx^2+dy^2+dz^2, dx^2+dy^2+dz^2) |
---|
| 38 | |
---|
| 39 | t2 = _mm_sqrt_pd(t1); // t2 = (norm(d), norm(d)) |
---|
| 40 | d1 = _mm_div_pd(d1, t1); // d = d * (norm(d) ^(-3)) |
---|
| 41 | d1 = _mm_div_pd(d1, t2); // d = d * (norm(d) ^(-3)) |
---|
| 42 | d2 = _mm_div_pd(d2, t1); // d = d * (norm(d) ^(-3)) |
---|
| 43 | d2 = _mm_div_pd(d2, t2); // d = d * (norm(d) ^(-3)) |
---|
| 44 | |
---|
| 45 | tmp = G * m2; |
---|
| 46 | t1 = _mm_load1_pd(&tmp); |
---|
| 47 | t2 = _mm_mul_pd(t1, d1); |
---|
| 48 | a11 = _mm_add_pd(a11, t2); |
---|
| 49 | t2 = _mm_mul_pd(t1, d2); |
---|
| 50 | a12 = _mm_add_pd(a12, t2); |
---|
| 51 | |
---|
| 52 | tmp = - G * m1; |
---|
| 53 | t1 = _mm_load1_pd(&tmp); |
---|
| 54 | t2 = _mm_mul_pd(t1, d1); |
---|
| 55 | a21 = _mm_add_pd(a21, t2); |
---|
| 56 | t2 = _mm_mul_pd(t1, d2); |
---|
| 57 | a22 = _mm_add_pd(a22, t2); |
---|
| 58 | |
---|
| 59 | _mm_storeu_pd((double*)av1, a11); |
---|
| 60 | _mm_store_sd((double*)av1+2, a12); |
---|
| 61 | _mm_storeu_pd((double*)av2, a21); |
---|
| 62 | _mm_store_sd((double*)av2+2, a22); |
---|
| 63 | } |
---|
| 64 | |
---|
| 65 | inline void |
---|
| 66 | update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { |
---|
| 67 | __m128d r1, r2, v1, v2, a1, a2, t1, t2; |
---|
| 68 | double tmp; |
---|
| 69 | |
---|
| 70 | r1 = _mm_loadu_pd((double*)rv); |
---|
| 71 | r2 = _mm_load1_pd((double*)rv+2); |
---|
| 72 | v1 = _mm_loadu_pd((double*)vv); |
---|
| 73 | v2 = _mm_load1_pd((double*)vv+2); |
---|
| 74 | a1 = _mm_loadu_pd((double*)av); |
---|
| 75 | a2 = _mm_load1_pd((double*)av+2); |
---|
| 76 | |
---|
| 77 | tmp = DT; |
---|
| 78 | t1 = _mm_load1_pd(&tmp); // t1 = (DT,DT) |
---|
| 79 | |
---|
| 80 | t2 = _mm_mul_pd(v1, t1); // t2 = (vx*DT,vy*DT) |
---|
| 81 | r1 = _mm_add_pd(r1, t2); // r1 = (rx+vx*DT, ry+vy*DT) |
---|
| 82 | t2 = _mm_mul_pd(v2, t1); // t2 = (vz*DT,vz*DT) |
---|
| 83 | r2 = _mm_add_pd(r2, t2); // r2 = (rz+vz*DT, rz+vz*DT) |
---|
| 84 | |
---|
| 85 | t2 = _mm_mul_pd(a1, t1); // t2 = (ax*DT, ay*DT) |
---|
| 86 | v1 = _mm_add_pd(v1, t2); // v1 = (vx+ax*DT, vy+ay*DT) |
---|
| 87 | t2 = _mm_mul_pd(a2, t1); // t2 = (az*DT, az*DT) |
---|
| 88 | v2 = _mm_add_pd(v2, t2); // v1 = (vz+az*DT, vz+az*DT) |
---|
| 89 | |
---|
| 90 | tmp = DT*DT/2; |
---|
| 91 | t1 = _mm_load1_pd(&tmp); // t1 = (DT*DT/2, DT*DT/2) |
---|
| 92 | |
---|
| 93 | t2 = _mm_mul_pd(a1, t1); // t2 = (ax*DT*DT/2,ay*DT*DT/2) |
---|
| 94 | r1 = _mm_add_pd(r1, t2); // r1 = (rx+vx*DT+ax*DT*DT/2, ry+vy*DT+ay*DT*DT/2) |
---|
| 95 | t2 = _mm_mul_pd(a2, t1); // t2 = (az*DT*DT/2,az*DT*DT/2) |
---|
| 96 | r2 = _mm_add_pd(r2, t2); // r2 = (rz+vz*DT+az*DT*DT/2, rz+vz*DT+az*DT*DT/2) |
---|
| 97 | |
---|
| 98 | _mm_storeu_pd((double*)rv, r1); |
---|
| 99 | _mm_store_sd((double*)rv+2, r2); |
---|
| 100 | _mm_storeu_pd((double*)vv, v1); |
---|
| 101 | _mm_store_sd((double*)vv+2, v2); |
---|
| 102 | } |
---|
| 103 | |
---|
| 104 | #elif defined _SINGLE_PRECISION_ |
---|
| 105 | /* single precision using SSE instructions */ |
---|
| 106 | inline void |
---|
| 107 | compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { |
---|
| 108 | __m128 r1, r2, d, t1, t2, a1, a2; |
---|
| 109 | float tmp; |
---|
| 110 | |
---|
| 111 | // load position and acceleration vectors into SSE registers |
---|
| 112 | r1 = _mm_loadu_ps((float*)rv1); |
---|
| 113 | r2 = _mm_loadu_ps((float*)rv2); |
---|
| 114 | a1 = _mm_loadu_ps((float*)av1); |
---|
| 115 | a2 = _mm_loadu_ps((float*)av2); |
---|
| 116 | |
---|
| 117 | d = _mm_sub_ps(r2, r1); // d = r2 - r1 |
---|
| 118 | t1 = _mm_mul_ps(d, d); |
---|
| 119 | t1 = _mm_hadd_ps(t1, t1); |
---|
| 120 | t1 = _mm_hadd_ps(t1, t1); // t1 = norm(d) ^ 2 |
---|
| 121 | t2 = _mm_rcp_ps(t1); // t2 = norm(d) ^ (-2) |
---|
| 122 | t1 = _mm_rsqrt_ps(t1); // t1 = norm(d) ^ (-1) |
---|
| 123 | d = _mm_mul_ps(d, t1); // d = d * (norm(d) ^(-1)) |
---|
| 124 | d = _mm_mul_ps(d, t2); // d = d * (norm(d) ^(-3)) |
---|
| 125 | |
---|
| 126 | tmp = G * m2; |
---|
| 127 | t1 = _mm_load1_ps(&tmp); |
---|
| 128 | t1 = _mm_mul_ps(t1, d); |
---|
| 129 | a1 = _mm_add_ps(a1, t1); |
---|
| 130 | |
---|
| 131 | tmp = - G * m1; |
---|
| 132 | t2 = _mm_load1_ps(&tmp); |
---|
| 133 | t2 = _mm_mul_ps(t2, d); |
---|
| 134 | a2 = _mm_add_ps(a2, t2); |
---|
| 135 | |
---|
| 136 | _mm_storeu_ps((float*)av1, a1); |
---|
| 137 | _mm_storeu_ps((float*)av2, a2); |
---|
| 138 | } |
---|
| 139 | |
---|
| 140 | inline void |
---|
| 141 | update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { |
---|
| 142 | __m128 r, v, a, t1, t2; |
---|
| 143 | float tmp; |
---|
| 144 | |
---|
| 145 | r = _mm_loadu_ps((float*)rv); |
---|
| 146 | v = _mm_loadu_ps((float*)vv); |
---|
| 147 | a = _mm_loadu_ps((float*)av); |
---|
| 148 | |
---|
| 149 | tmp = DT; |
---|
| 150 | t1 = _mm_load1_ps(&tmp); // t1 = (DT,DT,DT) |
---|
| 151 | t2 = _mm_mul_ps(v, t1); // t2 = v*DT |
---|
| 152 | r = _mm_add_ps(r, t2); // r = r + v*DT |
---|
| 153 | |
---|
| 154 | t2 = _mm_mul_ps(a, t1); // t2 = a*DT |
---|
| 155 | v = _mm_add_ps(v, t2); // v = v + a*DT |
---|
| 156 | |
---|
| 157 | tmp = DT*DT/2; |
---|
| 158 | t1 = _mm_load1_ps(&tmp); |
---|
| 159 | t2 = _mm_mul_ps(a, t1); // t2 = a * (DT*DT/2) |
---|
| 160 | r = _mm_add_ps(r, t2); // r = r + v*DT + a*(DT*DT/2) |
---|
| 161 | |
---|
| 162 | _mm_storeu_ps((float*)rv, r); |
---|
| 163 | _mm_storeu_ps((float*)vv, v); |
---|
| 164 | } |
---|
| 165 | #else |
---|
| 166 | #error "Either _SINGLE_PRECISION_ or _DOUBLE_PRECISION_ must be defined" |
---|
| 167 | #endif |
---|
| 168 | |
---|
| 169 | #else |
---|
| 170 | /* Compatibility mode without SSE instructions */ |
---|
| 171 | inline void |
---|
| 172 | compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { |
---|
| 173 | // varianta fara SSE |
---|
| 174 | vector d; |
---|
| 175 | real w, norm3; |
---|
| 176 | |
---|
| 177 | d.x = rv2->x - rv1->x; |
---|
| 178 | d.y = rv2->y - rv1->y; |
---|
| 179 | d.z = rv2->z - rv1->z; |
---|
| 180 | |
---|
| 181 | w = d.x*d.x + d.y*d.y + d.z*d.z; // norm^2 |
---|
| 182 | norm3 = w * sqrt(w); // norm^3 |
---|
| 183 | |
---|
| 184 | av1->x += G * m2 * d.x / norm3; |
---|
| 185 | av1->y += G * m2 * d.y / norm3; |
---|
| 186 | av1->z += G * m2 * d.z / norm3; |
---|
| 187 | |
---|
| 188 | av2->x -= G * m1 * d.x / norm3; |
---|
| 189 | av2->y -= G * m1 * d.y / norm3; |
---|
| 190 | av2->z -= G * m1 * d.z / norm3; |
---|
| 191 | } |
---|
| 192 | |
---|
| 193 | inline void |
---|
| 194 | update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { |
---|
| 195 | rv->x = rv->x + vv->x * DT + av->x * (DT*DT)/2; |
---|
| 196 | rv->y = rv->y + vv->y * DT + av->y * (DT*DT)/2; |
---|
| 197 | rv->z = rv->z + vv->z * DT + av->z * (DT*DT)/2; |
---|
| 198 | |
---|
| 199 | vv->x = vv->x + av->x * DT; |
---|
| 200 | vv->y = vv->y + av->y * DT; |
---|
| 201 | vv->z = vv->z + av->z * DT; |
---|
| 202 | } |
---|
| 203 | #endif |
---|
| 204 | |
---|