/* compute.c */ #include "SolarSim.h" inline void v_init( vector *v, real x, real y, real z ) { v->x = x; v->y = y; v->z = z; } #if __INTEL_COMPILER #if defined _DOUBLE_PRECISION_ /* double precision using SSE instructions */ inline void compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { __m128d r11, r12, r21, r22, d1, d2, t1, t2, a11, a12, a21, a22; double tmp; // load position vectors into SSE registers r11 = _mm_loadu_pd((double*)rv1); // (x1, y1) r12 = _mm_load1_pd((double*)rv1+2); // (z1, z1) r21 = _mm_loadu_pd((double*)rv2); // (x2, y2) r22 = _mm_load1_pd((double*)rv2+2); // (z2, z2) // load acceleration vectors a11 = _mm_loadu_pd((double*)av1); a12 = _mm_load1_pd((double*)av1+2); a21 = _mm_loadu_pd((double*)av2); a22 = _mm_load1_pd((double*)av2+2); // d = r2 - r1 d1= _mm_sub_pd(r21, r11); // (dx dy) d2= _mm_sub_pd(r22, r12); // (dz dz) t1 = _mm_mul_pd(d1, d1); // (dx^2, dy^2) t2 = _mm_mul_pd(d2, d2); // (dz^2, dz^2) t1 = _mm_hadd_pd(t1, t1); // (dx^2+dy^2, dx^2+dy^2) t1 = _mm_add_pd(t1, t2); // (dx^2+dy^2+dz^2, dx^2+dy^2+dz^2) t2 = _mm_sqrt_pd(t1); // t2 = (norm(d), norm(d)) d1 = _mm_div_pd(d1, t1); // d = d * (norm(d) ^(-3)) d1 = _mm_div_pd(d1, t2); // d = d * (norm(d) ^(-3)) d2 = _mm_div_pd(d2, t1); // d = d * (norm(d) ^(-3)) d2 = _mm_div_pd(d2, t2); // d = d * (norm(d) ^(-3)) tmp = G * m2; t1 = _mm_load1_pd(&tmp); t2 = _mm_mul_pd(t1, d1); a11 = _mm_add_pd(a11, t2); t2 = _mm_mul_pd(t1, d2); a12 = _mm_add_pd(a12, t2); tmp = - G * m1; t1 = _mm_load1_pd(&tmp); t2 = _mm_mul_pd(t1, d1); a21 = _mm_add_pd(a21, t2); t2 = _mm_mul_pd(t1, d2); a22 = _mm_add_pd(a22, t2); _mm_storeu_pd((double*)av1, a11); _mm_store_sd((double*)av1+2, a12); _mm_storeu_pd((double*)av2, a21); _mm_store_sd((double*)av2+2, a22); } inline void update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { __m128d r1, r2, v1, v2, a1, a2, t1, t2; double tmp; r1 = _mm_loadu_pd((double*)rv); r2 = _mm_load1_pd((double*)rv+2); v1 = _mm_loadu_pd((double*)vv); v2 = _mm_load1_pd((double*)vv+2); a1 = _mm_loadu_pd((double*)av); a2 = _mm_load1_pd((double*)av+2); tmp = DT; t1 = _mm_load1_pd(&tmp); // t1 = (DT,DT) t2 = _mm_mul_pd(v1, t1); // t2 = (vx*DT,vy*DT) r1 = _mm_add_pd(r1, t2); // r1 = (rx+vx*DT, ry+vy*DT) t2 = _mm_mul_pd(v2, t1); // t2 = (vz*DT,vz*DT) r2 = _mm_add_pd(r2, t2); // r2 = (rz+vz*DT, rz+vz*DT) t2 = _mm_mul_pd(a1, t1); // t2 = (ax*DT, ay*DT) v1 = _mm_add_pd(v1, t2); // v1 = (vx+ax*DT, vy+ay*DT) t2 = _mm_mul_pd(a2, t1); // t2 = (az*DT, az*DT) v2 = _mm_add_pd(v2, t2); // v1 = (vz+az*DT, vz+az*DT) tmp = DT*DT/2; t1 = _mm_load1_pd(&tmp); // t1 = (DT*DT/2, DT*DT/2) t2 = _mm_mul_pd(a1, t1); // t2 = (ax*DT*DT/2,ay*DT*DT/2) r1 = _mm_add_pd(r1, t2); // r1 = (rx+vx*DT+ax*DT*DT/2, ry+vy*DT+ay*DT*DT/2) t2 = _mm_mul_pd(a2, t1); // t2 = (az*DT*DT/2,az*DT*DT/2) r2 = _mm_add_pd(r2, t2); // r2 = (rz+vz*DT+az*DT*DT/2, rz+vz*DT+az*DT*DT/2) _mm_storeu_pd((double*)rv, r1); _mm_store_sd((double*)rv+2, r2); _mm_storeu_pd((double*)vv, v1); _mm_store_sd((double*)vv+2, v2); } #elif defined _SINGLE_PRECISION_ /* single precision using SSE instructions */ inline void compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { __m128 r1, r2, d, t1, t2, a1, a2; float tmp; // load position and acceleration vectors into SSE registers r1 = _mm_loadu_ps((float*)rv1); r2 = _mm_loadu_ps((float*)rv2); a1 = _mm_loadu_ps((float*)av1); a2 = _mm_loadu_ps((float*)av2); d = _mm_sub_ps(r2, r1); // d = r2 - r1 t1 = _mm_mul_ps(d, d); t1 = _mm_hadd_ps(t1, t1); t1 = _mm_hadd_ps(t1, t1); // t1 = norm(d) ^ 2 t2 = _mm_rcp_ps(t1); // t2 = norm(d) ^ (-2) t1 = _mm_rsqrt_ps(t1); // t1 = norm(d) ^ (-1) d = _mm_mul_ps(d, t1); // d = d * (norm(d) ^(-1)) d = _mm_mul_ps(d, t2); // d = d * (norm(d) ^(-3)) tmp = G * m2; t1 = _mm_load1_ps(&tmp); t1 = _mm_mul_ps(t1, d); a1 = _mm_add_ps(a1, t1); tmp = - G * m1; t2 = _mm_load1_ps(&tmp); t2 = _mm_mul_ps(t2, d); a2 = _mm_add_ps(a2, t2); _mm_storeu_ps((float*)av1, a1); _mm_storeu_ps((float*)av2, a2); } inline void update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { __m128 r, v, a, t1, t2; float tmp; r = _mm_loadu_ps((float*)rv); v = _mm_loadu_ps((float*)vv); a = _mm_loadu_ps((float*)av); tmp = DT; t1 = _mm_load1_ps(&tmp); // t1 = (DT,DT,DT) t2 = _mm_mul_ps(v, t1); // t2 = v*DT r = _mm_add_ps(r, t2); // r = r + v*DT t2 = _mm_mul_ps(a, t1); // t2 = a*DT v = _mm_add_ps(v, t2); // v = v + a*DT tmp = DT*DT/2; t1 = _mm_load1_ps(&tmp); t2 = _mm_mul_ps(a, t1); // t2 = a * (DT*DT/2) r = _mm_add_ps(r, t2); // r = r + v*DT + a*(DT*DT/2) _mm_storeu_ps((float*)rv, r); _mm_storeu_ps((float*)vv, v); } #else #error "Either _SINGLE_PRECISION_ or _DOUBLE_PRECISION_ must be defined" #endif #else /* Compatibility mode without SSE instructions */ inline void compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { // varianta fara SSE vector d; real w, norm3; d.x = rv2->x - rv1->x; d.y = rv2->y - rv1->y; d.z = rv2->z - rv1->z; w = d.x*d.x + d.y*d.y + d.z*d.z; // norm^2 norm3 = w * sqrt(w); // norm^3 av1->x += G * m2 * d.x / norm3; av1->y += G * m2 * d.y / norm3; av1->z += G * m2 * d.z / norm3; av2->x -= G * m1 * d.x / norm3; av2->y -= G * m1 * d.y / norm3; av2->z -= G * m1 * d.z / norm3; } inline void update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { rv->x = rv->x + vv->x * DT + av->x * (DT*DT)/2; rv->y = rv->y + vv->y * DT + av->y * (DT*DT)/2; rv->z = rv->z + vv->z * DT + av->z * (DT*DT)/2; vv->x = vv->x + av->x * DT; vv->y = vv->y + av->y * DT; vv->z = vv->z + av->z * DT; } #endif