1 | /* compute.c */ |
---|
2 | #include "SolarSim.h" |
---|
3 | |
---|
4 | inline void |
---|
5 | v_init( vector *v, real x, real y, real z ) { |
---|
6 | v->x = x; |
---|
7 | v->y = y; |
---|
8 | v->z = z; |
---|
9 | } |
---|
10 | |
---|
11 | #if __INTEL_COMPILER |
---|
12 | #if defined _DOUBLE_PRECISION_ |
---|
13 | /* double precision using SSE instructions */ |
---|
14 | inline void |
---|
15 | compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { |
---|
16 | __m128d r11, r12, r21, r22, d1, d2, t1, t2, a11, a12, a21, a22; |
---|
17 | double tmp; |
---|
18 | |
---|
19 | // load position vectors into SSE registers |
---|
20 | r11 = _mm_loadu_pd((double*)rv1); // (x1, y1) |
---|
21 | r12 = _mm_load1_pd((double*)rv1+2); // (z1, z1) |
---|
22 | r21 = _mm_loadu_pd((double*)rv2); // (x2, y2) |
---|
23 | r22 = _mm_load1_pd((double*)rv2+2); // (z2, z2) |
---|
24 | |
---|
25 | // load acceleration vectors |
---|
26 | a11 = _mm_loadu_pd((double*)av1); |
---|
27 | a12 = _mm_load1_pd((double*)av1+2); |
---|
28 | a21 = _mm_loadu_pd((double*)av2); |
---|
29 | a22 = _mm_load1_pd((double*)av2+2); |
---|
30 | // d = r2 - r1 |
---|
31 | d1= _mm_sub_pd(r21, r11); // (dx dy) |
---|
32 | d2= _mm_sub_pd(r22, r12); // (dz dz) |
---|
33 | |
---|
34 | t1 = _mm_mul_pd(d1, d1); // (dx^2, dy^2) |
---|
35 | t2 = _mm_mul_pd(d2, d2); // (dz^2, dz^2) |
---|
36 | t1 = _mm_hadd_pd(t1, t1); // (dx^2+dy^2, dx^2+dy^2) |
---|
37 | t1 = _mm_add_pd(t1, t2); // (dx^2+dy^2+dz^2, dx^2+dy^2+dz^2) |
---|
38 | |
---|
39 | t2 = _mm_sqrt_pd(t1); // t2 = (norm(d), norm(d)) |
---|
40 | d1 = _mm_div_pd(d1, t1); // d = d * (norm(d) ^(-3)) |
---|
41 | d1 = _mm_div_pd(d1, t2); // d = d * (norm(d) ^(-3)) |
---|
42 | d2 = _mm_div_pd(d2, t1); // d = d * (norm(d) ^(-3)) |
---|
43 | d2 = _mm_div_pd(d2, t2); // d = d * (norm(d) ^(-3)) |
---|
44 | |
---|
45 | tmp = G * m2; |
---|
46 | t1 = _mm_load1_pd(&tmp); |
---|
47 | t2 = _mm_mul_pd(t1, d1); |
---|
48 | a11 = _mm_add_pd(a11, t2); |
---|
49 | t2 = _mm_mul_pd(t1, d2); |
---|
50 | a12 = _mm_add_pd(a12, t2); |
---|
51 | |
---|
52 | tmp = - G * m1; |
---|
53 | t1 = _mm_load1_pd(&tmp); |
---|
54 | t2 = _mm_mul_pd(t1, d1); |
---|
55 | a21 = _mm_add_pd(a21, t2); |
---|
56 | t2 = _mm_mul_pd(t1, d2); |
---|
57 | a22 = _mm_add_pd(a22, t2); |
---|
58 | |
---|
59 | _mm_storeu_pd((double*)av1, a11); |
---|
60 | _mm_store_sd((double*)av1+2, a12); |
---|
61 | _mm_storeu_pd((double*)av2, a21); |
---|
62 | _mm_store_sd((double*)av2+2, a22); |
---|
63 | } |
---|
64 | |
---|
65 | inline void |
---|
66 | update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { |
---|
67 | __m128d r1, r2, v1, v2, a1, a2, t1, t2; |
---|
68 | double tmp; |
---|
69 | |
---|
70 | r1 = _mm_loadu_pd((double*)rv); |
---|
71 | r2 = _mm_load1_pd((double*)rv+2); |
---|
72 | v1 = _mm_loadu_pd((double*)vv); |
---|
73 | v2 = _mm_load1_pd((double*)vv+2); |
---|
74 | a1 = _mm_loadu_pd((double*)av); |
---|
75 | a2 = _mm_load1_pd((double*)av+2); |
---|
76 | |
---|
77 | tmp = DT; |
---|
78 | t1 = _mm_load1_pd(&tmp); // t1 = (DT,DT) |
---|
79 | |
---|
80 | t2 = _mm_mul_pd(v1, t1); // t2 = (vx*DT,vy*DT) |
---|
81 | r1 = _mm_add_pd(r1, t2); // r1 = (rx+vx*DT, ry+vy*DT) |
---|
82 | t2 = _mm_mul_pd(v2, t1); // t2 = (vz*DT,vz*DT) |
---|
83 | r2 = _mm_add_pd(r2, t2); // r2 = (rz+vz*DT, rz+vz*DT) |
---|
84 | |
---|
85 | t2 = _mm_mul_pd(a1, t1); // t2 = (ax*DT, ay*DT) |
---|
86 | v1 = _mm_add_pd(v1, t2); // v1 = (vx+ax*DT, vy+ay*DT) |
---|
87 | t2 = _mm_mul_pd(a2, t1); // t2 = (az*DT, az*DT) |
---|
88 | v2 = _mm_add_pd(v2, t2); // v1 = (vz+az*DT, vz+az*DT) |
---|
89 | |
---|
90 | tmp = DT*DT/2; |
---|
91 | t1 = _mm_load1_pd(&tmp); // t1 = (DT*DT/2, DT*DT/2) |
---|
92 | |
---|
93 | t2 = _mm_mul_pd(a1, t1); // t2 = (ax*DT*DT/2,ay*DT*DT/2) |
---|
94 | r1 = _mm_add_pd(r1, t2); // r1 = (rx+vx*DT+ax*DT*DT/2, ry+vy*DT+ay*DT*DT/2) |
---|
95 | t2 = _mm_mul_pd(a2, t1); // t2 = (az*DT*DT/2,az*DT*DT/2) |
---|
96 | r2 = _mm_add_pd(r2, t2); // r2 = (rz+vz*DT+az*DT*DT/2, rz+vz*DT+az*DT*DT/2) |
---|
97 | |
---|
98 | _mm_storeu_pd((double*)rv, r1); |
---|
99 | _mm_store_sd((double*)rv+2, r2); |
---|
100 | _mm_storeu_pd((double*)vv, v1); |
---|
101 | _mm_store_sd((double*)vv+2, v2); |
---|
102 | } |
---|
103 | |
---|
104 | #elif defined _SINGLE_PRECISION_ |
---|
105 | /* single precision using SSE instructions */ |
---|
106 | inline void |
---|
107 | compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { |
---|
108 | __m128 r1, r2, d, t1, t2, a1, a2; |
---|
109 | float tmp; |
---|
110 | |
---|
111 | // load position and acceleration vectors into SSE registers |
---|
112 | r1 = _mm_loadu_ps((float*)rv1); |
---|
113 | r2 = _mm_loadu_ps((float*)rv2); |
---|
114 | a1 = _mm_loadu_ps((float*)av1); |
---|
115 | a2 = _mm_loadu_ps((float*)av2); |
---|
116 | |
---|
117 | d = _mm_sub_ps(r2, r1); // d = r2 - r1 |
---|
118 | t1 = _mm_mul_ps(d, d); |
---|
119 | t1 = _mm_hadd_ps(t1, t1); |
---|
120 | t1 = _mm_hadd_ps(t1, t1); // t1 = norm(d) ^ 2 |
---|
121 | t2 = _mm_rcp_ps(t1); // t2 = norm(d) ^ (-2) |
---|
122 | t1 = _mm_rsqrt_ps(t1); // t1 = norm(d) ^ (-1) |
---|
123 | d = _mm_mul_ps(d, t1); // d = d * (norm(d) ^(-1)) |
---|
124 | d = _mm_mul_ps(d, t2); // d = d * (norm(d) ^(-3)) |
---|
125 | |
---|
126 | tmp = G * m2; |
---|
127 | t1 = _mm_load1_ps(&tmp); |
---|
128 | t1 = _mm_mul_ps(t1, d); |
---|
129 | a1 = _mm_add_ps(a1, t1); |
---|
130 | |
---|
131 | tmp = - G * m1; |
---|
132 | t2 = _mm_load1_ps(&tmp); |
---|
133 | t2 = _mm_mul_ps(t2, d); |
---|
134 | a2 = _mm_add_ps(a2, t2); |
---|
135 | |
---|
136 | _mm_storeu_ps((float*)av1, a1); |
---|
137 | _mm_storeu_ps((float*)av2, a2); |
---|
138 | } |
---|
139 | |
---|
140 | inline void |
---|
141 | update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { |
---|
142 | __m128 r, v, a, t1, t2; |
---|
143 | float tmp; |
---|
144 | |
---|
145 | r = _mm_loadu_ps((float*)rv); |
---|
146 | v = _mm_loadu_ps((float*)vv); |
---|
147 | a = _mm_loadu_ps((float*)av); |
---|
148 | |
---|
149 | tmp = DT; |
---|
150 | t1 = _mm_load1_ps(&tmp); // t1 = (DT,DT,DT) |
---|
151 | t2 = _mm_mul_ps(v, t1); // t2 = v*DT |
---|
152 | r = _mm_add_ps(r, t2); // r = r + v*DT |
---|
153 | |
---|
154 | t2 = _mm_mul_ps(a, t1); // t2 = a*DT |
---|
155 | v = _mm_add_ps(v, t2); // v = v + a*DT |
---|
156 | |
---|
157 | tmp = DT*DT/2; |
---|
158 | t1 = _mm_load1_ps(&tmp); |
---|
159 | t2 = _mm_mul_ps(a, t1); // t2 = a * (DT*DT/2) |
---|
160 | r = _mm_add_ps(r, t2); // r = r + v*DT + a*(DT*DT/2) |
---|
161 | |
---|
162 | _mm_storeu_ps((float*)rv, r); |
---|
163 | _mm_storeu_ps((float*)vv, v); |
---|
164 | } |
---|
165 | #else |
---|
166 | #error "Either _SINGLE_PRECISION_ or _DOUBLE_PRECISION_ must be defined" |
---|
167 | #endif |
---|
168 | |
---|
169 | #else |
---|
170 | /* Compatibility mode without SSE instructions */ |
---|
171 | inline void |
---|
172 | compute_acc( vector *rv1, vector *rv2, vector *vv1, vector *vv2, vector *av1, vector *av2, real m1, real m2 ) { |
---|
173 | // varianta fara SSE |
---|
174 | vector d; |
---|
175 | real w, norm3; |
---|
176 | |
---|
177 | d.x = rv2->x - rv1->x; |
---|
178 | d.y = rv2->y - rv1->y; |
---|
179 | d.z = rv2->z - rv1->z; |
---|
180 | |
---|
181 | w = d.x*d.x + d.y*d.y + d.z*d.z; // norm^2 |
---|
182 | norm3 = w * sqrt(w); // norm^3 |
---|
183 | |
---|
184 | av1->x += G * m2 * d.x / norm3; |
---|
185 | av1->y += G * m2 * d.y / norm3; |
---|
186 | av1->z += G * m2 * d.z / norm3; |
---|
187 | |
---|
188 | av2->x -= G * m1 * d.x / norm3; |
---|
189 | av2->y -= G * m1 * d.y / norm3; |
---|
190 | av2->z -= G * m1 * d.z / norm3; |
---|
191 | } |
---|
192 | |
---|
193 | inline void |
---|
194 | update_pos_vel( vector *rv, vector *vv, vector *av, long DT ) { |
---|
195 | rv->x = rv->x + vv->x * DT + av->x * (DT*DT)/2; |
---|
196 | rv->y = rv->y + vv->y * DT + av->y * (DT*DT)/2; |
---|
197 | rv->z = rv->z + vv->z * DT + av->z * (DT*DT)/2; |
---|
198 | |
---|
199 | vv->x = vv->x + av->x * DT; |
---|
200 | vv->y = vv->y + av->y * DT; |
---|
201 | vv->z = vv->z + av->z * DT; |
---|
202 | } |
---|
203 | #endif |
---|
204 | |
---|