[77] | 1 | // example1.cpp : Defines the entry point for the console application. |
---|
| 2 | // |
---|
| 3 | |
---|
| 4 | //#include "stdafx.h" |
---|
| 5 | |
---|
| 6 | #include <vector> |
---|
| 7 | #include <stdio.h> |
---|
| 8 | #include <stdlib.h> |
---|
| 9 | #include <float.h> |
---|
| 10 | #include <cuda.h> |
---|
| 11 | #include <cudpp.h> |
---|
| 12 | #include <cutil.h> |
---|
| 13 | #include <math_functions.h> |
---|
| 14 | #include <image.h> |
---|
| 15 | #include <misc.h> |
---|
| 16 | #include <pnmfile.h> |
---|
| 17 | #include <cutil_math.h> |
---|
| 18 | #include <sys/time.h> |
---|
| 19 | #include <unistd.h> |
---|
| 20 | |
---|
| 21 | |
---|
| 22 | #include "segment-image.h" |
---|
| 23 | |
---|
| 24 | |
---|
| 25 | #define BLOCK_DIM_X 12 |
---|
| 26 | #define BLOCK_DIM_Y 8 |
---|
| 27 | |
---|
| 28 | #define max FLT_MAX |
---|
| 29 | |
---|
| 30 | |
---|
| 31 | |
---|
| 32 | __global__ void convolve_even_gpu(imagef *src, imagef *dst, float *mask, int dim_mask){ |
---|
| 33 | int w = src->w; |
---|
| 34 | int h = src->h; |
---|
| 35 | int x = blockIdx.x * blockDim.x + threadIdx.x; |
---|
| 36 | int y = blockIdx.y * blockDim.y + threadIdx.y; |
---|
| 37 | int idx = x + y*w; |
---|
| 38 | int i; |
---|
| 39 | float sum; |
---|
| 40 | int mmax,mmin; |
---|
| 41 | |
---|
| 42 | if (x>=w || y>=h) |
---|
| 43 | return; |
---|
| 44 | |
---|
| 45 | sum = mask[0] * src->data[idx]; |
---|
| 46 | |
---|
| 47 | for (i = 1; i < dim_mask; i++) { |
---|
| 48 | mmax = (x-i > 0 ? x-i : 0); |
---|
| 49 | //mmax = max(x-i,0);// > 0 ? x-i : 0) |
---|
| 50 | mmin = (x+i > w-1 ? x+i : w-1); |
---|
| 51 | sum += mask[i] * (src->data[mmax + y * w]) + src->data[mmin + y * w]; |
---|
| 52 | } |
---|
| 53 | dst->data[y + h * x] = sum; |
---|
| 54 | |
---|
| 55 | } |
---|
| 56 | |
---|
| 57 | |
---|
| 58 | __global__ void create_rgb_smooth(imagef *r, imagef *g, imagef *b, image *input) |
---|
| 59 | { |
---|
| 60 | int w = input->w; |
---|
| 61 | int h = input->h; |
---|
| 62 | int x = blockIdx.x * blockDim.x + threadIdx.x; |
---|
| 63 | int y = blockIdx.y * blockDim.y + threadIdx.y; |
---|
| 64 | int idx = x + y*w; |
---|
| 65 | |
---|
| 66 | if (x>=w || y>=h) |
---|
| 67 | return; |
---|
| 68 | |
---|
| 69 | r->data[idx] = input->data[idx].r; |
---|
| 70 | g->data[idx] = input->data[idx].g; |
---|
| 71 | b->data[idx] = input->data[idx].b; |
---|
| 72 | |
---|
| 73 | } |
---|
| 74 | |
---|
| 75 | #define diffgpu(r, g, b, x1, y1, x2, y2) \ |
---|
| 76 | sqrtf(powf(r->data[x1+w*y1]-r->data[x2+w*y2],2) + \ |
---|
| 77 | powf(g->data[x1+w*y1]-g->data[x2+w*y2],2) + \ |
---|
| 78 | powf(b->data[x1+w*y1]-b->data[x2+w*y2],2)); \ |
---|
| 79 | |
---|
| 80 | |
---|
| 81 | // Kernel that executes on the CUDA device |
---|
| 82 | __global__ void compute_edges(image *input, edge *edges, edge **edges_location,float *weights,imagef *smooth_r, |
---|
| 83 | imagef *smooth_g,imagef *smooth_b ) |
---|
| 84 | { |
---|
| 85 | //int idx = blockIdx.x * blockDim.x + threadIdx.x; |
---|
| 86 | int w = input->w; |
---|
| 87 | int h = input->h; |
---|
| 88 | int x = blockIdx.x * blockDim.x + threadIdx.x; |
---|
| 89 | int y = blockIdx.y * blockDim.y + threadIdx.y; |
---|
| 90 | int idx = x + y*w; |
---|
| 91 | |
---|
| 92 | // indx[idx] = idx; |
---|
| 93 | |
---|
| 94 | /* if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0) |
---|
| 95 | input->w = input->w + 1; |
---|
| 96 | |
---|
| 97 | edges[0].a = 23; |
---|
| 98 | edges[0].b = 24; |
---|
| 99 | */ |
---|
| 100 | |
---|
| 101 | if (x>=w || y>=h) |
---|
| 102 | return; |
---|
| 103 | |
---|
| 104 | if (x < w-1) |
---|
| 105 | { |
---|
| 106 | edges[idx*4].a = y * w + x; |
---|
| 107 | edges[idx*4].b = y * w + (x+1); |
---|
| 108 | weights[idx*4] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y); |
---|
| 109 | } |
---|
| 110 | else |
---|
| 111 | { |
---|
| 112 | edges[idx*4].a = -1; |
---|
| 113 | edges[idx*4].b = -1; |
---|
| 114 | weights[idx*4] = max; |
---|
| 115 | } |
---|
| 116 | |
---|
| 117 | |
---|
| 118 | if (y < h-1) |
---|
| 119 | { |
---|
| 120 | edges[idx*4+1].a = y * w + x; |
---|
| 121 | edges[idx*4+1].b = (y+1) * w + x; |
---|
| 122 | weights[idx*4+1] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x, y+1); |
---|
| 123 | } |
---|
| 124 | else |
---|
| 125 | { |
---|
| 126 | edges[idx*4+1].a = -1; |
---|
| 127 | edges[idx*4+1].b = -1; |
---|
| 128 | weights[idx*4+1] = max; |
---|
| 129 | } |
---|
| 130 | |
---|
| 131 | if ((x < w-1) && (y < h-1)) |
---|
| 132 | { |
---|
| 133 | edges[idx*4+2].a = y * w + x; |
---|
| 134 | edges[idx*4+2].b = (y+1) * w + (x+1); |
---|
| 135 | weights[idx*4+2] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y+1); |
---|
| 136 | } |
---|
| 137 | else |
---|
| 138 | { |
---|
| 139 | edges[idx*4+2].a = -1; |
---|
| 140 | edges[idx*4+2].b = -1; |
---|
| 141 | weights[idx*4+2] = max; |
---|
| 142 | |
---|
| 143 | } |
---|
| 144 | |
---|
| 145 | if ((x < w-1) && (y > 0)) |
---|
| 146 | { |
---|
| 147 | edges[idx*4+3].a = y * w + x; |
---|
| 148 | edges[idx*4+3].b = (y-1) * w + (x+1); |
---|
| 149 | weights[idx*4+3] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y-1); |
---|
| 150 | } |
---|
| 151 | else |
---|
| 152 | { |
---|
| 153 | edges[idx*4+3].a = -1; |
---|
| 154 | edges[idx*4+3].b = -1; |
---|
| 155 | weights[idx*4+3] = max; |
---|
| 156 | } |
---|
| 157 | |
---|
| 158 | edges_location[idx*4] = &(edges[idx*4]); |
---|
| 159 | edges_location[idx*4+1] = &(edges[idx*4+1]); |
---|
| 160 | edges_location[idx*4+2] = &(edges[idx*4+2]); |
---|
| 161 | edges_location[idx*4+3] = &(edges[idx*4+3]); |
---|
| 162 | |
---|
| 163 | |
---|
| 164 | /* if (idx == 0) |
---|
| 165 | input->h = input->h + 1; |
---|
| 166 | */ |
---|
| 167 | |
---|
| 168 | } |
---|
| 169 | |
---|
| 170 | // main routine that executes on the host |
---|
| 171 | int main(int argc, char **argv) |
---|
| 172 | { |
---|
| 173 | if (argc != 6) |
---|
| 174 | { |
---|
| 175 | fprintf(stderr, "usage: %s sigma k min input(ppm) output(ppm)\n", argv[0]); |
---|
| 176 | return 1; |
---|
| 177 | } |
---|
| 178 | |
---|
| 179 | |
---|
| 180 | dim3 n_blocks, block_size; |
---|
| 181 | //int n_blocks, block_size; |
---|
| 182 | float sigma = atof(argv[1]); |
---|
| 183 | float k = atof(argv[2]); |
---|
| 184 | int min_size = atoi(argv[3]); |
---|
| 185 | rgb *array; |
---|
| 186 | edge *edges; |
---|
| 187 | edge *ed_temp; |
---|
| 188 | |
---|
| 189 | float* weights; |
---|
| 190 | edge** edges_location; |
---|
| 191 | |
---|
| 192 | |
---|
| 193 | struct timeval start, end; |
---|
| 194 | long mtime, seconds, useconds; |
---|
| 195 | |
---|
| 196 | gettimeofday(&start, NULL); |
---|
| 197 | |
---|
| 198 | printf("loading input image.\n"); |
---|
| 199 | image *input = loadPPM(argv[4]); |
---|
| 200 | |
---|
| 201 | /* Alloc image on device*/ |
---|
| 202 | image *input_cuda; |
---|
| 203 | |
---|
| 204 | image in; |
---|
| 205 | |
---|
| 206 | cudaMalloc((void **) &array, input->h * input->w * sizeof(rgb)); |
---|
| 207 | cudaMemcpy((void *) array, (void *) input->data, input->h * input->w * sizeof(rgb), cudaMemcpyHostToDevice); |
---|
| 208 | |
---|
| 209 | |
---|
| 210 | in.h = input->h; |
---|
| 211 | in.w = input->w; |
---|
| 212 | in.data = (rgb*)array; |
---|
| 213 | |
---|
| 214 | cudaMalloc((void **) &input_cuda, sizeof(image)); |
---|
| 215 | cudaMemcpy((void *) input_cuda, (void *) &in, sizeof(image), cudaMemcpyHostToDevice); |
---|
| 216 | |
---|
| 217 | |
---|
| 218 | /* Alocare matrci r,g,b de image_float*/ |
---|
| 219 | imagef *r; |
---|
| 220 | imagef *g; |
---|
| 221 | imagef *b; |
---|
| 222 | // imagef *tmp; |
---|
| 223 | imagef tmp; |
---|
| 224 | |
---|
| 225 | imagef *tmp_img; |
---|
| 226 | imagef tmp_img_cpu; |
---|
| 227 | |
---|
| 228 | float *arrayftmp; |
---|
| 229 | |
---|
| 230 | |
---|
| 231 | float *arrayf1, *arrayf2, *arrayf3; |
---|
| 232 | std::vector<float> vmask = make_fgauss(sigma); |
---|
| 233 | float *mask = (float *)malloc(vmask.size()*sizeof(float)) + 3 * sizeof(imagef); |
---|
| 234 | for (int i=0; i<vmask.size(); i++) |
---|
| 235 | mask[i]=vmask[i]; |
---|
| 236 | float *maski; |
---|
| 237 | // tmp = (imagef *)(((char *)mask) + vmask.size()*sizeof(float)); |
---|
| 238 | |
---|
| 239 | |
---|
| 240 | |
---|
| 241 | unsigned int rgb_size = 4 * input->h * input->w * sizeof(float) + 4 * sizeof(imagef) + sizeof(float) * vmask.size(); |
---|
| 242 | void *bigMem; |
---|
| 243 | if (cudaMalloc(&bigMem, rgb_size) != cudaSuccess) |
---|
| 244 | printf("ERROR big_malloc 1\n"); |
---|
| 245 | char *initial = (char *)bigMem; |
---|
| 246 | maski = (float *)initial; |
---|
| 247 | initial += sizeof(float) * vmask.size(); |
---|
| 248 | r = (imagef *)initial; |
---|
| 249 | initial+=sizeof(imagef); |
---|
| 250 | g = (imagef *)initial; |
---|
| 251 | initial+=sizeof(imagef); |
---|
| 252 | b = (imagef *)initial; |
---|
| 253 | initial+=sizeof(imagef); |
---|
| 254 | tmp_img = (imagef *)initial; |
---|
| 255 | initial+=sizeof(imagef); |
---|
| 256 | arrayf1 = (float *)initial; |
---|
| 257 | initial += input->h * input->w * sizeof(float); |
---|
| 258 | arrayf2 = (float *)initial; |
---|
| 259 | initial += input->h * input->w * sizeof(float); |
---|
| 260 | arrayf3 = (float *)initial; |
---|
| 261 | initial += input->h * input->w * sizeof(float); |
---|
| 262 | arrayftmp = (float *)initial; |
---|
| 263 | /* |
---|
| 264 | for (int i=0; i<3; i++){ |
---|
| 265 | tmp[i].w=input->w; |
---|
| 266 | tmp[i].h=input->h; |
---|
| 267 | } |
---|
| 268 | tmp[0].data = arrayf1; |
---|
| 269 | tmp[1].data = arrayf2; |
---|
| 270 | tmp[2].data = arrayf3; |
---|
| 271 | CUDA_SAFE_CALL(cudaMemcpy((void *)maski, (void *)mask, 3*sizeof(imagef) + vmask.size()*sizeof(float), cudaMemcpyHostToDevice)); |
---|
| 272 | */ |
---|
| 273 | // exit(0); |
---|
| 274 | /* |
---|
| 275 | if (cudaMalloc((void **) &r, sizeof(imagef)) != cudaSuccess) |
---|
| 276 | printf("ERROR11\n"); |
---|
| 277 | if (cudaMalloc((void **) &g, sizeof(imagef)) != cudaSuccess) |
---|
| 278 | printf("ERROR12\n"); |
---|
| 279 | |
---|
| 280 | if (cudaMalloc((void **) &b, sizeof(imagef)) != cudaSuccess) |
---|
| 281 | printf("ERROR13\n"); |
---|
| 282 | |
---|
| 283 | cudaMalloc((void **) &arrayf1, input->h * input->w * sizeof(float)); |
---|
| 284 | */ |
---|
| 285 | |
---|
| 286 | tmp.w = input->w; |
---|
| 287 | tmp.h = input->h; |
---|
| 288 | tmp.data = arrayf1; |
---|
| 289 | |
---|
| 290 | CUDA_SAFE_CALL(cudaMemcpy((void *) r, (void *) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)); |
---|
| 291 | |
---|
| 292 | // cudaMalloc((void **) &arrayf2, input->h * input->w * sizeof(float)); |
---|
| 293 | tmp.data = arrayf2; |
---|
| 294 | if (cudaMemcpy((void *) g, (void *) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) |
---|
| 295 | printf("Error memcpy g\n"); |
---|
| 296 | |
---|
| 297 | // cudaMalloc((void **) &arrayf3, input->h * input->w * sizeof(float)); |
---|
| 298 | tmp.data = arrayf3; |
---|
| 299 | if (cudaMemcpy((void *) b, (void *) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) |
---|
| 300 | printf("Error memcpy b\n"); |
---|
| 301 | |
---|
| 302 | if (cudaMemcpy((void *) maski, (void *) mask, vmask.size() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) |
---|
| 303 | printf("Error memcpy maski\n"); |
---|
| 304 | |
---|
| 305 | |
---|
| 306 | //smooth alloc |
---|
| 307 | |
---|
| 308 | imagef *smooth_r; |
---|
| 309 | imagef *smooth_g; |
---|
| 310 | imagef *smooth_b; |
---|
| 311 | imagef smooth_tmp; |
---|
| 312 | |
---|
| 313 | |
---|
| 314 | float *arrayfs1, *arrayfs2, *arrayfs3; |
---|
| 315 | |
---|
| 316 | |
---|
| 317 | unsigned int smooth_size = 3 * input->h * input->w * sizeof(float) + 3 * sizeof(imagef); |
---|
| 318 | void *bigMemSmooth; |
---|
| 319 | CUDA_SAFE_CALL(cudaMalloc(&bigMemSmooth, smooth_size)); |
---|
| 320 | |
---|
| 321 | initial = (char *)bigMemSmooth; |
---|
| 322 | smooth_r = (imagef *)initial; |
---|
| 323 | initial+=sizeof(imagef); |
---|
| 324 | smooth_g = (imagef *)initial; |
---|
| 325 | initial+=sizeof(imagef); |
---|
| 326 | smooth_b = (imagef *)initial; |
---|
| 327 | initial+=sizeof(imagef); |
---|
| 328 | arrayfs1 = (float *)initial; |
---|
| 329 | initial += input->h * input->w * sizeof(float); |
---|
| 330 | arrayfs2 = (float *)initial; |
---|
| 331 | initial += input->h * input->w * sizeof(float); |
---|
| 332 | arrayfs3 = (float *)initial; |
---|
| 333 | |
---|
| 334 | /* |
---|
| 335 | if (cudaMalloc((void **) &smooth_r, sizeof(imagef)) != cudaSuccess) |
---|
| 336 | printf("ERROR14\n"); |
---|
| 337 | if (cudaMalloc((void **) &smooth_g, sizeof(imagef)) != cudaSuccess) |
---|
| 338 | printf("ERROR15\n"); |
---|
| 339 | |
---|
| 340 | if (cudaMalloc((void **) &smooth_b, sizeof(imagef)) != cudaSuccess) |
---|
| 341 | printf("ERROR16\n"); |
---|
| 342 | |
---|
| 343 | if (cudaMalloc((void **) &arrayfs1, input->h * input->w * sizeof(float))!=cudaSuccess) |
---|
| 344 | printf("Error malloc arrayfs1\n"); |
---|
| 345 | */ |
---|
| 346 | smooth_tmp.w = input->w; |
---|
| 347 | smooth_tmp.h = input->h; |
---|
| 348 | smooth_tmp.data = arrayfs1; |
---|
| 349 | if (cudaMemcpy((void *) smooth_r, (void *) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) |
---|
| 350 | printf("Error memcpy smoothr\n"); |
---|
| 351 | |
---|
| 352 | // if (cudaMalloc((void **) &arrayfs2, input->h * input->w * sizeof(float))!=cudaSuccess) |
---|
| 353 | // printf("Error malloc arrayfs2\n"); |
---|
| 354 | smooth_tmp.data = arrayfs2; |
---|
| 355 | if (cudaMemcpy((void *) smooth_g, (void *) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) |
---|
| 356 | printf("Error memcpy smoothg\n"); |
---|
| 357 | |
---|
| 358 | // if (cudaMalloc((void **) &arrayfs3, input->h * input->w * sizeof(float))!=cudaSuccess) |
---|
| 359 | // printf("Error malloc arrayfs3\n"); |
---|
| 360 | smooth_tmp.data = arrayfs3; |
---|
| 361 | if (cudaMemcpy((void *) smooth_b, (void *) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) |
---|
| 362 | printf("Error memcpy smoothb\n"); |
---|
| 363 | |
---|
| 364 | //alloc tmp matrix for smooth |
---|
| 365 | // if (cudaMalloc((void **) &tmp_img, sizeof(imagef)) != cudaSuccess) |
---|
| 366 | // printf("ERROR17\n"); |
---|
| 367 | // cudaMalloc((void **) &arrayftmp, input->w * input->h * sizeof(float)); |
---|
| 368 | tmp_img_cpu.w = input->h; |
---|
| 369 | tmp_img_cpu.h = input->w; |
---|
| 370 | tmp_img_cpu.data = arrayftmp; |
---|
| 371 | if (cudaMemcpy((void *) tmp_img, (void *) &tmp_img_cpu, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) |
---|
| 372 | printf("Error memcpy tmp_img\n"); |
---|
| 373 | |
---|
| 374 | // if (cudaMalloc((void **) &maski, sizeof(float)*vmask.size()) != cudaSuccess) |
---|
| 375 | // printf("ERROR18\n"); |
---|
| 376 | |
---|
| 377 | |
---|
| 378 | create_rgb_smooth <<< n_blocks, block_size >>> (r, g, b, input_cuda); |
---|
| 379 | |
---|
| 380 | convolve_even_gpu <<< n_blocks, block_size >>> (r, tmp_img, maski, vmask.size()); |
---|
| 381 | convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_r, maski, vmask.size()); |
---|
| 382 | convolve_even_gpu <<< n_blocks, block_size >>> (g, tmp_img, maski, vmask.size()); |
---|
| 383 | convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_g, maski, vmask.size()); |
---|
| 384 | convolve_even_gpu <<< n_blocks, block_size >>> (b, tmp_img, maski, vmask.size()); |
---|
| 385 | convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_b, maski, vmask.size()); |
---|
| 386 | |
---|
| 387 | |
---|
| 388 | //free(mask); |
---|
| 389 | /* |
---|
| 390 | cudaFree(maski); |
---|
| 391 | cudaFree(tmp_img); |
---|
| 392 | cudaFree(r); |
---|
| 393 | cudaFree(g); |
---|
| 394 | cudaFree(b); |
---|
| 395 | cudaFree(arrayf1); |
---|
| 396 | cudaFree(arrayf2); |
---|
| 397 | cudaFree(arrayf3); |
---|
| 398 | cudaFree(arrayftmp); |
---|
| 399 | */ |
---|
| 400 | CUDA_SAFE_CALL(cudaFree(input_cuda)); |
---|
| 401 | CUDA_SAFE_CALL(cudaFree(array)); |
---|
| 402 | |
---|
| 403 | CUDA_SAFE_CALL(cudaFree(bigMem)); |
---|
| 404 | |
---|
| 405 | |
---|
| 406 | /* Alloc edges' vector on device */ |
---|
| 407 | //CUDA_SAFE_CALL(cudaMalloc((void **) &edges, (4 * input->h * input->w )* sizeof(edge))); |
---|
| 408 | //CUDA_SAFE_CALL(cudaMalloc((void **) &weights, (4 * input->h * input->w )* sizeof(float))); |
---|
| 409 | unsigned int size_edges = 4 * input->h * input->w * sizeof(edge) + 4 * input->h * input->w * sizeof(edge*) + 4 * input->h * input->w * sizeof(float); |
---|
| 410 | CUDA_SAFE_CALL(cudaMalloc((void **) &bigMem, size_edges)); |
---|
| 411 | |
---|
| 412 | //exit(0); |
---|
| 413 | |
---|
| 414 | /* |
---|
| 415 | if (cudaMalloc((void **) &edges, 4 * input->h * input->w * sizeof(edge)) != cudaSuccess) |
---|
| 416 | printf("ERROR19\n"); |
---|
| 417 | if (cudaMalloc((void **) &edges_location, (4 * input->h * input->w )* sizeof(edge*)) != cudaSuccess) |
---|
| 418 | printf("ERROR1A\n"); |
---|
| 419 | |
---|
| 420 | if (cudaMalloc((void **) &weights, (4 * input->h * input->w )* sizeof(float)) != cudaSuccess) |
---|
| 421 | printf("ERROR1B\n"); |
---|
| 422 | */ |
---|
| 423 | initial = (char *)bigMem; |
---|
| 424 | edges = (edge*)initial; |
---|
| 425 | initial += 4 * input->h * input->w * sizeof(edge); |
---|
| 426 | edges_location = (edge **)initial; |
---|
| 427 | initial += 4 * input->h * input->w * sizeof(edge*); |
---|
| 428 | weights = (float *)initial; |
---|
| 429 | |
---|
| 430 | /* Define the grid */ |
---|
| 431 | n_blocks.x = input->w / BLOCK_DIM_X + (input->w % BLOCK_DIM_X == 0?0:1); |
---|
| 432 | n_blocks.y = input->h / BLOCK_DIM_Y + (input->h % BLOCK_DIM_Y == 0?0:1); |
---|
| 433 | n_blocks.z = 1; |
---|
| 434 | |
---|
| 435 | printf("n_blocks.x = %d\n", n_blocks.x); |
---|
| 436 | printf("n_blocks.y = %d\n", n_blocks.y); |
---|
| 437 | |
---|
| 438 | |
---|
| 439 | block_size.x = BLOCK_DIM_X; |
---|
| 440 | block_size.y = BLOCK_DIM_Y; |
---|
| 441 | block_size.z = 1; |
---|
| 442 | |
---|
| 443 | /* Compute edges */ |
---|
| 444 | compute_edges <<< n_blocks, block_size >>> (input_cuda, edges, edges_location, weights,smooth_r,smooth_g,smooth_b); |
---|
| 445 | /* |
---|
| 446 | cudaFree(smooth_r); |
---|
| 447 | cudaFree(smooth_g); |
---|
| 448 | cudaFree(smooth_b); |
---|
| 449 | cudaFree(arrayfs1); |
---|
| 450 | cudaFree(arrayfs2); |
---|
| 451 | cudaFree(arrayfs3); |
---|
| 452 | */ |
---|
| 453 | CUDA_SAFE_CALL(cudaFree(bigMemSmooth)); |
---|
| 454 | |
---|
| 455 | ed_temp = (edge *)malloc((4 * input->h * input->w )* sizeof(edge)); |
---|
| 456 | if (cudaMemcpy((void *) ed_temp, (void *) edges, 4 * input->h * input->w * sizeof(edge), cudaMemcpyDeviceToHost) != cudaSuccess) |
---|
| 457 | printf("ERROR2\n"); |
---|
| 458 | |
---|
| 459 | /* if (cudaMemcpy((void *) indx_temp, (void *) indx, input->h * input->w * sizeof(int), cudaMemcpyDeviceToHost)) |
---|
| 460 | printf("ERROR3\n"); |
---|
| 461 | */ |
---|
| 462 | |
---|
| 463 | printf("a0 = %d\n", ed_temp[0].a); |
---|
| 464 | printf("b0 = %d\n", ed_temp[0].b); |
---|
| 465 | |
---|
| 466 | //sort edges |
---|
| 467 | /* |
---|
| 468 | CUDPPConfiguration config; |
---|
| 469 | config.op = CUDPP_MIN; |
---|
| 470 | config.datatype = CUDPP_FLOAT; |
---|
| 471 | config.algorithm = CUDPP_SORT_RADIX; |
---|
| 472 | config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; |
---|
| 473 | |
---|
| 474 | CUDPPHandle scanplan = 0; |
---|
| 475 | CUDPPResult result = cudppPlan(&scanplan, config, 4 * input->h * input->w, 1, 0); |
---|
| 476 | |
---|
| 477 | result = cudppSort ( scanplan, |
---|
| 478 | weights, |
---|
| 479 | edges_location, |
---|
| 480 | 32, |
---|
| 481 | 4 * input->h * input->w |
---|
| 482 | ); |
---|
| 483 | |
---|
| 484 | result = cudppDestroyPlan(scanplan); |
---|
| 485 | if (CUDPP_SUCCESS != result) |
---|
| 486 | { |
---|
| 487 | printf("Error destroying CUDPPPlan\n"); |
---|
| 488 | exit(-1); |
---|
| 489 | } |
---|
| 490 | |
---|
| 491 | |
---|
| 492 | if (CUDPP_SUCCESS != result) |
---|
| 493 | { |
---|
| 494 | printf("Error creating CUDPPPlan\n"); |
---|
| 495 | exit(-1); |
---|
| 496 | } |
---|
| 497 | */ |
---|
| 498 | gettimeofday(&end, NULL); |
---|
| 499 | |
---|
| 500 | seconds = end.tv_sec - start.tv_sec; |
---|
| 501 | useconds = end.tv_usec - start.tv_usec; |
---|
| 502 | |
---|
| 503 | mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; |
---|
| 504 | |
---|
| 505 | printf("Elapsed time: %ld milliseconds\n", mtime); |
---|
| 506 | |
---|
| 507 | // Cleanup |
---|
| 508 | //exit(0); |
---|
| 509 | //cudaFree(input_cuda); |
---|
| 510 | CUDA_SAFE_CALL(cudaFree(bigMem)); |
---|
| 511 | //cudaFree(array); |
---|
| 512 | //cudaFree(edges); |
---|
| 513 | //cudaFree(edges_location); |
---|
| 514 | //cudaFree(weights); |
---|
| 515 | |
---|
| 516 | |
---|
| 517 | return 0; |
---|
| 518 | |
---|
| 519 | } |
---|