// example1.cpp : Defines the entry point for the console application. // //#include "stdafx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "segment-image.h" #define BLOCK_DIM_X 12 #define BLOCK_DIM_Y 8 #define max FLT_MAX __global__ void convolve_even_gpu(imagef *src, imagef *dst, float *mask, int dim_mask){ int w = src->w; int h = src->h; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int idx = x + y*w; int i; float sum; int mmax,mmin; if (x>=w || y>=h) return; sum = mask[0] * src->data[idx]; for (i = 1; i < dim_mask; i++) { mmax = (x-i > 0 ? x-i : 0); //mmax = max(x-i,0);// > 0 ? x-i : 0) mmin = (x+i > w-1 ? x+i : w-1); sum += mask[i] * (src->data[mmax + y * w]) + src->data[mmin + y * w]; } dst->data[y + h * x] = sum; } __global__ void create_rgb_smooth(imagef *r, imagef *g, imagef *b, image *input) { int w = input->w; int h = input->h; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int idx = x + y*w; if (x>=w || y>=h) return; r->data[idx] = input->data[idx].r; g->data[idx] = input->data[idx].g; b->data[idx] = input->data[idx].b; } #define diffgpu(r, g, b, x1, y1, x2, y2) \ sqrtf(powf(r->data[x1+w*y1]-r->data[x2+w*y2],2) + \ powf(g->data[x1+w*y1]-g->data[x2+w*y2],2) + \ powf(b->data[x1+w*y1]-b->data[x2+w*y2],2)); \ // Kernel that executes on the CUDA device __global__ void compute_edges(image *input, edge *edges, edge **edges_location,float *weights,imagef *smooth_r, imagef *smooth_g,imagef *smooth_b ) { //int idx = blockIdx.x * blockDim.x + threadIdx.x; int w = input->w; int h = input->h; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int idx = x + y*w; // indx[idx] = idx; /* if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0) input->w = input->w + 1; edges[0].a = 23; edges[0].b = 24; */ if (x>=w || y>=h) return; if (x < w-1) { edges[idx*4].a = y * w + x; edges[idx*4].b = y * w + (x+1); weights[idx*4] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y); } else { edges[idx*4].a = -1; edges[idx*4].b = -1; weights[idx*4] = max; } if (y < h-1) { edges[idx*4+1].a = y * w + x; edges[idx*4+1].b = (y+1) * w + x; weights[idx*4+1] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x, y+1); } else { edges[idx*4+1].a = -1; edges[idx*4+1].b = -1; weights[idx*4+1] = max; } if ((x < w-1) && (y < h-1)) { edges[idx*4+2].a = y * w + x; edges[idx*4+2].b = (y+1) * w + (x+1); weights[idx*4+2] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y+1); } else { edges[idx*4+2].a = -1; edges[idx*4+2].b = -1; weights[idx*4+2] = max; } if ((x < w-1) && (y > 0)) { edges[idx*4+3].a = y * w + x; edges[idx*4+3].b = (y-1) * w + (x+1); weights[idx*4+3] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y-1); } else { edges[idx*4+3].a = -1; edges[idx*4+3].b = -1; weights[idx*4+3] = max; } edges_location[idx*4] = &(edges[idx*4]); edges_location[idx*4+1] = &(edges[idx*4+1]); edges_location[idx*4+2] = &(edges[idx*4+2]); edges_location[idx*4+3] = &(edges[idx*4+3]); /* if (idx == 0) input->h = input->h + 1; */ } // main routine that executes on the host int main(int argc, char **argv) { if (argc != 6) { fprintf(stderr, "usage: %s sigma k min input(ppm) output(ppm)\n", argv[0]); return 1; } dim3 n_blocks, block_size; //int n_blocks, block_size; float sigma = atof(argv[1]); float k = atof(argv[2]); int min_size = atoi(argv[3]); rgb *array; edge *edges; edge *ed_temp; float* weights; edge** edges_location; struct timeval start, end; long mtime, seconds, useconds; gettimeofday(&start, NULL); printf("loading input image.\n"); image *input = loadPPM(argv[4]); /* Alloc image on device*/ image *input_cuda; image in; cudaMalloc((void **) &array, input->h * input->w * sizeof(rgb)); cudaMemcpy((void *) array, (void *) input->data, input->h * input->w * sizeof(rgb), cudaMemcpyHostToDevice); in.h = input->h; in.w = input->w; in.data = (rgb*)array; cudaMalloc((void **) &input_cuda, sizeof(image)); cudaMemcpy((void *) input_cuda, (void *) &in, sizeof(image), cudaMemcpyHostToDevice); /* Alocare matrci r,g,b de image_float*/ imagef *r; imagef *g; imagef *b; // imagef *tmp; imagef tmp; imagef *tmp_img; imagef tmp_img_cpu; float *arrayftmp; float *arrayf1, *arrayf2, *arrayf3; std::vector vmask = make_fgauss(sigma); float *mask = (float *)malloc(vmask.size()*sizeof(float)) + 3 * sizeof(imagef); for (int i=0; ih * input->w * sizeof(float) + 4 * sizeof(imagef) + sizeof(float) * vmask.size(); void *bigMem; if (cudaMalloc(&bigMem, rgb_size) != cudaSuccess) printf("ERROR big_malloc 1\n"); char *initial = (char *)bigMem; maski = (float *)initial; initial += sizeof(float) * vmask.size(); r = (imagef *)initial; initial+=sizeof(imagef); g = (imagef *)initial; initial+=sizeof(imagef); b = (imagef *)initial; initial+=sizeof(imagef); tmp_img = (imagef *)initial; initial+=sizeof(imagef); arrayf1 = (float *)initial; initial += input->h * input->w * sizeof(float); arrayf2 = (float *)initial; initial += input->h * input->w * sizeof(float); arrayf3 = (float *)initial; initial += input->h * input->w * sizeof(float); arrayftmp = (float *)initial; /* for (int i=0; i<3; i++){ tmp[i].w=input->w; tmp[i].h=input->h; } tmp[0].data = arrayf1; tmp[1].data = arrayf2; tmp[2].data = arrayf3; CUDA_SAFE_CALL(cudaMemcpy((void *)maski, (void *)mask, 3*sizeof(imagef) + vmask.size()*sizeof(float), cudaMemcpyHostToDevice)); */ // exit(0); /* if (cudaMalloc((void **) &r, sizeof(imagef)) != cudaSuccess) printf("ERROR11\n"); if (cudaMalloc((void **) &g, sizeof(imagef)) != cudaSuccess) printf("ERROR12\n"); if (cudaMalloc((void **) &b, sizeof(imagef)) != cudaSuccess) printf("ERROR13\n"); cudaMalloc((void **) &arrayf1, input->h * input->w * sizeof(float)); */ tmp.w = input->w; tmp.h = input->h; tmp.data = arrayf1; CUDA_SAFE_CALL(cudaMemcpy((void *) r, (void *) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)); // cudaMalloc((void **) &arrayf2, input->h * input->w * sizeof(float)); tmp.data = arrayf2; if (cudaMemcpy((void *) g, (void *) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) printf("Error memcpy g\n"); // cudaMalloc((void **) &arrayf3, input->h * input->w * sizeof(float)); tmp.data = arrayf3; if (cudaMemcpy((void *) b, (void *) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) printf("Error memcpy b\n"); if (cudaMemcpy((void *) maski, (void *) mask, vmask.size() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) printf("Error memcpy maski\n"); //smooth alloc imagef *smooth_r; imagef *smooth_g; imagef *smooth_b; imagef smooth_tmp; float *arrayfs1, *arrayfs2, *arrayfs3; unsigned int smooth_size = 3 * input->h * input->w * sizeof(float) + 3 * sizeof(imagef); void *bigMemSmooth; CUDA_SAFE_CALL(cudaMalloc(&bigMemSmooth, smooth_size)); initial = (char *)bigMemSmooth; smooth_r = (imagef *)initial; initial+=sizeof(imagef); smooth_g = (imagef *)initial; initial+=sizeof(imagef); smooth_b = (imagef *)initial; initial+=sizeof(imagef); arrayfs1 = (float *)initial; initial += input->h * input->w * sizeof(float); arrayfs2 = (float *)initial; initial += input->h * input->w * sizeof(float); arrayfs3 = (float *)initial; /* if (cudaMalloc((void **) &smooth_r, sizeof(imagef)) != cudaSuccess) printf("ERROR14\n"); if (cudaMalloc((void **) &smooth_g, sizeof(imagef)) != cudaSuccess) printf("ERROR15\n"); if (cudaMalloc((void **) &smooth_b, sizeof(imagef)) != cudaSuccess) printf("ERROR16\n"); if (cudaMalloc((void **) &arrayfs1, input->h * input->w * sizeof(float))!=cudaSuccess) printf("Error malloc arrayfs1\n"); */ smooth_tmp.w = input->w; smooth_tmp.h = input->h; smooth_tmp.data = arrayfs1; if (cudaMemcpy((void *) smooth_r, (void *) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) printf("Error memcpy smoothr\n"); // if (cudaMalloc((void **) &arrayfs2, input->h * input->w * sizeof(float))!=cudaSuccess) // printf("Error malloc arrayfs2\n"); smooth_tmp.data = arrayfs2; if (cudaMemcpy((void *) smooth_g, (void *) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) printf("Error memcpy smoothg\n"); // if (cudaMalloc((void **) &arrayfs3, input->h * input->w * sizeof(float))!=cudaSuccess) // printf("Error malloc arrayfs3\n"); smooth_tmp.data = arrayfs3; if (cudaMemcpy((void *) smooth_b, (void *) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) printf("Error memcpy smoothb\n"); //alloc tmp matrix for smooth // if (cudaMalloc((void **) &tmp_img, sizeof(imagef)) != cudaSuccess) // printf("ERROR17\n"); // cudaMalloc((void **) &arrayftmp, input->w * input->h * sizeof(float)); tmp_img_cpu.w = input->h; tmp_img_cpu.h = input->w; tmp_img_cpu.data = arrayftmp; if (cudaMemcpy((void *) tmp_img, (void *) &tmp_img_cpu, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess) printf("Error memcpy tmp_img\n"); // if (cudaMalloc((void **) &maski, sizeof(float)*vmask.size()) != cudaSuccess) // printf("ERROR18\n"); create_rgb_smooth <<< n_blocks, block_size >>> (r, g, b, input_cuda); convolve_even_gpu <<< n_blocks, block_size >>> (r, tmp_img, maski, vmask.size()); convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_r, maski, vmask.size()); convolve_even_gpu <<< n_blocks, block_size >>> (g, tmp_img, maski, vmask.size()); convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_g, maski, vmask.size()); convolve_even_gpu <<< n_blocks, block_size >>> (b, tmp_img, maski, vmask.size()); convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_b, maski, vmask.size()); //free(mask); /* cudaFree(maski); cudaFree(tmp_img); cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(arrayf1); cudaFree(arrayf2); cudaFree(arrayf3); cudaFree(arrayftmp); */ CUDA_SAFE_CALL(cudaFree(input_cuda)); CUDA_SAFE_CALL(cudaFree(array)); CUDA_SAFE_CALL(cudaFree(bigMem)); /* Alloc edges' vector on device */ //CUDA_SAFE_CALL(cudaMalloc((void **) &edges, (4 * input->h * input->w )* sizeof(edge))); //CUDA_SAFE_CALL(cudaMalloc((void **) &weights, (4 * input->h * input->w )* sizeof(float))); unsigned int size_edges = 4 * input->h * input->w * sizeof(edge) + 4 * input->h * input->w * sizeof(edge*) + 4 * input->h * input->w * sizeof(float); CUDA_SAFE_CALL(cudaMalloc((void **) &bigMem, size_edges)); //exit(0); /* if (cudaMalloc((void **) &edges, 4 * input->h * input->w * sizeof(edge)) != cudaSuccess) printf("ERROR19\n"); if (cudaMalloc((void **) &edges_location, (4 * input->h * input->w )* sizeof(edge*)) != cudaSuccess) printf("ERROR1A\n"); if (cudaMalloc((void **) &weights, (4 * input->h * input->w )* sizeof(float)) != cudaSuccess) printf("ERROR1B\n"); */ initial = (char *)bigMem; edges = (edge*)initial; initial += 4 * input->h * input->w * sizeof(edge); edges_location = (edge **)initial; initial += 4 * input->h * input->w * sizeof(edge*); weights = (float *)initial; /* Define the grid */ n_blocks.x = input->w / BLOCK_DIM_X + (input->w % BLOCK_DIM_X == 0?0:1); n_blocks.y = input->h / BLOCK_DIM_Y + (input->h % BLOCK_DIM_Y == 0?0:1); n_blocks.z = 1; printf("n_blocks.x = %d\n", n_blocks.x); printf("n_blocks.y = %d\n", n_blocks.y); block_size.x = BLOCK_DIM_X; block_size.y = BLOCK_DIM_Y; block_size.z = 1; /* Compute edges */ compute_edges <<< n_blocks, block_size >>> (input_cuda, edges, edges_location, weights,smooth_r,smooth_g,smooth_b); /* cudaFree(smooth_r); cudaFree(smooth_g); cudaFree(smooth_b); cudaFree(arrayfs1); cudaFree(arrayfs2); cudaFree(arrayfs3); */ CUDA_SAFE_CALL(cudaFree(bigMemSmooth)); ed_temp = (edge *)malloc((4 * input->h * input->w )* sizeof(edge)); if (cudaMemcpy((void *) ed_temp, (void *) edges, 4 * input->h * input->w * sizeof(edge), cudaMemcpyDeviceToHost) != cudaSuccess) printf("ERROR2\n"); /* if (cudaMemcpy((void *) indx_temp, (void *) indx, input->h * input->w * sizeof(int), cudaMemcpyDeviceToHost)) printf("ERROR3\n"); */ printf("a0 = %d\n", ed_temp[0].a); printf("b0 = %d\n", ed_temp[0].b); //sort edges /* CUDPPConfiguration config; config.op = CUDPP_MIN; config.datatype = CUDPP_FLOAT; config.algorithm = CUDPP_SORT_RADIX; config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; CUDPPHandle scanplan = 0; CUDPPResult result = cudppPlan(&scanplan, config, 4 * input->h * input->w, 1, 0); result = cudppSort ( scanplan, weights, edges_location, 32, 4 * input->h * input->w ); result = cudppDestroyPlan(scanplan); if (CUDPP_SUCCESS != result) { printf("Error destroying CUDPPPlan\n"); exit(-1); } if (CUDPP_SUCCESS != result) { printf("Error creating CUDPPPlan\n"); exit(-1); } */ gettimeofday(&end, NULL); seconds = end.tv_sec - start.tv_sec; useconds = end.tv_usec - start.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; printf("Elapsed time: %ld milliseconds\n", mtime); // Cleanup //exit(0); //cudaFree(input_cuda); CUDA_SAFE_CALL(cudaFree(bigMem)); //cudaFree(array); //cudaFree(edges); //cudaFree(edges_location); //cudaFree(weights); return 0; }