Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

segment.cu @ 77

Last change on this file since 77 was 77, checked in by (none), 14 years ago
Added parallelized code
File size: 13.8 KB

Line
1	// example1.cpp : Defines the entry point for the console application.
2	//
3
4	//#include "stdafx.h"
5
6	#include <vector>
7	#include <stdio.h>
8	#include <stdlib.h>
9	#include <float.h>
10	#include <cuda.h>
11	#include <cudpp.h>
12	#include <cutil.h>
13	#include <math_functions.h>
14	#include <image.h>
15	#include <misc.h>
16	#include <pnmfile.h>
17	#include <cutil_math.h>
18	#include <sys/time.h>
19	#include <unistd.h>
20
21
22	#include "segment-image.h"
23
24
25	#define BLOCK_DIM_X 12
26	#define BLOCK_DIM_Y 8
27
28	#define max FLT_MAX
29
30
31
32	__global__ void convolve_even_gpu(imagef src, imagef dst, float *mask, int dim_mask){
33	int w = src->w;
34	int h = src->h;
35	int x = blockIdx.x * blockDim.x + threadIdx.x;
36	int y = blockIdx.y * blockDim.y + threadIdx.y;
37	int idx = x + y*w;
38	int i;
39	float sum;
40	int mmax,mmin;
41
42	if (x>=w \|\| y>=h)
43	return;
44
45	sum = mask[0] * src->data[idx];
46
47	for (i = 1; i < dim_mask; i++) {
48	mmax = (x-i > 0 ? x-i : 0);
49	//mmax = max(x-i,0);// > 0 ? x-i : 0)
50	mmin = (x+i > w-1 ? x+i : w-1);
51	sum += mask[i] * (src->data[mmax + y * w]) + src->data[mmin + y * w];
52	}
53	dst->data[y + h * x] = sum;
54
55	}
56
57
58	__global__ void create_rgb_smooth(imagef r, imagef g, imagef b, image input)
59	{
60	int w = input->w;
61	int h = input->h;
62	int x = blockIdx.x * blockDim.x + threadIdx.x;
63	int y = blockIdx.y * blockDim.y + threadIdx.y;
64	int idx = x + y*w;
65
66	if (x>=w \|\| y>=h)
67	return;
68
69	r->data[idx] = input->data[idx].r;
70	g->data[idx] = input->data[idx].g;
71	b->data[idx] = input->data[idx].b;
72
73	}
74
75	#define diffgpu(r, g, b, x1, y1, x2, y2) \
76	sqrtf(powf(r->data[x1+wy1]-r->data[x2+wy2],2) + \
77	powf(g->data[x1+wy1]-g->data[x2+wy2],2) + \
78	powf(b->data[x1+wy1]-b->data[x2+wy2],2)); \
79
80
81	// Kernel that executes on the CUDA device
82	__global__ void compute_edges(image input, edge edges, edge *edges_location,float weights,imagef *smooth_r,
83	imagef smooth_g,imagef smooth_b )
84	{
85	//int idx = blockIdx.x * blockDim.x + threadIdx.x;
86	int w = input->w;
87	int h = input->h;
88	int x = blockIdx.x * blockDim.x + threadIdx.x;
89	int y = blockIdx.y * blockDim.y + threadIdx.y;
90	int idx = x + y*w;
91
92	// indx[idx] = idx;
93
94	/* if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0)
95	input->w = input->w + 1;
96
97	edges[0].a = 23;
98	edges[0].b = 24;
99	*/
100
101	if (x>=w \|\| y>=h)
102	return;
103
104	if (x < w-1)
105	{
106	edges[idx4].a = y w + x;
107	edges[idx4].b = y w + (x+1);
108	weights[idx*4] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y);
109	}
110	else
111	{
112	edges[idx*4].a = -1;
113	edges[idx*4].b = -1;
114	weights[idx*4] = max;
115	}
116
117
118	if (y < h-1)
119	{
120	edges[idx4+1].a = y w + x;
121	edges[idx4+1].b = (y+1) w + x;
122	weights[idx*4+1] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x, y+1);
123	}
124	else
125	{
126	edges[idx*4+1].a = -1;
127	edges[idx*4+1].b = -1;
128	weights[idx*4+1] = max;
129	}
130
131	if ((x < w-1) && (y < h-1))
132	{
133	edges[idx4+2].a = y w + x;
134	edges[idx4+2].b = (y+1) w + (x+1);
135	weights[idx*4+2] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y+1);
136	}
137	else
138	{
139	edges[idx*4+2].a = -1;
140	edges[idx*4+2].b = -1;
141	weights[idx*4+2] = max;
142
143	}
144
145	if ((x < w-1) && (y > 0))
146	{
147	edges[idx4+3].a = y w + x;
148	edges[idx4+3].b = (y-1) w + (x+1);
149	weights[idx*4+3] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y-1);
150	}
151	else
152	{
153	edges[idx*4+3].a = -1;
154	edges[idx*4+3].b = -1;
155	weights[idx*4+3] = max;
156	}
157
158	edges_location[idx4] = &(edges[idx4]);
159	edges_location[idx4+1] = &(edges[idx4+1]);
160	edges_location[idx4+2] = &(edges[idx4+2]);
161	edges_location[idx4+3] = &(edges[idx4+3]);
162
163
164	/* if (idx == 0)
165	input->h = input->h + 1;
166	*/
167
168	}
169
170	// main routine that executes on the host
171	int main(int argc, char **argv)
172	{
173	if (argc != 6)
174	{
175	fprintf(stderr, "usage: %s sigma k min input(ppm) output(ppm)\n", argv[0]);
176	return 1;
177	}
178
179
180	dim3 n_blocks, block_size;
181	//int n_blocks, block_size;
182	float sigma = atof(argv[1]);
183	float k = atof(argv[2]);
184	int min_size = atoi(argv[3]);
185	rgb *array;
186	edge *edges;
187	edge *ed_temp;
188
189	float* weights;
190	edge** edges_location;
191
192
193	struct timeval start, end;
194	long mtime, seconds, useconds;
195
196	gettimeofday(&start, NULL);
197
198	printf("loading input image.\n");
199	image *input = loadPPM(argv[4]);
200
201	/* Alloc image on device*/
202	image *input_cuda;
203
204	image in;
205
206	cudaMalloc((void *) &array, input->h input->w * sizeof(rgb));
207	cudaMemcpy((void ) array, (void ) input->data, input->h * input->w * sizeof(rgb), cudaMemcpyHostToDevice);
208
209
210	in.h = input->h;
211	in.w = input->w;
212	in.data = (rgb*)array;
213
214	cudaMalloc((void **) &input_cuda, sizeof(image));
215	cudaMemcpy((void ) input_cuda, (void ) &in, sizeof(image), cudaMemcpyHostToDevice);
216
217
218	/* Alocare matrci r,g,b de image_float*/
219	imagef *r;
220	imagef *g;
221	imagef *b;
222	// imagef *tmp;
223	imagef tmp;
224
225	imagef *tmp_img;
226	imagef tmp_img_cpu;
227
228	float *arrayftmp;
229
230
231	float arrayf1, arrayf2, *arrayf3;
232	std::vector<float> vmask = make_fgauss(sigma);
233	float mask = (float )malloc(vmask.size()sizeof(float)) + 3 sizeof(imagef);
234	for (int i=0; i<vmask.size(); i++)
235	mask[i]=vmask[i];
236	float *maski;
237	// tmp = (imagef )(((char )mask) + vmask.size()*sizeof(float));
238
239
240
241	unsigned int rgb_size = 4 * input->h * input->w * sizeof(float) + 4 * sizeof(imagef) + sizeof(float) * vmask.size();
242	void *bigMem;
243	if (cudaMalloc(&bigMem, rgb_size) != cudaSuccess)
244	printf("ERROR big_malloc 1\n");
245	char initial = (char )bigMem;
246	maski = (float *)initial;
247	initial += sizeof(float) * vmask.size();
248	r = (imagef *)initial;
249	initial+=sizeof(imagef);
250	g = (imagef *)initial;
251	initial+=sizeof(imagef);
252	b = (imagef *)initial;
253	initial+=sizeof(imagef);
254	tmp_img = (imagef *)initial;
255	initial+=sizeof(imagef);
256	arrayf1 = (float *)initial;
257	initial += input->h * input->w * sizeof(float);
258	arrayf2 = (float *)initial;
259	initial += input->h * input->w * sizeof(float);
260	arrayf3 = (float *)initial;
261	initial += input->h * input->w * sizeof(float);
262	arrayftmp = (float *)initial;
263	/*
264	for (int i=0; i<3; i++){
265	tmp[i].w=input->w;
266	tmp[i].h=input->h;
267	}
268	tmp[0].data = arrayf1;
269	tmp[1].data = arrayf2;
270	tmp[2].data = arrayf3;
271	CUDA_SAFE_CALL(cudaMemcpy((void )maski, (void )mask, 3sizeof(imagef) + vmask.size()sizeof(float), cudaMemcpyHostToDevice));
272	*/
273	// exit(0);
274	/*
275	if (cudaMalloc((void **) &r, sizeof(imagef)) != cudaSuccess)
276	printf("ERROR11\n");
277	if (cudaMalloc((void **) &g, sizeof(imagef)) != cudaSuccess)
278	printf("ERROR12\n");
279
280	if (cudaMalloc((void **) &b, sizeof(imagef)) != cudaSuccess)
281	printf("ERROR13\n");
282
283	cudaMalloc((void *) &arrayf1, input->h input->w * sizeof(float));
284	*/
285
286	tmp.w = input->w;
287	tmp.h = input->h;
288	tmp.data = arrayf1;
289
290	CUDA_SAFE_CALL(cudaMemcpy((void ) r, (void ) &tmp, sizeof(imagef), cudaMemcpyHostToDevice));
291
292	// cudaMalloc((void *) &arrayf2, input->h input->w * sizeof(float));
293	tmp.data = arrayf2;
294	if (cudaMemcpy((void ) g, (void ) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
295	printf("Error memcpy g\n");
296
297	// cudaMalloc((void *) &arrayf3, input->h input->w * sizeof(float));
298	tmp.data = arrayf3;
299	if (cudaMemcpy((void ) b, (void ) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
300	printf("Error memcpy b\n");
301
302	if (cudaMemcpy((void ) maski, (void ) mask, vmask.size() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
303	printf("Error memcpy maski\n");
304
305
306	//smooth alloc
307
308	imagef *smooth_r;
309	imagef *smooth_g;
310	imagef *smooth_b;
311	imagef smooth_tmp;
312
313
314	float arrayfs1, arrayfs2, *arrayfs3;
315
316
317	unsigned int smooth_size = 3 * input->h * input->w * sizeof(float) + 3 * sizeof(imagef);
318	void *bigMemSmooth;
319	CUDA_SAFE_CALL(cudaMalloc(&bigMemSmooth, smooth_size));
320
321	initial = (char *)bigMemSmooth;
322	smooth_r = (imagef *)initial;
323	initial+=sizeof(imagef);
324	smooth_g = (imagef *)initial;
325	initial+=sizeof(imagef);
326	smooth_b = (imagef *)initial;
327	initial+=sizeof(imagef);
328	arrayfs1 = (float *)initial;
329	initial += input->h * input->w * sizeof(float);
330	arrayfs2 = (float *)initial;
331	initial += input->h * input->w * sizeof(float);
332	arrayfs3 = (float *)initial;
333
334	/*
335	if (cudaMalloc((void **) &smooth_r, sizeof(imagef)) != cudaSuccess)
336	printf("ERROR14\n");
337	if (cudaMalloc((void **) &smooth_g, sizeof(imagef)) != cudaSuccess)
338	printf("ERROR15\n");
339
340	if (cudaMalloc((void **) &smooth_b, sizeof(imagef)) != cudaSuccess)
341	printf("ERROR16\n");
342
343	if (cudaMalloc((void *) &arrayfs1, input->h input->w * sizeof(float))!=cudaSuccess)
344	printf("Error malloc arrayfs1\n");
345	*/
346	smooth_tmp.w = input->w;
347	smooth_tmp.h = input->h;
348	smooth_tmp.data = arrayfs1;
349	if (cudaMemcpy((void ) smooth_r, (void ) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
350	printf("Error memcpy smoothr\n");
351
352	// if (cudaMalloc((void *) &arrayfs2, input->h input->w * sizeof(float))!=cudaSuccess)
353	// printf("Error malloc arrayfs2\n");
354	smooth_tmp.data = arrayfs2;
355	if (cudaMemcpy((void ) smooth_g, (void ) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
356	printf("Error memcpy smoothg\n");
357
358	// if (cudaMalloc((void *) &arrayfs3, input->h input->w * sizeof(float))!=cudaSuccess)
359	// printf("Error malloc arrayfs3\n");
360	smooth_tmp.data = arrayfs3;
361	if (cudaMemcpy((void ) smooth_b, (void ) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
362	printf("Error memcpy smoothb\n");
363
364	//alloc tmp matrix for smooth
365	// if (cudaMalloc((void **) &tmp_img, sizeof(imagef)) != cudaSuccess)
366	// printf("ERROR17\n");
367	// cudaMalloc((void *) &arrayftmp, input->w input->h * sizeof(float));
368	tmp_img_cpu.w = input->h;
369	tmp_img_cpu.h = input->w;
370	tmp_img_cpu.data = arrayftmp;
371	if (cudaMemcpy((void ) tmp_img, (void ) &tmp_img_cpu, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
372	printf("Error memcpy tmp_img\n");
373
374	// if (cudaMalloc((void *) &maski, sizeof(float)vmask.size()) != cudaSuccess)
375	// printf("ERROR18\n");
376
377
378	create_rgb_smooth <<< n_blocks, block_size >>> (r, g, b, input_cuda);
379
380	convolve_even_gpu <<< n_blocks, block_size >>> (r, tmp_img, maski, vmask.size());
381	convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_r, maski, vmask.size());
382	convolve_even_gpu <<< n_blocks, block_size >>> (g, tmp_img, maski, vmask.size());
383	convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_g, maski, vmask.size());
384	convolve_even_gpu <<< n_blocks, block_size >>> (b, tmp_img, maski, vmask.size());
385	convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_b, maski, vmask.size());
386
387
388	//free(mask);
389	/*
390	cudaFree(maski);
391	cudaFree(tmp_img);
392	cudaFree(r);
393	cudaFree(g);
394	cudaFree(b);
395	cudaFree(arrayf1);
396	cudaFree(arrayf2);
397	cudaFree(arrayf3);
398	cudaFree(arrayftmp);
399	*/
400	CUDA_SAFE_CALL(cudaFree(input_cuda));
401	CUDA_SAFE_CALL(cudaFree(array));
402
403	CUDA_SAFE_CALL(cudaFree(bigMem));
404
405
406	/* Alloc edges' vector on device */
407	//CUDA_SAFE_CALL(cudaMalloc((void *) &edges, (4 input->h * input->w )* sizeof(edge)));
408	//CUDA_SAFE_CALL(cudaMalloc((void *) &weights, (4 input->h * input->w )* sizeof(float)));
409	unsigned int size_edges = 4 * input->h * input->w * sizeof(edge) + 4 * input->h * input->w * sizeof(edge) + 4 input->h * input->w * sizeof(float);
410	CUDA_SAFE_CALL(cudaMalloc((void **) &bigMem, size_edges));
411
412	//exit(0);
413
414	/*
415	if (cudaMalloc((void *) &edges, 4 input->h * input->w * sizeof(edge)) != cudaSuccess)
416	printf("ERROR19\n");
417	if (cudaMalloc((void *) &edges_location, (4 input->h * input->w )* sizeof(edge*)) != cudaSuccess)
418	printf("ERROR1A\n");
419
420	if (cudaMalloc((void *) &weights, (4 input->h * input->w )* sizeof(float)) != cudaSuccess)
421	printf("ERROR1B\n");
422	*/
423	initial = (char *)bigMem;
424	edges = (edge*)initial;
425	initial += 4 * input->h * input->w * sizeof(edge);
426	edges_location = (edge **)initial;
427	initial += 4 * input->h * input->w * sizeof(edge*);
428	weights = (float *)initial;
429
430	/* Define the grid */
431	n_blocks.x = input->w / BLOCK_DIM_X + (input->w % BLOCK_DIM_X == 0?0:1);
432	n_blocks.y = input->h / BLOCK_DIM_Y + (input->h % BLOCK_DIM_Y == 0?0:1);
433	n_blocks.z = 1;
434
435	printf("n_blocks.x = %d\n", n_blocks.x);
436	printf("n_blocks.y = %d\n", n_blocks.y);
437
438
439	block_size.x = BLOCK_DIM_X;
440	block_size.y = BLOCK_DIM_Y;
441	block_size.z = 1;
442
443	/* Compute edges */
444	compute_edges <<< n_blocks, block_size >>> (input_cuda, edges, edges_location, weights,smooth_r,smooth_g,smooth_b);
445	/*
446	cudaFree(smooth_r);
447	cudaFree(smooth_g);
448	cudaFree(smooth_b);
449	cudaFree(arrayfs1);
450	cudaFree(arrayfs2);
451	cudaFree(arrayfs3);
452	*/
453	CUDA_SAFE_CALL(cudaFree(bigMemSmooth));
454
455	ed_temp = (edge )malloc((4 input->h * input->w )* sizeof(edge));
456	if (cudaMemcpy((void ) ed_temp, (void ) edges, 4 * input->h * input->w * sizeof(edge), cudaMemcpyDeviceToHost) != cudaSuccess)
457	printf("ERROR2\n");
458
459	/* if (cudaMemcpy((void ) indx_temp, (void ) indx, input->h * input->w * sizeof(int), cudaMemcpyDeviceToHost))
460	printf("ERROR3\n");
461	*/
462
463	printf("a0 = %d\n", ed_temp[0].a);
464	printf("b0 = %d\n", ed_temp[0].b);
465
466	//sort edges
467	/*
468	CUDPPConfiguration config;
469	config.op = CUDPP_MIN;
470	config.datatype = CUDPP_FLOAT;
471	config.algorithm = CUDPP_SORT_RADIX;
472	config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
473
474	CUDPPHandle scanplan = 0;
475	CUDPPResult result = cudppPlan(&scanplan, config, 4 * input->h * input->w, 1, 0);
476
477	result = cudppSort ( scanplan,
478	weights,
479	edges_location,
480	32,
481	4 * input->h * input->w
482	);
483
484	result = cudppDestroyPlan(scanplan);
485	if (CUDPP_SUCCESS != result)
486	{
487	printf("Error destroying CUDPPPlan\n");
488	exit(-1);
489	}
490
491
492	if (CUDPP_SUCCESS != result)
493	{
494	printf("Error creating CUDPPPlan\n");
495	exit(-1);
496	}
497	*/
498	gettimeofday(&end, NULL);
499
500	seconds = end.tv_sec - start.tv_sec;
501	useconds = end.tv_usec - start.tv_usec;
502
503	mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
504
505	printf("Elapsed time: %ld milliseconds\n", mtime);
506
507	// Cleanup
508	//exit(0);
509	//cudaFree(input_cuda);
510	CUDA_SAFE_CALL(cudaFree(bigMem));
511	//cudaFree(array);
512	//cudaFree(edges);
513	//cudaFree(edges_location);
514	//cudaFree(weights);
515
516
517	return 0;
518
519	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/pmake3d/segment/segment.cu @ 77

Download in other formats: