Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

segment.cu @ 77

Last change on this file since 77 was 77, checked in by (none), 14 years ago
Added parallelized code
File size: 13.8 KB

Rev	Line
[77]	1	// example1.cpp : Defines the entry point for the console application.
	2	//
	3
	4	//#include "stdafx.h"
	5
	6	#include <vector>
	7	#include <stdio.h>
	8	#include <stdlib.h>
	9	#include <float.h>
	10	#include <cuda.h>
	11	#include <cudpp.h>
	12	#include <cutil.h>
	13	#include <math_functions.h>
	14	#include <image.h>
	15	#include <misc.h>
	16	#include <pnmfile.h>
	17	#include <cutil_math.h>
	18	#include <sys/time.h>
	19	#include <unistd.h>
	20
	21
	22	#include "segment-image.h"
	23
	24
	25	#define BLOCK_DIM_X 12
	26	#define BLOCK_DIM_Y 8
	27
	28	#define max FLT_MAX
	29
	30
	31
	32	__global__ void convolve_even_gpu(imagef src, imagef dst, float *mask, int dim_mask){
	33	int w = src->w;
	34	int h = src->h;
	35	int x = blockIdx.x * blockDim.x + threadIdx.x;
	36	int y = blockIdx.y * blockDim.y + threadIdx.y;
	37	int idx = x + y*w;
	38	int i;
	39	float sum;
	40	int mmax,mmin;
	41
	42	if (x>=w \|\| y>=h)
	43	return;
	44
	45	sum = mask[0] * src->data[idx];
	46
	47	for (i = 1; i < dim_mask; i++) {
	48	mmax = (x-i > 0 ? x-i : 0);
	49	//mmax = max(x-i,0);// > 0 ? x-i : 0)
	50	mmin = (x+i > w-1 ? x+i : w-1);
	51	sum += mask[i] * (src->data[mmax + y * w]) + src->data[mmin + y * w];
	52	}
	53	dst->data[y + h * x] = sum;
	54
	55	}
	56
	57
	58	__global__ void create_rgb_smooth(imagef r, imagef g, imagef b, image input)
	59	{
	60	int w = input->w;
	61	int h = input->h;
	62	int x = blockIdx.x * blockDim.x + threadIdx.x;
	63	int y = blockIdx.y * blockDim.y + threadIdx.y;
	64	int idx = x + y*w;
	65
	66	if (x>=w \|\| y>=h)
	67	return;
	68
	69	r->data[idx] = input->data[idx].r;
	70	g->data[idx] = input->data[idx].g;
	71	b->data[idx] = input->data[idx].b;
	72
	73	}
	74
	75	#define diffgpu(r, g, b, x1, y1, x2, y2) \
	76	sqrtf(powf(r->data[x1+wy1]-r->data[x2+wy2],2) + \
	77	powf(g->data[x1+wy1]-g->data[x2+wy2],2) + \
	78	powf(b->data[x1+wy1]-b->data[x2+wy2],2)); \
	79
	80
	81	// Kernel that executes on the CUDA device
	82	__global__ void compute_edges(image input, edge edges, edge *edges_location,float weights,imagef *smooth_r,
	83	imagef smooth_g,imagef smooth_b )
	84	{
	85	//int idx = blockIdx.x * blockDim.x + threadIdx.x;
	86	int w = input->w;
	87	int h = input->h;
	88	int x = blockIdx.x * blockDim.x + threadIdx.x;
	89	int y = blockIdx.y * blockDim.y + threadIdx.y;
	90	int idx = x + y*w;
	91
	92	// indx[idx] = idx;
	93
	94	/* if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0)
	95	input->w = input->w + 1;
	96
	97	edges[0].a = 23;
	98	edges[0].b = 24;
	99	*/
	100
	101	if (x>=w \|\| y>=h)
	102	return;
	103
	104	if (x < w-1)
	105	{
	106	edges[idx4].a = y w + x;
	107	edges[idx4].b = y w + (x+1);
	108	weights[idx*4] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y);
	109	}
	110	else
	111	{
	112	edges[idx*4].a = -1;
	113	edges[idx*4].b = -1;
	114	weights[idx*4] = max;
	115	}
	116
	117
	118	if (y < h-1)
	119	{
	120	edges[idx4+1].a = y w + x;
	121	edges[idx4+1].b = (y+1) w + x;
	122	weights[idx*4+1] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x, y+1);
	123	}
	124	else
	125	{
	126	edges[idx*4+1].a = -1;
	127	edges[idx*4+1].b = -1;
	128	weights[idx*4+1] = max;
	129	}
	130
	131	if ((x < w-1) && (y < h-1))
	132	{
	133	edges[idx4+2].a = y w + x;
	134	edges[idx4+2].b = (y+1) w + (x+1);
	135	weights[idx*4+2] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y+1);
	136	}
	137	else
	138	{
	139	edges[idx*4+2].a = -1;
	140	edges[idx*4+2].b = -1;
	141	weights[idx*4+2] = max;
	142
	143	}
	144
	145	if ((x < w-1) && (y > 0))
	146	{
	147	edges[idx4+3].a = y w + x;
	148	edges[idx4+3].b = (y-1) w + (x+1);
	149	weights[idx*4+3] = diffgpu(smooth_r, smooth_g, smooth_b, x, y, x+1, y-1);
	150	}
	151	else
	152	{
	153	edges[idx*4+3].a = -1;
	154	edges[idx*4+3].b = -1;
	155	weights[idx*4+3] = max;
	156	}
	157
	158	edges_location[idx4] = &(edges[idx4]);
	159	edges_location[idx4+1] = &(edges[idx4+1]);
	160	edges_location[idx4+2] = &(edges[idx4+2]);
	161	edges_location[idx4+3] = &(edges[idx4+3]);
	162
	163
	164	/* if (idx == 0)
	165	input->h = input->h + 1;
	166	*/
	167
	168	}
	169
	170	// main routine that executes on the host
	171	int main(int argc, char **argv)
	172	{
	173	if (argc != 6)
	174	{
	175	fprintf(stderr, "usage: %s sigma k min input(ppm) output(ppm)\n", argv[0]);
	176	return 1;
	177	}
	178
	179
	180	dim3 n_blocks, block_size;
	181	//int n_blocks, block_size;
	182	float sigma = atof(argv[1]);
	183	float k = atof(argv[2]);
	184	int min_size = atoi(argv[3]);
	185	rgb *array;
	186	edge *edges;
	187	edge *ed_temp;
	188
	189	float* weights;
	190	edge** edges_location;
	191
	192
	193	struct timeval start, end;
	194	long mtime, seconds, useconds;
	195
	196	gettimeofday(&start, NULL);
	197
	198	printf("loading input image.\n");
	199	image *input = loadPPM(argv[4]);
	200
	201	/* Alloc image on device*/
	202	image *input_cuda;
	203
	204	image in;
	205
	206	cudaMalloc((void *) &array, input->h input->w * sizeof(rgb));
	207	cudaMemcpy((void ) array, (void ) input->data, input->h * input->w * sizeof(rgb), cudaMemcpyHostToDevice);
	208
	209
	210	in.h = input->h;
	211	in.w = input->w;
	212	in.data = (rgb*)array;
	213
	214	cudaMalloc((void **) &input_cuda, sizeof(image));
	215	cudaMemcpy((void ) input_cuda, (void ) &in, sizeof(image), cudaMemcpyHostToDevice);
	216
	217
	218	/* Alocare matrci r,g,b de image_float*/
	219	imagef *r;
	220	imagef *g;
	221	imagef *b;
	222	// imagef *tmp;
	223	imagef tmp;
	224
	225	imagef *tmp_img;
	226	imagef tmp_img_cpu;
	227
	228	float *arrayftmp;
	229
	230
	231	float arrayf1, arrayf2, *arrayf3;
	232	std::vector<float> vmask = make_fgauss(sigma);
	233	float mask = (float )malloc(vmask.size()sizeof(float)) + 3 sizeof(imagef);
	234	for (int i=0; i<vmask.size(); i++)
	235	mask[i]=vmask[i];
	236	float *maski;
	237	// tmp = (imagef )(((char )mask) + vmask.size()*sizeof(float));
	238
	239
	240
	241	unsigned int rgb_size = 4 * input->h * input->w * sizeof(float) + 4 * sizeof(imagef) + sizeof(float) * vmask.size();
	242	void *bigMem;
	243	if (cudaMalloc(&bigMem, rgb_size) != cudaSuccess)
	244	printf("ERROR big_malloc 1\n");
	245	char initial = (char )bigMem;
	246	maski = (float *)initial;
	247	initial += sizeof(float) * vmask.size();
	248	r = (imagef *)initial;
	249	initial+=sizeof(imagef);
	250	g = (imagef *)initial;
	251	initial+=sizeof(imagef);
	252	b = (imagef *)initial;
	253	initial+=sizeof(imagef);
	254	tmp_img = (imagef *)initial;
	255	initial+=sizeof(imagef);
	256	arrayf1 = (float *)initial;
	257	initial += input->h * input->w * sizeof(float);
	258	arrayf2 = (float *)initial;
	259	initial += input->h * input->w * sizeof(float);
	260	arrayf3 = (float *)initial;
	261	initial += input->h * input->w * sizeof(float);
	262	arrayftmp = (float *)initial;
	263	/*
	264	for (int i=0; i<3; i++){
	265	tmp[i].w=input->w;
	266	tmp[i].h=input->h;
	267	}
	268	tmp[0].data = arrayf1;
	269	tmp[1].data = arrayf2;
	270	tmp[2].data = arrayf3;
	271	CUDA_SAFE_CALL(cudaMemcpy((void )maski, (void )mask, 3sizeof(imagef) + vmask.size()sizeof(float), cudaMemcpyHostToDevice));
	272	*/
	273	// exit(0);
	274	/*
	275	if (cudaMalloc((void **) &r, sizeof(imagef)) != cudaSuccess)
	276	printf("ERROR11\n");
	277	if (cudaMalloc((void **) &g, sizeof(imagef)) != cudaSuccess)
	278	printf("ERROR12\n");
	279
	280	if (cudaMalloc((void **) &b, sizeof(imagef)) != cudaSuccess)
	281	printf("ERROR13\n");
	282
	283	cudaMalloc((void *) &arrayf1, input->h input->w * sizeof(float));
	284	*/
	285
	286	tmp.w = input->w;
	287	tmp.h = input->h;
	288	tmp.data = arrayf1;
	289
	290	CUDA_SAFE_CALL(cudaMemcpy((void ) r, (void ) &tmp, sizeof(imagef), cudaMemcpyHostToDevice));
	291
	292	// cudaMalloc((void *) &arrayf2, input->h input->w * sizeof(float));
	293	tmp.data = arrayf2;
	294	if (cudaMemcpy((void ) g, (void ) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
	295	printf("Error memcpy g\n");
	296
	297	// cudaMalloc((void *) &arrayf3, input->h input->w * sizeof(float));
	298	tmp.data = arrayf3;
	299	if (cudaMemcpy((void ) b, (void ) &tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
	300	printf("Error memcpy b\n");
	301
	302	if (cudaMemcpy((void ) maski, (void ) mask, vmask.size() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
	303	printf("Error memcpy maski\n");
	304
	305
	306	//smooth alloc
	307
	308	imagef *smooth_r;
	309	imagef *smooth_g;
	310	imagef *smooth_b;
	311	imagef smooth_tmp;
	312
	313
	314	float arrayfs1, arrayfs2, *arrayfs3;
	315
	316
	317	unsigned int smooth_size = 3 * input->h * input->w * sizeof(float) + 3 * sizeof(imagef);
	318	void *bigMemSmooth;
	319	CUDA_SAFE_CALL(cudaMalloc(&bigMemSmooth, smooth_size));
	320
	321	initial = (char *)bigMemSmooth;
	322	smooth_r = (imagef *)initial;
	323	initial+=sizeof(imagef);
	324	smooth_g = (imagef *)initial;
	325	initial+=sizeof(imagef);
	326	smooth_b = (imagef *)initial;
	327	initial+=sizeof(imagef);
	328	arrayfs1 = (float *)initial;
	329	initial += input->h * input->w * sizeof(float);
	330	arrayfs2 = (float *)initial;
	331	initial += input->h * input->w * sizeof(float);
	332	arrayfs3 = (float *)initial;
	333
	334	/*
	335	if (cudaMalloc((void **) &smooth_r, sizeof(imagef)) != cudaSuccess)
	336	printf("ERROR14\n");
	337	if (cudaMalloc((void **) &smooth_g, sizeof(imagef)) != cudaSuccess)
	338	printf("ERROR15\n");
	339
	340	if (cudaMalloc((void **) &smooth_b, sizeof(imagef)) != cudaSuccess)
	341	printf("ERROR16\n");
	342
	343	if (cudaMalloc((void *) &arrayfs1, input->h input->w * sizeof(float))!=cudaSuccess)
	344	printf("Error malloc arrayfs1\n");
	345	*/
	346	smooth_tmp.w = input->w;
	347	smooth_tmp.h = input->h;
	348	smooth_tmp.data = arrayfs1;
	349	if (cudaMemcpy((void ) smooth_r, (void ) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
	350	printf("Error memcpy smoothr\n");
	351
	352	// if (cudaMalloc((void *) &arrayfs2, input->h input->w * sizeof(float))!=cudaSuccess)
	353	// printf("Error malloc arrayfs2\n");
	354	smooth_tmp.data = arrayfs2;
	355	if (cudaMemcpy((void ) smooth_g, (void ) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
	356	printf("Error memcpy smoothg\n");
	357
	358	// if (cudaMalloc((void *) &arrayfs3, input->h input->w * sizeof(float))!=cudaSuccess)
	359	// printf("Error malloc arrayfs3\n");
	360	smooth_tmp.data = arrayfs3;
	361	if (cudaMemcpy((void ) smooth_b, (void ) &smooth_tmp, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
	362	printf("Error memcpy smoothb\n");
	363
	364	//alloc tmp matrix for smooth
	365	// if (cudaMalloc((void **) &tmp_img, sizeof(imagef)) != cudaSuccess)
	366	// printf("ERROR17\n");
	367	// cudaMalloc((void *) &arrayftmp, input->w input->h * sizeof(float));
	368	tmp_img_cpu.w = input->h;
	369	tmp_img_cpu.h = input->w;
	370	tmp_img_cpu.data = arrayftmp;
	371	if (cudaMemcpy((void ) tmp_img, (void ) &tmp_img_cpu, sizeof(imagef), cudaMemcpyHostToDevice)!=cudaSuccess)
	372	printf("Error memcpy tmp_img\n");
	373
	374	// if (cudaMalloc((void *) &maski, sizeof(float)vmask.size()) != cudaSuccess)
	375	// printf("ERROR18\n");
	376
	377
	378	create_rgb_smooth <<< n_blocks, block_size >>> (r, g, b, input_cuda);
	379
	380	convolve_even_gpu <<< n_blocks, block_size >>> (r, tmp_img, maski, vmask.size());
	381	convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_r, maski, vmask.size());
	382	convolve_even_gpu <<< n_blocks, block_size >>> (g, tmp_img, maski, vmask.size());
	383	convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_g, maski, vmask.size());
	384	convolve_even_gpu <<< n_blocks, block_size >>> (b, tmp_img, maski, vmask.size());
	385	convolve_even_gpu <<< n_blocks, block_size >>> (tmp_img, smooth_b, maski, vmask.size());
	386
	387
	388	//free(mask);
	389	/*
	390	cudaFree(maski);
	391	cudaFree(tmp_img);
	392	cudaFree(r);
	393	cudaFree(g);
	394	cudaFree(b);
	395	cudaFree(arrayf1);
	396	cudaFree(arrayf2);
	397	cudaFree(arrayf3);
	398	cudaFree(arrayftmp);
	399	*/
	400	CUDA_SAFE_CALL(cudaFree(input_cuda));
	401	CUDA_SAFE_CALL(cudaFree(array));
	402
	403	CUDA_SAFE_CALL(cudaFree(bigMem));
	404
	405
	406	/* Alloc edges' vector on device */
	407	//CUDA_SAFE_CALL(cudaMalloc((void *) &edges, (4 input->h * input->w )* sizeof(edge)));
	408	//CUDA_SAFE_CALL(cudaMalloc((void *) &weights, (4 input->h * input->w )* sizeof(float)));
	409	unsigned int size_edges = 4 * input->h * input->w * sizeof(edge) + 4 * input->h * input->w * sizeof(edge) + 4 input->h * input->w * sizeof(float);
	410	CUDA_SAFE_CALL(cudaMalloc((void **) &bigMem, size_edges));
	411
	412	//exit(0);
	413
	414	/*
	415	if (cudaMalloc((void *) &edges, 4 input->h * input->w * sizeof(edge)) != cudaSuccess)
	416	printf("ERROR19\n");
	417	if (cudaMalloc((void *) &edges_location, (4 input->h * input->w )* sizeof(edge*)) != cudaSuccess)
	418	printf("ERROR1A\n");
	419
	420	if (cudaMalloc((void *) &weights, (4 input->h * input->w )* sizeof(float)) != cudaSuccess)
	421	printf("ERROR1B\n");
	422	*/
	423	initial = (char *)bigMem;
	424	edges = (edge*)initial;
	425	initial += 4 * input->h * input->w * sizeof(edge);
	426	edges_location = (edge **)initial;
	427	initial += 4 * input->h * input->w * sizeof(edge*);
	428	weights = (float *)initial;
	429
	430	/* Define the grid */
	431	n_blocks.x = input->w / BLOCK_DIM_X + (input->w % BLOCK_DIM_X == 0?0:1);
	432	n_blocks.y = input->h / BLOCK_DIM_Y + (input->h % BLOCK_DIM_Y == 0?0:1);
	433	n_blocks.z = 1;
	434
	435	printf("n_blocks.x = %d\n", n_blocks.x);
	436	printf("n_blocks.y = %d\n", n_blocks.y);
	437
	438
	439	block_size.x = BLOCK_DIM_X;
	440	block_size.y = BLOCK_DIM_Y;
	441	block_size.z = 1;
	442
	443	/* Compute edges */
	444	compute_edges <<< n_blocks, block_size >>> (input_cuda, edges, edges_location, weights,smooth_r,smooth_g,smooth_b);
	445	/*
	446	cudaFree(smooth_r);
	447	cudaFree(smooth_g);
	448	cudaFree(smooth_b);
	449	cudaFree(arrayfs1);
	450	cudaFree(arrayfs2);
	451	cudaFree(arrayfs3);
	452	*/
	453	CUDA_SAFE_CALL(cudaFree(bigMemSmooth));
	454
	455	ed_temp = (edge )malloc((4 input->h * input->w )* sizeof(edge));
	456	if (cudaMemcpy((void ) ed_temp, (void ) edges, 4 * input->h * input->w * sizeof(edge), cudaMemcpyDeviceToHost) != cudaSuccess)
	457	printf("ERROR2\n");
	458
	459	/* if (cudaMemcpy((void ) indx_temp, (void ) indx, input->h * input->w * sizeof(int), cudaMemcpyDeviceToHost))
	460	printf("ERROR3\n");
	461	*/
	462
	463	printf("a0 = %d\n", ed_temp[0].a);
	464	printf("b0 = %d\n", ed_temp[0].b);
	465
	466	//sort edges
	467	/*
	468	CUDPPConfiguration config;
	469	config.op = CUDPP_MIN;
	470	config.datatype = CUDPP_FLOAT;
	471	config.algorithm = CUDPP_SORT_RADIX;
	472	config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
	473
	474	CUDPPHandle scanplan = 0;
	475	CUDPPResult result = cudppPlan(&scanplan, config, 4 * input->h * input->w, 1, 0);
	476
	477	result = cudppSort ( scanplan,
	478	weights,
	479	edges_location,
	480	32,
	481	4 * input->h * input->w
	482	);
	483
	484	result = cudppDestroyPlan(scanplan);
	485	if (CUDPP_SUCCESS != result)
	486	{
	487	printf("Error destroying CUDPPPlan\n");
	488	exit(-1);
	489	}
	490
	491
	492	if (CUDPP_SUCCESS != result)
	493	{
	494	printf("Error creating CUDPPPlan\n");
	495	exit(-1);
	496	}
	497	*/
	498	gettimeofday(&end, NULL);
	499
	500	seconds = end.tv_sec - start.tv_sec;
	501	useconds = end.tv_usec - start.tv_usec;
	502
	503	mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
	504
	505	printf("Elapsed time: %ld milliseconds\n", mtime);
	506
	507	// Cleanup
	508	//exit(0);
	509	//cudaFree(input_cuda);
	510	CUDA_SAFE_CALL(cudaFree(bigMem));
	511	//cudaFree(array);
	512	//cudaFree(edges);
	513	//cudaFree(edges_location);
	514	//cudaFree(weights);
	515
	516
	517	return 0;
	518
	519	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/pmake3d/segment/segment.cu @ 77

Download in other formats: