#include "NBody.hpp" int numBodies; // No. of particles; cl_float* pos; // Output position; void* me; // Pointing to NBody class; cl_bool display; // If it is true then OpenGL display is used; long curr_step = 0; // Reatins the current step in the simulation; long n_steps = 110; // Numbuer of steps of the simulation; cl_bool verify = false; // Compares the final position vectors resulted from running // the code on CPU and GPU; std::string deviceType("cpu"); // It says on which device we want to do the computations; time_t rawtime; // Variables used for measuring the duration of the run; struct tm * timeinfo; float NBody::random(float randMax, float randMin) { float result; result =(float)rand()/(float)RAND_MAX; return ((1.0f - result) * randMin + result *randMax); } int NBody::setupNBody() { //-------------------------------------------- // make sure numParticles is multiple of group size numParticles = (numParticles < GROUP_SIZE) ? GROUP_SIZE : numParticles; numParticles = (numParticles / GROUP_SIZE) * GROUP_SIZE; numBodies = numParticles; //-------------------------------------------- // First we will use initPos and initVel vectors to generate the // input data. They will be used only in this function to // initialize the other vectors: the pos and vel vectors // used in simulation on GPU only, and refPos and refVel used in // simulation on CPU only initPos = (cl_float*)malloc(numBodies * sizeof(cl_float4)); if(initPos == NULL) { std::cout << "Failed to allocate host memory. (initPos)" << std::endl; return 1; } initVel = (cl_float*)malloc(numBodies * sizeof(cl_float4)); if(initVel == NULL) { std::cout << "Failed to allocate host memory. (initVel)" << std::endl; return 1; } // initialization of inputs for(int i = 0; i < numBodies; ++i) { int index = 4 * i; // First 3 values are position in x,y and z direction for(int j = 0; j < 3; ++j) { initPos[index + j] = random(3, 50); } // Mass value initPos[index + 3] = random(1, 1000); // First 3 values are velocity in x,y and z direction for(int j = 0; j < 3; ++j) { initVel[index + j] = 0.0f; } // unused initVel[3] = 0.0f; } //-------------------------------------------- // Variables used on GPU Running only: if(deviceType.compare("gpu") == 0){ #if defined (_WIN32) pos = (cl_float*)_aligned_malloc(numBodies * sizeof(cl_float4), 16); #else pos = (cl_float*)memalign(16, numBodies * sizeof(cl_float4)); #endif if(pos == NULL) { std::cout << "Failed to allocate host memory. (pos)" << std::endl; return 1; } #if defined (_WIN32) vel = (cl_float*)_aligned_malloc(numBodies * sizeof(cl_float4), 16); #else vel = (cl_float*)memalign(16, numBodies * sizeof(cl_float4)); #endif if(vel == NULL) { std::cout << "Failed to allocate host memory. (vel)" << std::endl; return 1; } // Copy the auxiliary vectors into the pos and vel ones: memcpy(pos, initPos, 4 * numBodies * sizeof(cl_float)); memcpy(vel, initVel, 4 * numBodies * sizeof(cl_float)); } //-------------------------------------------- // Variables used on CPU Running only: if(deviceType.compare("cpu") == 0){ refPos = (cl_float*)malloc(numBodies * sizeof(cl_float4)); if(refPos == NULL) { std::cout << "Failed to allocate host memory. (refPos)" << std::endl; return 1; } refVel = (cl_float*)malloc(numBodies * sizeof(cl_float4)); if(refVel == NULL) { std::cout << "Failed to allocate host memory. (refVel)" << std::endl; return 1; } // Copy the auxiliary vectors into the refPos and refVel ones: memcpy(refPos, initPos, 4 * numBodies * sizeof(cl_float)); memcpy(refVel, initVel, 4 * numBodies * sizeof(cl_float)); } return 0; } int NBody::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* Create context from given device type */ context = clCreateContextFromType( 0, dType, NULL, NULL, &status); /* * if opencl fails to open a context on default device GPU * then it falls back to CPU */ if(status != CL_SUCCESS && dType == CL_DEVICE_TYPE_GPU) { std::cout << "Unsupported GPU device; falling back to CPU ..." << std::endl; context = clCreateContextFromType( 0, CL_DEVICE_TYPE_CPU, NULL, NULL, &status); } if (status != CL_SUCCESS){ std::cout << "clCreateContextFromType failed." << std::endl; return 1; } size_t deviceListSize; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if (status != CL_SUCCESS){ std::cout << "clGetContextInfo failed." << std::endl; return 1; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id *)malloc(deviceListSize); if(devices==NULL) { std::cout << "Failed to allocate memory (devices)." << std::endl; return 1; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if (status != CL_SUCCESS){ std::cout << "clGetContextInfo failed." << std::endl; return 1; } /* Create command queue */ commandQueue = clCreateCommandQueue( context, devices[0], 0, &status); if (status != CL_SUCCESS){ std::cout << "clCreateCommandQueue failed." << std::endl; return 1; } /* Get Device specific Information */ status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if (status != CL_SUCCESS){ std::cout << "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed." << std::endl; return 1; } //---------------------------------- // Device infos: cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; status = clGetDeviceInfo( devices[0], CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL); if (status != CL_SUCCESS){ std::cout << "clGetDeviceInfo CL_DEVICE_VENDOR failed." << std::endl; return 1; } status|= clGetDeviceInfo( devices[0], CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (status != CL_SUCCESS){ std::cout << "clGetDeviceInfo CL_DEVICE_NAME failed." << std::endl; return 1; } std::cout << "Connecting to " << vendor_name << ", " << device_name << " ... " << std::endl; status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if (status != CL_SUCCESS){ std::cout << "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed." << std::endl; return 1; } maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if (status != CL_SUCCESS){ std::cout << "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed." << std::endl; return 1; } status = clGetDeviceInfo( devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL); if (status != CL_SUCCESS){ std::cout << "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed." << std::endl; return 1; } /* * Create and initialize memory objects */ /* Create memory objects for position */ if(deviceType.compare("gpu") == 0){ updatedPos = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, numBodies * sizeof(cl_float4), pos, &status); }else if(deviceType.compare("cpu") == 0){ updatedPos = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, numBodies * sizeof(cl_float4), refPos, &status); } if (status != CL_SUCCESS){ std::cout << "clCreateBuffer failed. (updatePos)" << std::endl; return 1; } /* Create memory objects for velocity */ if(deviceType.compare("gpu") == 0){ updatedVel = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, numBodies * sizeof(cl_float4), vel, &status); }else if(deviceType.compare("cpu") == 0){ updatedVel = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, numBodies * sizeof(cl_float4), refVel, &status); } if (status != CL_SUCCESS){ std::cout << "clCreateBuffer failed. (updatedVel)" << std::endl; return 1; } /* create a CL program using the kernel source */ char *program_source = load_program_source(filename); size_t sourceSize[] = { strlen(program_source) }; program = clCreateProgramWithSource( context, 1, (const char**)&program_source, sourceSize, &status); if (status != CL_SUCCESS){ std::cout << "clCreateProgramWithSource failed." << std::endl; return 1; } /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[0], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if (logStatus != CL_SUCCESS){ std::cout << "clGetProgramBuildInfo failed." << std::endl; return 1; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { std::cout << "Failed to allocate host memory. (buildLog)" << std::endl; return 1; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if (logStatus != CL_SUCCESS){ std::cout << "clGetProgramBuildInfo failed." << std::endl; free(buildLog); return 1; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(status != CL_SUCCESS) { std::cout << "clBuildProgram failed." << std::endl; return 1; } } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel( program, "nbody_sim", &status); if(status != CL_SUCCESS) { std::cout << "clCreateKernel failed." << std::endl; return 1; } return 0; } char* NBody::load_program_source(const char *filename) { FILE *fh; char *source; int size; fh = fopen(filename, "r"); if (fh == NULL){ std::cout << "Reading the source cod for kernel failed." << std::endl; exit(1); } fseek (fh, 0, SEEK_END); size = ftell (fh); rewind(fh); source = (char *) malloc(size+1); int result = fread(source, 1, size, fh); /*if (result != size){ std::cout << "Reading the source cod for kernel failed." << std::endl; exit(1); }*/ source[result] = '\0'; fclose(fh); return source; } int NBody::setupCLKernels() { cl_int status; /*** Set appropriate arguments to the kernel ***/ /* Particle positions */ status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&updatedPos); if(status != CL_SUCCESS) { std::cout << "clSetKernelArg failed. (updatedPos)" << std::endl; return 1; } /* Particle velocity */ status = clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *)&updatedVel); if(status != CL_SUCCESS) { std::cout << "clSetKernelArg failed. (updatedVel)" << std::endl; return 1; } /* numBodies */ status = clSetKernelArg( kernel, 2, sizeof(cl_int), (void *)&numBodies); if(status != CL_SUCCESS) { std::cout << "clSetKernelArg failed. (numBodies)" << std::endl; return 1; } /* time step */ status = clSetKernelArg( kernel, 3, sizeof(cl_float), (void *)&delT); if(status != CL_SUCCESS) { std::cout << "clSetKernelArg failed. (delT)" << std::endl; return 1; } /* upward Pseudoprobability */ status = clSetKernelArg( kernel, 4, sizeof(cl_float), (void *)&espSqr); if(status != CL_SUCCESS) { std::cout << "clSetKernelArg failed. (espSqr)" << std::endl; return 1; } /* local memory */ status = clSetKernelArg( kernel, 5, GROUP_SIZE * 4 * sizeof(float), NULL); if(status != CL_SUCCESS) { std::cout << "clSetKernelArg failed. (localPos)" << std::endl; return 1; } status = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &usedLocalMemory, NULL); if(status != CL_SUCCESS) { std::cout << "clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed." << std::endl; return 1; } if(usedLocalMemory > totalLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return 1; } return 0; } int NBody::runCLKernels() { cl_int status; cl_event events[1]; /* * Enqueue a kernel run call. */ size_t globalThreads[] = {numBodies}; size_t localThreads[] = {GROUP_SIZE}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout<<"Unsupported: Device does not support requested number of work items."; return 1; } status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if(status != CL_SUCCESS) { std::cout << "clEnqueueNDRangeKernel failed." << std::endl; return 1; } status = clFinish(commandQueue); if(status != CL_SUCCESS) { std::cout << "clFinish failed." << std::endl; return 1; } /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, updatedPos, CL_TRUE, 0, numBodies* sizeof(cl_float4), pos, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout << "clEnqueueReadBuffer failed." << std::endl; return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout << "clWaitForEvents failed." << std::endl; return 1; } clReleaseEvent(events[0]); return 0; } /* * n-body simulation on cpu */ void NBody::nBodyCPUReference() { //Iterate for all samples for(int i = 0; i < numBodies; ++i) { int myIndex = 4 * i; float acc[3] = {0.0f, 0.0f, 0.0f}; for(int j = 0; j < numBodies; ++j) { float r[3]; int index = 4 * j; float distSqr = 0.0f; for(int k = 0; k < 3; ++k) { r[k] = refPos[index + k] - refPos[myIndex + k]; distSqr += r[k] * r[k]; } float invDist = 1.0f / sqrt(distSqr + espSqr); float invDistCube = invDist * invDist * invDist; float s = refPos[index + 3] * invDistCube; for(int k = 0; k < 3; ++k) { acc[k] += s * r[k]; } } for(int k = 0; k < 3; ++k) { refPos[myIndex + k] += refVel[myIndex + k] * delT + 0.5f * acc[k] * delT * delT; refVel[myIndex + k] += acc[k] * delT; } } } int NBody::setup() { if(setupNBody()!= 0) // SDK_SUCCESS == 0; return 1; /* int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); */ if(setupCL()!= 0) return 1; /* sampleCommon->stopTimer(timer); // Compute setup time setupTime = (double)(sampleCommon->readTimer(timer)); */ display= true; return 0; } /** * @brief Initialize GL */ void GLInit() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); glClear(GL_DEPTH_BUFFER_BIT); glMatrixMode(GL_PROJECTION); glLoadIdentity(); } /** * @brief Glut Idle function */ void idle() { if (curr_step < n_steps){ curr_step++; }else{ time ( &rawtime ); timeinfo = localtime ( &rawtime ); std::cout << "End time and date: " << asctime (timeinfo) << std::endl; exit(1); } glutPostRedisplay(); } /** * @brief Glut reshape func * * @param w numParticles of OpenGL window * @param h height of OpenGL window */ void reShape(int w,int h) { glViewport(0,0,w,h); glViewport(0,0,w,h); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); gluPerspective(45.0f,w/h,1.0f,1000.0f); gluLookAt (0.0, 0.0, -2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0); } /** * @brief OpenGL display function */ void displayfunc() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); glClear(GL_DEPTH_BUFFER_BIT); glPointSize(1.0); glBlendFunc(GL_SRC_ALPHA, GL_ONE); glEnable(GL_BLEND); glDepthMask(GL_FALSE); glColor3f(1.0f,0.6f,0.0f); if (curr_step < n_steps){ if (deviceType.compare("gpu") == 0){ //Calling kernel for calculatig subsequent positions ((NBody*)me)->runCLKernels(); }else if (deviceType.compare("cpu") == 0){ ((NBody*)me)->nBodyCPUReference(); } } glBegin(GL_POINTS); for(int i=0; i < numBodies; ++i) { if (deviceType.compare("gpu") == 0){ //divided by 300 just for scaling glVertex3d(pos[i*4+ 0]/300,pos[i*4+1]/300,pos[i*4+2]/300); }else if (deviceType.compare("cpu") == 0){ //divided by 300 just for scaling glVertex3d(((NBody*)me)->refPos[i*4+ 0]/300,((NBody*)me)->refPos[i*4+1]/300,((NBody*)me)->refPos[i*4+2]/300); } } glEnd(); glFlush(); glutSwapBuffers(); } /* keyboard function */ void keyboardFunc(unsigned char key, int mouseX, int mouseY) { switch(key) { /* If the user hits escape or Q, then exit */ /* ESCAPE_KEY = 27 */ case 27: case 'q': case 'Q': { if(((NBody*)me)->cleanup() != 0) // SDK_SUCCESS == 0; exit(1); else exit(0); } default: break; } } int NBody::run() { /* Arguments are set and execution call is enqueued on command buffer */ if(setupCLKernels()!= 0) // SDK_SUCCESS == 0; { return 1; } /* if(!quiet) { // Printeaza pozitiile corpurilor de la inceput: sampleCommon->printArray("Output", pos, numBodies, 1); } */ return 0; } void NBody::printStats() { /* std::string strArray[3] = {"Particles", "Iterations", "Time(sec)"}; std::string stats[3]; totalTime = setupTime + kernelTime; stats[0] = sampleCommon->toString(numParticles, std::dec); stats[1] = sampleCommon->toString(ITER, std::dec); stats[2] = sampleCommon->toString(totalTime, std::dec); this->SDKSample::printStats(strArray, stats, 3); */ } int NBody::cleanup() { /* Releases OpenCL resources (Context, Memory etc.) */ cl_int status; status = clReleaseKernel(kernel); if (status != CL_SUCCESS){ std::cout << "clReleaseKernel failed." << std::endl; return 1; } status = clReleaseProgram(program); if (status != CL_SUCCESS){ std::cout << "clReleaseProgram failed." << std::endl; return 1; } status = clReleaseMemObject(updatedPos); if (status != CL_SUCCESS){ std::cout << "clReleaseMemObject failed." << std::endl; return 1; } status = clReleaseMemObject(updatedVel); if (status != CL_SUCCESS){ std::cout << "clReleaseMemObject failed." << std::endl; return 1; } status = clReleaseCommandQueue(commandQueue); if (status != CL_SUCCESS){ std::cout << "clReleaseMemObject failed." << std::endl; return 1; } status = clReleaseContext(context); if (status != CL_SUCCESS){ std::cout << "clReleaseMemObject failed." << std::endl; return 1; } return 0; } NBody::~NBody() { /* release program resources */ if(initPos) { free(initPos); initPos = NULL; } if(initVel) { free(initVel); initVel = NULL; } if(pos) { #if defined (_WIN32) _aligned_free(pos); #else free(pos); #endif pos = NULL; } if(vel) { #if defined (_WIN32) _aligned_free(vel); #else free(vel); #endif vel = NULL; } if(devices) { free(devices); devices = NULL; } if(refPos) { free(refPos); refPos = NULL; } if(refVel) { free(refVel); refVel = NULL; } if(maxWorkItemSizes) { free(maxWorkItemSizes); maxWorkItemSizes = NULL; } } int main(int argc, char * argv[]) { NBody clNBody("OpenCL NBody"); me = &clNBody; if(clNBody.setup() != 0) return 1; if(clNBody.run() != 0) return 1; time ( &rawtime ); timeinfo = localtime ( &rawtime ); std::cout << "Start time and date: " << asctime (timeinfo) << std::endl; if(display) { // Run in graphical window if requested glutInit(&argc, argv); glutInitWindowPosition(100,10); glutInitWindowSize(600,600); glutInitDisplayMode( GLUT_RGB | GLUT_DOUBLE ); glutCreateWindow("NBody simulation"); GLInit(); glutDisplayFunc(displayfunc); glutReshapeFunc(reShape); glutIdleFunc(idle); glutKeyboardFunc(keyboardFunc); glutMainLoop(); } if(clNBody.cleanup() != 0) return 1; clNBody.printStats(); return 0; }