#ifndef NBODY_H_
#define NBODY_H_

// standard utility and system includes
#include <oclUtils.h>

// GLEW and GLUT includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
    #include <GLUT/glut.h>
#else
    #include <GL/glut.h>
#endif

// Extra CL/GL include
#include <CL/cl_gl.h>


#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <cmath>
#include <malloc.h>
#include <time.h>

#include <GL/glut.h>

#define GROUP_SIZE 64
#define NUM_PARTICLES 30000
#define END_TIME 2

//----------------------
#define TAKE_DIAGNOSTICS		0
#define DO_NOT_TAKE_DIAGNOSTICS 1
#define TAKE_SNAPSHOTS			2
#define DO_NOT_TAKE_SNAPSHOTS	3

static int takeDiagnostics = DO_NOT_TAKE_DIAGNOSTICS;
static int takeSnapshots = DO_NOT_TAKE_SNAPSHOTS;

//----------------------
#define SIMPLE		0
#define LEAPFROG	1

static int integrator = 0;

//----------------------

static bool displayOpenGL = true;			// If it is true then OpenGL display is used; 
static bool readInputData = false;			// If true then the particles information will be read
											// from an input file. If it is false, then the information
											// is generated via random functions;

static std::string deviceType("gpu");		// It says on which device we want to do the computations;

static time_t rawtime;						// Variables used for measuring the duration of the run;
static struct tm * timeinfo;

static char* inputDataFileName = "Input-Output\\input_64.txt";
											// Represents the name of the file which contains the
											// information about particles (mass, position, vel);

static char* snapshotFileName = "Input-Output\\snapshot.txt";	
											// Represents the name of the file in which we will take
											// snapshots of the current system when we require.

static char* diagnosticFileName = "Input-Output\\diagnostic.txt";	
											// Represents the name of the file in which we will
											// write information about the energy conservation,
											// from time to time;

//----------------------


/**
 * NBody 
 * Class implements OpenCL  NBody sample
 *
 */

class NBody 
{
public:

	//================================================
	// General variables used for all integrators

    cl_double  setupTimeCL;			// Time taken to setup OpenCL resources and building kernel;
    cl_double  kernelTime;			// Time taken to run kernel and read result back;
    
	cl_context context;             // CL context; 
    cl_device_id *devices;          // CL device list; 

    size_t maxWorkGroupSize;        // Max allowed work-items in a group;
    cl_uint maxDimensions;          // Max group dimensions allowed;
    size_t* maxWorkItemSizes;       // Max work-items sizes in each dimensions;
    cl_ulong totalLocalMemory;      // Max local memory allowed;
    cl_ulong usedLocalMemory;       // Used local memory;

	cl_float* initPos;              // Initial position. Used to hold the initial particle position;
    cl_float* initVel;              // Initial velocity. Used to hold the initial particle velocity;
	
	cl_float* pos;					// This is the buffer for positions that resides on the host
									// side of the application. It is linked with the memory
									// buffer for positions "updatedPos";
    
	cl_float* vel;                  // This is the buffer for velocities that resides on the host
									// side of the application. It is linked with the memory
									// buffer for velocities "updatedVel";


    cl_mem   updatedPos;            // This is a memory buffer representing the position of 
									// partciles. It is linked with the application buffer 
									// for positions "pos". updatedPos is set as an argument to
									// the kernel, and also used to read data from the kernel by
									// enqueueing a read command. It will represent the updated
									// values of positions after running once the kernel;
   
	cl_mem   updatedVel;            // This is a memory buffer representing the poelocity of 
									// partciles. It is linked with the application buffer 
									// for velocities "vel". updatedVel is set as an argument to
									// the kernel, and also used to read data from the kernel by
									// enqueueing a read command. It will represent the updated
									// values of velocities after running once the kernel; 
    
	cl_command_queue commandQueue;  // CL command queue; 
    cl_program program;             // CL program; 
    cl_kernel kernel;               // CL kernel; 

    cl_int  numParticles;			// Number of particles in the system;  
		    
    cl_float espSqr;                // Softening Factor;
	cl_float delT;                  // dT (timestep);
	cl_double curr_time_step;		// Reatins the current time step in the simulation;
	cl_double end_time;				// The total time (end time) of the simulation; 
	cl_long taken_steps;			// Tne number of steps taken by now (at the curr_time_step);
	cl_double dt_snap;				// The interval between two snapshots;
	cl_double dt_diag;				// The interval between two diagnostics;
	cl_double curr_snap_time;		// This is the current time that must be reached by 
									// curr_time_step in order to take a snapshot. After this,
									// curr_snap_time will be incremented by dt_snap;
	cl_double curr_diag_time;		// This is the current time that must be reached by 
									// curr_time_step in order to make a diagnostication of the
									// system. After this, curr_diag_time will be incremented by dt_diag;
	
	cl_double Etot_init;			// Will retain the total energy of the system at the beggining
									// of the simulation. It is used to measure the convergence
									// (error) of the simulation;

	bool initFlag;					// This flag variable is used to make some initialisations of
									// variables when the simulation starts. For example if 
									// initFlag is true, than we are starting the simulation and
									// compute "Etot_init" energy. After this first step of simulation
									// it becomes false;

	const char * kernelFileName;	// The name of the file that contains the kernel code;
	const char * kernelFunctionName;// The name of the __kernel function in the kernel file;

	//================================================
	// Variables specific only to Leapfrog integrator:

	cl_float* acc;                  // This is the buffer for accelerations that resides on the host
									// side of the application. It is linked with the memory
									// buffer for accelerations "updatedAcc";

    cl_mem   updatedAcc;            // This is a memory buffer representing the acceleration of 
									// partciles. It is linked with the application buffer 
									// for acc "acc". updatedAcc is set as an argument to
									// the kernel, and also used to read data from the kernel by
									// enqueueing a read command. It will represent the updated
									// values of accelerations after running once the kernel;
	cl_mem   updatedCollTime;
	cl_float* collTime;

	cl_int initialAccComputation;

	

	
private:
	float random(float randMax, float randMin);
   
public:
    /** 
     * Constructor 
     * Initialize member variables
     * @param name name of sample (string)
     */
    explicit NBody(std::string name)
    {
		setupTimeCL = 0;
		kernelTime = 0;
		delT = 0.005f;
		espSqr = 50.0f;
		initPos = NULL;
		initVel = NULL;
		pos = NULL;
		vel = NULL;
		devices = NULL;
		maxWorkItemSizes = NULL;
		kernelFileName = "Kernels\\simpleIntegratorKernel.cl";
		kernelFunctionName = "simple_integrator";
		numParticles = 30;
		curr_time_step = 0;
		end_time = 10;	
		initFlag = true;
		dt_snap = 0.1;
		dt_diag = 0.1;
		curr_snap_time = dt_snap;
		curr_diag_time = dt_diag;
		taken_steps = 0;
    }

    /** 
     * Constructor 
     * Initialize member variables
     * @param name name of sample (const char*)
     */
    explicit NBody(const char* name)
    {
		setupTimeCL = 0;
		kernelTime = 0;
		delT = 0.01f;
		espSqr = 50.0f;
		initPos = NULL;
		initVel = NULL;
		pos = NULL;
		vel = NULL;
		devices = NULL;
		maxWorkItemSizes = NULL;
		if (integrator == LEAPFROG){
			kernelFileName = "Kernels\\leapfrogIntegratorKernel.cl";
			kernelFunctionName = "leapfrog_integrator";
		}else{
			kernelFileName = "Kernels\\simpleIntegratorKernel.cl";
			kernelFunctionName = "simple_integrator";
		}
		numParticles = NUM_PARTICLES;
		curr_time_step = 0;
		end_time = END_TIME;
		initFlag = true;
		dt_snap = 1.0;
		dt_diag = 1.0;
		curr_snap_time = dt_snap;
		curr_diag_time = dt_diag;
		taken_steps = 0;
    }

    ~NBody();

    /**
     * Returns information about the device on  which the
	 * simulation will run. Also some variables will be init
	 * here based on the device information
     * @return 1 on success and 0 on failure
     */
	int getDeviceInfo();

    /**
     * Allocate and initialize host memory array with random values
     * @return 1 on success and 0 on failure
     */
    int setupNBody();

    /**
     * OpenCL related initialisations. 
     * Set up Context, Device list, Command Queue, Memory buffers
     * @return 1 on success and 0 on failure
     */
    int setupCL();

	/**
     * Build CL kernel program executable
     * @return 1 on success and 0 on failure
	 */
	int setupCLProgram();

    /**
     * Build kernels and set values for kernels' arguments
     * @return 1 on success and 0 on failure
     */
    int setupCLKernels();

    /**
     * Enqueue calls to the kernels
     * on to the command queue, wait till end of kernel execution.
     * Get kernel start and end time if timing is enabled
     * @return 1 on success and 0 on failure
     */
    int runCLKernels();

    /**
     * Load a .cl source file as a char* and it will be used
	 * for as a parameter for creating a program
     * @return the char vector
     */
	char * load_program_source(const char *filename);

    /**
     * Override from SDKSample
     * Run OpenCL NBody
     */
    int run();

    /**
     * Override from SDKSample
     * Cleanup memory allocations
     */
    int cleanup();
	
	/**
     * Writes a single snapshot on the output snapshot file.
	 */
	void put_snapshot();
	
	/**
	 * Writes diagnostics on the diagnostics file:
	 * current time; number of integration steps so far;
	 * kinetic, potential, and total energy; absolute and
	 * relative energy errors since the start of the run.
	 */
	void write_diagnostics();
	
	/**
	 * Computes the potential energy of a single particle.
	 */ 
	float epot_particle(int currPart);

	int runCLKernelLeapfrog();

	int enqueueKernel();

};

#endif // NBODY_H_