#ifndef _GPGPU_OPENCL_H
#define _GPGPU_OPENCL_H

#include "gpgpu_frame.h"
#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <map>

class openCLTexture {
	public:
	openCLTexture(unsigned int, cl_context, cl_command_queue);
	openCLTexture(int w, int h, bool floatData, cl_context, cl_command_queue);
	openCLTexture(int w, int h, int chans, std::string fName, cl_context, cl_command_queue);
	cl_mem *getMemPtr();
	int dims();
	int channels();
	int width();
	int height();
	bool floatData();
	bool needRemapping();
	void map();
	void unMap();


	protected:
	cl_context d_clCtx;
	cl_command_queue d_clCmdQueue;
	cl_mem d_clMem;
	unsigned int d_oglHandle;
	int d_dims;
	int d_width, d_height, d_channels;
	bool d_floatData;
	bool d_needRemapping;
};

class openCLTexture3D : public openCLTexture {
	public:
	openCLTexture3D(int, int, int, void*, bool, cl_context, cl_command_queue);

	private:
	int d_depth;
};

class openCLKernel {
	public:
	openCLKernel(cl_context, cl_command_queue, cl_device_id);
	void addString(std::string);
	void addSource(KernelSrc*);
	KernelSrc *getSource(); // Calling this will result in processing of the input source
	template<class T> void pushParam(T);
	void pushParam(size_t, void*);
	void clearParams();
	void compile();
	void setExecConfig(int, int, int, int);
	void execute();
	void execute(std::vector<cl_event>, cl_event*);
	float timedExecute(const int);
	void setVar(std::string, size_t, void*);
	void bindTexture(openCLTexture*, std::string);
	std::string name();
	cl_device_id getDev();

	static void buildNotify(cl_program, void*);

	private:
	std::string d_kernelName; // Also the func name
	std::string d_kernelSrcString;
	KernelSrc *d_kernelSrc;
	cl_program d_clProgram;
	cl_kernel d_clKernel;
	
	// Unfortunately, we also need these here
	cl_device_id d_clDev;
	cl_context d_clCtx;
	cl_command_queue d_clCmdQueue;
	int d_totalThreadsX, d_threadBlockX;
	int d_totalThreadsY, d_threadBlockY;

	std::vector<std::pair<size_t, void*> > d_params;
	std::vector<std::pair<std::string, openCLTexture*> > d_boundTextures;

	bool d_srcGenerated, d_compiled, d_paramsSet;
	
	std::string findKernel(bool*);
	void addTextureDecls(int);
	void genSources();
};

class openCLController : public gpgpuController {
	public:
	openCLController(int ctx = 0); // Defaults: If there's no existing CUDA/OGL environment, -1: use OGL interop.
	~openCLController();
	int newBuffer(size_t, const void *data = NULL);
	void fillTexFromBuffer(int, int);
	void nearestSampling(int t);
	void clearBuffer(int, void *fourB);
	void uploadToBuffer(int, void*, size_t);
	void *getBufferData(int, size_t&);
	void *getBufferPtr(int);
	int newKernel(std::string);
	int newKernel(KernelSrc*);
	void writeKernelSrc(int, std::string);
	int makeGLTexture(unsigned int);
	int create3DTexture(int w, int h, int d, void*, bool floatData = false);
	int create2DTextureFromFile(int w, int h, int chans, std::string fName);
	void setBufferParam(int, int);
	void setDataParam(int, size_t, void*);
	void setVar(int, std::string, size_t, void*);
	void clearKernelParams(int);
	void setTex(int, int, std::string);
	//void setTex3D(int, int, std::string);
	void setKernelExecConf(int, int totalX, int blockX, int totalY = 1, int blockY = 1);
	void preferSM(int);
	void executeKernel(int);
	void reportTiming(int);

	static void checkError(int, std::string desc = "");

	private:
	cl_device_id d_clDev;
	cl_context d_clCtx;
	cl_command_queue d_clCmdQueue;
	
	std::map<int, cl_mem> d_mem;
	int d_memCounter;
	std::vector<float> d_clearBuf; // When CL 1.2 not supported and falling back

	std::map<int, openCLKernel*> d_kernel;
	int d_kernelCounter;
	std::map<int, openCLTexture*> d_texture;
	int d_textureCounter;
};

class openCLKeywords : public gpgpuKeywords {
	public:
	openCLKeywords();
	virtual std::string blockX();
	virtual std::string blockY();
	virtual std::string threadX();
	virtual std::string threadY();
	virtual std::string blockDimX();
	virtual std::string blockDimY();
	virtual std::string globalThreadX();
	virtual std::string globalThreadY();
	virtual std::string sharedMem();
	virtual std::string constMem();
	virtual std::string localSync();
	virtual std::string kernelDecl(std::string name, std::string params);
	virtual std::string funcDecl(std::string name, std::string ret, std::string params, std::vector<std::string> tex);
	virtual std::string float2Operators(); // The standard "+=" etc stuff
	virtual std::string float3Operators();
	virtual std::string vectorOperators(); // Normalizations, cross products etc
	virtual std::string float2Ctor(std::string valX, std::string valY);
	virtual std::string halfType();
	virtual std::string regType();
	virtual std::string sharedType();
	virtual std::string globalType();
	//virtual std::string readHalf2(std::string dest, std::string src);
	virtual std::string readHalf2(std::string dest, std::string src, std::string ptr = "", std::string offset = "0");
	virtual std::string writeHalf2(std::string dest, std::string src, std::string ptr = "", std::string offset = "0");
	virtual std::string writeHalf(std::string dest, std::string src);
	virtual std::string float3Ctor(std::string valX, std::string valY, std::string valZ);
	virtual std::string float4Ctor(std::string valX, std::string valY, std::string valZ, std::string valW);
	virtual std::string div(std::string, std::string);
	virtual std::string rcp(std::string);
	virtual std::string floorf(std::string);
	virtual std::string sqrt(std::string);
	virtual std::string rsqrt(std::string);
	virtual std::string pow(std::string, std::string);
	virtual std::string sin(std::string);
	virtual std::string cos(std::string);
	virtual std::string sincos(std::string angle, std::string sintarget, std::string costarget);
	virtual std::string exp2(std::string);
	virtual std::string maxf(std::string, std::string);
	virtual std::string minf(std::string, std::string);
	virtual std::string absf(std::string);
	virtual std::string tex2DDeclFloat(std::string);
	virtual std::string tex2DDeclFloat4(std::string);
	virtual std::string tex3DDeclFloat(std::string);
	virtual std::string tex2DSample1(std::string id, std::string coordX, std::string coordY);
	virtual std::string tex2DSample4(std::string id, std::string coordX, std::string coordY);
	virtual std::string tex3DSample(std::string id, std::string coordX, std::string coordY, std::string coordZ);
	
	virtual std::string atomicMin(std::string dest, std::string value);
	virtual std::string atomicMax(std::string dest, std::string value);
	virtual std::string header(bool h = true); // Params:  use halfs
	virtual std::string fileExt();

	private:
	std::string d_texPrefix;

	std::string globalizePointers(std::string);
};

#endif // _GPGPU_OPENCL_H
