#ifndef _GPGPU_CUDA_H
#define _GPGPU_CUDA_H

#include "gpgpu_frame.h"
#include <cuda.h>
#include <map>

class cudaTexture {
	public:
	cudaTexture(unsigned int);
	cudaTexture(int w, int h); // Used from cudaTexture3D
	cudaTexture(int w, int h, int chans, std::string fName); // Read data from file
	cudaTexture(int w, int h, int chans, void *data); // Data from pointer
	~cudaTexture();
	void pushData(CUdeviceptr);
	void createFromData(void *data); // From host data
	void map();
	void unMap();
	void initRef(CUtexref);
	//CUdeviceptr getPtr();
	void copyToBuffer(CUdeviceptr);
	bool needRemapping();
	int dims();
	int channels();
	bool floatData();
	void setLinearInterpolation(bool);

	protected:
	unsigned int d_oglHandle;
	int d_dims;
	int d_width, d_height, d_channels;
	CUgraphicsResource d_cudaHandle;
	CUtexref d_cudaRef;
	CUarray d_cudaArray;
	bool d_needRemapping;
	bool d_floatData;
	bool d_linearInterpolation;
};

// For the moment this is fixed to float
class cudaSurface {
	public:
	cudaSurface(int w, int h, int chans);
	void initRef(CUsurfref);
	int width();
	int height();
	int channels();
	int dims();

	protected:
	int d_width, d_height, d_channels;
	int d_dims;
	CUarray d_cudaArray;
	CUsurfref d_cudaRef;
};

// This is for staticly uploaded data (tabulations)
class cudaTexture3D : public cudaTexture {
	public:
	cudaTexture3D(int w, int h, int d, void *data, bool floatData);
	//void initRef(CUtexref);

	private:
	int d_depth;
};

class cudaKernel {
	public:
	cudaKernel(int computeCap);
	~cudaKernel();
	//cudaKernel(const cudaKernel&);
	std::string name();
	void addString(std::string);
	void addSource(KernelSrc*);
	KernelSrc *getSource();
	void addLine(std::string);
	void compile(bool keepSources = false);
	void execute();
	float timedExecute(const int);
	//std::string getSources();
	void setVar(std::string, size_t, void*);
	void setExecConfig(int, int, int, int);
	void bindTexture(cudaTexture*, std::string);
	void bindSurface(cudaSurface*, std::string);
	//void bindTexture3D(cudaTexture3D*, std::string);
	template<class T> void pushParam(T);
	void pushParam(size_t, void*);
	void clearParams();
	void preferSM(bool);

	private:
	std::string d_kernelSrcString, d_kernelName;
	KernelSrc *d_kernelSrc;
	int d_totalThreadsX, d_threadBlockX;
	int d_totalThreadsY, d_threadBlockY;
	int d_computeCap;
	CUmodule d_module;
	CUfunction d_function;
	std::vector<std::pair<std::string, cudaTexture*> > d_boundTextures;
	std::vector<std::pair<std::string, cudaSurface*> > d_boundSurfaces;
	bool d_preferSM;
	bool d_srcGenerated, d_compiled, d_paramsSet;
	
	std::vector<std::pair<size_t, void*> > d_params;
	//size_t d_paramSize;
	
	void genSources();
};

class cudaKeywords : public gpgpuKeywords {
	public:
	cudaKeywords();
	virtual std::string blockX();
	virtual std::string blockY();
	virtual std::string threadX();
	virtual std::string threadY();
	virtual std::string blockDimX();
	virtual std::string blockDimY();
	virtual std::string globalThreadX();
	virtual std::string globalThreadY();
	virtual std::string sharedMem();
	virtual std::string constMem();
	virtual std::string localSync();
	//virtual std::string half2Type();
	virtual std::string halfType();
	virtual std::string regType();
	virtual std::string sharedType();
	virtual std::string globalType();
	virtual std::string kernelDecl(std::string, std::string);
	virtual std::string funcDecl(std::string name, std::string ret, std::string params, std::vector<std::string> tex);
	virtual std::string float2Operators(); // The standard "+=" etc stuff
	virtual std::string float3Operators();
	virtual std::string vectorOperators(); // Normalizations, cross products etc
	virtual std::string float2Ctor(std::string valX, std::string valY);
	//virtual std::string readHalf2(std::string dest, std::string src);
	virtual std::string readHalf2(std::string dest, std::string src, std::string ptr = "", std::string offset = "0");
	virtual std::string writeHalf2(std::string dest, std::string src, std::string ptr = "", std::string offset = "0");
	virtual std::string writeHalf(std::string dest, std::string src);
	virtual std::string float3Ctor(std::string valX, std::string valY, std::string valZ);
	virtual std::string float4Ctor(std::string valX, std::string valY, std::string valZ, std::string valW);
	virtual std::string div(std::string, std::string);
	virtual std::string rcp(std::string);
	virtual std::string floorf(std::string);
	virtual std::string sqrt(std::string);
	virtual std::string rsqrt(std::string);
	virtual std::string pow(std::string, std::string);
	virtual std::string sin(std::string);
	virtual std::string cos(std::string);
	virtual std::string sincos(std::string angle, std::string sintarget, std::string costarget);
	virtual std::string exp2(std::string);
	virtual std::string maxf(std::string, std::string);
	virtual std::string minf(std::string, std::string);
	virtual std::string absf(std::string);
	virtual std::string tex2DDeclFloat(std::string);
	virtual std::string tex2DDeclFloat4(std::string);
	virtual std::string tex3DDeclFloat(std::string);
	virtual std::string tex2DSample1(std::string id, std::string coordX, std::string coordY);
	virtual std::string tex2DSample4(std::string id, std::string coordX, std::string coordY);
	virtual std::string tex3DSample(std::string id, std::string coordX, std::string coordY, std::string coordZ);
	virtual std::string surf2DWrite(std::string id, std::string coordX, std::string coordY, std::string type, std::string value);
	virtual std::string surf2DRead(std::string id, std::string coordX, std::string coordY, std::string type);
	
	virtual std::string atomicMin(std::string dest, std::string value);
	virtual std::string atomicMax(std::string dest, std::string value);
	virtual std::string atomicAdd(std::string dest, std::string value);
	virtual std::string header(bool h = true);
	virtual std::string fileExt();
};

class cudaController : public gpgpuController {
	public:
	cudaController(int ctx = 0); // Defaults: If there's no existing CUDA/OGL environment, -1: use OGL interop.
	~cudaController();
	int newBuffer(size_t, const void *data = NULL);
	void fillTexFromBuffer(int, int);
	void fillBufferFromTex(int, int);
	void clearBuffer(int, void *fourB);
	void uploadToBuffer(int, void*, size_t);
	void *getBufferData(int, size_t&);
	void *getBufferPtr(int);
	int newKernel(std::string);
	int newKernel(KernelSrc*);
	void writeKernelSrc(int, std::string);
	int makeGLTexture(unsigned int);
	int create3DTexture(int w, int h, int d, void*, bool floatData = false);
	int create2DTextureFromFile(int w, int h, int chans, std::string fName);
	int create2DTextureFromData(int w, int h, int chans, void *data);
	int create2DSurface(int w, int h, int chans);
	void nearestSampling(int t);
	//int getBufferFromTex(int);
	void setBufferParam(int, int);
	void setDataParam(int, size_t, void*);
	void setVar(int, std::string, size_t, void*);
	void clearKernelParams(int);
	void setTex(int, int, std::string);
	void setSurf(int, int, std::string);
	//void setTex3D(int, int, std::string);
	void setKernelExecConf(int, int totalX, int blockX, int totalY = 1, int blockY = 1);
	void preferSM(int);
	//void setKernelExecConf(int, int, int);
	void executeKernel(int);
	void reportTiming(int);
	int getComputeCap();

	static void checkError(int, std::string desc = "");

	private:
	CUdevice d_cuDev;
	CUcontext d_cuCtx;
	std::map<int, CUdeviceptr> d_mem;
	int d_memCounter;
	std::map<int, cudaKernel*> d_kernel;
	int d_kernelCounter;
	std::map<int, cudaTexture*> d_texture;
	std::map<int, cudaSurface*> d_surface;
	int d_textureCounter;
	int d_surfaceCounter;
	/*std::map<int, cudaTexture3D*> d_texture3D;
	int d_texture3DCounter;*/
};

#endif // _GPGPU_CUDA_H
