#include "config.h" // For NVCC_PATH + PTXAS_PATH
#include "gpgpu_cuda.h"
#include <GL/gl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
#include <cudaGL.h>
#include <cstdio>
#include <string>
#include <map>
#include <cstring>
#include <unistd.h>

//#define AVOID_NATIVE

cudaKeywords::cudaKeywords() {
}

std::string cudaKeywords::blockX() {
	return std::string("blockIdx.x");
}

std::string cudaKeywords::blockY() {
	return std::string("blockIdx.y");
}

std::string cudaKeywords::blockDimX() {
	return std::string("blockDim.x");
}

std::string cudaKeywords::blockDimY() {
	return std::string("blockDim.y");
}

std::string cudaKeywords::kernelDecl(std::string name, std::string params) {
	// Inserting an anchor here where the texture declarations come
	return std::string("\nextern \"C\" __global__ void ") + name + "(" + params + ")";
}

std::string cudaKeywords::funcDecl(std::string name, std::string ret, std::string params, std::vector<std::string> tex) {
	// Texes can be ignored
	return std::string("__device__ ") + ret + " " + name + "(" + params + ")";
}

std::string cudaKeywords::threadX() {
	return std::string("threadIdx.x");
}

std::string cudaKeywords::threadY() {
	return std::string("threadIdx.y");
}

std::string cudaKeywords::globalThreadX() {
	return std::string("blockIdx.x*blockDim.x + threadIdx.x");
}

std::string cudaKeywords::globalThreadY() {
	return std::string("blockIdx.y*blockDim.y + threadIdx.y");
}

std::string cudaKeywords::float2Ctor(std::string valX, std::string valY) {
	return std::string("make_float2(") + valX + ", " + valY + ")";
}

std::string cudaKeywords::float3Ctor(std::string valX, std::string valY, std::string valZ) {
	return std::string("make_float3(") + valX + ", " + valY + ", " + valZ + ")";
}

std::string cudaKeywords::float4Ctor(std::string valX, std::string valY, std::string valZ, std::string valW) {
	return std::string("make_float4(") + valX + ", " + valY + ", " + valZ + ", " + valW + ")";
}

/*std::string cudaKeywords::float2Half(std::string val) {
	return std::string("__float2half_rn(") + val + ")";
}

std::string cudaKeywords::half2Float(std::string val) {
	return std::string("__half2float(") + val + ")";
}*/

std::string cudaKeywords::halfType() {
	return "unsigned short";
}

std::string cudaKeywords::writeHalf2(std::string dest, std::string src, std::string ptr, std::string offset) {
	std::string code;
	if (ptr != "")
		code += "*";
	return code + dest + " = __float2half_rn(" + src + ".x) | (__float2half_rn(" + src + ".y)<<16)";
	//return dest + " = ((unsigned int)__float2half_rn(" + src + ".x)) | (((unsigned int)__float2half_rn(" + src + ".y))<<16)";
}

std::string cudaKeywords::writeHalf(std::string dest, std::string src) {
	return dest + " = __float2half_rn(" + src + ")";
}

std::string cudaKeywords::readHalf2(std::string dest, std::string src, std::string ptr, std::string offset) {
	std::string code = "\n  {\n    unsigned int inValue = ";
	if (ptr == "")
		code += src;
	else {
		code += "*((unsigned int*)" + src + " + " + offset + ")";
	}

	return code + ";\n    " + dest + "  = make_float2(__half2float(inValue&0xffff), __half2float(inValue>>16));\n  }\n";
}

std::string cudaKeywords::regType() {
	return "";
}

std::string cudaKeywords::sharedType() {
	return "";
}

std::string cudaKeywords::globalType() {
	return "";
}

std::string cudaKeywords::atomicMin(std::string dest, std::string value) {
	return std::string("atomicMin(") + dest + ", " + value + ")";
}

std::string cudaKeywords::atomicMax(std::string dest, std::string value) {
	return std::string("atomicMax(") + dest + ", " + value + ")";
}

std::string cudaKeywords::atomicAdd(std::string dest, std::string value) {
	return std::string("atomicAdd(&") + dest + ", " + value + ")";
}

std::string cudaKeywords::localSync() {
	return std::string("__syncthreads()");
}

std::string cudaKeywords::sharedMem() {
	return std::string("__shared__");
}

std::string cudaKeywords::constMem() {
	return std::string("__constant__");
}

std::string cudaKeywords::div(std::string a, std::string b) {
	#ifdef AVOID_NATIVE
	return std::string("(") + a + "/" + b + ")";
	#else
	return std::string("__fdividef(") + a + ", " + b + ")";
	#endif
}

std::string cudaKeywords::floorf(std::string v) {
	return std::string("floorf(") + v + ")";
}

std::string cudaKeywords::rcp(std::string v) {
	return std::string("(1.0f/(") + v + "))";
}

std::string cudaKeywords::sqrt(std::string v) {
	return std::string("sqrtf(") + v + ")";
}

std::string cudaKeywords::rsqrt(std::string v) {
	#ifdef AVOID_NATIVE
	return std::string("(1.0f/sqrtf(") + v + "))";
	#else
	return std::string("rsqrtf(") + v + ")";
	#endif
}

std::string cudaKeywords::pow(std::string a, std::string b) {
	return std::string("powf(") + a + ", " + b + ")";
}

std::string cudaKeywords::sin(std::string v) {
	#ifdef AVOID_NATIVE
	return std::string("sinf(") + v + ")";
	#else
	return std::string("__sinf(") + v + ")";
	#endif
}

std::string cudaKeywords::cos(std::string v) {
	#ifdef AVOID_NATIVE
	return std::string("cosf(") + v + ")";
	#else
	return std::string("__cosf(") + v + ")";
	#endif
}

std::string cudaKeywords::exp2(std::string v) {
	return std::string("exp2f(") + v + ")";
}

std::string cudaKeywords::sincos(std::string angle, std::string sintarget, std::string costarget) {
	#ifdef AVOID_NATIVE
	return sintarget + " = sinf(" + angle + "); " + costarget + " = cosf(" + angle + ");";
	#else
	return std::string("__sincosf(") + angle + ", &" + sintarget + ", &" + costarget + ")";
	#endif
}

std::string cudaKeywords::maxf(std::string a, std::string b) {
	return std::string("fmaxf(") + a + ", " + b + ")";
}

std::string cudaKeywords::minf(std::string a, std::string b) {
	return std::string("fminf(") + a + ", " + b + ")";
}

std::string cudaKeywords::absf(std::string a) {
	return std::string("fabsf(") + a + ")";
}

std::string cudaKeywords::tex2DDeclFloat(std::string id) {
	return std::string("texture<float, 2, cudaReadModeElementType> ") + id;
}

std::string cudaKeywords::tex3DDeclFloat(std::string id) {
	return std::string("texture<float, 3, cudaReadModeElementType> ") + id;
}

std::string cudaKeywords::tex2DDeclFloat4(std::string id) {
	return std::string("texture<float4, 2, cudaReadModeElementType> ") + id;
}

std::string cudaKeywords::tex2DSample4(std::string id, std::string coordX, std::string coordY) {
	return std::string("tex2D(") + id + ", " + coordX + ", " + coordY + ")";
}

std::string cudaKeywords::tex2DSample1(std::string id, std::string coordX, std::string coordY) {
	return std::string("tex2D(") + id + ", " + coordX + ", " + coordY + ")"; // No need to append .x here
}

std::string cudaKeywords::tex3DSample(std::string id, std::string coordX, std::string coordY, std::string coordZ) {
	//return std::string("(") + coordX + "+" + coordY + " + " + coordZ + ")";
	return std::string("tex3D(") + id + ", " + coordX + ", " + coordY + ", " + coordZ + ")";
}

std::string cudaKeywords::surf2DWrite(std::string id, std::string coordX, std::string coordY, std::string type, std::string value) {
	return std::string("surf2Dwrite<") + type + ">(" + value + ", " + id + ", " + coordX + ", " + coordY + ")";
}

std::string cudaKeywords::surf2DRead(std::string id, std::string coordX, std::string coordY, std::string type) {
	return std::string("surf2Dread<") + type + ">(" + id + ", " + coordX + ", " + coordY + ")";
}


std::string cudaKeywords::float2Operators() {
	return std::string("\n\
__inline__ __device__ bool operator==(const float2 a, const float2 b) {\n\
	//return fabs(a.x - b.x) < 1e-6f && fabs(a.y - b.y) < 1e-6f;\n\
	return a.x == b.x && a.y == b.y;\n\
}\n\
__inline__ __device__ bool operator!=(const float2 a, const float2 b) {\n\
	//return fabs(a.x - b.x) > 1e-6f || fabs(a.y - b.y) > 1e-6f;\n\
	return a.x != b.x || a.y != b.y;\n\
}\n\
__inline__ __device__ float2 operator+(const float2 a, const float2 b) {\n\
	return make_float2(a.x+b.x, a.y+b.y);\n\
}\n\
\n\
__inline__ __device__ float2 operator-(const float2 a, const float2 b) {\n\
	return make_float2(a.x-b.x, a.y-b.y);\n\
}\n\
\n\
__inline__ __device__ float2 operator-(const float2 a) {\n\
	return make_float2(-a.x, -a.y);\n\
}\n\
\n\
__inline__ __device__ void operator*=(float2 &a, const float c) {\n\
	a.x *= c;\n\
	a.y *= c;\n\
}\n\
__inline__ __device__ void operator*=(float2 &a, const float2 c) {\n\
	a.x *= c.x;\n\
	a.y *= c.y;\n\
}\n\
\n\
__inline__ __device__ float2 operator*(const float2 a, const float c) {\n\
	return make_float2(a.x*c, a.y*c);\n\
}\n\
\n\
__inline__ __device__ float2 operator*(const float c, const float2 a) {\n\
	return make_float2(a.x*c, a.y*c);\n\
}\n\
\n\
__inline__ __device__ void operator+=(float2 &a, const float2 b) {\n\
	a.x += b.x;\n\
	a.y += b.y;\n\
}\n\n\
__inline__ __device__ void operator-=(float2 &a, const float2 b) {\n\
	a.x -= b.x;\n\
	a.y -= b.y;\n\
}\n");
}

std::string cudaKeywords::float3Operators() {
	return std::string("\n\
__inline__ __device__ float3 operator-(const float3 a, const float3 b) {\n\
	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);\n\
}\n\
\n\
__inline__ __device__ float3 operator-(const float3 a) {\n\
	return make_float3(-a.x, -a.y, -a.z);\n\
}\n\
\n\
__inline__ __device__ float3 operator*(const float3 a, const float c) {\n\
	return make_float3(a.x*c, a.y*c, a.z*c);\n\
}\n\
\n\
__inline__ __device__ float3 operator*(const float c, const float3 a) {\n\
	return make_float3(c*a.x, c*a.y, c*a.z);\n\
}\n\
\n\
__inline__ __device__ float3 operator+(const float3 a, const float3 b) {\n\
	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);\n\
}\n\
\n\
__inline__ __device__ void operator+=(float3 &a, const float3 b) {\n\
	a.x += b.x;\n\
	a.y += b.y;\n\
	a.z += b.z;\n\
}\n\
\n\
__inline__ __device__ void operator*=(float3 &a, const float c) {\n\
	a.x *= c;\n\
	a.y *= c;\n\
	a.z *= c;\n\
}\n\n");
}

std::string cudaKeywords::vectorOperators() {
	return std::string("\n\
__inline__ __device__ float SSEOdot3(const float3 a, const float3 b) {\n\
	return a.x*b.x + a.y*b.y + a.z*b.z;\n\
}\n\
\n\
__inline__ __device__ float SSEOdot2(const float2 a, const float2 b) {\n\
	return a.x*b.x + a.y*b.y;\n\
}\n\
\n\
__inline__ __device__ float SSEOlength3(const float3 a) {\n\
	return sqrtf(a.x*a.x + a.y*a.y + a.z*a.z);\n\
}\n\
\n\
__inline__ __device__ float SSEOlength2(const float2 a) {\n\
	return sqrtf(a.x*a.x + a.y*a.y);\n\
}\n\
\n\
__inline__ __device__ float3 SSEOnormalize3(const float3 a) {\n\
	float coef = rsqrtf(a.x*a.x + a.y*a.y + a.z*a.z);\n\
	return make_float3(a.x*coef, a.y*coef, a.z*coef);\n\
}\n\
\n\
__inline__ __device__ void normalizeThis(float3 &a) {\n\
	float coef = rsqrtf(a.x*a.x + a.y*a.y + a.z*a.z);\n\
	a.x *= coef;\n\
	a.y *= coef;\n\
	a.z *= coef;\n\
}\n\
\n\
__inline__ __device__ float2 SSEOnormalize2(const float2 a) {\n\
	float coef = rsqrtf(a.x*a.x + a.y*a.y);\n\
	return make_float2(a.x*coef, a.y*coef);\n\
}\n\
\n\
__inline__ __device__ void normalizeThis(float2 &v) { \n\
	const float coef = rsqrtf(v.x*v.x + v.y*v.y);\n\
	v.x *= coef;\n\
	v.y *= coef;\n\
}\n\
\n\
__inline__ __device__ float3 SSEOcross3(const float3 a, const float3 b) {\n\
	return make_float3(\n\
			a.y*b.z - a.z*b.y,\n\
			a.z*b.x - a.x*b.z,\n\
			a.x*b.y - a.y*b.x);\n\
}\n\n"); 
}

std::string cudaKeywords::header(bool h) {
	return std::string("#define LANG_CUDA\n#TEXDECL#\n");
}

std::string cudaKeywords::fileExt() {
	return std::string(".cu");
}



cudaController::cudaController(int ctx) {
	try {
		checkError(cuInit(0));
		int deviceCount = 0;
		checkError(cuDeviceGetCount(&deviceCount));

		if (!deviceCount) // Picking the first
			throw("No GPGPU devices");

		checkError(cuDeviceGet(&d_cuDev, 0));
		if (!ctx)
			checkError(cuCtxCreate(&d_cuCtx, 0, d_cuDev), "Context creation");
		else if (ctx == -1)
			checkError(cuGLCtxCreate(&d_cuCtx, 0, d_cuDev), "Context creation");
		else
			throw std::string("CUDA context sharing not yet implemented");

		d_memCounter = 1; // We reserve 0 for something
		d_kernelCounter = 1;
		d_textureCounter = 1;
		d_surfaceCounter = 1;

		char devName[1024];
		checkError(cuDeviceGetName(devName, 1023, d_cuDev), "Get device name");
		printf("Initialized CUDA device \"%s\"\n", devName);

		printf("Will be compiling using compute capability %d and %d bit code\n", 
				getComputeCap(),
				sizeof(void*)*8);

		// Instancing the keyword dictionary
		d_keywords = new cudaKeywords();

	} catch (std::string e) {
		fprintf(stderr, "CUDA initialization failed: %s\n", e.c_str());
		throw std::string("GPGPU initialization failed\n");
	}
}

cudaController::~cudaController() {
	// We're not checking if we fail or not when uninitting
	for (std::map<int, CUdeviceptr>::iterator i = d_mem.begin(); i != d_mem.end(); ++i)
		cuMemFree(i->second);
	
	for (std::map<int, cudaKernel*>::iterator i = d_kernel.begin(); i != d_kernel.end(); ++i)
		delete i->second;

	for (std::map<int, cudaTexture*>::iterator i = d_texture.begin(); i != d_texture.end(); ++i)
		delete i->second;

	cuCtxDestroy(d_cuCtx);
	printf("CUDA uninitted\n");
}

/*int cudaController::getBufferFromTex(int tex) {
	d_mem.insert(std::pair<int, CUdeviceptr>(d_memCounter, d_texture[tex]->getPtr()));
	return d_memCounter++;
}*/

int cudaController::newBuffer(size_t size, const void *data) {
	CUdeviceptr devPtr;
	checkError(cuMemAlloc(&devPtr, size), "Mem alloc");

	if (data)
		checkError(cuMemcpyHtoD(devPtr, data, size), "Mem transfer H->D");

	d_mem.insert(std::pair<int, CUdeviceptr>(d_memCounter, devPtr));
	return d_memCounter++;
}

void cudaController::checkError(int rCode, std::string desc) {
	static std::map<int, std::string> g_errorStrings;
	if (!g_errorStrings.size()) {
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_VALUE, "CUDA_ERROR_INVALID_VALUE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_OUT_OF_MEMORY, "CUDA_ERROR_OUT_OF_MEMORY"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_INITIALIZED, "CUDA_ERROR_NOT_INITIALIZED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_DEINITIALIZED, "CUDA_ERROR_DEINITIALIZED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NO_DEVICE, "CUDA_ERROR_NO_DEVICE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_DEVICE, "CUDA_ERROR_INVALID_DEVICE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_IMAGE, "CUDA_ERROR_INVALID_IMAGE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_CONTEXT, "CUDA_ERROR_INVALID_CONTEXT"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_MAP_FAILED, "CUDA_ERROR_MAP_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNMAP_FAILED, "CUDA_ERROR_UNMAP_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ARRAY_IS_MAPPED, "CUDA_ERROR_ARRAY_IS_MAPPED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ALREADY_MAPPED, "CUDA_ERROR_ALREADY_MAPPED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NO_BINARY_FOR_GPU, "CUDA_ERROR_NO_BINARY_FOR_GPU"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ALREADY_ACQUIRED, "CUDA_ERROR_ALREADY_ACQUIRED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED, "CUDA_ERROR_NOT_MAPPED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED_AS_POINTER, "CUDA_ERROR_NOT_MAPPED_AS_POINTER"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNSUPPORTED_LIMIT, "CUDA_ERROR_UNSUPPORTED_LIMIT"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_SOURCE, "CUDA_ERROR_INVALID_SOURCE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_FILE_NOT_FOUND, "CUDA_ERROR_FILE_NOT_FOUND"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_OPERATING_SYSTEM, "CUDA_ERROR_OPERATING_SYSTEM"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_HANDLE, "CUDA_ERROR_INVALID_HANDLE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_FOUND, "CUDA_ERROR_NOT_FOUND"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_READY, "CUDA_ERROR_NOT_READY"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_FAILED, "CUDA_ERROR_LAUNCH_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_TIMEOUT, "CUDA_ERROR_LAUNCH_TIMEOUT"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_CONTEXT_IS_DESTROYED, "CUDA_ERROR_CONTEXT_IS_DESTROYED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNKNOWN, "CUDA_ERROR_UNKNOWN"));
	}

	if (rCode != CUDA_SUCCESS)
		throw ((desc == "") ? 
				std::string("Error: ") : 
				(std::string("Error in \"") + desc + std::string("\": "))) + 
			g_errorStrings[rCode];
}

int cudaController::newKernel(KernelSrc *kernSrc) {
	cudaKernel *kernel = new cudaKernel(getComputeCap());

	kernel->addSource(kernSrc);

	// Finding the function name

	//kernel->compile();
	//kernel->compile(true);

	d_kernel.insert(std::pair<int, cudaKernel*>(d_kernelCounter, kernel));
	return d_kernelCounter++;
}

void cudaController::writeKernelSrc(int kernel, std::string fname) {
	// We tell the kernel to update the sources, and then we write them out
	d_kernel[kernel]->getSource()->write(fname);
}

void cudaController::clearKernelParams(int kernel) {
	d_kernel[kernel]->clearParams();
}

void cudaController::setBufferParam(int kernel, int buffer) {
	// FIXME:  This is for 32-bit only
	/*printf("buffer ptr %ld\n", d_mem[buffer]);
	d_kernel[kernel]->pushParam((unsigned int)d_mem[buffer]);*/
	d_kernel[kernel]->pushParam(d_mem[buffer]);
}

void cudaController::setDataParam(int kernel, size_t paramSize, void *paramData) {
	d_kernel[kernel]->pushParam(paramSize, paramData);
}

void cudaController::setVar(int kernel, std::string varName, size_t varSize, void *varData) {
	d_kernel[kernel]->setVar(varName, varSize, varData);
}

void cudaController::clearBuffer(int buffer, void *fourB) {
	size_t size;
	//float value = v;
	//float value = -10000.0f;
	cudaController::checkError(cuMemGetAddressRange(NULL, &size, d_mem[buffer]), "Get mem address range");
	//cudaController::checkError(cuMemsetD32(d_mem[buffer], *((unsigned int*)&value), size/sizeof(float)), "Memset");
	cudaController::checkError(cuMemsetD32(d_mem[buffer], *((unsigned int*)fourB), size/sizeof(float)), "Memset");
}

void cudaController::uploadToBuffer(int bufId, void *data, size_t size) {
	size_t bufSize;
	checkError(cuMemGetAddressRange(NULL, &bufSize, d_mem[bufId]), "Get mem address range");
	if (size != bufSize)
		throw std::string("Trying to upload a different amount of data than the buffer is");

	checkError(cuMemcpyHtoD(d_mem[bufId], data, size), "Memcpy H -> D");	
}

void cudaController::setTex(int kernel, int tex, std::string refName) {
	d_kernel[kernel]->bindTexture(d_texture[tex], refName);
}

void cudaController::setSurf(int kernel, int surf, std::string refName) {
	d_kernel[kernel]->bindSurface(d_surface[surf], refName);
}

/*void cudaController::setTex3D(int kernel, int tex, std::string refName) {
	d_kernel[kernel]->bindTexture3D(d_texture3D[tex], refName);
}*/

void cudaController::setKernelExecConf(int kernel, int totalX, int blockX, int totalY, int blockY) {
	d_kernel[kernel]->setExecConfig(totalX, blockX, totalY, blockY);
}

void cudaController::preferSM(int kernel) {
	d_kernel[kernel]->preferSM(true);
}

int cudaController::getComputeCap() {
	int capMajor, capMinor;
	cudaController::checkError(cuDeviceGetAttribute(&capMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, d_cuDev), "Read major capability");
	cudaController::checkError(cuDeviceGetAttribute(&capMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, d_cuDev), "Read minor capability");
	return capMajor*10 + capMinor;
}


float cudaKernel::timedExecute(const int iters) {
	CUevent start, stop;
	cudaController::checkError(cuEventCreate(&start, CU_EVENT_DEFAULT), "Event creation");
	cudaController::checkError(cuEventCreate(&stop, CU_EVENT_DEFAULT), "Event creation");

	execute();

	cudaController::checkError(cuEventRecord(start, 0), "Event record");

	for (int i = 0; i < iters; ++i)
		execute();

	cudaController::checkError(cuEventRecord(stop, 0), "Event record");
	cudaController::checkError(cuEventSynchronize(stop), "Event synchronize");

	float elapsedTime;
	cudaController::checkError(cuEventElapsedTime(&elapsedTime, start, stop), "Event elapsed");
	elapsedTime = elapsedTime/(float)iters;

	cudaController::checkError(cuEventDestroy(start), "Event destroy");
	cudaController::checkError(cuEventDestroy(stop), "Event destroy");

	return elapsedTime;
}

void cudaKernel::setVar(std::string varName, size_t varSize, void *varData) {
	// We have to have the kernel compiled
	compile();

	// Making sure the variable size matches
	size_t realVarSize;
	CUdeviceptr dest;
	cudaController::checkError(cuModuleGetGlobal(&dest, &realVarSize, d_module, varName.c_str()), "Get global var info from module");
	if (realVarSize != varSize)
		throw std::string("Var sizes don't match");

	cudaController::checkError(cuMemcpyHtoD(dest, varData, varSize), "H -> D memcpy (variable");
}

void cudaController::reportTiming(int kernel) {
	// First we run it once to know how it runs..  Then we run it for
	// <burnTime> milliseconds to get the accurate result (unless one iteration would take more than that).
	cudaKernel *benchKernel = d_kernel[kernel];
	if (!benchKernel)
		throw std::string("No kernel for the handle");

	float crudeTiming = benchKernel->timedExecute(1);
	const float burnTime = 1000.0f;
	float reportTime;

	if (crudeTiming*2.0f < burnTime) {
		int loops = (int)(burnTime/crudeTiming);
		//printf("Looping the kernel %d times..\n", loops); 
		reportTime = benchKernel->timedExecute(loops);
	} else
		reportTime = crudeTiming;

	printf("Kernel %s runs in %.3f ms (%.2f fps)\n",
			benchKernel->name().c_str(), reportTime, 1000.0f/reportTime);
}

int cudaController::makeGLTexture(unsigned int oglTex) {
	cudaTexture *tex = new cudaTexture(oglTex);

	d_texture.insert(std::pair<int, cudaTexture*>(d_textureCounter, tex));
	return d_textureCounter++;
}

int cudaController::create3DTexture(int w, int h, int d, void *data, bool floatData) {
	cudaTexture3D *tex = new cudaTexture3D(w, h, d, data, floatData);

	d_texture.insert(std::pair<int, cudaTexture3D*>(d_textureCounter, tex));
	return d_textureCounter++;
	/*d_texture3D.insert(std::pair<int, cudaTexture3D*>(d_texture3DCounter, tex));
	return d_texture3DCounter++;*/
}

int cudaController::create2DTextureFromFile(int w, int h, int chans, std::string fName) {
	cudaTexture *tex = new cudaTexture(w, h, chans, fName);

	d_texture.insert(std::pair<int, cudaTexture*>(d_textureCounter, tex));
	return d_textureCounter++;
}

int cudaController::create2DTextureFromData(int w, int h, int chans, void *data) {
	cudaTexture *tex = new cudaTexture(w, h, chans, data);

	d_texture.insert(std::pair<int, cudaTexture*>(d_textureCounter, tex));
	return d_textureCounter++;
}

int cudaController::create2DSurface(int w, int h, int chans) {
	cudaSurface *surf = new cudaSurface(w, h, chans);

	d_surface.insert(std::pair<int, cudaSurface*>(d_surfaceCounter, surf));
	return d_surfaceCounter++;
}

void cudaController::nearestSampling(int t) {
	d_texture[t]->setLinearInterpolation(false);
}


void *cudaController::getBufferData(int buffer, size_t &size) {
	CUdeviceptr devPtr = d_mem[buffer];
	checkError(cuMemGetAddressRange(NULL, &size, devPtr), "Get mem address range");
	printf("Returning %d bytes of data\n", (int)size);
	void *data = (void*) malloc(size);
	checkError(cuMemcpyDtoH(data, devPtr, size), "Memcpy D -> H");
	return data;
}

void *cudaController::getBufferPtr(int buffer) {
	int unified;
	checkError(cuDeviceGetAttribute(&unified, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, d_cuDev), "Get attrib");
	if (!unified)
		throw std::string("Unified addressing needed for getBufferPtr");

	//void *ptr;
	//checkError(cuPointerGetAttribute(&ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, d_mem[buffer]), "Get att
	return (void*)d_mem[buffer];
}

void cudaController::executeKernel(int kernel) {
	d_kernel[kernel]->execute();
}

void cudaController::fillTexFromBuffer(int tex, int buffer) {
	d_texture[tex]->map(); // Refreshes the array pointer

	d_texture[tex]->pushData(d_mem[buffer]);

	d_texture[tex]->unMap();
}

void cudaController::fillBufferFromTex(int buffer, int tex) {
	d_texture[tex]->map(); // Refreshes the array pointer

	d_texture[tex]->copyToBuffer(d_mem[buffer]);

	d_texture[tex]->unMap();
}


cudaKernel::cudaKernel(int computeCap) : d_computeCap(computeCap) {
	d_module = 0;
	d_function = 0;
	//d_paramSize = 0;
	d_totalThreadsX = d_threadBlockX = 0;
	d_totalThreadsY = d_threadBlockY = 0;
	d_preferSM = false;

	d_srcGenerated = false;
	d_compiled = false;
	d_paramsSet = false;

	d_kernelSrc = NULL;
}

/*cudaKernel::cudaKernel(const cudaKernel &c) {
	printf("copy ctor\n");
	*this = c;
	d_isACopy = true;
}*/

cudaKernel::~cudaKernel() {
	if (d_module)
		cuModuleUnload(d_module);
}

void cudaKernel::addString(std::string add) {
	d_kernelSrcString += add;
}

KernelSrc *cudaKernel::getSource() {
	genSources();
	return d_kernelSrc;
}

void cudaKernel::addSource(KernelSrc *s) {
	d_kernelSrc = s;
}

void cudaKernel::addLine(std::string line) {
	d_kernelSrcString += line + std::string("\n");
}

void cudaKernel::clearParams() {
	/*d_paramSize = 0;
	cudaController::checkError(cuParamSetSize(d_function, 0), "cuParamSetSize");*/
	d_paramsSet = false;
	d_params.clear();
}

template<class T> void cudaKernel::pushParam(T value) {
	pushParam(sizeof(T), &value);
	/*void *data = malloc(sizeof(T));
	memcpy(data, &value, sizeof(T));
	d_params.push_back(std::pair<size_t, void*>(sizeof(T), data));*/

	/*if (!d_function)
		throw std::string("Setting a param before compiling a kernel -- not possible");

	cudaController::checkError(cuParamSetSize(d_function, d_paramSize + __alignof(T)), "cuParamSetSize");
	cudaController::checkError(cuParamSetv(d_function, d_paramSize, &value, sizeof(T)), "cuParamSetv");
	d_paramSize += __alignof(T);*/
}

void cudaKernel::pushParam(size_t paramSize, void *paramData) {
	void *data = malloc(paramSize);
	memcpy(data, paramData, paramSize);
	d_params.push_back(std::pair<size_t, void*>(paramSize, data));
}

void cudaKernel::compile(bool keepSources) {
	if (!d_compiled) {

		std::string ptxSrc, funcName;
		char ptxFname[] = "SSEO_kernelXXXXXX.ptx";
		int ptxFd = mkstemps(ptxFname, 4);
		if (!ptxFd)
			throw std::string("Couldn't create temp kernel files for nvcc");
		char cmd[10240];
			
		const char *bitness = (sizeof(void*) == 4) ? "-m32" : "-m64";

		// We get the PTX source either as direct input, or compile it
		// from the generated .cu data (via nvcc).  Then we compile
		// cubin with ptxas and upload that to the driver API.
		if (d_kernelSrc->getAsm() == "") {
			// Fixing the sources if not yet done
			genSources();

			if (!d_kernelSrcString.size())
				throw std::string("Can't compile an empty kernel source");

			
			char cuFname[] = "SSEO_kernelXXXXXX.cu";
			int cuFd = mkstemps(cuFname, 3);

			if (!cuFd) // || !ptxFd) // || !cubinFd)
				throw std::string("Couldn't create temp kernel files for nvcc");

			// Writing the .cu sources into the temp file
			if (write(cuFd, d_kernelSrcString.c_str(), d_kernelSrcString.size()) == -1)
				throw std::string("Couldn't write temp kernel file ") + std::string(cuFname);

			// Using system() is the simplest way to do this, but I think it actually suffices
			// Compiling the same architecture binary as ours
			const char *leaveFlags = keepSources ? "-lineinfo -src-in-ptx" : "";
			sprintf(cmd, 
					"%s -gencode arch=compute_%d,code=compute_%d -ptx %s %s -use_fast_math -restrict -o %s %s",
					NVCC_PATH, d_computeCap, d_computeCap, bitness, leaveFlags, ptxFname, cuFname);
			printf("Trying to execute command\n\t%s\n", cmd);
			int rCode = system(cmd);

			close(cuFd);
			if (remove(cuFname))
				throw std::string("Couldn't remove temp file ") + std::string(cuFname);

			if (rCode) {
				fprintf(stderr, "Returned with %d!\n", rCode);
				remove(ptxFname);
				throw std::string("Failed in executing nvcc");
			}

			char *ptxSrcC = (char*) malloc(sizeof(char)*1024*1024);
			size_t readBytes = read(ptxFd, ptxSrcC, sizeof(char)*1024*1024);

			if (readBytes <= 0) {
				fprintf(stderr, "Read from %s returned %d bytes\n", ptxFname, (int)readBytes);
				throw std::string("Couldn't read the compiled ptx file");
			}
			ptxSrcC[readBytes] = 0;
			ptxSrc = std::string(ptxSrcC);

			free(ptxSrcC);

			close(ptxFd);

		} else {
			ptxSrc = d_kernelSrc->getAsm();
			d_kernelSrc->write(ptxFname, ptxSrc);
		}

		char cubinFname[] = "SSEO_kernelXXXXXX.cubin";
		int cubinFd = mkstemps(cubinFname, 6);
		if (!cubinFd)
			throw std::string("Couldn't create temp kernel files for ptxas");
		// DEF-LOAD-CACHE
		// Modes: ca (cache at all level) DEFAULT
		//        cg (cache at L2 and below, do not cache at L1)
		//        cs (do not cache)
		// DEF-STORE-CACHE
		// Modes: cg (cache at L2 and below)
		//        wb (write back to all levels) DEFAULT
		//        cs (no not cache (write with evict-first policy))
		// Also remember -maxrregcount <regs>
		sprintf(cmd, 
				"%s --allow-expensive-optimizations true -arch sm_%d %s -v -o %s %s",
				//"%s --allow-expensive-optimizations true --def-load-cache ca --def-store-cache wb -arch sm_%d %s -v -o %s %s",
				PTXAS_PATH, d_computeCap, bitness, cubinFname, ptxFname);
		printf("Trying to execute command\n\t%s\n", cmd);
		int rCode = system(cmd);

		if (rCode) {
			fprintf(stderr, "Returned with %d!\n", rCode);
			// If this failed, and we were given the PTX code, we need to clean the PTX copy
			if (d_kernelSrc->getAsm() != "")
				remove(ptxFname);
			remove(cubinFname);
			throw std::string("Failed in executing ptxas");
		}

		// We remove the ptx file if we had asm and it is a copy, or if keepsource is not set
		if ((!keepSources || d_kernelSrc->getAsm() != "") && remove(ptxFname))
			throw std::string("Couldn't remove temp file ") + std::string(ptxFname);

		void *cubinSrc = malloc(sizeof(char)*1024*1024);
		size_t readBytes = read(cubinFd, cubinSrc, sizeof(char)*1024*1024);

		if (readBytes <= 0) {
			fprintf(stderr, "Read from %s returned %d bytes\n", cubinFname, (int)readBytes);
			throw std::string("Couldn't read the compiled cubin file");
		}
		//cubinSrc[readBytes] = 0;

		close(cubinFd);
		if (remove(cubinFname))
			throw std::string("Couldn't remove temp file ") + std::string(cubinFname);

		// Making a string out of the ptx source to make finding the function name easier
		std::string ptxString(ptxSrc);
		size_t match = ptxString.find(std::string(".entry "));
		if (match == std::string::npos)
			throw std::string("Couldn't find .entry from the ptx source");

		match += std::string(".entry ").size();
		//std::string funcName;
		while (ptxString.at(match) != ' ' && ptxString.at(match) != '(') {
			funcName.push_back(ptxString.at(match));
			match++;
		}
		printf("Found function \"%s\"\n", funcName.c_str());
		// We'll name the kernel with this
		d_kernelName = funcName;

		// We're "keeping" the PTX source if keepSources was asked and the PTX source wasn't provided already
		if ((keepSources && d_kernelSrc->getAsm() == "") && rename(ptxFname, (d_kernelName+".ptx").c_str()))
			fprintf(stderr, "Couldn't rename PTX source to %s\n", (d_kernelName+".ptx").c_str());

		//printf("%s\n", ptxSrc);
		// Now we have our ptx source in ptxSrc
		cudaController::checkError(cuModuleLoadDataEx(&d_module, cubinSrc, 0, NULL, NULL), "Load module data");
		cudaController::checkError(cuModuleGetFunction(&d_function, d_module, funcName.c_str()), "Get function");
		
		// FIXME: Make this configurable
		if (d_preferSM)
			cudaController::checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_SHARED), "Set L1 config");
		else
			cudaController::checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_L1), "Set L1 config");

		free(cubinSrc);

		/*cudaController::checkError(cuParamSetSize(d_function, d_paramSize + __alignof(T)), "cuParamSetSize");
		cudaController::checkError(cuParamSetv(d_function, d_paramSize, &value, sizeof(T)), "cuParamSetv");*/
		d_compiled = true;
	}
}

void cudaKernel::genSources() {
	if (!d_srcGenerated) {
		// Find the tags and fill in the code
		// First the textures:  We first generate a string of the tex decls
		std::string texDecls;
		for (int i = 0; i < d_boundTextures.size(); ++i) {
			//printf("texref %p\n", d_boundTextures.at(i).second);
			int channels = d_boundTextures.at(i).second->channels();

			std::string mode = d_boundTextures.at(i).second->floatData() ? 
				"cudaReadModeElementType" : "cudaReadModeNormalizedFloat";

			char type[1024];
			sprintf(type, (channels > 1 || !d_boundTextures.at(i).second->floatData()) ? 
					"%s%d" : "%s",
				d_boundTextures.at(i).second->floatData() ? "float" : "uchar",
				channels);

			char decl[1024];
			sprintf(decl, "texture<%s, %d, %s> %s;\n",
					type, d_boundTextures.at(i).second->dims(),
					mode.c_str(), d_boundTextures.at(i).first.c_str());
			texDecls += decl;
		}
		// We also fill in surface references here
		for (int i = 0; i < d_boundSurfaces.size(); ++i) {
			int channels = d_boundSurfaces.at(i).second->dims();
			char decl[1024];
			sprintf(decl, "surface<void, %s> %s;\n",
					channels == 1 ? "cudaSurfaceType1D" : (channels == 2 ? "cudaSurfaceType2D" : "cudaSurfaceType3D"),
					d_boundSurfaces.at(i).first.c_str());

			texDecls += decl;
		}

		// And then replace the anchor with it
		d_kernelSrc->replace("#TEXDECL#", texDecls);

		// And write out the string
		d_kernelSrcString = d_kernelSrc->getSrc();
		
		d_srcGenerated = true;
	}

}

void cudaKernel::preferSM(bool b) {
	d_preferSM = b;

	// Setting if it has already been compiled
	if (d_function) {
		if (d_preferSM)
			cudaController::checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_SHARED), "Set L1 config");
		else
			cudaController::checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_L1), "Set L1 config");
	}
}

void cudaKernel::execute() {
	if (!d_totalThreadsX || !d_threadBlockX || !d_totalThreadsY || !d_threadBlockY)
		throw std::string("Trying to execute a kernel with unset execution configuration");

	//if (!d_compiled) {
	compile(true);
	/*	d_compiled = true;
	}*/

	if (!d_paramsSet) {
		//if (!d_newParamMethod) {
			// Setting the parameters
			size_t parPos = 0;
			for (int i = 0; i < d_params.size(); ++i) 
				parPos += __alignof(d_params.at(i).first);
			cudaController::checkError(cuParamSetSize(d_function, parPos), "cuParamSetSize");
			parPos = 0;
			for (int i = 0; i < d_params.size(); ++i) {
				cudaController::checkError(cuParamSetv(d_function, parPos, d_params.at(i).second, d_params.at(i).first), "cuParamSetv");
				parPos += __alignof(d_params.at(i).first);
			}
		//}

		// And binding the textures
		for (int i = 0; i < d_boundTextures.size(); ++i) {
			CUtexref tempRef;
			cudaController::checkError(
					cuModuleGetTexRef(&tempRef, d_module, d_boundTextures.at(i).first.c_str()), "Get tex ref from module");
			d_boundTextures.at(i).second->initRef(tempRef);
		}

		// And the surfaces
		for (int i = 0; i < d_boundSurfaces.size(); ++i) {
			CUsurfref tempRef;
			cudaController::checkError(
					cuModuleGetSurfRef(&tempRef, d_module, d_boundSurfaces.at(i).first.c_str()), "Get surf ref from module");
			d_boundSurfaces.at(i).second->initRef(tempRef);
		}


		cudaController::checkError(cuFuncSetBlockShape(d_function, d_threadBlockX, d_threadBlockY, 1), "Set block shape");

		d_paramsSet = true;
	}

	
	const bool reMap = true;

	if (reMap)
		for (int i = 0; i < (int)d_boundTextures.size(); ++i)
			if (d_boundTextures[i].second->needRemapping())
				d_boundTextures[i].second->map();

	/*if (d_kernelName == "occlusion")
		cudaController::checkError(cuLaunchGrid(d_function, 432*3/8, d_totalThreadsY/d_threadBlockY), "Launch grid");
	else
		cudaController::checkError(cuLaunchGrid(d_function, d_totalThreadsX/d_threadBlockX, d_totalThreadsY/d_threadBlockY), "Launch grid");*/

	//cudaController::checkError(cuLaunchGridAsync(d_function, d_totalThreadsX/d_threadBlockX, d_totalThreadsY/d_threadBlockY, 0), "Launch grid");
	cudaController::checkError(cuLaunchGrid(d_function, d_totalThreadsX/d_threadBlockX, d_totalThreadsY/d_threadBlockY), "Launch grid");

	if (reMap)
		for (int i = 0; i < (int)d_boundTextures.size(); ++i)
			if (d_boundTextures[i].second->needRemapping())
				d_boundTextures[i].second->unMap();
}

void cudaKernel::setExecConfig(int totalThreadsX, int threadBlockX, int totalThreadsY, int threadBlockY) {
	d_totalThreadsX = totalThreadsX;
	d_threadBlockX = threadBlockX;
	d_totalThreadsY = totalThreadsY;
	d_threadBlockY = threadBlockY;

	printf("Kernel %s set to execute %dx%d threads in %dx%d blocks\n", 
			d_kernelName.c_str(), d_totalThreadsX, d_totalThreadsY, d_threadBlockX, d_threadBlockY);

	if (d_totalThreadsX%d_threadBlockX)
		throw std::string("Total threads not divisible by thread block size (X), \"") + name() + "\"";
	if (d_totalThreadsY%d_threadBlockY)
		throw std::string("Total threads not divisible by thread block size (Y), \"") + name() + "\"";
}

std::string cudaKernel::name() {
	return d_kernelName;
}

/*CUmodule cudaKernel::getModule() {
	return d_module;
}*/

void cudaKernel::bindTexture(cudaTexture *tex, std::string refName) {
	/*CUtexref tempRef;
	cudaController::checkError(cuModuleGetTexRef(&tempRef, d_module, refName.c_str()), "Get tex ref from module");
	tex->initRef(tempRef);*/
	//d_boundTextures.push_back(tex);
	d_boundTextures.push_back(std::pair<std::string, cudaTexture*>(refName, tex));
}

void cudaKernel::bindSurface(cudaSurface *surf, std::string refName) {
	d_boundSurfaces.push_back(std::pair<std::string, cudaSurface*>(refName, surf));
}

/*void cudaKernel::bindTexture3D(cudaTexture3D *tex, std::string refName) {
	d_boundTextures.push_back(std::pair<std::string, cudaTexture*>(refName, tex));
	/CUtexref tempRef;
	cudaController::checkError(cuModuleGetTexRef(&tempRef, d_module, refName.c_str()), "Get tex ref from module");
	tex->initRef(tempRef);/
	// We're skipping this cause we won't be using unmap/map
	//d_boundTextures.push_back(tex); 
}*/

cudaTexture3D::cudaTexture3D(int w, int h, int d, void *data, bool floatData) : 
	cudaTexture(w, h), 
	d_depth(d)
{
	// This also just happens to be fixed
	d_channels = 1;

	// We create a fixed type 3D texture..
	CUDA_ARRAY3D_DESCRIPTOR desc;
	desc.Width = d_width;
	desc.Height = d_height;
	desc.Depth = d_depth;
	desc.Format = floatData ? CU_AD_FORMAT_FLOAT : CU_AD_FORMAT_UNSIGNED_INT8;
	desc.NumChannels = d_channels;
	desc.Flags = 0;

	cudaController::checkError(cuArray3DCreate(&d_cudaArray, &desc), "Create 3D array");

	// Populating it with data
	//int pitch = sizeof(unsigned char)*1*d_width;
	int pitch = sizeof(unsigned char)*d_channels*d_width*(floatData ? 4 : 1);
	CUDA_MEMCPY3D pCopy;
	memset(&pCopy, 0, sizeof(CUDA_MEMCPY3D));
	pCopy.srcMemoryType = CU_MEMORYTYPE_HOST;
	pCopy.srcHost = data;
	pCopy.srcPitch = pitch;
	pCopy.srcHeight = d_height;
	pCopy.dstMemoryType = CU_MEMORYTYPE_ARRAY;
	pCopy.dstArray = d_cudaArray;
	pCopy.WidthInBytes = pitch;
	pCopy.Height = d_height;
	pCopy.Depth = d_depth;

	cudaController::checkError(cuMemcpy3D(&pCopy), "Memcpy3D H -> A");
	
	d_dims = 3;
	d_floatData = floatData;

	// 3D texes don't need this by default
	d_needRemapping = false;
}

int cudaTexture::dims() {
	return d_dims;
}

int cudaTexture::channels() {
	return d_channels;
}

bool cudaTexture::floatData() {
	return d_floatData;
}

/*void cudaTexture3D::initRef(CUtexref texRef) {
	d_cudaRef = texRef;

	cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, 0, CU_TR_ADDRESS_MODE_CLAMP), "addmode");
	cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, 1, CU_TR_ADDRESS_MODE_CLAMP), "addmode");
	cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, 2, CU_TR_ADDRESS_MODE_CLAMP), "addmode");
	cudaController::checkError(cuTexRefSetFilterMode(d_cudaRef, CU_TR_FILTER_MODE_LINEAR), "filtermode");
	cudaController::checkError(cuTexRefSetFlags(d_cudaRef, CU_TRSF_NORMALIZED_COORDINATES), "Flags");
	//cudaController::checkError(cuTexRefSetFormat(d_cudaRef, CU_AD_FORMAT_UNSIGNED_INT8, 1), "format");
	cudaController::checkError(cuTexRefSetFormat(d_cudaRef, CU_AD_FORMAT_FLOAT, 1), "format");

	cudaController::checkError(cuTexRefSetArray(d_cudaRef, d_cudaArray, CU_TRSA_OVERRIDE_FORMAT), "setarray");
}*/

cudaTexture::cudaTexture(int w, int h) : 
	d_dims(2),
	d_width(w), 
	d_height(h), 
	d_needRemapping(true), 
	d_floatData(true),
	d_cudaArray(0),
	d_linearInterpolation(true)
{
}

cudaTexture::cudaTexture(int w, int h, int chans, void *data) :
	d_width(w),
	d_height(h),
	d_channels(chans),
	d_cudaArray(0),
	d_linearInterpolation(true)
{
	createFromData(data);
}

void cudaTexture::setLinearInterpolation(bool v) {
	d_linearInterpolation = v;
}

void cudaTexture::createFromData(void *data) {
	if (d_cudaArray)
		throw std::string("Already created!");
	
	d_dims = 2;
	d_needRemapping = false;
	// FIXME:  Only float supported right now
	d_floatData = true;

	// We create a fixed type 2D texture..
	CUDA_ARRAY_DESCRIPTOR desc;
	desc.Width = d_width;
	desc.Height = d_height;
	desc.Format = d_floatData ? CU_AD_FORMAT_FLOAT : CU_AD_FORMAT_UNSIGNED_INT8;
	desc.NumChannels = d_channels;
	//desc.Flags = 0;

	cudaController::checkError(cuArrayCreate(&d_cudaArray, &desc), "Create 2D array");

	// Populating it with data
	//int pitch = sizeof(unsigned char)*1*d_width;
	int pitch = sizeof(unsigned char)*d_channels*d_width*(d_floatData ? 4 : 1);
	CUDA_MEMCPY2D pCopy;
	memset(&pCopy, 0, sizeof(CUDA_MEMCPY2D));
	pCopy.srcMemoryType = CU_MEMORYTYPE_HOST;
	pCopy.srcHost = data;
	pCopy.srcPitch = pitch;
	//pCopy.srcHeight = d_height;
	pCopy.dstMemoryType = CU_MEMORYTYPE_ARRAY;
	pCopy.dstArray = d_cudaArray;
	pCopy.WidthInBytes = pitch;
	pCopy.Height = d_height;
	//pCopy.Depth = d_depth;

	cudaController::checkError(cuMemcpy2D(&pCopy), "Memcpy2D H -> A");
}

cudaTexture::cudaTexture(int w, int h, int chans, std::string fName) :
	d_width(w),
	d_height(h),
	d_channels(chans),
	d_cudaHandle(0)
{
	size_t size = d_width * d_height * d_channels * sizeof(float);
	int fd = open(fName.c_str(), O_RDONLY);
	if (fd == -1)
		throw std::string("Can't open texture file");

	struct stat filestat;
	if (fstat(fd, &filestat) == -1)
		exit(17);

	if (size != filestat.st_size)
		throw std::string("Odd amount of texture data");

	void *data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);

	createFromData(data);
	
	munmap(data, size);
	close(fd);
}


void cudaTexture::copyToBuffer(CUdeviceptr dest) {
	size_t texSize = (d_floatData ? sizeof(float) : sizeof(unsigned char))
		*d_dims*d_width*d_height*d_channels;
	cudaController::checkError(cuMemcpyAtoD(dest, d_cudaArray, 0, texSize), "Memcopy A -> D");
}

/*CUdeviceptr cudaTexture::getPtr() {
	if (!d_cudaRef)
		throw std::string("No reference initialized, can't return data pointer from texture");

	CUdeviceptr ptr;
	cudaController::checkError(cuTexRefGetAddress(&ptr, d_cudaRef), "cuTexRefGetAddress");

	return ptr;
}*/

cudaTexture::cudaTexture(unsigned int oglTex) :
	d_dims(2),
	d_cudaHandle(0),
	d_linearInterpolation(true)
{
	d_oglHandle = oglTex;

	cudaController::checkError(cuGraphicsGLRegisterImage(&d_cudaHandle, 
				oglTex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY),
			"Registering the OGL texture in CUDA");

	d_cudaRef = 0;

	map();
	CUDA_ARRAY_DESCRIPTOR arrayDesc;
	cudaController::checkError(cuArrayGetDescriptor(&arrayDesc, d_cudaArray), "Get array descriptor");
	d_width = arrayDesc.Width;
	d_height = arrayDesc.Height;
	d_channels = arrayDesc.NumChannels;

	// FIXME The array description returned by cuda doesn't work properly
	if (arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
		fprintf(stderr, "Array format %d\n", arrayDesc.Format);
		throw std::string("We're only handling 32b float channels..");
	}

	d_floatData = true;
	unMap();
}

cudaTexture::~cudaTexture() {
	if (d_cudaHandle)
		cuGraphicsUnregisterResource(d_cudaHandle);
}

bool cudaTexture::needRemapping() {
	return d_needRemapping;
}

void cudaTexture::initRef(CUtexref texRef) {
	d_cudaRef = texRef;

	for (int i = 0; i < d_dims; ++i)
		cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, i, CU_TR_ADDRESS_MODE_CLAMP), "addmode");
		//cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, i, CU_TR_ADDRESS_MODE_WRAP), "addmode");

	if (d_linearInterpolation) {
		printf("Setting linear interpolation (dims %d)\n", d_dims);
		cudaController::checkError(cuTexRefSetFilterMode(d_cudaRef, CU_TR_FILTER_MODE_LINEAR), "filtermode");
	} else {
		printf("Setting point interpolation (dims %d)\n", d_dims);
		cudaController::checkError(cuTexRefSetFilterMode(d_cudaRef, CU_TR_FILTER_MODE_POINT), "filtermode");
	}

	cudaController::checkError(cuTexRefSetFlags(d_cudaRef, CU_TRSF_NORMALIZED_COORDINATES), "Flags");

	if (d_floatData)
		cudaController::checkError(cuTexRefSetFormat(d_cudaRef, CU_AD_FORMAT_FLOAT, 1), "format");
	else
		cudaController::checkError(cuTexRefSetFormat(d_cudaRef, CU_AD_FORMAT_UNSIGNED_INT8, 1), "format");
	
	// If not going to get remapped, we need to do this here
	if (!d_needRemapping)
		cudaController::checkError(cuTexRefSetArray(d_cudaRef, d_cudaArray, CU_TRSA_OVERRIDE_FORMAT), "setarray");
}

void cudaTexture::map() {
	/*
	// We could do this in unMap, but there's no fault in calling many map()s in a row and then we wouldn't free the previous
	if (d_cudaArray) {
		delete d_cudaArray;
		d_cudaArray = NULL;
	}*/ 

	// I noticed in cycle2 code that changes from OpenGL propagate without this
	#if 0
	static bool informed = false;
	if (!informed)
		printf("THERE MIGHT BE NO NEED TO CONTINUOUSLY MAP!\n");
	informed = true;
	#endif

	cudaController::checkError(cuGraphicsMapResources(1, &d_cudaHandle, 0), "Map resources (cudaTexture)");
	cudaController::checkError(cuGraphicsSubResourceGetMappedArray(&d_cudaArray, d_cudaHandle, 0, 0),
			"Get cuda resource -> cuda array");

	if (d_cudaRef)
		cudaController::checkError(cuTexRefSetArray(d_cudaRef, d_cudaArray, CU_TRSA_OVERRIDE_FORMAT), "setarray");
}

void cudaTexture::unMap() {
	cudaController::checkError(cuGraphicsUnmapResources(1, &d_cudaHandle, 0), "Map resources (cudaTexture)");
}

void cudaTexture::pushData(CUdeviceptr devPtr) {
	size_t size;
	cudaController::checkError(cuMemGetAddressRange(NULL, &size, devPtr), "Get mem address range");

	int pitch = d_width*d_channels*sizeof(float);
	if ((int)size != d_height*pitch) {
		fprintf(stderr, "%d bytes in the buffer, %d bytes (%dx%d@%d channels @%d bytes each) in texture\n",
				(int)size, (int)(d_width*d_height*d_channels*sizeof(float)), d_width, d_height, d_channels, (int)sizeof(float));
		throw std::string("Buffer size doesn't match tex dimensions");
	}

	CUDA_MEMCPY2D pCopy;
	memset(&pCopy, 0, sizeof(CUDA_MEMCPY2D));
	pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
	pCopy.srcDevice = devPtr;
	pCopy.srcPitch = pitch;
	pCopy.dstMemoryType = CU_MEMORYTYPE_ARRAY;
	pCopy.dstArray = d_cudaArray;
	pCopy.WidthInBytes = pitch;
	pCopy.Height = d_height;

	cudaController::checkError(cuMemcpy2D(&pCopy), "Memcpy D -> A");
}

int cudaSurface::width() {
	return d_width;
}

int cudaSurface::height() {
	return d_height;
}

int cudaSurface::channels() {
	return d_channels;
}

int cudaSurface::dims() {
	return d_dims;
}

cudaSurface::cudaSurface(int w, int h, int chans) :
	d_width(w),
	d_height(h),
	d_channels(chans),
	d_dims(2)
{
	printf("Surface width %d, height %d\n", d_width, d_height);
	//d_width = 1404; d_height = 1404;
	//d_width *= 2; d_height /= 2;
	// The array part goes the same as with textures
	CUDA_ARRAY3D_DESCRIPTOR desc;
	desc.Width = d_width;
	desc.Height = d_height;
	desc.Depth = 0;
	desc.Format = CU_AD_FORMAT_FLOAT; //floatData ? CU_AD_FORMAT_FLOAT : CU_AD_FORMAT_UNSIGNED_INT8;
	desc.NumChannels = d_channels;
	desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;

	cudaController::checkError(cuArray3DCreate(&d_cudaArray, &desc), "Create 3D array (for surface)");
}

void cudaSurface::initRef(CUsurfref surfRef) {
	d_cudaRef = surfRef;

	/*for (int i = 0; i < d_dims; ++i)
		cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, i, CU_TR_ADDRESS_MODE_CLAMP), "addmode");
		//cudaController::checkError(cuTexRefSetAddressMode(d_cudaRef, i, CU_TR_ADDRESS_MODE_WRAP), "addmode");

	cudaController::checkError(cuTexRefSetFilterMode(d_cudaRef, CU_TR_FILTER_MODE_LINEAR), "filtermode");
	//cudaController::checkError(cuTexRefSetFilterMode(d_cudaRef, CU_TR_FILTER_MODE_POINT), "filtermode");

	cudaController::checkError(cuTexRefSetFlags(d_cudaRef, CU_TRSF_NORMALIZED_COORDINATES), "Flags");

	if (d_floatData)
		cudaController::checkError(cuTexRefSetFormat(d_cudaRef, CU_AD_FORMAT_FLOAT, 1), "format");
	else
		cudaController::checkError(cuTexRefSetFormat(d_cudaRef, CU_AD_FORMAT_UNSIGNED_INT8, 1), "format");
	*/
	
	// If not going to get remapped, we need to do this here
	cudaController::checkError(cuSurfRefSetArray(d_cudaRef, d_cudaArray, 0), "Setarray for surf");
}
