#include "gpgpu_opencl.h"
#include <sys/time.h>
#include <cstring>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <GL/gl.h>
#include <GL/glx.h>
#include <unistd.h>


openCLController::openCLController(int ctx) {

	// We probably need a platform/device chooser at one point
	int usePlatform = 0; // The first available
	int useDevice = 0; // This is used if interop is NOT to be used

	cl_platform_id clPlatforms[10];
	cl_uint numPlatforms;
	checkError(clGetPlatformIDs(10, clPlatforms, &numPlatforms), "Get platforms");

	if (!numPlatforms)
		throw std::string("No OpenCL platforms");

	for (int i = 0; i < numPlatforms; ++i) {
		char platformName[1024];
		checkError(clGetPlatformInfo(clPlatforms[i], CL_PLATFORM_NAME, 1023, platformName, NULL), "Get plat info");
		printf("Found OpenCL platform (%d/%d): %s", i+1, numPlatforms, platformName);
		if (usePlatform == i)
			printf(" (USING THIS)\n");
		else
			printf("\n");
	}

	cl_device_id clDevices[10];
	cl_uint numDevices;
	checkError(clGetDeviceIDs(clPlatforms[usePlatform], CL_DEVICE_TYPE_GPU, 10, clDevices, &numDevices), "Get devices");

	if (!numDevices)
		throw std::string("No OpenCL devices for this platform");

	if (ctx == -1)
		printf("Choosing the first device supporting GL sharing\n");

	d_clDev = NULL;
	for (int i = 0; i < numDevices; ++i) {
		char deviceName[1024];
		checkError(clGetDeviceInfo(clDevices[i], CL_DEVICE_NAME, 1023, deviceName, NULL), "Get device info");
		printf("Found GPU device (%d/%d): %s", i+1, numDevices, deviceName);

		// We consider picking this dev only if none is already chosen
		if (!d_clDev) {
			if (ctx == -1) {
				// Checking for the sharing extension
				char extStrC[10240];
				checkError(clGetDeviceInfo(clDevices[i], CL_DEVICE_EXTENSIONS, 10239, extStrC, NULL), "Get device extensions");
				std::string extStr(extStrC);
				if (extStr.find("cl_khr_gl_sharing") != std::string::npos) {
					d_clDev = clDevices[i];
					// We need to set some properties for sharing
					cl_context_properties clProps[] = {
						CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), 
						CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), 
						CL_CONTEXT_PLATFORM, (cl_context_properties)clPlatforms[usePlatform], 
						0
					};
					int rCode;
					d_clCtx = clCreateContext(clProps, 1, &d_clDev, NULL, NULL, &rCode);
					checkError(rCode, "Create shared context");

					printf(" (OpenGL shared)");
				} else 
					throw std::string("Couldn't find a shareable context");
			} else {
				if (useDevice == i) {
					d_clDev = clDevices[i];
					// There's not much magic in creating an unshared context
					int rCode;
					d_clCtx = clCreateContext(NULL, 1, &d_clDev, NULL, NULL, &rCode);
					checkError(rCode, "Create context");
				}
			}
		}

		if (d_clDev == clDevices[i])
			printf(" (USING THIS)\n");
		else
			printf("\n");
	}

	if (!d_clDev)
		throw std::string("Couldn't find usable CL device");

	// Creating the command queue
	int rCode;
	// FIXME: You should go to OU-OF-ORDER command queues at some point,
	// but it requires either dependency definition from the application
	// or tweaks to the API where everything that the application asks
	// is done in-order but implemented out-of-order
	d_clCmdQueue = clCreateCommandQueue(d_clCtx, d_clDev, 0, &rCode); 
	checkError(rCode, "Create cmd queue");

	// Instancing the keyword dictionary
	d_keywords = new openCLKeywords();
}

void openCLController::checkError(int rCode, std::string desc) {
	std::string errName;
	switch (rCode) {
		case CL_DEVICE_NOT_FOUND:
			errName = "CL_DEVICE_NOT_FOUND"; break;
		case CL_DEVICE_NOT_AVAILABLE:
			errName = "CL_DEVICE_NOT_AVAILABLE"; break;
		case CL_COMPILER_NOT_AVAILABLE:
			errName = "CL_COMPILER_NOT_AVAILABLE"; break;
		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
			errName = "CL_MEM_OBJECT_ALLOCATION_FAILURE"; break;
		case CL_OUT_OF_RESOURCES:
			errName = "CL_OUT_OF_RESOURCES"; break;
		case CL_OUT_OF_HOST_MEMORY:
			errName = "CL_OUT_OF_HOST_MEMORY"; break;
		case CL_PROFILING_INFO_NOT_AVAILABLE:
			errName = "CL_PROFILING_INFO_NOT_AVAILABLE"; break;
		case CL_MEM_COPY_OVERLAP:
			errName = "CL_MEM_COPY_OVERLAP"; break;
		case CL_IMAGE_FORMAT_MISMATCH:
			errName = "CL_IMAGE_FORMAT_MISMATCH"; break;
		case CL_IMAGE_FORMAT_NOT_SUPPORTED:
			errName = "CL_IMAGE_FORMAT_NOT_SUPPORTED"; break;
		case CL_BUILD_PROGRAM_FAILURE:
			errName = "CL_BUILD_PROGRAM_FAILURE"; break;
		case CL_MAP_FAILURE:
			errName = "CL_MAP_FAILURE"; break;
		case CL_MISALIGNED_SUB_BUFFER_OFFSET:
			errName = "CL_MISALIGNED_SUB_BUFFER_OFFSET"; break;
		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
			errName = "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; break;
		case CL_INVALID_VALUE:
			errName = "CL_INVALID_VALUE"; break;
		case CL_INVALID_DEVICE_TYPE:
			errName = "CL_INVALID_DEVICE_TYPE"; break;
		case CL_INVALID_PLATFORM:
			errName = "CL_INVALID_PLATFORM"; break;
		case CL_INVALID_DEVICE:
			errName = "CL_INVALID_DEVICE"; break;
		case CL_INVALID_CONTEXT:
			errName = "CL_INVALID_CONTEXT"; break;
		case CL_INVALID_QUEUE_PROPERTIES:
			errName = "CL_INVALID_QUEUE_PROPERTIES"; break;
		case CL_INVALID_COMMAND_QUEUE:
			errName = "CL_INVALID_COMMAND_QUEUE"; break;
		case CL_INVALID_HOST_PTR:
			errName = "CL_INVALID_HOST_PTR"; break;
		case CL_INVALID_MEM_OBJECT:
			errName = "CL_INVALID_MEM_OBJECT"; break;
		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
			errName = "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; break;
		case CL_INVALID_IMAGE_SIZE:
			errName = "CL_INVALID_IMAGE_SIZE"; break;
		case CL_INVALID_SAMPLER:
			errName = "CL_INVALID_SAMPLER"; break;
		case CL_INVALID_BINARY:
			errName = "CL_INVALID_BINARY"; break;
		case CL_INVALID_BUILD_OPTIONS:
			errName = "CL_INVALID_BUILD_OPTIONS"; break;
		case CL_INVALID_PROGRAM:
			errName = "CL_INVALID_PROGRAM"; break;
		case CL_INVALID_PROGRAM_EXECUTABLE:
			errName = "CL_INVALID_PROGRAM_EXECUTABLE"; break;
		case CL_INVALID_KERNEL_NAME:
			errName = "CL_INVALID_KERNEL_NAME"; break;
		case CL_INVALID_KERNEL_DEFINITION:
			errName = "CL_INVALID_KERNEL_DEFINITION"; break;
		case CL_INVALID_KERNEL:
			errName = "CL_INVALID_KERNEL"; break;
		case CL_INVALID_ARG_INDEX:
			errName = "CL_INVALID_ARG_INDEX"; break;
		case CL_INVALID_ARG_VALUE:
			errName = "CL_INVALID_ARG_VALUE"; break;
		case CL_INVALID_ARG_SIZE:
			errName = "CL_INVALID_ARG_SIZE"; break;
		case CL_INVALID_KERNEL_ARGS:
			errName = "CL_INVALID_KERNEL_ARGS"; break;
		case CL_INVALID_WORK_DIMENSION:
			errName = "CL_INVALID_WORK_DIMENSION"; break;
		case CL_INVALID_WORK_GROUP_SIZE:
			errName = "CL_INVALID_WORK_GROUP_SIZE"; break;
		case CL_INVALID_WORK_ITEM_SIZE:
			errName = "CL_INVALID_WORK_ITEM_SIZE"; break;
		case CL_INVALID_GLOBAL_OFFSET:
			errName = "CL_INVALID_GLOBAL_OFFSET"; break;
		case CL_INVALID_EVENT_WAIT_LIST:
			errName = "CL_INVALID_EVENT_WAIT_LIST"; break;
		case CL_INVALID_EVENT:
			errName = "CL_INVALID_EVENT"; break;
		case CL_INVALID_OPERATION:
			errName = "CL_INVALID_OPERATION"; break;
		case CL_INVALID_GL_OBJECT:
			errName = "CL_INVALID_GL_OBJECT"; break;
		case CL_INVALID_BUFFER_SIZE:
			errName = "CL_INVALID_BUFFER_SIZE"; break;
		case CL_INVALID_MIP_LEVEL:
			errName = "CL_INVALID_MIP_LEVEL"; break;
		case CL_INVALID_GLOBAL_WORK_SIZE:
			errName = "CL_INVALID_GLOBAL_WORK_SIZE"; break;
		//case CL_INVALID_PROPERTY_EXT:
		//    errName = "CL_INVALID_PROPERTY_EXT"; break;
		default:
			errName = "unknown error code"; break;
	}

	if (rCode != CL_SUCCESS)
		throw ((desc != "") ? std::string("Error in \"") + desc + "\": " : std::string("Error: ")) +
			errName;
}

openCLController::~openCLController() {
	// We should probably release all kernels, programs etc first..
	clReleaseCommandQueue(d_clCmdQueue);
	clReleaseContext(d_clCtx);
}

int openCLController::newBuffer(size_t size, const void *data) {
	int rCode;
	cl_mem devPtr;
	if (data) {
		devPtr = clCreateBuffer(d_clCtx, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, size, (void*)data, &rCode);
		checkError(rCode, "Create buffer from host data");
	} else {
		devPtr = clCreateBuffer(d_clCtx, CL_MEM_READ_WRITE, size, NULL, &rCode);
		checkError(rCode, "Create buffer from null data");
	}

	d_mem.insert(std::pair<int, cl_mem>(d_memCounter, devPtr));
	return d_memCounter++;
}

int openCLController::newKernel(std::string s) {
	openCLKernel *kernel = new openCLKernel(d_clCtx, d_clCmdQueue, d_clDev);

	kernel->addString(s);

	d_kernel.insert(std::pair<int, openCLKernel*>(d_kernelCounter, kernel));
	return d_kernelCounter++;
}

int openCLController::newKernel(KernelSrc *s) {
	openCLKernel *kernel = new openCLKernel(d_clCtx, d_clCmdQueue, d_clDev);

	kernel->addSource(s);

	d_kernel.insert(std::pair<int, openCLKernel*>(d_kernelCounter, kernel));
	return d_kernelCounter++;
}

void openCLController::writeKernelSrc(int kernel, std::string fname) {
	// We tell the kernel to update the sources, and then we write them out
	d_kernel[kernel]->getSource()->write(fname);
}

void openCLController::nearestSampling(int t) {
	fprintf(stderr, "WARNING!  Nearest sampling not yet supported\n");
}

void openCLController::clearKernelParams(int kernel) {
	d_kernel[kernel]->clearParams();
}

void openCLController::setBufferParam(int kernel, int buffer) {
	d_kernel[kernel]->pushParam(d_mem[buffer]);
}

void openCLController::setDataParam(int kernel, size_t paramSize, void *paramData) {
	d_kernel[kernel]->pushParam(paramSize, paramData);
}

void openCLController::setVar(int kernel, std::string varName, size_t varSize, void *varData) {
	throw std::string("Setvar not supported in OpenCL");
	//d_kernel[kernel]->setVar(varName, varSize, varData);
}

void openCLController::clearBuffer(int buffer, void *fourB) {
	cl_event e;
	//float pattern = v;
	size_t size;
	// Reading the buffer size
	checkError(clGetMemObjectInfo(d_mem[buffer], CL_MEM_SIZE,
				sizeof(size_t), &size, NULL), "Mem obj. info");

	// FIXME:  Only supported in OpenCL 1.2
	#ifdef CL_VERSION_1_2
	// Filling it with "pattern"
	checkError(clEnqueueFillBuffer(d_clCmdQueue, d_mem[buffer],
				//&pattern, sizeof(float), 0, size, 0, NULL, &e), "Fill buffer");
				fourB, 4, 0, size, 0, NULL, &e), "Fill buffer");

	#else
	size_t prevSize = d_clearBuf.size()/sizeof(float);
	float pattern = *((float*)fourB);
	if (prevSize < size)
		while (prevSize++ < size)
			d_clearBuf.push_back(pattern);

	checkError(clEnqueueWriteBuffer(d_clCmdQueue,
				d_mem[buffer], true, 0, size, &d_clearBuf.at(0), 0, NULL, &e), "Enqueue write");
	#endif

	checkError(clWaitForEvents(1, &e), "Wait for events");
	checkError(clReleaseEvent(e), "Release event");
}

void openCLController::uploadToBuffer(int bufId, void *data, size_t size) {
	throw std::string("Unimplemented 2");
}

void openCLController::setTex(int kernel, int tex, std::string refName) {
	d_kernel[kernel]->bindTexture(d_texture[tex], refName);
}

/*void openCLController::setTex3D(int kernel, int tex, std::string refName) {
	throw std::string("Unimplemented");
}*/

void openCLController::setKernelExecConf(int kernel, int totalX, int blockX, int totalY, int blockY) {
	d_kernel[kernel]->setExecConfig(totalX, blockX, totalY, blockY);
}

void openCLController::preferSM(int kernel) {
	//throw std::string("Unimplemented");
}

float openCLKernel::timedExecute(const int iters) {
	cl_event events[iters];

	execute(); // A normal blocking execution just to "flush"
	clFinish(d_clCmdQueue);

	struct timeval t_start, t_end;
	gettimeofday(&t_start, NULL);

	for (int i = 0; i < iters; ++i) {
		std::vector<cl_event> waitEvents;
		if (i)
			waitEvents.push_back(events[i-1]);

		execute(waitEvents, events + i); // Use the first event
	}

	openCLController::checkError(clWaitForEvents(1, events + iters - 1), "Wait for the last event");
	gettimeofday(&t_end, NULL);

	// Freeing the events
	for (int i = 0; i < iters; ++i)
		openCLController::checkError(clReleaseEvent(events[i]), "Free event");

	long int timedif = (t_end.tv_sec*1e6 + t_end.tv_usec) - (t_start.tv_sec*1e6 + t_start.tv_usec);
	return (double)timedif/1e3/(double)iters;
}

void openCLKernel::setVar(std::string varName, size_t varSize, void *varData) {
	throw std::string("Setvar not supported in OpenCL");
	// We have to have the kernel compiled
	//compile();
}

void openCLController::reportTiming(int kernel) {
	// First we run it once to know how it runs..  Then we run it for
	// <burnTime> seconds to get the accurate result (unless one iteration would take more than that).
	openCLKernel *benchKernel = d_kernel[kernel];
	if (!benchKernel)
		throw std::string("No kernel for the handle");

	float crudeTiming = benchKernel->timedExecute(1);
	const float burnTime = 1000.0f;
	float reportTime;

	if (crudeTiming*2.0f < burnTime) {
		int loops = (int)(burnTime/crudeTiming);
		//printf("Looping the kernel %d times..\n", loops); 
		reportTime = benchKernel->timedExecute(loops);
	} else
		reportTime = crudeTiming;

	printf("Kernel %s runs in %.3f ms (%.2f fps)\n",
			benchKernel->name().c_str(), reportTime, 1000.0f/reportTime);
}


int openCLController::makeGLTexture(unsigned int oglTex) {
	openCLTexture *tex = new openCLTexture(oglTex, d_clCtx, d_clCmdQueue);

	d_texture.insert(std::pair<int, openCLTexture*>(d_textureCounter, tex));
	return d_textureCounter++;
}

int openCLController::create3DTexture(int w, int h, int d, void *data, bool floatData) {
	openCLTexture3D *tex = new openCLTexture3D(w, h, d, data, floatData, d_clCtx, d_clCmdQueue);

	d_texture.insert(std::pair<int, openCLTexture*>(d_textureCounter, tex));
	return d_textureCounter++;
}

int openCLController::create2DTextureFromFile(int w, int h, int chans, std::string fName) {
	openCLTexture *tex = new openCLTexture(w, h, chans, fName, d_clCtx, d_clCmdQueue);

	d_texture.insert(std::pair<int, openCLTexture*>(d_textureCounter, tex));
	return d_textureCounter++;
}


void *openCLController::getBufferData(int buffer, size_t &size) {
	// First we get the size
	checkError(clGetMemObjectInfo(d_mem[buffer], CL_MEM_SIZE, sizeof(size_t), &size, NULL), "Get mem info");
	void *data = malloc(size);
	cl_event e;
	checkError(clEnqueueReadBuffer(d_clCmdQueue, d_mem[buffer],
				true, 0, size, data, 0, NULL, &e), "Read buffer");
	clWaitForEvents(1, &e);
	checkError(clReleaseEvent(e), "Free event");

	printf("Returning %d bytes of data\n", size);
	return data;
}

void *openCLController::getBufferPtr(int buffer) {
	throw std::string("Unimplemented 5");
}

void openCLController::executeKernel(int kernel) {
	d_kernel[kernel]->execute();
}

void openCLController::fillTexFromBuffer(int tex, int buffer) {
	cl_event e;
	size_t startPos[] = {
		0,
		0,
		0
	};
	size_t readSize[] = {
		d_texture[tex]->width(),
		d_texture[tex]->height(),
		1
	};

	// FIXME:  Chain these together with events
	if (d_texture[tex]->needRemapping())
		d_texture[tex]->map();

	checkError(clEnqueueCopyBufferToImage(d_clCmdQueue,
				d_mem[buffer], *d_texture[tex]->getMemPtr(),
				0, startPos, readSize, 0, NULL,
				&e), "Copy buffer to image");

	checkError(clWaitForEvents(1, &e), "Waiting for events");
	
	if (d_texture[tex]->needRemapping())
		d_texture[tex]->unMap();

	checkError(clReleaseEvent(e), "Free event");
}


openCLTexture::openCLTexture(int w, int h, int chans, std::string fName, cl_context clCtx, cl_command_queue clCmdQueue) :
	d_width(w),
	d_height(h),
	d_channels(chans),
	d_clCtx(clCtx),
	d_clCmdQueue(clCmdQueue)
{
	d_dims = 2;
	d_needRemapping = false;
	d_floatData = true;

	size_t size = d_width * d_height * d_channels * sizeof(float);
	int fd = open(fName.c_str(), O_RDONLY);
	if (fd == -1)
		throw std::string("Can't open texture file");

	struct stat filestat;
	if (fstat(fd, &filestat) == -1)
		exit(17);

	if (size != filestat.st_size)
		throw std::string("Odd amount of texture data");

	void *data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);

	// Creating the texture
	cl_image_format clFormat;
	clFormat.image_channel_order = (d_channels == 1) ? CL_R : CL_RGBA;
	clFormat.image_channel_data_type = CL_FLOAT;
	size_t pixelSize = d_channels*sizeof(float);

	int rCode;

	#ifdef CL_VERSION_1_2
	cl_image_desc imageDesc;
	imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
	imageDesc.image_width = d_width;
	imageDesc.image_height = d_height;
	imageDesc.image_depth = 1;
	imageDesc.image_array_size = 1;
	imageDesc.image_row_pitch = pixelSize*d_width;
	imageDesc.image_slice_pitch = 0;
	imageDesc.num_mip_levels = 0;
	imageDesc.num_samples = 0;
	imageDesc.buffer = NULL;

	d_clMem = clCreateImage(d_clCtx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			&clFormat, &imageDesc, data, &rCode);
	#else
	d_clMem = clCreateImage2D(d_clCtx,
				CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
				&clFormat, d_width, d_height,
				d_width*pixelSize,
				data, &rCode);
	#endif
	
	openCLController::checkError(rCode, "clCreateImage3D");

	munmap(data, size);
	close(fd);
}

openCLTexture::openCLTexture(unsigned int oglTex, cl_context clCtx, cl_command_queue clCmdQueue) :
	d_oglHandle(oglTex),
	d_clCtx(clCtx),
	d_clCmdQueue(clCmdQueue),
	d_clMem(NULL)
{
	int rCode;

	#ifdef CL_VERSION_1_2
	d_clMem = clCreateFromGLTexture(d_clCtx, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, oglTex, &rCode);
	#else
	d_clMem = clCreateFromGLTexture2D(d_clCtx, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, oglTex, &rCode);
	#endif
	openCLController::checkError(rCode, "Create from GL tex");

	/*cl_event e;
	openCLController::checkError(clEnqueueAcquireGLObjects(
				d_clCmdQueue, 1, &d_clMem, 0, NULL, &e), "Acquire GL objects");
	openCLController::checkError(clWaitForEvents(1, &e), "Wait for events");*/

	// In order to extract data from the texture (as OpenCL doesn't provide a way for it)
	// We shamelessly exploit the Texture class from OpenCL API
	//#define USEGLTEXINFO

	#ifdef USEGLTEXINFO
	Texture texInfo(GL_TEXTURE_2D, oglTex);

	d_width = texInfo.width();
	d_height = texInfo.height();
	d_channels = texInfo.channels();
	d_floatData = texInfo.channelSize() == 32 ? true : false;

	printf("Created an OpenCL texture: %d x %d,  %d channels,  %d b each\n",
			d_width, d_height, d_channels, d_floatData ? 32 : 8);
	#else
	
	openCLController::checkError(clGetImageInfo(d_clMem, CL_IMAGE_WIDTH, sizeof(size_t), &d_width, NULL), "Get image info");
	openCLController::checkError(clGetImageInfo(d_clMem, CL_IMAGE_HEIGHT, sizeof(size_t), &d_height, NULL), "Get image info");
	size_t elemSize;
	openCLController::checkError(clGetImageInfo(d_clMem, CL_IMAGE_ELEMENT_SIZE, sizeof(size_t), &elemSize, NULL), "Get image info");

	cl_image_format imageFormat;
	openCLController::checkError(clGetImageInfo(d_clMem, CL_IMAGE_FORMAT, sizeof(cl_image_format), &imageFormat, NULL), "Get image info");

	if (imageFormat.image_channel_order == CL_R) {
		d_channels = 1;
	} else if (imageFormat.image_channel_order == CL_RGBA) {
		d_channels = 4;
	} else {
		printf("Channel order %d\n", imageFormat.image_channel_order);
		throw std::string("Unknown image channel order");
	}

	d_floatData = (elemSize/d_channels) == 4;

	/*size_t rowPitch;
	openCLController::checkError(clGetImageInfo(d_clMem, CL_IMAGE_ROW_PITCH, sizeof(size_t), &rowPitch, NULL), "Get image info");

	d_channels = rowPitch/(d_width*elemSize);*/

	printf("Created an OpenCL texture: %d x %d,  %d channels,  %d b each\n",
			d_width, d_height, d_channels, (elemSize*8)/d_channels);
	#endif

	d_dims = 2;
	d_needRemapping = true;
}

void openCLTexture::map() {
	//cl_event e;
	//glFinish();
	openCLController::checkError(clEnqueueAcquireGLObjects(d_clCmdQueue, 1, &d_clMem, 0, NULL, NULL), "Acquire GL objects (map)");
	//openCLController::checkError(clWaitForEvents(1, &e), "Wait for events");
}

void openCLTexture::unMap() {
	//cl_event e;
	//clFinish(d_clCmdQueue);
	openCLController::checkError(clEnqueueReleaseGLObjects(d_clCmdQueue, 1, &d_clMem, 0, NULL, NULL), "Acquire GL objects (unmap)");
	//openCLController::checkError(clWaitForEvents(1, &e), "Wait for events");
}

openCLTexture::openCLTexture(int w, int h, bool floatData, cl_context clCtx, cl_command_queue cmdQueue) :
	d_width(w),
	d_height(h),
	d_floatData(floatData),
	d_clCtx(clCtx),
	d_clCmdQueue(cmdQueue),
	d_needRemapping(true)
{
}

openCLTexture3D::openCLTexture3D(int w, int h, int d, void *data, bool floatData, cl_context clCtx, cl_command_queue clCmdQueue) :
	openCLTexture(w, h, floatData, clCtx, clCmdQueue),
	d_depth(d)
{
	// This also just happens to be fixed
	d_channels = 1;

	cl_image_format clFormat;
	clFormat.image_channel_order = CL_R;
	clFormat.image_channel_data_type = d_floatData ? CL_FLOAT : CL_UNORM_INT8;
	size_t pixelSize = d_floatData ? 4 : 1;

	int rCode;

	#ifdef CL_VERSION_1_2
	cl_image_desc imageDesc;
	imageDesc.image_type = CL_MEM_OBJECT_IMAGE3D;
	imageDesc.image_width = d_width;
	imageDesc.image_height = d_height;
	imageDesc.image_depth = d_depth;
	imageDesc.image_array_size = 1;
	imageDesc.image_row_pitch = pixelSize*d_width;
	imageDesc.image_slice_pitch = imageDesc.image_row_pitch*d_height;
	imageDesc.num_mip_levels = 0;
	imageDesc.num_samples = 0;
	imageDesc.buffer = NULL;

	d_clMem = clCreateImage(d_clCtx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			&clFormat, &imageDesc, data, &rCode);
	#else
	d_clMem = clCreateImage3D(d_clCtx,
				CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
				&clFormat, d_width, d_height, d_depth,
				d_width*pixelSize, d_height*d_width*pixelSize,
				data, &rCode);
	#endif
	
	openCLController::checkError(rCode, "clCreateImage3D");
	
	d_dims = 3;
	d_needRemapping = false; // This isn't shared ATM..  Should  be configurable eventually
}

bool openCLTexture::needRemapping() {
	return d_needRemapping;
}

cl_mem *openCLTexture::getMemPtr() {
	//printf("returning tex mem %d\n", d_clMem);
	return &d_clMem;
}

int openCLTexture::dims() {
	return d_dims;
}

int openCLTexture::channels() {
	return d_channels;
}

bool openCLTexture::floatData() {
	return d_floatData;
}

int openCLTexture::width() {
	return d_width;
}

int openCLTexture::height() {
	return d_height;
}



openCLKernel::openCLKernel(cl_context ctx, cl_command_queue que, cl_device_id dev) :
	d_clDev(dev),
	d_clCmdQueue(que),
	d_clCtx(ctx),
	d_compiled(false),
	d_srcGenerated(false),
	d_paramsSet(false),
	d_totalThreadsX(0),
	d_threadBlockX(0),
	d_totalThreadsY(0),
	d_threadBlockY(0)
{
}

void openCLKernel::clearParams() {
	d_paramsSet = false;
	d_params.clear();
}

void openCLKernel::setExecConfig(int totalX, int blockX, int totalY, int blockY) {
	d_totalThreadsX = totalX;
	d_threadBlockX = blockX;
	d_totalThreadsY = totalY;
	d_threadBlockY = blockY;

	/*printf("Kernel %s set to execute %dx%d threads in %dx%d blocks\n", 
			d_kernelName.c_str(), d_totalThreadsX, d_totalThreadsY, d_threadBlockX, d_threadBlockY);*/

	if (d_totalThreadsX%d_threadBlockX)
		throw std::string("Total threads not divisible by thread block size (X)");
	if (d_totalThreadsY%d_threadBlockY)
		throw std::string("Total threads not divisible by thread block size (Y)");

}

void openCLKernel::bindTexture(openCLTexture *tex, std::string refName) {
	d_boundTextures.push_back(std::pair<std::string, openCLTexture*>(refName, tex));
}

void openCLKernel::addString(std::string a) {
	d_kernelSrcString += a;
}

void openCLKernel::addSource(KernelSrc *s) {
	d_kernelSrc = s;
}

cl_device_id openCLKernel::getDev() {
	return d_clDev;
}

void openCLKernel::addTextureDecls(int stringPos) {
	// Not changing the insert pos:  loop from back to front
	for (int i = d_boundTextures.size()-1; i >= 0; i--) {
		d_kernelSrcString.insert(stringPos, d_boundTextures.at(i).first);
		if (d_boundTextures.at(i).second->dims() == 3)
			d_kernelSrcString.insert(stringPos, "image3d_t ");
		else
			d_kernelSrcString.insert(stringPos, "image2d_t ");

		d_kernelSrcString.insert(stringPos, "__read_only ");

		if (i)
			d_kernelSrcString.insert(stringPos, ", ");
	}
}

std::string openCLKernel::findKernel(bool *needPrefix) {
	std::string source = d_kernelSrc->getSrc();

	// We search for "__kernel void "
	size_t pos;
	if ((pos = source.find("__kernel void ")) == std::string::npos)
		throw std::string("Didn't find kernel o_O");
	pos += std::string("__kernel void ").length();

	// Now we find "("
	size_t endPos = source.find("(", pos);

	std::string kernelName = source.substr(pos, endPos-pos);

	// Seeing if we have any params
	if (source.at(endPos+1) == ')')
		*needPrefix = false;
	else
		*needPrefix = true;

	printf("Found kernel name %s\n", kernelName.c_str());
	return kernelName;
}

void openCLKernel::genSources() {
	if (!d_srcGenerated) {
		// Find the tags and fill in the code
		// First the textures:  We first generate a string of the tex decls
		bool needPrefix;
		d_kernelName = findKernel(&needPrefix);

		if (d_boundTextures.empty())
			needPrefix = false;

		std::string texDecls = needPrefix ? ", " : "";

		for (int i = 0; i < d_boundTextures.size(); ++i) {
			char type[1024];
			sprintf(type, "image%dd_t", d_boundTextures.at(i).second->dims());

			char decl[1024];
			sprintf(decl, "__read_only %s %s",
					type, //d_boundTextures.at(i).first.c_str(),
					d_boundTextures.at(i).first.c_str());

			texDecls += decl;

			//if (i != d_boundTextures.size()-1)
			texDecls += ", ";
		}

		// Sampler
		if (d_boundTextures.size())
			texDecls += std::string("sampler_t TEXSAMPLER_LINEAR");

		// And then replace the anchor with it
		d_kernelSrc->replace("#TEXDECL#", texDecls);

		// And write out the string
		d_kernelSrcString = d_kernelSrc->getSrc();
		
		d_srcGenerated = true;
	}
}

void openCLKernel::execute() {
	//cl_event e;
	std::vector<cl_event> noEvents;

	execute(noEvents, NULL);

	//openCLController::checkError(clWaitForEvents(1, &e), "Wait for events");
}

std::string openCLKernel::name() {
	return d_kernelName;
}

void openCLKernel::execute(std::vector<cl_event> waitEvents, cl_event *kernelEvent) {
	if (!d_totalThreadsX || !d_threadBlockX || !d_totalThreadsY || !d_threadBlockY)
		throw std::string("Trying to execute a kernel with unset execution configuration");

	//if (!d_compiled) {
		compile();
		/*d_compiled = true;
	}*/

	if (!d_paramsSet) {
		// First adding the normal parameters
		int par;
		for (par = 0; par < d_params.size(); ++par)
			openCLController::checkError(clSetKernelArg(d_clKernel, par, d_params.at(par).first, d_params.at(par).second), "Set kernel param (param)");

		// Then we add the texture references
		for (int texCount = 0; texCount < d_boundTextures.size(); texCount++, par++) {
			//printf("Setting param %d\n", par+1);
			openCLController::checkError(clSetKernelArg(
						d_clKernel, 
						par, 
						sizeof(cl_mem), 
						d_boundTextures.at(texCount).second->getMemPtr()), 
					"Set kernel param (tex)");
		}

		// If we had textures, we embed a sampler
		if (d_boundTextures.size()) {
			int rCode;
			cl_sampler linearSampler = clCreateSampler(d_clCtx, true, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_LINEAR, &rCode);
			//cl_sampler linearSampler = clCreateSampler(d_clCtx, true, CL_ADDRESS_MIRRORED_REPEAT, CL_FILTER_LINEAR, &rCode);
			openCLController::checkError(rCode, "Create sampler");
			openCLController::checkError(clSetKernelArg(
						d_clKernel, par,
						sizeof(cl_sampler), &linearSampler), "Set kernel arg (sampler)");
		}

		d_paramsSet = true;
	}

	size_t totalThreads[] = {
		d_totalThreadsX, 
		d_totalThreadsY
	};

	size_t threadBlock[] = {
		d_threadBlockX,
		d_threadBlockY
	};

	const bool reMap = true;

	if (reMap)
		for (int i = 0; i < (int)d_boundTextures.size(); ++i)
			if (d_boundTextures[i].second->needRemapping())
				d_boundTextures[i].second->map();

	openCLController::checkError(clEnqueueNDRangeKernel(d_clCmdQueue, d_clKernel,
				2, NULL, totalThreads, threadBlock, waitEvents.size(), waitEvents.empty() ? NULL : &waitEvents.at(0), kernelEvent), "Run kernel");

	if (reMap)
		for (int i = 0; i < (int)d_boundTextures.size(); ++i)
			if (d_boundTextures[i].second->needRemapping())
				d_boundTextures[i].second->unMap();
}

template<class T> void openCLKernel::pushParam(T value) {
	pushParam(sizeof(T), &value);
}

void openCLKernel::pushParam(size_t paramSize, void *paramData) {
	void *data = malloc(paramSize);
	memcpy(data, paramData, paramSize);
	d_params.push_back(std::pair<size_t, void*>(paramSize, data));
}

void openCLKernel::compile() {
	if (!d_compiled) {
		genSources();

		/*if (d_boundTextures.size()) {
			// We add a ", " if there's prior params
			if (d_params.size()) {
				d_kernelSrcString.insert(kernelEnd, ", ");
				kernelEnd += 2;
			}

			addTextureDecls(kernelEnd);
		}*/

		const char *cStr = d_kernelSrcString.c_str();
		size_t strSize = d_kernelSrcString.length();
		int rCode;
		d_clProgram = clCreateProgramWithSource(d_clCtx, 1, &cStr, &strSize, &rCode);
		openCLController::checkError(rCode, "Create program");

		openCLController::checkError(clBuildProgram(d_clProgram, 1, &d_clDev, 
					"-cl-mad-enable -cl-fast-relaxed-math",
					&buildNotify, this), "Build program");

		d_clKernel = clCreateKernel(d_clProgram, d_kernelName.c_str(), &rCode);

		openCLController::checkError(rCode, "Create kernel");

		d_compiled = true;
	}
}

KernelSrc *openCLKernel::getSource() {
	genSources();
	return d_kernelSrc;
}

void openCLKernel::buildNotify(cl_program p, void *data) {
	const int maxLogSize = 102400;
	char msg[maxLogSize];
	openCLController::checkError(clGetProgramBuildInfo(p, 
				((openCLKernel*)data)->getDev(),
				CL_PROGRAM_BUILD_LOG,
				maxLogSize-1, msg, NULL), "Get program build info");

	if (strlen(msg) > 1) {
		printf("Program (%s) buildnotify:\n***\n%s\n***\n", ((openCLKernel*)data)->name().c_str(), msg);
		throw std::string("Build notify");
	}
}

openCLKeywords::openCLKeywords() {
}

std::string openCLKeywords::blockX() {
	return std::string("(int)get_group_id(0)");
}

std::string openCLKeywords::blockY() {
	return std::string("(int)get_group_id(1)");
}

std::string openCLKeywords::blockDimX() {
	return std::string("(int)get_local_size(0)");
}

std::string openCLKeywords::blockDimY() {
	return std::string("(int)get_local_size(1)");
}

std::string openCLKeywords::threadX() {
	return std::string("(int)get_local_id(0)");
}

std::string openCLKeywords::threadY() {
	return std::string("(int)get_local_id(1)");
}

std::string openCLKeywords::globalThreadX() {
	return std::string("(int)get_global_id(0)");
}

std::string openCLKeywords::globalThreadY() {
	return std::string("(int)get_global_id(1)");
}

std::string openCLKeywords::globalizePointers(std::string params) {
	std::string newParams; // We add "__global" to all pointers..
	std::vector<std::string> paramList;

	size_t dotPos = 0;
	size_t nextDotPos;
	do {
		nextDotPos = params.find(",", dotPos);
		paramList.push_back(params.substr(dotPos, nextDotPos-dotPos));
		dotPos = nextDotPos + ((nextDotPos == std::string::npos) ? 0 : 1);
	} while (dotPos != std::string::npos);

	for (int i = 0; i < paramList.size(); ++i) {
		//printf("param %d/%d: %s\n", i+1, paramList.size(), paramList.at(i).c_str());

		if (paramList.at(i).find("*") != std::string::npos)
			newParams += "__global ";
		newParams += paramList.at(i);

		if (i != paramList.size()-1)
			newParams += ", ";
	}

	return newParams;
}

std::string openCLKeywords::kernelDecl(std::string name, std::string params) {
	std::string newParams = globalizePointers(params);

	return std::string("__kernel void ") + name + "(" + newParams + "#TEXDECL#)";
}

std::string openCLKeywords::funcDecl(std::string name, std::string ret, std::string params, std::vector<std::string> tex) {
	std::string texDecls = (params.empty() || tex.empty()) ? "" : ", ";

	for (int i = 0; i < tex.size(); ++i) {
		// Writing anchors for the textures
		texDecls += "#TEX[" + tex.at(i) + "]#";
		if (i != tex.size()-1)
			texDecls += ", ";
	}

	std::string newParams = globalizePointers(params);

	return ret + " " + name + "(" + newParams + texDecls + ")";
}

std::string openCLKeywords::float2Ctor(std::string valX, std::string valY) {
	return std::string("(float2)(") + valX + ", " + valY + ")";
}

std::string openCLKeywords::halfType() {
	return "half";
}

std::string openCLKeywords::regType() {
	return "__private";
}

std::string openCLKeywords::sharedType() {
	return "__local";
}

std::string openCLKeywords::globalType() {
	return "__global";
}

std::string openCLKeywords::writeHalf2(std::string dest, std::string src, std::string ptr, std::string offset) {
	if (ptr == "") 
		return "vstore_half2(" + src + ", 0, (__private half*)&" + dest + ")\n";
	else {
		std::string code = "vstore_half2(" + src + "," + offset + ", ";
		if (ptr == "reg")
			code += "(__private";
		else if (ptr == "shared")
			code += "(__local";
		else if (ptr == "global")
			code += "(__global";
		else
			throw std::string("Ptr type has to be either reg, shared, or global");

		return code + " half*)" + dest + ")";
	}

	//return std::string("vstore_half2(") + src + ", 0, (__global half*)&" + dest + ")";
}
std::string openCLKeywords::writeHalf(std::string dest, std::string src) {
	return std::string("vstore_half(") + src + ", 0, (__global half*)&" + dest + ")";
}

std::string openCLKeywords::readHalf2(std::string dest, std::string src, std::string ptr, std::string offset) {
	if (ptr == "") 
		return dest + " = vload_half2(0, (__private half*)&" + src + ")\n";
	else {
		std::string code = dest + " = vload_half2(" + offset + ", ";
		if (ptr == "reg")
			code += "(__private";
		else if (ptr == "shared")
			code += "(__local";
		else if (ptr == "global")
			code += "(__global";
		else
			throw std::string("Ptr type has to be either reg, shared, or global");

		return code + " half*)" + src + ")";
	}
}

std::string openCLKeywords::float3Ctor(std::string valX, std::string valY, std::string valZ) {
	return std::string("(float3)(") + valX + ", " + valY + ", " + valZ + ")";
}

std::string openCLKeywords::float4Ctor(std::string valX, std::string valY, std::string valZ, std::string valW) {
	return std::string("(float4)(") + valX + ", " + valY + ", " + valZ + ", " + valW + ")";
}

std::string openCLKeywords::atomicMin(std::string dest, std::string value) {
	return std::string("atomic_min(") + dest + ", " + value + ")";
}

std::string openCLKeywords::atomicMax(std::string dest, std::string value) {
	return std::string("atomic_max(") + dest + ", " + value + ")";
}

std::string openCLKeywords::localSync() {
	return std::string("barrier(CLK_LOCAL_MEM_FENCE)");
}

std::string openCLKeywords::sharedMem() {
	return std::string("__local");
}

std::string openCLKeywords::constMem() {
	return std::string("__constant");
}

std::string openCLKeywords::div(std::string a, std::string b) {
	return std::string("native_divide(") + a + ", " + b +")"; 
	//return a + "/" + b;
}

std::string openCLKeywords::floorf(std::string v) {
	return std::string("floor(") + v + ")";
}

std::string openCLKeywords::rcp(std::string v) {
	return std::string("native_recip(") + v + ")";
}

std::string openCLKeywords::sqrt(std::string v) {
	return std::string("native_sqrt(") + v + ")";
}

std::string openCLKeywords::rsqrt(std::string v) {
	return std::string("native_rsqrt(") + v + ")";
}

std::string openCLKeywords::pow(std::string a, std::string b) {
	return std::string("native_powr(") + a + ", " + b + ")";
}

std::string openCLKeywords::sin(std::string v) {
	return std::string("native_sin(") + v + ")";
}

std::string openCLKeywords::cos(std::string v) {
	return std::string("native_cos(") + v + ")";
}

std::string openCLKeywords::exp2(std::string v) {
	return std::string("native_exp2(") + v + ")";
}

std::string openCLKeywords::sincos(std::string angle, std::string sintarget, std::string costarget) {
	std::string s = "{\n\t";
	s += "\tconst float sinCosAngle = " + angle + ";\n\t\t";
	s += sintarget + " = native_sin(sinCosAngle);\n\t\t";
	s += costarget + " = native_cos(sinCosAngle);\n";
	s += "\t}\n";
	return s;
	//return sntarget + " = native_sin(" + angle + "); " + costarget + " = native_cos(" + angle + ");";
}

std::string openCLKeywords::maxf(std::string a, std::string b) {
	return std::string("max(") + a + ", " + b + ")";
}

std::string openCLKeywords::minf(std::string a, std::string b) {
	return std::string("min(") + a + ", " + b + ")";
}

std::string openCLKeywords::absf(std::string a) {
	return std::string("fabs(") + a + ")";
}

std::string openCLKeywords::tex2DDeclFloat(std::string id) {
	return std::string("image2d_t ") + id;
}

std::string openCLKeywords::tex3DDeclFloat(std::string id) {
	//return std::string("texture<float, 3, openCLReadModeElementType> ") + id;
}

std::string openCLKeywords::tex2DDeclFloat4(std::string id) {
	//return std::string("texture<float4, 2, openCLReadModeElementType> ") + id;
}

std::string openCLKeywords::tex2DSample1(std::string id, std::string coordX, std::string coordY) {
	return std::string("read_imagef(") + id + ", TEXSAMPLER_LINEAR, (float2)(" + coordX + ", " + coordY + ")).x";
}

std::string openCLKeywords::tex2DSample4(std::string id, std::string coordX, std::string coordY) {
	return std::string("read_imagef(") + id + ", TEXSAMPLER_LINEAR, (float2)(" + coordX + ", " + coordY + "))";
}

std::string openCLKeywords::tex3DSample(std::string id, std::string coordX, std::string coordY, std::string coordZ) {
	//return std::string("(") + coordX + "+" + coordY + " + " + coordZ + ")";
	return std::string("read_imagef(") + id + ", TEXSAMPLER_LINEAR, (float4)(" + coordX + ", " + coordY + ", " + coordZ + ", 0.0f)).x";
	//return std::string("tex3D(") + id + ", " + coordX + ", " + coordY + ", " + coordZ + ")";
}

std::string openCLKeywords::float2Operators() {
	return std::string("");
}

std::string openCLKeywords::float3Operators() {
	return std::string("");
}

std::string openCLKeywords::vectorOperators() {
	return std::string("\n\
inline float3 SSEOnormalize3(const float3 a) {\n\
	float coef = native_rsqrt(a.x*a.x + a.y*a.y + a.z*a.z);\n\
	return (float3)(a.x*coef, a.y*coef, a.z*coef);\n\
}\n\
\n\
inline float2 SSEOnormalize2(const float2 a) {\n\
	float coef = native_rsqrt(a.x*a.x + a.y*a.y);\n\
	return (float2)(a.x*coef, a.y*coef);\n\
}\n\
\n\
inline float SSEOdot3(const float3 a, const float3 b) {\n\
	return a.x*b.x + a.y*b.y + a.z*b.z;\n\
}\n\
\n\
inline float SSEOdot2(const float2 a, const float2 b) {\n\
	return a.x*b.x + a.y*b.y;\n\
}\n\
\n\
inline float SSEOlength3(const float3 a) {\n\
	return native_sqrt(a.x*a.x + a.y*a.y + a.z*a.z);\n\
}\n\
\n\
inline float SSEOlength2(const float2 a) {\n\
	return native_sqrt(a.x*a.x + a.y*a.y);\n\
}\n\
\n\
inline float3 SSEOcross3(const float3 a, const float3 b) {\n\
	return (float3)(\n\
			a.y*b.z - a.z*b.y,\n\
			a.z*b.x - a.x*b.z,\n\
			a.x*b.y - a.y*b.x);\n\
}\n\n"); 
}

std::string openCLKeywords::header(bool h) {
	std::string header = std::string("#define LANG_OPENCL\n\n");

	//#if !defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)
	if (h)
		header += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n\n";
	//#endif

	return header;
}

std::string openCLKeywords::fileExt() {
	return std::string(".cl");
}
