#include "sseo.h"

#include "config.h"
#include "control.h"
#ifdef HAVE_CUDA
	#include "gpgpu_cuda.h"
#endif
#include "gpgpu_opencl.h"

#include <cstdio>
#include <sys/time.h>
#include <string.h>
#include <cmath>
#include <algorithm>


using namespace SSEO;


int SSEO::init(unsigned int depthNormalTex, unsigned int normalTex, unsigned int resultTex, float *projection,
		int borderW, int borderH, int K, float falloffDecay, GPGPU_Backend backend) {
	try {
		Matrix4 projMatrix;
		projMatrix.elem(0,0) = projection[0];
		projMatrix.elem(1,1) = projection[1];
		projMatrix.elem(2,0) = projection[2];
		projMatrix.elem(2,1) = projection[3];
		g_sseoCtrl = new SSEOController(depthNormalTex, normalTex, resultTex, projMatrix, borderW, borderH, K, falloffDecay, backend);
	} catch (std::string e) {
		throw std::string("Couldn't init SSEO: ") + e;
	}

	return 0;
}

void SSEO::uninit() {
	delete g_sseoCtrl;
}

void *SSEO::getResultBuf() {
	return g_sseoCtrl->getResultBuf();
}

void SSEO::computeLight(bool report) {
	g_sseoCtrl->execute(report);
}

SSEOController *SSEO::g_sseoCtrl;

SSEOConfiguration *SSEOController::genConfig(int w, int h, int borderW, int borderH, int K, float falloffDecay) {
	SSEOConfiguration *cfg = new SSEOConfiguration();
	
	// The meaning of these is explained in kernelCommon.h
	cfg->hfWidth() = w;
	cfg->hfHeight() = h;
	cfg->occWidth() = w - borderW; // Use smaller dimensions here to calculate occlusion inside guard bands
	cfg->occHeight() = h - borderH;

	cfg->mode() = 0; // The main mode of operation:
	// 0  Normal
	// 1  Brute-ground
	// 2  Brute with only the highest occlusion
	// 3  HBAO implementation using OpenGL fragment shaders, max slope occlusion (siggrap poster version)
	// 4  The above, but incremental slope (siggraph presentation slides "Per-Sample Attenuation" version)
	cfg->hbaoSteps() = 16; // How many steps to take for each direction

	// (NOT REALLY USED IN THE "EYESTEP" IMPLEMENTATION:)
	cfg->hbaoStepLen() = sqrtf(2.0f); // How long steps (a multiplier for texel width) to take

	cfg->dirs() = K; // K: How many directions to process, this also applies to HBAO
	cfg->dirOffset() = 0.0f; // Where does the first direction start, used for axis alignment
	cfg->usePreset() = true; // Pick directions along a box around receiver..  Accepted K are 8n, n e Z+

	cfg->fallOff() = falloffDecay; // Falloff coefficient, i.e. r in falloffFunction(distance) = r/(r + distance^2)

	cfg->optiPath() = true; // This is an execution path whose "sweep" kernel is specially optimized
	// It locks down the following options:
	// Gatherbuffer = false
	// Matchopposite = true
	// UseV4Buffers = false
	// UseHalfBuffers = true
	// TangentType = 0
	// JitterTangent = false
	
	// The following are specific to optiPath:
	cfg->unrollAmount() = 4; // How many iterations to manually unroll
	cfg->unrollStorage() = 4; // Where to store unrolled data
	// 0  In registers (disables unrolling)
	// NOT IN USE 1  In linear memory (global thread-specific memory)
	// NOT IN USE 2  Shared memory
	//    Since the above were not an optimization, we introduce storage modes for samples only:
	// 3  Linear memory
	// 4  Shared memory
	// 6  Use #pragma unroll

	cfg->hullStorage() = 2; // Where to store the "convex" hull
	// 0  Linear memory
	// 1  Global memory (each thread allocated linearly)
	// 2  Shared memory [hullIndex][threadIndex]
	// 3  Shared memory [threadIndex][hullIndex]
	cfg->hullSize() = 16; // How many elements to allocate
	cfg->hullType() = 0;
	// 0  Float2
	// 1  Half2 (unsigned int)
	cfg->hullThresholdType() = 0; // 0:  just an offset, 1:  (1.0f + threshold) ratio
	cfg->hullThreshold() = 0.001f; // Within which occlusion threshold hull elements are considered equal
	  								// 0.0015f is okay (tested on dragon)
	cfg->stopAtMax() = true; // Whether or not stop unraveling the hull beyond max horizon

	cfg->gatherBuffer() = false; // Whether to decouple reading/writing during accumulation
	cfg->matchOpposite() = true;
	cfg->useV4Buffers() = false; // Applies to matchOpposite..  Takes 25% more memory but allows vectorized loads
	cfg->useHalfBuffers() = true; // Applies to matchOpposite..  Takes 33% less memory AND allows vectorized load
	cfg->sweepSurface() = false; // Whether to use surfaces ("textures") for sweep data.  Texture cache has 2D locality and shorter cache lines which often make it a better match for gathering during accumulationKernel

	// WARNING: Experimental
	cfg->occlusionScatter() = false; // Instead of gathering the receiver's neighborhood in a separate kernel, splat the contribution during sweeping

	cfg->stepInterpolation() = 1; // No need to do the (slow) hybrid when usePreset() == true
	// 0  Linear interpolation
	// 1  Snap to texel centers
	// 2  Hybrid:  Snap only when @ edge

	cfg->tangentType() = 0; // Use 1 for HBAO
	// 0  No tangent
	// 1  Tilt from previous/next step (min difference)
	// 2  Pick the tilt from previous/current or current/next based on which have a smaller depth difference
	// 3  Read the scene normal from normal map
	cfg->fixedTangent() = -0.85f; // If tangentType 0 is used, this is the fixed tangent.

	cfg->lineSkip() = 1.0f; // Coefficient for D_L (Every D_L:th line is processed)

	// The following are used to "balance" D_L such that K/(D_S*D_L) stays constant:
	if (cfg->usePreset() && cfg->dirs() == 16) cfg->lineSkip() = 0.675f;
	if (cfg->usePreset() && cfg->dirs() == 24) cfg->lineSkip() = 0.475f;
	if (cfg->usePreset() && cfg->dirs() == 32) cfg->lineSkip() = 0.3625f;
	
	cfg->stepSkip() = 1.0f; // Coefficient for D_S (Every D_S:th step is processed).  Overridden if usePreset() == true

	cfg->jitterTangent() = false; // Not used

	cfg->sweepStripe() = 32; 
	cfg->sweepStorageStripe() = 256; // Having this different than sweepStripe is especially useful for sweepSurface() 'cause its limited dimensions
	cfg->sweepBlock() = 32; // 64 for Fermi, 32 for Tahiti

	cfg->accBlockX() = 16;
	cfg->accBlockY() = 16;

	cfg->edgeAwareAcc() = 1; // 0:  Off, 1:  Use heights baked into sweep data, 2:  Sample heights from coords
	cfg->edgeThreshold() = 1.008f; // 1.008f is good
	cfg->includeMin() = false; // Whether to always include the closest match..  This never hurts the quality, but rarely makes any difference either.  You lose a couple dozen microseconds

	// How many nearest values to gather for each final screen pixel..  use cfg->dirs() here to gather one from each K
	cfg->linesPerPixel() = cfg->dirs();


	// Configuration sanity checks..
	if (cfg->unrollStorage() == 0)
		cfg->unrollAmount() = 0;
	
	if (cfg->occlusionScatter() && cfg->unrollAmount() != 1)
		throw std::string("OcclusionScatter() works only without unrolling");

	if (!cfg->sweepSurface())
		cfg->sweepStorageStripe() = cfg->sweepStripe();

	if (!cfg->edgeAwareAcc())
		cfg->includeMin() = false; // This makes no sense without edge awareness
	if (cfg->matchOpposite())
		cfg->linesPerPixel() /= 2;

	if (cfg->useV4Buffers() && cfg->useHalfBuffers())
		throw std::string("Conflicting buffer modes");

	if (cfg->mode() != 0)
		cfg->optiPath() = false;

	if (cfg->optiPath() && (
				cfg->gatherBuffer() != false ||
				cfg->matchOpposite() != true ||
				cfg->useV4Buffers() != false ||
				cfg->useHalfBuffers() != true ||
				cfg->tangentType() != 0 ||
				cfg->jitterTangent() != 0)) /* ||
				cfg->edgeAwareAcc() != 1))*/
		throw std::string("OptiPath can't be used with this config");

	if (cfg->sweepSurface() && !cfg->useHalfBuffers())
		throw std::string("Surface only implemented for half float buffers");

	return cfg;
}


SSEOController::SSEOController(unsigned int depthNormalTex, 
		unsigned int normalTex,
		unsigned int resultTex, 
		Matrix4 projection,
		int borderW, int borderH,
		int K, float falloffDecay,
		GPGPU_Backend backend) :
	d_config(NULL),
	d_projection(projection),
	d_backend(backend) {

	int w = OpenGLController::getTexWidth(depthNormalTex);
	int h = OpenGLController::getTexHeight(depthNormalTex);

	d_config = genConfig(w, h, borderW, borderH, K, falloffDecay);
	d_config->projMatrix() = d_projection;
		
	int channels = OpenGLController::getTexChannels(depthNormalTex);
	int componentSize = OpenGLController::getTexComponentSize(depthNormalTex);

	d_config->separateSources() = (bool)normalTex;

	// Checking the first input tex
	if (channels != ((d_config->separateSources()) ? 1 : 4)) {
		fprintf(stderr, "Got %d channels\n", channels);
		throw std::string("Not the kind of channel conf we were expecting");
	}
	if (componentSize < 32) { 
		fprintf(stderr, "Got %d sized channels\n", componentSize);
		throw std::string("We were only prepared for 32b floating point channels");
	}

	if (!normalTex)
		printf("Using a combined height+normal source texture\n");
	else {
		channels = OpenGLController::getTexChannels(normalTex);
		componentSize = OpenGLController::getTexComponentSize(normalTex);

		if (channels != 4) {
			fprintf(stderr, "Got %d channels\n", channels);
			throw std::string("We need an input normal texture with 4 channels");
		}
		if (componentSize < 32) { 
			fprintf(stderr, "Got %d sized channels\n", componentSize);
			throw std::string("We were only prepared for 4 byte floating point channels in the normal texture");
		}
		
		printf("Using separate normal and height source textures\n");
	}

	d_glDepthNormalTex = depthNormalTex; 
	d_glNormalTex = normalTex;
	d_glResultTex = resultTex;

	try {
		printf("Preparing to init for (%d+%d)x(%d+%d) height field, %d directions\n",
				w-borderW, borderW, h-borderH, borderH, d_config->dirs());

		// We're taking a different control path for HBAO
		if (d_config->mode() > 2) {
			initHBAO();
			return;
		}

		if (d_backend == SSEO_OPENCL)
			d_gpgpuCtrl = (gpgpuController*) new openCLController(-1);
		else if (d_backend == SSEO_CUDA) {
			#ifdef HAVE_CUDA
			d_gpgpuCtrl = (gpgpuController*) new cudaController(-1);
			#else
			throw std::string("CUDA is not supported by this system");
			#endif
		} else 
			throw std::string("Unimplemented GPGPU backend");


		// Creating the input textures in GPGPU
		int gpgpuNormal, gpgpuDepthNormal;
		gpgpuDepthNormal = d_gpgpuCtrl->makeGLTexture(depthNormalTex);
		if (d_config->separateSources())
			gpgpuNormal = d_gpgpuCtrl->makeGLTexture(normalTex);
				
		if (d_config->stepInterpolation() == 1) {
			d_gpgpuCtrl->nearestSampling(gpgpuDepthNormal);
			if (d_config->separateSources())
				d_gpgpuCtrl->nearestSampling(gpgpuNormal);
		}


		/*
		 * LINE PARAMS
		 */
		// Information that the threads use to coordinate work in projection phase
		LineParams *lp = new LineParams(d_config);
		size_t size;
		struct LineInfo *li = lp->getWork(&size);
		d_liMem = d_gpgpuCtrl->newBuffer(size, li);
		// We need the vector later on
		std::vector<struct LineInfo> liVec;
		for (int i = 0; i < size/sizeof(struct LineInfo); ++i)
			liVec.push_back(li[i]);
		delete li;

		int totalThreads = size/sizeof(struct LineInfo);
		printf("Allocated %.0f kB on the device for %d LineInfos\n", (float)size/1024.0f, totalThreads);


		
		/*
		 * LINE INDEXES
		 */
		size_t sweepBufSize;
		if (!d_config->occlusionScatter()) {
			sweepBufSize = d_config->sweepWidth()*
				d_config->sweepHeight()*
				sizeof(float);

			if (d_config->edgeAwareAcc() == 1) {
				if (d_config->matchOpposite()) {
					if (d_config->useV4Buffers())
						sweepBufSize *= 4;
					else if (d_config->useHalfBuffers())
						sweepBufSize *= 2;
					else
						sweepBufSize *= 3;
					// FIXME:  Kludge alert..  We might be stepping on other threads' toes
					//         and also stepping over the edge, when it comes to oppositedirs.
					//         Therefore we space the buffers apart by a fixed amount, which doesn't take much mem.
					sweepBufSize += 20000;
				} else
					sweepBufSize *= 2;
			} else {
				if (d_config->matchOpposite()) {
					if (!d_config->useHalfBuffers())
						throw std::string("MatchOpposite and no-edgeAwareness matches only with halfBuffers");
				}
			}

			LineIndex indexGen(d_config);
			indexGen.setLI(&liVec);
			indexGen.checkBounds(sweepBufSize/sizeof(float));
			unsigned int *indexData;
			size_t indexSize = indexGen.genData(&indexData);
			d_lineIndexMem = d_gpgpuCtrl->newBuffer(indexSize, indexData);
			printf("Allocated %.0f MB on the device for line index and height data\n", 
					(float)indexSize/1024.0f/1024.0f);
			free(indexData);
		}



		/* 
		 * SWEEP KERNEL
		 */
		// This is not needed until accumulate kernel if occlusionScatter() is not used
		d_resultTex = d_gpgpuCtrl->makeGLTexture(resultTex);
		d_resultMem = d_gpgpuCtrl->newBuffer(sizeof(float)*w*h);

		SweepKernel *sweepKernSrc = new SweepKernel();
		sweepKernSrc->setConfig(d_config);
		sweepKernSrc->setKeywords(d_gpgpuCtrl->getKeywords());

		if (!d_config->occlusionScatter()) {
			if (d_config->sweepSurface()) {
				d_sweepSurface = d_gpgpuCtrl->create2DSurface(d_config->sweepWidth(), d_config->sweepHeight(), 
						d_config->edgeAwareAcc() ? 2 : 1); // float2 or float
			} else {
				d_sweepBuffer = d_gpgpuCtrl->newBuffer(sweepBufSize);
			}
			printf("And %.0f MB for sweep buffers\n",
				   (float)sweepBufSize/1024.0f/1024.0f);
		}

		d_sweepKernel = d_gpgpuCtrl->newKernel(sweepKernSrc);

		// The actual kernel
		d_gpgpuCtrl->setBufferParam(d_sweepKernel, d_liMem);

		if (d_config->occlusionScatter()) {
			d_gpgpuCtrl->setBufferParam(d_sweepKernel, d_resultMem);
		} else {
			if (d_config->sweepSurface()) {
				d_gpgpuCtrl->setSurf(d_sweepKernel, d_sweepSurface, "sweepSurf");
			} else 
				d_gpgpuCtrl->setBufferParam(d_sweepKernel, d_sweepBuffer);
		}

		if (d_config->hullStorage() == 1) { // We're allocating the hull storage from global mem
			int hullStorage = d_gpgpuCtrl->newBuffer(sizeof(int) * 
					(d_config->hullType() ? 1 : 2) * d_config->hullSize() *
					totalThreads);
			d_gpgpuCtrl->setBufferParam(d_sweepKernel, hullStorage);
		}

		if (d_config->separateSources()) {
			d_gpgpuCtrl->setTex(d_sweepKernel, gpgpuDepthNormal, "depthTex");
			d_gpgpuCtrl->setTex(d_sweepKernel, gpgpuNormal, "normalTex");
		} else
			d_gpgpuCtrl->setTex(d_sweepKernel, gpgpuDepthNormal, "depthNormalTex");

		d_gpgpuCtrl->setKernelExecConf(d_sweepKernel, totalThreads, d_config->sweepBlock());
		d_gpgpuCtrl->preferSM(d_sweepKernel);
		
		d_gpgpuCtrl->writeKernelSrc(d_sweepKernel, "sweep");



		/*
		 * OPPOSITE COMBINE KERNEL
		 */
		if (d_config->matchOpposite() && false) { // Not used since it's faster to gather during accumulate
			OppositeKernel oppKernSrc;
			oppKernSrc.setConfig(d_config);
			oppKernSrc.setKeywords(d_gpgpuCtrl->getKeywords());
			d_oppositeKernel = d_gpgpuCtrl->newKernel(&oppKernSrc);

			d_gpgpuCtrl->setBufferParam(d_oppositeKernel, d_sweepBuffer);
			d_gpgpuCtrl->setBufferParam(d_oppositeKernel, d_mirrorBuffer);
			int blockX = 16;
			int blockY = 16;
			int alignX = (d_config->sweepWidth()+blockX-1)/blockX*blockX;
			int alignY = (d_config->sweepHeight()+blockY-1)/blockY*blockY;
			d_gpgpuCtrl->setKernelExecConf(d_oppositeKernel, 
					alignX, blockX,
					alignY, blockY);
			d_gpgpuCtrl->writeKernelSrc(d_oppositeKernel, "opposite");
			d_gpgpuCtrl->reportTiming(d_oppositeKernel);
		}



		/*
		 * ACCUMULATE KERNEL
		 */
		if (!d_config->occlusionScatter()) {
			AccumulateKernel *accKernSrc = new AccumulateKernel();
			accKernSrc->setConfig(d_config);
			accKernSrc->setKeywords(d_gpgpuCtrl->getKeywords());
			d_accKernel = d_gpgpuCtrl->newKernel(accKernSrc);
			//d_gpgpuCtrl->preferSM(d_accKernel);
			// As input we need:
			// The actual visibility/occlusion data

			if (d_config->sweepSurface()) {
				d_gpgpuCtrl->setSurf(d_accKernel, d_sweepSurface, "sweepSurf");
			} else 
				d_gpgpuCtrl->setBufferParam(d_accKernel, d_sweepBuffer);

			// Line indices
			d_gpgpuCtrl->setBufferParam(d_accKernel, d_lineIndexMem);
			// And the result buffer (output)
			d_gpgpuCtrl->setBufferParam(d_accKernel, d_resultMem);

			if (d_config->edgeAwareAcc()) {
				if (d_config->separateSources()) {
					d_gpgpuCtrl->setTex(d_accKernel, gpgpuDepthNormal, "depthTex");
				} else
					d_gpgpuCtrl->setTex(d_accKernel, gpgpuDepthNormal, "depthNormalTex");
			}

			int alignX = (d_config->occWidth()+d_config->accBlockX()-1)/d_config->accBlockX()*d_config->accBlockX();
			int alignY = (d_config->occHeight()+d_config->accBlockY()-1)/d_config->accBlockY()*d_config->accBlockY();
			d_gpgpuCtrl->setKernelExecConf(d_accKernel, 
					alignX, d_config->accBlockX(),
					alignY, d_config->accBlockY());
			d_gpgpuCtrl->writeKernelSrc(d_accKernel, "accumulate");
		}

	} catch (std::string e) {
		throw std::string("Couldn't initialize SSEO Controller: ") + e;
	}

	d_frameCounter = 1;
	d_time = getTime();
}

// For debugging purposes, raw sweeps can be brought to OpenGL instead of the light map
void SSEOController::exportLayerData() {
	if (d_config->edgeAwareAcc() == 1)
		throw std::string("Export layer data not supported with edgeAwareAcc 1");

	size_t layerDataSize;
	float *data = (float*)d_gpgpuCtrl->getBufferData(d_sweepBuffer, layerDataSize);

	size_t resultDataSize;
	float *resultData = (float*)d_gpgpuCtrl->getBufferData(d_resultMem, resultDataSize);

	// We loop over the result data elems, and read in data from the layers..
	int minX = (d_config->hfWidth() < d_config->sweepWidth()) ? d_config->hfWidth() : d_config->sweepWidth();
	int minY = (d_config->hfHeight() < d_config->sweepHeight()) ? d_config->hfHeight() : d_config->sweepHeight();
	for (int x = 0; x < minX; ++x)
		for (int y = 0; y < minY; ++y) {
			float r, g, b;
			b = 0.0f;

			const int dir = 0;
			int index = dir*d_config->sweepHeight() + y;
			index *= d_config->sweepWidth();
			index += x;

			/*r = (data[index] + 10.0f)/20.0f;
			g = (data[index] + 10.0f)/20.0f;*/
			r = g = b = (d_config->matchOpposite() ? 0.5f : 1.0f) *
				data[index];

			resultData[(y*d_config->hfWidth() + x)*4 + 0] = r;
			resultData[(y*d_config->hfWidth() + x)*4 + 1] = g;
			resultData[(y*d_config->hfWidth() + x)*4 + 2] = b;
		}

	d_gpgpuCtrl->uploadToBuffer(d_resultMem, resultData, resultDataSize);
	free(resultData);
}

void SSEOController::execute(bool report) {
	if (d_config->mode() > 2) {
		executeHBAO();
		return;
	}

	const int reportFrames = 10;
	const bool showLayerData = false;

	if (d_config->occlusionScatter()) {
		float zero = 0.0f;
		d_gpgpuCtrl->clearBuffer(d_resultMem, &zero);
	}

	d_gpgpuCtrl->executeKernel(d_sweepKernel);

	if (d_config->matchOpposite() && false)
		d_gpgpuCtrl->executeKernel(d_oppositeKernel);

	if (showLayerData)
		exportLayerData();
	else {
		if (!d_config->occlusionScatter())
			d_gpgpuCtrl->executeKernel(d_accKernel);
	}

	d_gpgpuCtrl->fillTexFromBuffer(d_resultTex, d_resultMem);

	// Let's report the overall update frequency here
	if (report && !(++d_frameCounter%reportFrames) && d_frameCounter < reportFrames+1) {
		printf("\n");
		d_gpgpuCtrl->reportTiming(d_sweepKernel);
		if (d_config->matchOpposite() && false)
			d_gpgpuCtrl->reportTiming(d_oppositeKernel);
		if (!d_config->occlusionScatter())
			d_gpgpuCtrl->reportTiming(d_accKernel);
	}
}

void *SSEOController::getResultBuf() {
	return d_gpgpuCtrl->getBufferPtr(d_resultMem);
}

double SSEOController::getTime() {
	struct timeval t;
	gettimeofday(&t, NULL);
	return (double)t.tv_sec + (double)t.tv_usec/1e6;
}

SSEOController::~SSEOController() {
	delete d_gpgpuCtrl;
}
