#include "accumulate.h"
#include <cmath>
	
//#define SINGLEDIR 1

AccumulateKernel::AccumulateKernel() {
}

void AccumulateKernel::calcOcclusion() {
	//append("float thisContrib = contribData[finalIndex];\n");
	//append("totalContrib += thisContrib;\n");

	if (d_config->includeMin())
		if (d_config->useV4Buffers() || !d_config->useHalfBuffers() || !d_config->matchOpposite() || !d_config->edgeAwareAcc())
			throw std::string("Include min has to have half buffers, matchopposite and edge awareness");

	const bool atomic = true;

	if (d_config->edgeAwareAcc() == 1) {
		if (d_config->sweepSurface()) {
			int shiftAmount = log2((double)d_config->sweepStorageStripe()) + 0.5;
			append("int thisY = lineIndex>>%d;\n", shiftAmount);
			append("int thisX = (lineIndex&%d)*8;\n", d_config->sweepStorageStripe()-1);
			append("float2 sweepSample = #surf2DRead{sweepSurf}{thisX}{thisY}{float2}#;\n");
		} else {
			if (d_config->matchOpposite()) {
				if (d_config->useV4Buffers()) 
					append("float4 sweepSample = sweepData[lineIndex");
				else if (d_config->useHalfBuffers())
					append("float2 sweepSample = sweepData[lineIndex");
				else
					append("float sweepSample = sweepData[lineIndex");
			} else
				append("float2 sweepSample = sweepData[lineIndex");

			if (d_config->gatherBuffer())
				append(".x");

			if (d_config->matchOpposite() && !d_config->useV4Buffers() && !d_config->useHalfBuffers())
				append("*3");

			append("];\n");
		}
		const float threshold = d_config->edgeThreshold(); //1.01f;

		if (d_config->gatherBuffer()) {
			append("float compHeight = heightBuffer[lineIndex.y];\n");
			append("if (sweepSample.y > compHeight*%Ef && compHeight > sweepSample.y*%Ef) {\n",
					threshold, threshold);
			if (atomic) {
				append("  #atomicAdd{acceptBuffer[lineIndex.y]}{1.0f}#;\n");
				append("  #atomicAdd{gatherBuffer[lineIndex.y]}{sweepSample.x}#;\n");
			} else {
				append("  acceptBuffer[lineIndex.y] += 1.0f;\n");
				append("  gatherBuffer[lineIndex.y] += sweepSample.x;\n");
			}
			append("}\n");
		} else {
			if (d_config->matchOpposite()) {
				if (d_config->useV4Buffers()) {
					append("if (sweepSample.x > myHeight*%Ef && myHeight > sweepSample.x*%Ef) {\n",
							threshold, threshold);
					append("  float tempOcc = sweepSample.y + sweepSample.z;\n");
					append("  occlusion += #minf{2.0f}{%s}#;\n", d_keywords->maxf("0.0f", "tempOcc").c_str());
					//append("  occlusion += sweepSample.x + sweepSample.z;\n");
					append("  accepted += 2.0f;\n");
				} else if (d_config->useHalfBuffers()) {
					append("float thisDiff = #absf{1.0f - sweepSample.x*myHeightI}#;\n");

					if (d_config->includeMin()) {
						#ifdef SINGLEDIR
						append("unsigned int occData = *((unsigned int*)(&sweepSample.y));\n");
						append("float thisContrib = __half2float(occData&0xffff);\n");
					    #else	
						append("float2 occValues;\n");
						append("#readHalf2{occValues}{&sweepSample}{reg}{1}#;\n");
						append("float tempOcc = occValues.x + occValues.y;\n");
						//append("float thisContrib = __half2float(occData&0xffff) + __half2float(occData>>16);\n");
						#endif

						append("if (thisDiff < maxDiff) {\n");
						append("  maxDiff = thisDiff;\n");
						append("  maxDiffContrib = thisContrib;\n");
						append("}\n");
						append("if (thisDiff < %Ef) {\n",
								threshold - 1.0f);
						append("float tempOcc = thisContrib;\n");
					} else {
						/*append("if (sweepSample.x > myHeight*%Ef && myHeight > sweepSample.x*%Ef) {\n",
								threshold, threshold);*/
						append("if (thisDiff < %Ef) {\n",
								threshold - 1.0f);
						// FIXME: This is not finished
						//append("  unsigned int occData = *((unsigned int*)(&sweepSample.y));\n");
						#ifdef SINGLEDIR
						append("unsigned int occData = *((unsigned int*)(&sweepSample.y));\n");
						append("  float tempOcc = __half2float(occData&0xffff);\n");
					    #else	
						append("float2 occValues;\n");
						append("#readHalf2{occValues}{&sweepSample}{reg}{1}#;\n");
						append("float tempOcc = occValues.x + occValues.y;\n");
						//append("  float tempOcc = __half2float(occData&0xffff) + __half2float(occData>>16);\n"); //sweepSample.y;\n");
						#endif
					}

					#if 0
					append("  occlusion += tempOcc;\n");
					#else
					append("  occlusion += #minf{2.0f}{%s}#;\n", d_keywords->maxf("0.0f", "tempOcc").c_str());
					#endif
					append("  accepted += 2.0f;\n");
				} else {
					append("float thisDiff = #absf{1.0f - sweepSample*myHeightI}#;\n");
					append("if (thisDiff < %Ef) {\n",
							threshold - 1.0f);
					/*append("if (sweepSample > myHeight*%Ef && myHeight > sweepSample*%Ef) {\n",
							threshold, threshold);*/
					append("  float tempOcc = sweepData[lineIndex*3 + 1] + sweepData[lineIndex*3 + 2];\n"); //sweepSample.y+sweepSample.z;\n");
					append("  occlusion += #minf{2.0f}{%s}#;\n", d_keywords->maxf("0.0f", "tempOcc").c_str());
					//append("  occlusion += sweepSample.x + sweepSample.z;\n");
					append("  accepted += 2.0f;\n");
				}
			} else {
				append("if (sweepSample.y > myHeight*%Ef && myHeight > sweepSample.y*%Ef) {\n",
						threshold, threshold);
				append("  occlusion += sweepSample.x;\n");
				append("  accepted += 1.0f;\n");
			}
			append("}\n");
		}
	} else {
		if (!d_config->gatherBuffer()) {
			if (d_config->useHalfBuffers()) {
				append("float2 thisOcc;\n");
				if (d_config->sweepSurface()) {
					int shiftAmount = log2((double)d_config->sweepStorageStripe()) + 0.5;
					append("int thisY = lineIndex>>%d;\n", shiftAmount);
					append("int thisX = (lineIndex&%d)*4;\n", d_config->sweepStorageStripe()-1);
					append("#readHalf2{thisOcc}{%s}#;\n",
							d_keywords->surf2DRead("sweepSurf", "thisX", "thisY", "unsigned int").c_str());
				} else {
					append("#readHalf2{thisOcc}{sweepData[lineIndex]}#;\n");
				}
				append("occlusion += thisOcc.x + thisOcc.y;\n");
			} else
				append("occlusion += sweepData[lineIndex];\n");
		}
	}
}

void AccumulateKernel::genSrc() {
	// If we already have something (loaded), we don't do this
	if (d_preExpandSrc != "")
		return;

	append(d_keywords->float2Operators());

	const char *indexType;
	if (d_config->edgeAwareAcc() == 2 || d_config->gatherBuffer())
		indexType = "uint2";
	else
		indexType = "uint";

	const char *sweepType;
	if (d_config->edgeAwareAcc() == 1) {
		if (d_config->matchOpposite()) {
			if (d_config->useV4Buffers())
				sweepType = "float4";
			else if (d_config->useHalfBuffers())
				sweepType = "float2";
			else
				sweepType = "float";
		} else
			sweepType = "float2";
	} else {
		if (d_config->useHalfBuffers())
			sweepType = "unsigned int";
		else
			sweepType = "float";
	}

	if (d_config->sweepSurface())
		append("#kernelDecl{accumulate}{const %s * __restrict indexData, float * __restrict out}# {\n", indexType);
	else
		append("#kernelDecl{accumulate}{const %s * __restrict sweepData, const %s * __restrict indexData, float * __restrict out}# {\n", sweepType, indexType);

	incrementIndent();
	append("int tidY = #globalThreadY#;\n");
	append("int tidX = #globalThreadX#;\n");

	append("if (tidY >= %d || tidX >= %d) return;\n", d_config->occHeight(), d_config->occWidth());
	append("float occlusion = 0.0f;\n");
	//append("float totalContrib = 0.0f;\n");


	if (d_config->gatherBuffer()) {
		append("const unsigned int localIndex = #threadY#*%d + #threadX#;\n", d_config->accBlockX());
		append("#sharedMem# float gatherBuffer[%d];\n", d_config->accBlockY()*d_config->accBlockX());
		append("gatherBuffer[localIndex] = 0.0f;\n");

		if (d_config->edgeAwareAcc()) {
			append("#sharedMem# float heightBuffer[%d];\n", d_config->accBlockY()*d_config->accBlockX());
			append("#sharedMem# float acceptBuffer[%d];\n", d_config->accBlockY()*d_config->accBlockX());
			append("acceptBuffer[localIndex] = 0.0f;\n");
		}
	}

	append("int thisIndex = tidY*%d + tidX;\n", d_config->occWidth());

	int linesPerPixel = d_config->linesPerPixel();
	/*if (d_config->matchOpposite())
		linesPerPixel /= 2;*/

	// If we're perfoming edge awareness, we read in the height

	printf("Accumulating %d final buffers\n", linesPerPixel);

	// Getting our height if edge aware
	int borderSideW = (d_config->hfWidth() - d_config->occWidth())/2;
	int borderSideH = (d_config->hfHeight() - d_config->occHeight())/2;
	if (d_config->edgeAwareAcc()) {
		append("float2 myCoord = #float2Ctor{%s}{%s}#;\n",
				genStr("%Ef + (float)tidX", 0.5f + (float)borderSideW).c_str(),
				genStr("%Ef - (float)tidY", (float)d_config->hfHeight()-0.5f - (float)borderSideH).c_str());
		append("myCoord *= #float2Ctor{%Ef}{%Ef}#;\n", 
				1.0f/(float)d_config->hfWidth(), 
				1.0f/(float)d_config->hfHeight());

		append("float myHeightI = ");

		std::string heightSample;
		if (d_config->separateSources())
			heightSample = d_keywords->tex2DSample1("depthTex",
					"myCoord.x", "myCoord.y");
		else
			heightSample = d_keywords->tex2DSample4("depthNormalTex",
					"myCoord.x", "myCoord.y") + ".w";
			//append("#tex2DSample4{depthNormalTex}{myCoord.x}{myCoord.y}#.w;\n");

		append("#rcp{%s}#;\n", heightSample.c_str());

		if (d_config->gatherBuffer()) {
			append("heightBuffer[localIndex] = myHeight;\n");
			append("#localSync#;\n");
		} else {
			append("float accepted = 0.0f;\n");
			if (d_config->includeMin()) {
				append("float maxDiff = 10000.0f;\n");
				append("float maxDiffContrib;\n");
			}
		}
	}

	#ifdef SINGLEDIR
	append("for (int line = %d; line < %d; ++line) {\n", SINGLEDIR, SINGLEDIR+1);
	#else
	append("for (int line = 0; line < %d; ++line) {\n", linesPerPixel);
	#endif
	incrementIndent();

	append("int finalIndex = thisIndex + line*%d;\n", d_config->occWidth()*d_config->occHeight());

	if (d_config->edgeAwareAcc() == 2) {
		append("int lineIndex = indexData[finalIndex].x;\n");
	} else {
		if (d_config->gatherBuffer())
			append("uint2 lineIndex = indexData[finalIndex];\n");
		else
			append("unsigned int lineIndex = indexData[finalIndex];\n");
	}

	calcOcclusion();
	
	decrementIndent();
	append("}\n");

	#ifndef SINGLEDIR
	if (d_config->edgeAwareAcc()) {
		if (d_config->gatherBuffer()) {
			append("#localSync#;\n");
			append("occlusion = gatherBuffer[localIndex]/acceptBuffer[localIndex];\n");
		} else {
			#ifdef CALC_ACCEPTED
			//append("occlusion = 1.0 - (accepted*%Ef);\n", 2.0f/(float)d_config->linesPerPixel());
			append("if (accepted > 0.0f) { occlusion = 1.0f; } else { occlusion = 0.0f; }\n");// - (accepted*%Ef);\n", 2.0f/(float)d_config->linesPerPixel());
			#else
			if (d_config->includeMin()) {
				append("if (accepted == 0.0f)\n");
				append("  occlusion = maxDiffContrib*0.5f;\n");
				append("else\n");
				append("  occlusion /= accepted;\n");
			} else
				append("occlusion /= accepted;\n");
			#endif // CALC_ACCEPTED
		}
	} else {
		if (d_config->matchOpposite())
			append("occlusion *= %Ef;\n", 0.5f/(float)d_config->linesPerPixel()); //#rcp{totalContrib}#;\n");
		else
			append("occlusion *= %Ef;\n", 1.0f/(float)d_config->linesPerPixel()); //#rcp{totalContrib}#;\n");
	}
	
	//append("occlusion = accepted/8.0f;\n");

	//append("occlusion *= %Ef;\n", 1.0f/(float)linesPerPixel);
	#else
	/*if (d_config->matchOpposite())
		append("occlusion *= 0.5f;\n");*/
	#endif


	append("out[tidY*%d + tidX + %d] = 1.0f - occlusion;\n", 
			d_config->hfWidth(),
			borderSideH*d_config->hfWidth() + borderSideW);
	/*append("out[tidY*%d + tidX + %d] = accepted > 0.0f ? %s : #float4Ctor{1.0}{0.0}{0.0}{1.0}#;\n", 
			d_config->hfWidth(),
			borderSideH*d_config->hfWidth() + borderSideW,
			d_keywords->float4Ctor("1.0f - occlusion", "1.0f - occlusion", "1.0f - occlusion", "1.0f").c_str());*/
			//d_keywords->float4Ctor("occlusion", "occlusion", "occlusion", "1.0f").c_str());
	decrementIndent();
	append("}\n");
}
