#include "sweep.h"
#include <cstdarg>
#include <cstdio>
#include <cmath>
#include <cstdlib>


SweepKernel::SweepKernel() :
	d_occlusionCompare(true),
	d_useFalloff(true)
{
	#ifdef SEPARATE_SOURCES
	if (d_tangent == 3)
		throw std::string("Tangent strategy 3 (use normals) requires combined sources");
	#endif
}

void SweepKernel::genOpti() {
	append("float2 pLocal;\n");
	append("float2 upVec;\n");

	// The first step is a "stopper" -- infinitely far down
	#ifdef REGH3
	append("if (li.idleSteps) {\n");
	incrementIndent();
	append("h3 = #float2Ctor{0.0}{-10000.0f}#;\n");
	sample("li.startPos", "h2");
	stepForward(true, true); // Do not touch index and decrement idle counter
	decrementIndent();
	append("} else\n");
	append("  h2 = h3;\n");

	append("if (li.idleSteps) {\n");
	incrementIndent();
	sample("li.startPos", "h1");
	stepForward(true, true); // Do not touch index and decrement idle counter
	append("} else\n");
	append("  h1 = h2;\n");
	#else
	append("h2 = #float2Ctor{0.0}{-10000.0f}#;\n");
	sample("li.startPos", "h1");
	stepForward(true, true); // Do not touch index and decrement idle counter
	#endif

	// OutLoc is used for unrollStorage > 2
	std::string outLoc;
	if (d_config->unrollStorage() == 3)
		outLoc = "pLocalS[slot]";
	else
		outLoc = genStr("pLocalS[slot*%d + #threadX#]", d_config->sweepBlock());

	bool manualUnroll = d_config->unrollAmount() > 1 && d_config->unrollStorage() < 3;

	if (manualUnroll) {
		// Generating the storage
		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			append("float2 pLocal%d;\n", i);
			append("float2 upVec%d;\n", i);
		}
	}

	if (manualUnroll) {
		append("while (li.idleSteps > %d) {\n", d_config->unrollAmount());
		incrementIndent();
		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			std::string targetName = genStr("pLocal%d", i);
			std::string upVecName = genStr("upVec%d", i);
			std::string posName = genStr("samplePos%d", i);
			append("float2 %s = li.startPos + %Ef*li.stepDir;\n",
					posName.c_str(), (float)i);
			sample(posName, targetName, "NOTREALLYLOL", upVecName);
		}
		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			std::string targetName = genStr("pLocal%d", i);
			std::string upVecName = genStr("upVec%d", i);
			stepConvexUnroll(false, targetName, upVecName);
		}

		append("li.numSteps -= %d;\n", d_config->unrollAmount());
		append("li.idleSteps -= %d;\n", d_config->unrollAmount());
		append("li.startPos += %Ef*li.stepDir;\n", (float)d_config->unrollAmount());

		decrementIndent();
		append("}\n");
		
		// This gets rid of all the excess iterations
		append("while (li.idleSteps > 0) {\n");
		incrementIndent();
		sample("li.startPos", "pLocal", "NOTREALLYLOL");
		stepConvex(false);
		stepForward(true, true);
		decrementIndent();
		append("}\n");

	} else if (d_config->unrollStorage() > 2 && d_config->unrollStorage() != 6 && d_config->unrollAmount() > 1) {
		if (d_config->unrollStorage() == 3)
			append("float2 pLocalS[%d];\n", d_config->unrollAmount());
		else
			append("#sharedMem# float2 pLocalS[%d];\n", d_config->unrollAmount()*d_config->sweepBlock());

		//append("#pragma unroll 4\n");
		append("while (li.idleSteps > %d) {\n", d_config->unrollAmount());
		incrementIndent();
		//append("#pragma unroll 4\n");
		append("for (int slot = 0; slot < %d; ++slot) {\n", d_config->unrollAmount());
		incrementIndent();

		sample("li.startPos", outLoc, "NOTREALLYLOL", "");
		append("li.startPos += li.stepDir;\n");
		decrementIndent();
		append("}\n");

		//append("#pragma unroll 4\n");
		append("for (int slot = 0; slot < %d; ++slot) {\n", d_config->unrollAmount());
		incrementIndent();
		append("pLocal = %s;\n", outLoc.c_str());
		append("upVec = SSEOnormalize2(-pLocal);\n");
		stepConvex(false);
		//stepForward(true, true);
		append("li.numSteps--;\n");
		append("li.idleSteps--;\n");
		decrementIndent();
		append("}\n");
		decrementIndent();
		append("}\n");

		// Excess iterations
		append("while (li.idleSteps > 0) {\n");
		incrementIndent();
		sample("li.startPos", "pLocal", "NOTREALLYLOL");
		stepConvex(false);
		stepForward(true, true);
		decrementIndent();
		append("}\n");

	} else {
		if (d_config->unrollStorage() == 6)
			append("#pragma unroll %d\n", d_config->unrollAmount());

		append("while (li.idleSteps > 0) {\n");
		incrementIndent();
		sample("li.startPos", "pLocal", "NOTREALLYLOL");
		stepConvex(false);
		stepForward(true, true);
		decrementIndent();
		append("}\n");
	}

	// ONTO THE MAIN LOOP
	std::string tangentDest;
	tangentDest = "tangentSin";

	if (manualUnroll) {
		for (int i = 0; i < d_config->unrollAmount(); ++i)
			append("float occlusion%d;\n", i);
		//append("while (li.numSteps > %d) {\n", d_config->unrollAmount());
		append("while (li.numSteps > 0) {\n", d_config->unrollAmount());
		incrementIndent();
		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			std::string targetName = genStr("pLocal%d", i);
			std::string upVecName = genStr("upVec%d", i);
			std::string posName = genStr("samplePos%d", i);
			append("float2 %s = li.startPos + %Ef*li.stepDir;\n",
					posName.c_str(), (float)i);
			sample(posName, targetName, tangentDest, upVecName);
		}
		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			std::string horName = genStr("horVec%d", i);
			std::string targetName = genStr("pLocal%d", i);
			std::string upVecName = genStr("upVec%d", i);
			stepConvexUnroll(true, targetName, upVecName, horName);
		}

		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			std::string horName = genStr("horVec%d", i);
			std::string occName = genStr("occlusion%d", i);
			std::string upVecName = genStr("upVec%d", i);
			append("%s = %s;\n", occName.c_str(), genOcc(horName, "", "", upVecName).c_str());
		}

		// WRITEOUT
		append("if (li.dirIndex < %d) {\n", d_config->dirs()/2);
		incrementIndent();
		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			append("out[(destIndex + myStripe*%d)*2] = pLocal%d.y;\n", i, i);
		}
		decrementIndent();
		append("}\n");

		for (int i = 0; i < d_config->unrollAmount(); ++i) {
			//append("outOcc[(destIndex + myStripe*%d)*4] = __float2half_rn(occlusion%d);\n", i, i);
			append("#writeHalf{outOcc[(destIndex + myStripe*%d)*4]}{occlusion%d};\n", i, i);
		}

		append("li.numSteps -= %d;\n", d_config->unrollAmount());
		append("li.startPos += %Ef*li.stepDir;\n", (float)d_config->unrollAmount());
		append("destIndex += myStripe*%d;\n", d_config->unrollAmount());

		incrementIndent();
	} else if (d_config->unrollStorage() > 2 && d_config->unrollStorage() != 6 && d_config->unrollAmount() > 1) {
	//} else if (d_config->unrollStorage() > 2 && d_config->unrollAmount() > 1) {
		append("float occlusion = 0.0f;\n");
		//append("#pragma unroll 4\n");
		append("while (li.numSteps > 0) {\n");
		incrementIndent();
		//append("#pragma unroll 4\n");
		append("for (int slot = 0; slot < %d; ++slot) {\n", d_config->unrollAmount());
		incrementIndent();
		sample("li.startPos", outLoc, tangentDest, "");
		append("li.startPos += li.stepDir;\n");
		decrementIndent();
		append("}\n");

		//append("#pragma unroll 4\n");
		append("for (int slot = 0; slot < %d; ++slot) {\n", d_config->unrollAmount());
		incrementIndent();
		append("pLocal = %s;\n", outLoc.c_str());
		append("upVec = SSEOnormalize2(-pLocal);\n");
		stepConvex();
		//genOcclusion();
		writeOut();
		append("li.numSteps--;\n");

		if (d_config->sweepSurface())
			append("storeY += (li.dirIndex < %d) ? 1 : -1;\n", d_config->dirs()/2);
		else
			append("destIndex += myStripe;\n");

		decrementIndent();
		append("}\n");

	} else {	
		append("float occlusion = 0.0f;\n");
		if (d_config->unrollStorage() == 6)
			append("#pragma unroll %d\n", d_config->unrollAmount());

		append("while (li.numSteps > 0) {\n");
		incrementIndent();
		//sample("li.startPos", "pLocal", tangentDest, "");
		sample("li.startPos", "pLocal", tangentDest);

		//stepConvexUnroll(true, targetName, upVecName);
		stepConvex();
		//genOcclusion();

		writeOut();

		#ifdef SKIPWRITES
		decrementIndent();
		append("}\n");
		#endif

		stepForward();
		postTangent();
	}
}

void SweepKernel::genSrc() {
	// If we already have something (loaded), we don't do this
	if (d_preExpandSrc != "")
		return;

	// Setting some composite flags
	d_lookAhead = d_config->tangentType() == 1 || d_config->tangentType() == 2;

	append("#include \"sharedStructs.h\"\n");
	append("#include \"sharedConstants.h\"\n");
	append(d_keywords->float2Operators());
	append(d_keywords->vectorOperators());

	genCommonFunctions();

	genFuncs();

	const char *sweepType;
	if (d_config->edgeAwareAcc() == 1)
		sweepType = "float2";
	else {
		if (d_config->useHalfBuffers())
			sweepType = "unsigned int";
		else
			sweepType = "float";
	}

	std::string hulls;
	if (d_config->hullStorage() == 1) {
		std::string type = d_config->hullType() ? "unsigned int" : "float2";
		hulls = ", " + type + " *hullPool";
	}

	if (d_config->matchOpposite()) {
		std::string out;
		if (d_config->edgeAwareAcc())
			out = genStr(", float * __restrict out%s", hulls.c_str());
		else
			out = genStr(", unsigned short * __restrict out%s", hulls.c_str());
		if (d_config->sweepSurface())
			out = "";

		append("#kernelDecl{sweep}{const struct LineInfo * __restrict liIn%s}#", out.c_str());
	} else {
		if (d_config->occlusionScatter())
			append("#kernelDecl{sweep}{const struct LineInfo * __restrict liIn, float * __restrict out%s}#", hulls.c_str());
		else
			append("#kernelDecl{sweep}{const struct LineInfo * __restrict liIn, %s * __restrict out%s}#", sweepType, hulls.c_str());
	}

	append(" {\n");

	incrementIndent();
	initKernel();
	genHullInit();

	if (d_config->optiPath()) {
		d_lookAhead = false;
		genOpti();
	} else { 
		append("float2 pLocal;\n");
		append("float2 upVec;\n");
		if (!d_config->mode()) {
			append("for (int i = 0; i < 2; ++i) {\n");
			incrementIndent();
			sample("li.startPos", "pLocal");

			append("if (convexIndex < %d)\n", d_config->hullSize()-1);
			incrementIndent();
			append(hullRefCurrent() + " = pLocal;\n");
			decrementIndent();

			hullIndexIncrement();
			stepForward(true, true); // Do not touch index and decrement idle counter
			decrementIndent();
			append("}\n");
		}

		// We go through the idle steps
		//append("#pragma unroll 4\n");
		append("while (li.idleSteps > 0) {\n");
		incrementIndent();
		sample("li.startPos", "pLocal", "NOTREALLYLOL");

		/*append("if (convexIndex < MAX_HULL_SIZE-1)\n");
		incrementIndent();
		append(hullRefCurrent() + " = pLocal;\n");
		decrementIndent();*/
		if (!d_config->mode())
			stepConvex(false);

		stepForward(true, true); // Touch index and decrement idle counter
		decrementIndent();
		append("}\n");

		
		initTangent();

		if (d_lookAhead) {
			append("float2 prevP, nextP;\n");
			sample("li.startPos", "nextP");
			stepForward(true);
		}

		append("float occlusion = 0.0f;\n");

		std::string tangentDest;
		if (d_config->tangentType()) {
			append("float tangentSin;\n");
		}
		tangentDest = "tangentSin";

		//append("#pragma unroll 8\n");
		append("while (li.numSteps > 0) {\n");
		incrementIndent();

		if (d_lookAhead) {
			append("pLocal = nextP;\n");
			sample("li.startPos", "nextP", tangentDest);
		} else {
			sample("li.startPos", "pLocal", tangentDest);
		}

		#if 1
		if (d_config->mode() != 1) {
			if (d_config->mode() == 2)
				genBrute();
			else
				stepConvex();
			genOcclusion();
		} else {
			genBrute();
		}
		#else
		if (!d_config->mode()) {
			stepConvex();
			genOcclusion();
		} else {
			genBrute();
		}
		#endif

		writeOut();

		#ifdef SKIPWRITES
		decrementIndent();
		append("}\n");
		#endif

		stepForward();
		postTangent();
	}

	decrementIndent();
	append("}\n");
	decrementIndent();
	append("}\n");
}


void SweepKernel::genFuncs() {
	append("#funcDecl{vecFalloff}{float}{float2 horVec}# {\n");
	#if 0
	//append("  return (SSEOdot2(horVec, horVec) < 1.0f) ? 1.0f : 0.0f;\n");
	//append("  return #maxf{0.0f}{1.0f - 0.8f*SSEOdot2(horVec, horVec)}#;\n");
	append("  return #maxf{0.0f}{1.0f - 0.5f*SSEOlength2(horVec)}#;\n");
	//append("  return #div{1.0f}{1.0f + 0.125f*SSEOdot2(horVec, horVec)}#;\n");
	#else
	append("  const float invCoef = %Ef;\n", 1.0f/d_config->fallOff());
	append("  return #rcp{1.0f + invCoef*SSEOdot2(horVec, horVec)}#;\n");
	#endif
	append("}\n");
	append("#funcDecl{fallOff}{float}{const float distance}# {\n");
	#if 0
	//append("  return 0.2f/(distance*distance);\n");
	//append("  return #maxf{0.0f}{1.0f - distance/3.0f}#;\n");
	//append("  return #exp2{-distance}#;\n");

	// StarCraft 2:
	append("  const float coef = 8.0f;\n");
	append("  return coef/(#maxf{coef}{distance*distance}#);\n");
	#else
	append("  const float coef = 1.0f;\n");
	append("  return coef/(coef + distance*distance);\n");
	#endif
	append("}\n");

	genHullCompare();
	genSampleFuncs();
}

#if 0
void SweepKernel::genBrute() {
	append("{\n");

	incrementIndent();

	if (d_tangent == 2) {
		append("float2 tanBack = pLocalPrev - pLocal;\n");
		append("float2 tanForward = pLocal - pLocalNext;\n");
		append("float2 tanVec = (SSEOdot2(tanBack, tanBack) < SSEOdot2(tanForward, tanForward)) ? tanBack : tanForward;\n");
	} else if (d_tangent == 1)
		append("float2 tanVec = pLocalPrev - pLocalNext;\n");
	else {
		append("float2 tanVec = #float2Ctor{-normal.y}{normal.x}#;\n");
	}

	append("float tangentSin = SSEOdot2(upVec, SSEOnormalize2(tanVec));\n");

	// FIXME: An ugly hack.  We save normalNext..
	append("float2 saveNormalNext = normalNext;\n");

	//append("float tangentSin = SSEOdot2(upVec, SSEOnormalize2(pLocalPrev - pLocalNext));\n");
	append("occlusion = tangentSin;\n");

	if (d_config->mode() == 2)
		append("float maxOcc = 0.0f;\n");

	append("float maxDot = tangentSin;\n");

	//append("bool first = true;\n");
	if (d_tangent)
		append("for (float2 samplePos = %s - li.stepDir*2.0f; insideTex(samplePos); samplePos -= li.stepDir) {\n",
				d_snapCoord ? "snapStart": "li.startPos");
	else
		append("for (float2 samplePos = %s - li.stepDir; insideTex(samplePos); samplePos -= li.stepDir) {\n",
				d_snapCoord ? "snapStart": "li.startPos");
	incrementIndent();
	append("float2 candP;\n");

	/*if (d_snapCoord) {
		append("float2 samplePosSnap = snapCoord(samplePos);\n");
		snapCoord("samplePos", "samplePosSnap");
	} else*/

	if (d_hybridSnap)
		getSmartPoint("candP", "samplePos");
	else
		getPoint("candP", "samplePos");

	append("float2 horVec = candP - pLocal;\n");
	if (d_useFalloff)
		append("float distance = SSEOlength2(horVec);\n");

	//append("float horizonSin = SSEOdot2(upVec, SSEOnormalize2(horVec));\n");

	append("float thisDot = SSEOdot2(upVec, SSEOnormalize2(horVec));\n");

	if (d_config->mode() == 1) {
		append("if (thisDot > maxDot) {\n");
		//append("  if (first) { first = false; distance = 0.0f; }\n");
		append("  occlusion += (thisDot - maxDot)");
		if (d_useFalloff)
			append("*fallOff(distance)");
		append(";\n");
		append("  maxDot = thisDot;\n");
		append("}\n");
		decrementIndent();
	} else {
		append("float tempOcc = (thisDot - maxDot)");
		if (d_useFalloff)
			append("*fallOff(distance)");
		append(";\n");
		append("if (tempOcc > maxOcc) maxOcc = tempOcc;\n");
	}

	append("}\n");

	if (d_config->mode() == 2)
		append("occlusion += maxOcc;\n");

	append("normalNext = saveNormalNext;\n");

	decrementIndent();
	append("}\n");
}
#endif
