#include "sweep.h"

#include <cmath>
#include <cstdio>
#include <algorithm>
#include <cstring>

using namespace SSEO;


LineParams::LineParams(SSEOConfiguration *c) : d_config(c) {	
	initPlain();
}

void LineParams::initPlain() {
	// Creating a pool of work..  One struct per thread.
	// First, we iterate over all the directions.
	// For each direction, we find the outermost corner of the FB that the direction's normal intersects.
	// (Actually we find the center of the corner texel.) We refer to this position as the origo from now on.
	// From the origo, we iterate along the edges of the FB such that each step along either
	// direction is one texel width apart along the direction's normal.
	// We create vectors that start from these positions and extend towards the direction: these accord to the
	// threads' starting positions and directions in which they process the height field.
	// From the origo, distance along the direction is called "layer distance" and distance along the
	// direction's normal is "layer offset".  This notation will be needed in the occlusion phase (3rd).
	double angleOffset = 2.0*PId/(double)d_config->dirs();
	double angle0 = (double)d_config->dirOffset()*angleOffset; // We avoid axis-aligned directions, and start w/ 0.5*offset

	d_debug = 1; // 1 == print statistics, 2 == print too much

	d_totalSteps = 0;
	//d_maxOffsetWidth = 0;
	d_maxDistance = -1;

	int maxDirs = d_config->dirs();
	if (d_config->matchOpposite()) {
		if (maxDirs%2)
			throw std::string("Config parameter \"match opposite\" requires an even number of dirs");
		maxDirs /= 2;
	}

	// These "preset" directions are chosen as follows:
	// Create a box of radius (K/8) around origo, and for each texel center that the box
	// intersects, draw a "direction"..  This produces K directions that hit texel centers nicely

	std::vector<double> presetAngles;
	std::vector<double> presetSkips;

	if (d_config->usePreset()) {
		if (maxDirs%4)
			throw std::string("Preset has to have K = 8n, n e Z");

		int boxRadius = maxDirs/4;

		#if 1
		// We start from directly up, and go all the way just one step short of down..
		// We do this in 3 segments..  The top row first
		int x = 0;
		int y = boxRadius;

		// Because axis-aligned directions can be stretched by any amount,
		// and since they will always be the shortest..  We will give them
		// a length that is the average of all the other dirs.
		double avgStep = 0.0;
		double avgCounter = 0.0;
		std::vector<int> replaceAvg;

		for (int step = 0; step < boxRadius+1; ++step) {
			double stepSize = sqrt((double)(x*x + y*y));
			stepSize *= d_config->stepSkip();
			double dirAngle;
			if (x)
				dirAngle = atan((double)x/(double)y);
			else
				dirAngle = 0.0;
			presetAngles.push_back(dirAngle);
			presetSkips.push_back(stepSize);

			if (x && y) {
				avgStep += stepSize;
				avgCounter += 1.0;
			} else
				replaceAvg.push_back(presetAngles.size()-1);

			x++; // Y stays the same
		}
		// The side is next
		x = boxRadius;
		y = boxRadius-1;
		for (int step = 0; step < 2*boxRadius; ++step) {
			double stepSize = sqrt((double)(x*x + y*y));
			stepSize *= d_config->stepSkip();
			double dirAngle = atan((double)x/(double)y);
			if (dirAngle < 0.0)
				dirAngle += PI;
			presetAngles.push_back(dirAngle);
			presetSkips.push_back(stepSize);

			if (x && y) {
				avgStep += stepSize;
				avgCounter += 1.0;
			} else
				replaceAvg.push_back(presetAngles.size()-1);

			y--; // X stays the same
		}
		// And the bottom
		x = boxRadius-1;
		y = -boxRadius;
		for (int step = 0; step < boxRadius-1; ++step) {
			double stepSize = sqrt((double)(x*x + y*y));
			stepSize *= d_config->stepSkip();
			double dirAngle = atan((double)x/(double)y) + PI;
			presetAngles.push_back(dirAngle);
			presetSkips.push_back(stepSize);

			if (x && y) {
				avgStep += stepSize;
				avgCounter += 1.0;
			} else
				replaceAvg.push_back(presetAngles.size()-1);

			x--; // Y stays the same
		}

		// Now we replace the lengths..
		double replaceStep = avgStep/avgCounter;
		for (int i = 0; i < replaceAvg.size(); ++i)
			presetSkips.at(replaceAvg.at(i)) = replaceStep;

		for (int i = 0; i < presetAngles.size(); ++i)
			printf("Preset dir %d+%d: stepsize %.3f, dir angle (-)%.1f degrees\n", 
					i, i + d_config->dirs()/2, presetSkips.at(i), presetAngles.at(i)/(2.0*PI)*360); //stepSize, dirAngle/(2.0*PI)*360.0);

		// We set the lineStep to match the stepSkip, because that's usually good.
		// It can be adjusted by the conf though, by a coefficient
		d_config->lineSkip() *= replaceStep/d_config->stepSkip();

		// We set the new stepSkip because it will be used in lineIndex.cpp to 
		// choose a contribution range.
		d_config->stepSkip() = replaceStep;

		printf("D_S = %.2f, D_L = %.2f, K = %d  <<>>  Average number of iters per pixel %.2f\n",
				d_config->stepSkip(), d_config->lineSkip(), d_config->dirs(), 
				(float)d_config->dirs()/d_config->stepSkip()/d_config->lineSkip());

		#else
	
		if (maxDirs == 4) { // K=8
			double offset = 0.5*PI/2.0;
			double step = sqrt(2.0);
			presetAngles.push_back(0.0*offset);
			presetSkips.push_back(step);
			presetAngles.push_back(1.0*offset);
			presetSkips.push_back(step);
			presetAngles.push_back(2.0*offset);
			presetSkips.push_back(step);
			presetAngles.push_back(3.0*offset);
			presetSkips.push_back(step);
		} else if (maxDirs == 8) { // K=16
			double middleStep = 2.0*sqrt(2.0);
			double quarterStep = sqrt(5.0);
			double avgStep = (middleStep + quarterStep)*0.5;
			printf("Steps: %f %f %f\n", middleStep, quarterStep, avgStep);

			// First one is straight up
			presetAngles.push_back(0.0);
			presetSkips.push_back(avgStep);
			// Fifth is straight to the side
			presetAngles.push_back(PI*0.5);
			presetSkips.push_back(avgStep);

			// middle
			presetAngles.push_back(0.25*PI);
			presetSkips.push_back(middleStep);
			presetAngles.push_back(0.75*PI);
			presetSkips.push_back(middleStep);

			// The last for are the quarters
			presetAngles.push_back(atan(0.5));
			presetSkips.push_back(quarterStep);
			presetAngles.push_back(0.5*PI - atan(0.5));
			presetSkips.push_back(quarterStep);

			// And another just rotated 90 degrees
			presetAngles.push_back(0.5*PI + atan(0.5));
			presetSkips.push_back(quarterStep);
			presetAngles.push_back(PI - atan(0.5));
			presetSkips.push_back(quarterStep);
		} else
			throw std::string("Asked for preset angles/lineskips for K we don't have");
		#endif
	}

	for (int dir = 0; dir < maxDirs; ++dir) {
		int dirSteps = 0;	
		double angle = angle0 + (double)dir*angleOffset;

		if (d_config->usePreset())
			angle = presetAngles.at(dir);
	
		// Angle == 0 means pointing upwards (0, 1)
		double2 dirVector(sin(angle), cos(angle));
		double2 dirNormals[] = {
			double2(dirVector.y, -dirVector.x),
			double2(-dirVector.y, dirVector.x)
		};

		dirVector *= d_config->usePreset() ? presetSkips.at(dir) : d_config->stepSkip();
		dirNormals[0] *= d_config->lineSkip();
		dirNormals[1] *= d_config->lineSkip();

		// The texture in normalized clamp addressing mode, using linear filtering, has
		// its corner texel centers at (0.5/width, 0.5/height) and 
		// ((width-1+0.5)/width, (height-1+0.5)/height), while the other texels are exactly
		// 1/width, 1/height apart.  We use unnormalized coordinates until the very end.
		d_corners[0] = double2(0.5, 0.5);
		d_corners[1] = double2(0.5, (double)(d_config->hfHeight()-1)+0.5);
		d_corners[2] = double2((double)(d_config->hfWidth()-1)+0.5, (double)(d_config->hfHeight()-1)+0.5);
		d_corners[3] = double2((double)(d_config->hfWidth()-1)+0.5, 0.5);

		double borderW = (double)(d_config->hfWidth()-d_config->occWidth())/2.0;
		double borderH = (double)(d_config->hfHeight()-d_config->occHeight())/2.0;

		d_cornersOcc[0] = d_corners[0] + double2(borderW, borderH);
		d_cornersOcc[1] = d_corners[1] + double2(borderW, -borderH);
		d_cornersOcc[2] = d_corners[2] + double2(-borderW, -borderH);
		d_cornersOcc[3] = d_corners[3] + double2(-borderW, borderH);

		// We know that by choosing the angle0 like we did we're not going to hit 
		// |x| < epsilon || |y| < epsilon cases, when the "dirs" is a power of 2.
		// Therefore we can safely determine the corner by testing the signs of x and y.
		int corner;
		if (dirVector.y > 0.0)
			corner = (dirVector.x > 0.0) ? 0 : 3;
		else
			corner = (dirVector.x > 0.0) ? 1 : 2;

		// Generating the lineinfos.  The first one starts from the corner.
		struct LineInfoD origo;
		origo.startPos = d_corners[corner];
		origo.stepDir = dirVector;
		origo.dirIndex = dir;
		origo.numSteps = getSteps(origo.startPos, origo.stepDir);
		origo.layerDistance = 0;
		origo.layerOffset = 0;

		if (origo.numSteps)
			d_workPool.push_back(origo);

		dirSteps += origo.numSteps;

		if (d_debug == 2)
			printf("Layer %d, offset %d, steps %d (forward 0) ORIGO\n", 
				origo.layerDistance, origo.layerOffset, origo.numSteps);

		int maxSteps = 0;

		// The 2 normal directions
		for (int i = 0; i < 2; ++i) {
			struct LineInfoD li = origo;
			double2 dirNormal = dirNormals[i];

			// We iterate as long as we can step inside the HF
			while (true) {
				li.startPos += dirNormal;
				li.layerOffset += (i == 1) ? 1 : -1;

				int forward = stepInside(li.startPos, li.stepDir);
				if (forward == -1) {
					if (d_debug == 2)
						printf("ABORTING\n");
					break; // This direction is of no use
				}

				li.startPos += li.stepDir*(double)forward;
				li.layerDistance += forward;

				li.numSteps = getSteps(li.startPos, li.stepDir);

				if (!li.numSteps) // If we don't hit the occlusion area
					continue;

				dirSteps += li.numSteps;
			
				if (li.numSteps > maxSteps)
					maxSteps = li.numSteps;
				d_workPool.push_back(li);

				// Updating the minima and maxima
				/*if (li.layerDistance + li.numSteps > maxDistance)
					maxDistance = li.layerDistance + li.numSteps;
				if (li.layerOffset < minOffset)
					minOffset = li.layerOffset;
				if (li.layerOffset > maxOffset)
					maxOffset = li.layerOffset;*/

				if (d_debug == 2)
					printf("Layer %d, offset %d, steps %d (forward %d), dirindex %d\n", 
						li.layerDistance, li.layerOffset, li.numSteps, forward, li.dirIndex);
			}
		}

		//d_maxDistance = -1; //maxSteps; // EQUALS TO SWEEPHEIGHT

		/*if (maxOffset-minOffset+1 > d_maxOffsetWidth)
			d_maxOffsetWidth = maxOffset-minOffset+1;
		if (maxDistance > d_maxDistance)
			d_maxDistance = maxDistance;*/

		/*if (d_debug == 2) {
			printf("Current max dist %d, offset width %d\n",
				d_maxDistance, d_maxOffsetWidth);
			printf("Steps for dir %d: %d\n", dir, dirSteps);
		}*/

		if (d_config->matchOpposite())
			d_totalSteps += dirSteps*2;
		else
			d_totalSteps += dirSteps;

		//d_dirOffsets.push_back(minOffset);
	}

	if (d_debug) {
		printf("Average number of steps per dir %.2f, texels in the HF %d\n",
				(float)((double)d_totalSteps/(double)d_config->dirs()), d_config->hfWidth()*d_config->hfHeight());
		printf("Total number of threads %lu, average length %.2f steps\n",
				d_workPool.size(), (float)((double)d_totalSteps/(double)d_workPool.size()));
		printf("Number of lines %d\n", d_workPool.size());
	}

	if (d_debug == 2) {
		for (int i = 0; i < (int)d_workPool.size(); ++i)
			printf("Thread %d: dir %d, layerstart %d, offset %d, length %d\n",
				i, 
				d_workPool.at(i).dirIndex,
				d_workPool.at(i).layerDistance,
				d_workPool.at(i).layerOffset,
				d_workPool.at(i).numSteps);
	}

	// Forwarding offsets already so we know alignments for memory addresses
	/*for (int i = 0; i < (int)d_workPool.size(); ++i)
		d_workPool.at(i).layerOffset -= d_dirOffsets.at(d_workPool.at(i).dirIndex);*/
	
	d_config->sweepHeight() = sweepHeight(); //d_maxDistance;
	//d_config->sweepWidth() = d_config->sweepStripe(); //d_workPool.size();

	/*if (d_config->matchOpposite())
		genOpposite();*/

	if (d_config->jitterTangent()) {
		genTangents();
		throw std::string("Not good right now..");
	}
}

void LineParams::genTangents() {
	for (int line = 0; line < d_workPool.size(); ++line) {
		float randomJitter = (float)(
				(double)(rand())/(double)RAND_MAX*0.5 + 0.5);
		d_workPool.at(line).tangent = randomJitter;
	}
}

void LineParams::genOpposite() {
	//throw std::string("Not implemented genopposite");

	// For each dir..
	for (int dir = 0; dir < d_config->dirs()/2; ++dir) {
		//std::vector<struct LineInfo> oppositePool;

		// We step each line through until the last step within the texture
		// Then we use the last step as the new startPos, and record the furthest point..
		for (int inLine = 0; inLine < d_workPool.size(); ++inLine)
			if (d_workPool.at(inLine).dirIndex == dir) {
				struct LineInfoD li = d_workPool.at(inLine);

				// We start from the current position, and step until we go out of the entire HF.
				// Then we continue backwards until we hit occ box, and continue again until we no longer hit.
				while (insideHF(li.startPos)) {
					li.startPos += li.stepDir;
					li.layerDistance++;
				}
				// We went too far, so we step once back
				li.startPos -= li.stepDir;
				li.layerDistance--;

				// This is where we start, now we calculate how far we go
				li.stepDir = -li.stepDir;
				li.numSteps = getSteps(li.startPos, li.stepDir);
				//li.layerDistance -= li.numSteps;
				//li.layerDistance = d_config->sweepHeight() - 1 - li.layerDistance;
				li.dirIndex += d_config->dirs()/2;

				//li.referenceLine = &(d_workPool.at(inLine));

				//oppositePool.push_back(li);
				d_workPool.push_back(li);
			}
	}
}

/*bool LineParams::insideTex(const float2 pos) {
	float xCoef = 1.0f/(float)d_config->hfWidth();
	float yCoef = 1.0f/(float)d_config->hfHeight();

	return (pos.x > 0.5f*xCoef && pos.x < ((float)d_config->hfWidth() - 0.5f)*xCoef &&
			pos.y > 0.5f*yCoef && pos.y < ((float)d_config->hfHeight() - 0.5f)*yCoef);
}

bool LineParams::insideTexNonNorm(const float2 pos) {
	return (pos.x > 0.5f && pos.x < ((float)d_config->hfWidth() - 0.5f) &&
			pos.y > 0.5f && pos.y < ((float)d_config->hfHeight() - 0.5f));
}*/

int LineParams::stepInside(double2 start, double2 step) {
	// Once again, brute force is acceptable once per application invocation
	static const int maxSteps = sqrtf(d_config->hfWidth()*d_config->hfWidth() + d_config->hfHeight()*d_config->hfHeight())*2; // This should be plenty

	for (int steps = 0; steps < maxSteps; steps++, start += step)
		if (insideHF(start))
			return steps;

	return -1;
}

int LineParams::stepInsideOccNorm(double2 start, double2 step) {
	// Once again, brute force is acceptable once per application invocation
	static const int maxSteps = sqrtf(d_config->hfWidth()*d_config->hfWidth() + d_config->hfHeight()*d_config->hfHeight())*2; // This should be plenty

	start.x *= (double)d_config->hfWidth();
	start.y *= (double)d_config->hfHeight();
	step.x *= (double)d_config->hfWidth();
	step.y *= (double)d_config->hfHeight();

	for (int steps = 0; steps < maxSteps; steps++, start += step) {
		if (insideOcc(start))
			return steps;
	}

	return -1;
}

int LineParams::getSteps(double2 start, double2 step) {
	// First we traverse until we hit the occlusion box
	int steps = 0;
	while (!insideOcc(start)) {
		start += step;
		steps++;
		// If we step outside of the entire HF, we don't need this line
		if (!insideHF(start)) {
			if (d_debug == 2) printf("Line skipped after %d steps\n", steps);
			return 0;
		}
	}

	if (d_debug == 2) printf("  %d steps until inside occ\n", steps);
	int borderSteps = steps;

	// We're here, and we've stepped inside the occlusion box..  Then we step until outside
	while (insideOcc(start)) {
		start += step;
		steps++;
	}

	if (d_debug == 2) printf("  %d meat steps\n", steps-borderSteps);
	if (d_debug) {
		int tempSteps = 0;
		while (insideHF(start)) {
			start += step;
			tempSteps++;
		}
		if (d_debug == 2) printf("  %d discarded steps\n", tempSteps);
	}

	return steps;
}

bool LineParams::inside(double2 pos, double2 *area) {
	return pos.x >= area[0].x &&
			pos.y >= area[0].y &&
			pos.x <= area[2].x &&
			pos.y <= area[2].y;
}

bool LineParams::insideHF(double2 pos) {
	return inside(pos, d_corners);
}

bool LineParams::insideOcc(double2 pos) {
	return inside(pos, d_cornersOcc);
}

std::vector<int> LineParams::getDirOffsets() {
	return d_dirOffsets;
}

struct LineInfoD LineParams::genOpposeLine(struct LineInfoD li) {
	// We start from the current position, and step until we go out of the entire HF.
	// Then we continue backwards until we hit occ box, and continue again until we no longer hit.
	while (insideHF(li.startPos)) {
		li.startPos += li.stepDir;
		li.layerDistance++;
	}
	// We went too far, so we step once back
	li.startPos -= li.stepDir;
	li.layerDistance--;

	// This is where we start, now we calculate how far we go
	li.stepDir = -li.stepDir;
	li.numSteps = getSteps(li.startPos, li.stepDir);
	//li.layerDistance -= li.numSteps;
	li.layerDistance = -li.layerDistance;
	li.dirIndex += d_config->dirs()/2;
	return li;//stripeBlocks.push_back(li);
}

void LineParams::organizeIntoBlocks(std::vector<struct LineInfoD> *pool) {
	// We copy the entire pool first, then we create an int vector that contains
	// the maximum length of a thread in the block..
	// We clear the original pool and re-add all the stripes in a decreasing order
	std::vector<struct LineInfoD> copyPool = *pool;
	std::vector<int> lengths;
	for (int i = 0; i < copyPool.size()/d_config->sweepStripe(); ++i)
		lengths.push_back(0);

	for (int stripe = 0; stripe < copyPool.size()/d_config->sweepStripe(); ++stripe)
		for (int j = 0; j < d_config->sweepStripe(); ++j) {
			int thisLength = copyPool.at(stripe*d_config->sweepStripe() + j).numSteps;
			if (lengths.at(stripe) < thisLength)
				lengths.at(stripe) = thisLength;
		}

	pool->clear();

	while (pool->size() != copyPool.size()) {
		// Finding the new max
		int max = 0;
		int stripePicked;
		for (int stripe = 0; stripe < copyPool.size()/d_config->sweepStripe(); ++stripe)
			if (lengths.at(stripe) > max) {
				max = lengths.at(stripe);
				stripePicked = stripe;
			}

		// Zeroing out the max so it will not be picked again
		lengths.at(stripePicked) = -1;

		// Anyway, adding the stripe..
		for (int j = 0; j < d_config->sweepStripe(); ++j)
			pool->push_back(copyPool.at(stripePicked*d_config->sweepStripe() + j));
	} // I think we're done..

	printf("Organized pool size %d\n", pool->size());
	/*for (int i = 0; i < pool->size(); ++i)
		printf("elem %d length %d (dir %d, offset %d)\n", i,
				pool->at(i).numSteps,
				pool->at(i).dirIndex,
				pool->at(i).layerOffset);*/
}

// CoalesceWidth tells how many threads, that end up writing consecutive columns in the
// result buffer, to pack together (consecutively).  Memory accesses can therefore
// occur in "coalesceWidth" wide coalesced transactions.
struct LineInfo *LineParams::getWork(size_t *size) {

	int coalesceWidth = d_config->sweepStripe();
	int blockSize = d_config->sweepBlock();

	if (coalesceWidth < 1 || coalesceWidth > (int)d_workPool.size())
		throw std::string("CoalesceWidth out of bounds");

	bool newWay = true; // && d_config->matchOpposing();
	std::vector<struct LineInfoD> result;
	struct LineInfoD pad = { double2(0.0, 0.0), double2(0.0, 0.0), -1, 0, 0, 0 };
	struct LineInfo padf = { float2(0.0f, 0.0f), float2(0.0f, 0.0f), -1, 0, 0, 0 };

	if (newWay) {
		int padEntries = 0;

		// The process is this:
		// 1  Generate stripe blocks for the first half..
		// 2  Calculate another pool of opposing stripes
		// 3  Align both of them
		// 4  Make complete blocks out of the two pools (aligned numSteps should suffice)
		// 5  Calc per-block idleIters & destIndices for the first part (reuse previous code)
		// 6  ...

		// We create block of lines the size of coalesceWidth..
		std::vector<struct LineInfoD> stripeBlocks;
		std::vector<struct LineInfoD> opposeBlocks;

		// Each dir at a time
		int maxDirs = d_config->dirs();
		if (d_config->matchOpposite())
			maxDirs /= 2;

		for (int dir = 0; dir < maxDirs; ++dir) {
			// We find the smallest offset (start from an edge)
			int smallest = 1000000;
			int smallestIndex;
			for (int line = 0; line < d_workPool.size(); ++line)
				if (d_workPool.at(line).dirIndex == dir)
					if (d_workPool.at(line).layerOffset < smallest) {
						smallest = d_workPool.at(line).layerOffset;
						smallestIndex = line;
					}
			
			// Now we generate stripes from this dir and pack the last one..
			int curOffset = smallest;
			bool fullBlock = true;
			while (fullBlock) {

				// PHASE 1
				for (int stripeLine = 0; stripeLine < coalesceWidth; ++stripeLine) {
					
					int foundLine = -1;
					// Now we find the curOffset line within dir
					for (int candidateLine = 0; candidateLine < d_workPool.size() && foundLine == -1; ++candidateLine) {
						if (d_workPool.at(candidateLine).dirIndex == dir && d_workPool.at(candidateLine).layerOffset == curOffset)
							foundLine = candidateLine;
					}

					// If we didn't find any, we add pad
					if (foundLine == -1) {
						//printf("Did not find next for dir %d, offset %d (steps %d)\n", dir, curOffset, d_workPool.at(smallestIndex).numSteps);
						stripeBlocks.push_back(pad);
						padEntries++;

						if (d_config->matchOpposite()) {
							opposeBlocks.push_back(pad);
							padEntries++;
						}

						fullBlock = false;
					} else {
						stripeBlocks.push_back(d_workPool.at(foundLine));
						// PHASE 2
						if (d_config->matchOpposite())
							opposeBlocks.push_back(genOpposeLine(d_workPool.at(foundLine)));
						curOffset++;
					}
				}
			}
		}

		// Printing the blocks..
		/*printf("total of %d blocks\n", curBlock);
		for (int block = 0; block < curBlock; ++block) {
			printf("Block %d:\n", block);
			std::vector<struct LineInfoD> *thisBlock = &(stripeBlocks.at(block));
			for (int elem = 0; elem < thisBlock->size(); ++elem) {
				printf("\tElem %d: dir %d, offset %d, length %d\n", elem, 
						thisBlock->at(elem).dirIndex,
						thisBlock->at(elem).layerOffset,
						thisBlock->at(elem).numSteps);
			}
		}*/

		// One big pool
		for (int i = 0; i < stripeBlocks.size(); ++i)
			result.push_back(stripeBlocks.at(i));
		if (d_config->matchOpposite())
			for (int i = 0; i < opposeBlocks.size(); ++i)
				result.push_back(opposeBlocks.at(i));

		printf("straight pool %d entries, opposite pool %d entries\n", stripeBlocks.size(), opposeBlocks.size());

		printf("%d pad threads in %d total (%.2f %%)\n", padEntries, result.size(),
				(float)padEntries/(float)result.size()*100.0f);
	} else {
		// First of all we sort all the work such that the longest running threads are in the beginning
		std::sort(d_workPool.begin(), d_workPool.end(), lengthCompare);

		if (d_debug == 2)
			for (int i = 0; i < (int)d_workPool.size(); ++i)
				printf("AFTER SORT Thread %d: dir %d, layerstart %d, offset %d, length %d\n",
					i, 
					d_workPool.at(i).dirIndex,
					d_workPool.at(i).layerDistance,
					d_workPool.at(i).layerOffset,
					d_workPool.at(i).numSteps);

		// Keeping track which threads have already been picked
		std::vector<bool> workPicked;
		for (int i = 0; i < (int)d_workPool.size(); ++i)
			workPicked.push_back(false);

		// We will return this
		//struct LineInfo *result = (struct LineInfo*) malloc(sizeof(struct LineInfo)*d_workPool.size());

		// Now we construct the work array (in coalesced packs) in the following way:
		// 1) We pick the first (longest) thread (that has not been picked yet) from the main pool.
		// 2) Then we pick the longest of the threads that are next to either side of the pack.
		// 3) Goto 2 until the pack is full, or there's no longer threads to choose from in which case pad.
		// 4) Goto 1 until there's no more threads unpicked
		int globalCounter = 0;
		while (globalCounter < (int)d_workPool.size()) {
			// Getting the topmost free thread
			int first = 0;
			while (workPicked.at(first))
				first++;

			result.push_back(d_workPool.at(first));
			globalCounter++;
			workPicked.at(first) = true;

			int slots = coalesceWidth - 1; // This many slots left in the pack
			// These are the "sides" of the pack
			int offsetMin = d_workPool.at(first).layerOffset;
			int offsetMax = d_workPool.at(first).layerOffset;
			int dirIndex = d_workPool.at(first).dirIndex;

			if (d_debug == 2)
				printf("First (%d): dir %d, offset %d, length %d\n", 
					first, d_workPool.at(first).dirIndex,
					d_workPool.at(first).layerOffset, d_workPool.at(first).numSteps);

			while (slots) {
				// Finding the threads that have the same dirIndex and offset of either
				// offsetMin-1 or offsetMax+1
				int bottom = -1;
				int top = -1;

				if (d_debug == 2)
					printf("Slots %d\n", slots);

				// FIXME:  Consider rewound numSteps instead of the tight one
				// (layerDistance within threads in a block need to be rewound so that they
				//  hit the same cache line.)
				for (int i = first+1; i < (int)d_workPool.size(); ++i) {
					if (workPicked.at(i))
						continue;

					if (d_workPool.at(i).dirIndex == dirIndex &&
							//d_workPool.at(i).layerOffset == offsetMin-d_config->skipLines()) {
							d_workPool.at(i).layerOffset == offsetMin-1) {
						//if (bottom != -1) { fprintf(stderr, "Bottom %d\n", bottom); exit(123); }
						bottom = i;
					}
					if (d_workPool.at(i).dirIndex == dirIndex &&
							//d_workPool.at(i).layerOffset == offsetMax+d_config->skipLines()) {
							d_workPool.at(i).layerOffset == offsetMax+1) {
						//if (top != -1) { fprintf(stderr, "Top %d\n", top); exit(123); }
						top = i;
					}
				}

				if (d_debug == 2) {
					if (top != -1) 
						printf("Top (%d): dir %d, offset %d, length %d\n", 
								top, d_workPool.at(top).dirIndex,
								d_workPool.at(top).layerOffset, d_workPool.at(top).numSteps);

					if (bottom != -1) 
						printf("Bottom (%d): dir %d, offset %d, length %d\n", 
								bottom, d_workPool.at(bottom).dirIndex,
								d_workPool.at(bottom).layerOffset, d_workPool.at(bottom).numSteps);
				}

				// If we didn't find anything, we pad
				if (top == -1 && bottom == -1) {
					while (slots) {
						if (d_debug == 2) {
							printf("Padding!  GlobalCounter %d, result counter %lu, slots %d\n",
									globalCounter, result.size(), slots);
						}
						result.push_back(pad);
						slots--;
					}
				} else {
					int pick;
					if (top == -1 || bottom == -1)
						pick = (top == -1) ? bottom : top;
					else {
						// We previously picked the one that had most iterations, but nowadays we pick
						// the one that is closer to alignment boundary.  This way we ALWAYS
						// create a pack that starts at aligned addres.
						/*pick = (d_workPool.at(top).numSteps > d_workPool.at(bottom).numSteps) ?
							top : bottom;*/
						int align = coalesceWidth;
						/*int topAlign = d_workPool.at(top).layerOffset%align;
						int bottomAlign = d_workPool.at(bottom).layerOffset%align;
						if (topAlign > bottomAlign)
							pick = bottom;
						else if (topAlign < bottomAlign)
							pick = top;
						else
							pick = (d_workPool.at(top).numSteps > d_workPool.at(bottom).numSteps) ?
							top : bottom;*/
						pick = ((d_workPool.at(top).layerOffset%align) > 
								(d_workPool.at(bottom).layerOffset%align)) ?
							bottom : top;
						// Somehow it seems most efficient to pick the top/bottom every time on tie,
						// instead of picking the longest running of these two..  Go figure, don't
						// have the time to analyze this in more detail right now..
					}

					if (d_debug == 2)
						printf("Picked %d\n", pick);
					// Pick is our pick
					result.push_back(d_workPool.at(pick));
					workPicked.at(pick) = true;
					slots--;
					globalCounter++;
					// Fixing offsetMin/Max
					((pick == top) ? offsetMax : offsetMin) = d_workPool.at(pick).layerOffset;
				}
			}
		}
	} // NewWay

	if (!newWay) {
		double padPercentage = (double)(result.size() - d_workPool.size())/(double)d_workPool.size()*100.0;

		if (d_debug) {
			printf("Work size %lu, result size %lu\n", d_workPool.size(), result.size());
			printf("Padding percentage %.2f %%\n", padPercentage);
		}

		if (padPercentage > 5.0)
			fprintf(stderr, "Padding exceeded 5%%, which is unusual.  Consider tail packing\n");
		if (padPercentage > 20.0)
			throw std::string("Padding exceeded 20%%.  We consider this an error");
	}

	alignThreads(result, coalesceWidth);

	if (newWay) {
		// Organizing the pool into blocks
		organizeIntoBlocks(&result);
	}

	/*
	// Then we create a sub pool for each direction
	std::vector<struct LineInfo> subPool[d_config->dirs()];
	for (int i = 0; i < d_workPool.size(); ++i) 
		subPool[d_workPool.at(i).dirIndex].push_back(d_workPool.at(i));
	*/

	int lineCounter = 0;
	int mirrorCounter = 0;	
	std::vector<struct LineInfo> resultF;

	int minIdleIters = 10000000;
	int threadCounter = 0;

	int fixedThread = 0;

	// First we fix the idle iters
	for (int i = 0; i < result.size(); ++i) {
		struct LineInfoD *in = &(result.at(i));

		//printf("Thread %d with %d steps..\n", i, in->numSteps);
		int thisIdleIters;
		
		if (in->numSteps) {
			thisIdleIters = stepInsideOccNorm(in->startPos, in->stepDir);
			if (thisIdleIters == -1)
				throw std::string("No iters inside occ box this far along the processing..");
		}

		if (threadCounter == d_config->sweepBlock()) {
			// This is the first thread of a brand new thread block, so we retroactively fix the previous block
			for (; fixedThread < i; fixedThread++)
				//result.at(fixedThread).idleSteps = minIdleIters;
				result.at(fixedThread).idleSteps = minIdleIters;

			#if 0 // This is fixed now
			if (minIdleIters < 2)
				throw std::string("Idle iters less than 2, we need to tweak the kernel..");
			//printf("Writing idleiters of %d\n", minIdleIters);
			#endif

			threadCounter = 0;
			minIdleIters = 1000000;
		}

		if (in->numSteps && thisIdleIters < minIdleIters)
			minIdleIters = thisIdleIters;

		// We use this to test when to start writing
		in->myWriteAfter = in->numSteps - thisIdleIters;

		threadCounter++;
	}

	for (; fixedThread < result.size(); fixedThread++) {
		//printf("Last block, thread %d, minsteps %d\n", fixedThread, minIdleIters);
		result.at(fixedThread).idleSteps = minIdleIters;
		//result.at(fixedThread).numSteps = 0;
	}

	
	int writeWidth = d_config->sweepStorageStripe();

	// "Ideally" 0 (no need to use this hack now:  always set to 0)
	#ifdef SKIPWRITES
	const int offset = 0*writeWidth; //d_config->sweepStripe();
	#else
	int offset;
	if (d_config->optiPath())
		offset = 50*writeWidth; //d_config->sweepStripe();
	else
		offset = 500*writeWidth; // Not really used
	#endif

	int bufferPos = offset;
	int stripeCounter = 0;
	int maxRun = 0;

	// NOTE:
	// If oppositeMatching is used, we only touch addresses (layerdistances)
	// of non-opposite threads.  Also, we do not write the result yet
	for (int i = 0; i < result.size(); ++i) {
		struct LineInfoD in = result.at(i);

		if (d_config->matchOpposite())
			#if 0
			if (in.dirIndex >=
			while (in.dirIndex >= d_config->dirs()/2)
				continue;
			#else
			if (in.dirIndex >= d_config->dirs()/2) {
				i += d_config->sweepStripe() - 1;
				continue; // We're not touching opposite dirs here
			}
			#endif

		struct LineInfo out;

		out.startPos = float2((float)in.startPos.x, (float)in.startPos.y);
		out.stepDir = float2((float)in.stepDir.x, (float)in.stepDir.y);
		out.dirIndex = in.dirIndex;
		out.numSteps = in.numSteps;
		out.idleSteps = in.idleSteps;
		out.layerDistance = in.layerDistance;
		out.layerOffset = in.layerOffset;
		out.tangent = in.tangent;

		#if 0
		if (in.layerDistance + in.numSteps > maxRun)
			maxRun = in.layerDistance + in.numSteps;

		if (d_config->matchOpposite() && out.dirIndex > d_config->dirs()/2-1) {
			out.layerDistance = mirrorCounter;
			mirrorCounter++;
		} else {
			out.layerDistance = lineCounter;
			lineCounter++;
		}
		#endif

		// Starting a new stripe block
		//if (stripeCounter == d_config->sweepStripe()) {
		if (stripeCounter == writeWidth) {

			// We update this first 'cause it's being used in opposite
			//bufferPos += maxRun*d_config->sweepStripe() + offset;
			bufferPos += maxRun*writeWidth + offset;

			#if 0
			if (d_config->matchOpposite()) {
				// We're generating another stripe of lines to match the opposite direction here
				for (int stripeLine = 0; stripeLine < d_config->sweepStripe(); ++stripeLine) {
					struct LineInfo li = resultF.at(resultF.size() - d_config->sweepStripe() + stripeLine);

					// We calculate how many extra steps we have before we have to write values that have to match
					int emptySteps = 0;
					while (!insideOcc(li.startPos)) {
						emptySteps++;
						li.startPos += li.stepDir;
					}
					// Now we're in.  We calculate how many useful steps we have
					int usefulSteps = 0;
					while (insideOcc(li.startPos)) {
						usefulSteps++;
						li.startPos += li.stepDir;
					}

					// In order to mirror this, we start emptySteps forward from this position
					li.startPos += li.stepDir*(float)emptySteps;
					li.stepDir = -li.stepDir;

					// The start is the same + maxRun rows forward
					li.layerDistance = bufferPos + stripeLine;


					// We start from the current position, and step until we go out of the entire HF.
					// Then we continue backwards until we hit occ box, and continue again until we no longer hit.
					while (insideHF(li.startPos)) {
						li.startPos += li.stepDir;
						li.layerDistance++;
					}
					// We went too far, so we step once back
					li.startPos -= li.stepDir;
					li.layerDistance--;

					// This is where we start, now we calculate how far we go
					li.stepDir = -li.stepDir;
					li.numSteps = getSteps(li.startPos, li.stepDir);
					li.layerDistance -= li.numSteps;
					li.layerDistance = d_config->sweepHeight() - 1 - li.layerDistance;
					li.dirIndex += d_config->dirs()/2;

					resultF.push_back(li);
				}
			} // matchOpposite()
			#endif

			stripeCounter = 0;
			maxRun = 0;
		}

		if (d_config->matchOpposite())
			result.at(i).layerDistance = bufferPos + stripeCounter;
		else
			out.layerDistance = bufferPos + stripeCounter;

		int thisRun = in.numSteps - in.idleSteps;
		if (in.numSteps && thisRun > maxRun)
			maxRun = thisRun;

		stripeCounter++;

		if (!d_config->matchOpposite())
			resultF.push_back(out);
	}

	// We're pushing the bufferPos forward one last time to make it an appropriate indicator for buffer size
	//bufferPos += maxRun*d_config->sweepStripe() + offset;
	bufferPos += maxRun*writeWidth + offset;

	d_concLength = bufferPos/writeWidth; //d_config->sweepStripe();

	d_maxDistance = maxRun;
	d_config->sweepHeight() = sweepHeight(); //d_maxDistance;


	// Now we're fixing the opposing dirs and really writing the result buffer..
	if (d_config->matchOpposite()) {
		for (int i = 0; i < result.size(); ++i) {
			struct LineInfoD in = result.at(i);
			struct LineInfo out;

			out.startPos = float2((float)in.startPos.x, (float)in.startPos.y);
			out.stepDir = float2((float)in.stepDir.x, (float)in.stepDir.y);
			out.dirIndex = in.dirIndex;
			out.numSteps = in.numSteps;
			out.idleSteps = in.idleSteps;
			out.layerDistance = in.layerDistance;
			out.layerOffset = in.layerOffset;
			out.tangent = in.tangent;

			// We're changing the out layerDistance if we're opposing
			if (in.dirIndex >= d_config->dirs()/2) {
				if (!in.numSteps) continue;
				// Finding the opposing..
				int found = -1;
				for (int j = 0; j < result.size(); ++j)
					if (result.at(j).dirIndex == in.dirIndex - d_config->dirs()/2 &&
							result.at(j).layerOffset == in.layerOffset)
						found = j;
				if (found == -1)
					throw std::string("Couldn't find an opposite");

				struct LineInfoD orig = result.at(found);
				//printf("Thisdir %d, opposeDir %d, numsteps this %d, numsteps oppose %d\n", in.dirIndex, orig.dirIndex, in.numSteps, orig.numSteps);


				#if 1
				// The last write position (coordinate) from orig:
				double2 endPos = orig.startPos + orig.stepDir*(float)(orig.numSteps-1);

				// Calculating how many steps from in this pos is
				float stepsF = (endPos - in.startPos).length() / in.stepDir.length();
				int steps = (int)(stepsF + 0.5);

				int writeSteps = steps - in.idleSteps;
				if (writeSteps < 0)
					throw std::string("Less than 0 writesteps");//printf("Writesteps of %d o_O\n", writeSteps);

				// Find the original write position at endPos..
				//int origWrite = orig.layerDistance + (orig.numSteps - orig.idleSteps - 1)*d_config->sweepStripe();
				int origWrite = orig.layerDistance + (orig.numSteps - orig.idleSteps - 1)*writeWidth;

				// Finally the destination write for orig is:
				//in.layerDistance = out.layerDistance = origWrite + writeSteps*d_config->sweepStripe();
				in.layerDistance = out.layerDistance = origWrite + writeSteps*writeWidth;

				#else
				// Now what we do is that we find the last position orig will write to, and set that as the first position
				// in will write to.
				// Fast forward idlesteps amount
				double2 endPos = orig.startPos + orig.stepDir*(float)(orig.numSteps-1);
				
				double2 firstWritePos = in.startPos + in.stepDir*(float)(in.idleSteps);

				// We calculate how many extra idle iters there are, i.e. how many steps from
				// firstWritePos before we hit endPos
				float stepsF = (endPos - firstWritePos).length()/in.stepDir.length();
				int steps = (int)(stepsF + 0.5);

				out.layerDistance = endIndex;

				// The end index is accurate, but there is "steps" steps which advance the index..
				// That means that out.layerDistance has to match endIndex after "steps" steps only.
				// We forward because opposing indices go backwards
				out.layerDistance += steps*d_config->sweepStripe();
				//printf("end/first %f %f\n", endPos.x/firstOpposePos.x, endPos.y/firstOpposePos.y);
				#endif
			}

			// This has a different meaning in GPU
			out.layerOffset = in.myWriteAfter;
			resultF.push_back(out);
		}
	}

	#if 1
	// Just making sure once more that we are writing coalesced indices
	for (int i = 0; i < resultF.size(); i += d_config->sweepStripe()) {
		//struct LineInfo first = resultF.at(i);
		for (int j = i+1; j < i+d_config->sweepStripe(); ++j) {
			if (!resultF.at(j).numSteps || !resultF.at(j-1).numSteps) continue;
			if (resultF.at(j).layerDistance != resultF.at(j-1).layerDistance + 1)
			#if 1
			{
				printf("%d and %d not mathcing!  block %d, within %d\n", j, j-1, i/d_config->sweepStripe(), j - (i/d_config->sweepStripe()*d_config->sweepStripe()));
				printf("\t%d: dir %d, layeroffset %d, layerdistance %d, steps %d\n",
						j,
						resultF.at(j).dirIndex, resultF.at(j).layerOffset, resultF.at(j).layerDistance, resultF.at(j).numSteps);
				printf("\t%d: dir %d, layeroffset %d, layerdistance %d, steps %d\n",
						j-1,
						resultF.at(j-1).dirIndex, resultF.at(j-1).layerOffset, resultF.at(j-1).layerDistance, resultF.at(j-1).numSteps);
				printf("\tdist difference %d\n",resultF.at(j).layerDistance-resultF.at(j-1).layerDistance);
				printf("*** WHOLE BLOCK ***\n");
				printBlock(&resultF, j);
			}
			/*else
				printf("%d and %d match: %d vs %d\n", j, j-1,
						resultF.at(j).layerDistance, resultF.at(j-1).layerDistance);*/
			#else
			throw std::string("not matching");
			#endif
		}
	}
	#endif

	// Checking that the elements in the result vector are indeed sizeof(struct LineInfo) bytes apart
	int totalEntries = ((int)resultF.size()+blockSize-1)/blockSize*blockSize;
	for (int i = (int)resultF.size(); i < totalEntries; ++i)
		resultF.push_back(padf);
	for (int i = 1; i < (int)resultF.size(); ++i)
		if ((long unsigned int)&resultF.at(i) - (long unsigned int)&resultF.at(i-1) != sizeof(struct LineInfo)) {
			printf("ptr 1 %p, ptr 2 %p, diff %lu\n", &resultF.at(i), &resultF.at(i-1), (long unsigned int)(&resultF.at(i)) - (long unsigned int)(&resultF.at(i-1)));
			throw std::string("Generated LineInfos not densely packed!");
		}

	/*for (int i = 0; i < (int)resultF.size(); ++i) {
		if (resultF.at(i).layerDistance + resultF.at(i).numSteps > d_config->sweepHeight()) {
			printf("elem %d:  layerheight %d, layerdist %d steps %d\n", i, d_config->sweepHeight(),
					resultF.at(i).layerDistance, resultF.at(i).numSteps);
		}
	}*/

	if (d_debug == 2) 
		for (int i = 0; i < (int)resultF.size(); i += blockSize)
			printf("Block %d starts at 128B offset %d\n", i/blockSize, resultF.at(i).layerOffset%256);

	/*if (d_config->lineSkip() != 1)
		removeLines(&resultF, d_config->lineSkip());*/

	d_totalLineCount = resultF.size();
	/*if (d_config->matchOpposite())
		d_totalLineCount /= 2;*/
	d_config->sweepWidth() = sweepWidth();

	*size = sizeof(struct LineInfo)*(int)resultF.size();

	if (d_debug)
		printf("Sweep line length %d, sweep lines %d\n", d_config->sweepHeight(), d_config->sweepWidth());

	// "Result" will get destroyed when we exit
	void *outData = malloc(*size);
	memcpy(outData, &resultF.at(0), *size);
	return (struct LineInfo*)outData;
}

void LineParams::removeLines(std::vector<struct LineInfo> *result, int lineSkip) {
	std::vector<struct LineInfo> oldResult = *result;
	result->clear();

	struct LineInfo pad = { float2(0.0f, 0.0f), float2(0.0f, 0.0f), 0, 0, 0, 0 };

	for (int l = 0; l < oldResult.size(); ++l) {
		struct LineInfo li = oldResult.at(l);

		// If we detect a line with 0 numSteps, it's a padding line
		// THEN, we replace the padding with our own:
		// We skip all consecutive padding lines from the old result,
		// and fill in the destination with pad lines until the next block boundary.
		if (li.numSteps == 0) {
			// Skip pads from the source
			while (l < oldResult.size() && oldResult.at(l).numSteps == 0)
				l++;
			l--;

			// Now filling the result..
			while (result->size() % d_config->sweepBlock())
				result->push_back(pad);
		} else {
			if ((li.layerOffset%lineSkip) == 0)
				result->push_back(li);
		}
	}
}

bool LineParams::lengthCompare(const struct LineInfoD &l1, const struct LineInfoD &l2) {
	// The length is the primary sorting parameter, but as there are so many
	// threads with the same length, we want to sort equal-length threads
	// according to their direction and offset.
	// Also, lengths may vary by 1 while they actually are adjacent and
	// of equal length in the texture.
	if (abs(l2.numSteps - l1.numSteps) > 1) // I know this is a sorting hazard and could cause accumulative error, but it works in practice.  We don't need to be so extremely picky about the lengths
		return l1.numSteps > l2.numSteps;
	else if (l1.dirIndex != l2.dirIndex)
		return l1.dirIndex < l2.dirIndex;
	else
		return l1.layerOffset < l2.layerOffset;
}

bool LineParams::offsetCompare(const struct LineInfoD &l1, const struct LineInfoD &l2) {
	// Pads go to the end (are the largest)
	if (!l1.numSteps)
		return false;
	if (!l2.numSteps)
		return true;

	// Pretty straight forward, but we refuse to sort offsets of different dirIndices
	if (l1.dirIndex != l2.dirIndex)
		throw std::string("Trying to offsetCompare LIs of different dir");

	return l1.layerOffset < l2.layerOffset;
}

void LineParams::alignThreads(std::vector<struct LineInfoD> &pool, int coalesce) {
	// We first sort the "coalesce" wide blocks according to offset
	// Then make sure that, padding excluded, threads are really contiquous in the blocks
	// Then we rewind the threads so that they end up writing consecutive memory locations
	// And finally calculate how much empty iterations there are (due to rewinding
	// and due to differences in numSteps)
	const int numBlocks = (pool.size()+coalesce-1)/coalesce;
	std::vector<struct LineInfoD> blocks[numBlocks];
	for (int i = 0; i < (int)pool.size(); ++i)
		blocks[i/coalesce].push_back(pool.at(i));

	for (int i = 0; i < numBlocks; ++i) {
		std::sort(blocks[i].begin(), blocks[i].end(), offsetCompare);

		for (int j = 1; j < coalesce && blocks[i].at(j).numSteps; ++j) {
			//if (blocks[i].at(j-1).layerOffset+d_config->skipLines() != blocks[i].at(j).layerOffset)
			if (blocks[i].at(j-1).layerOffset+1 != blocks[i].at(j).layerOffset)
				throw std::string("Not contiquous!");

			// The earliness requirement is actually not necessary, and conflicts with the offset sorting:
			/*if (blocks[i].at(j-1).layerDistance > blocks[i].at(j).layerDistance) {
				printf("block %d, elem %d: %s\n", i, j-1, getLineInfo(blocks[i].at(j-1)).c_str());
				printf("block %d, elem %d: %s\n", i, j, getLineInfo(blocks[i].at(j)).c_str());
				throw std::string("Earliest thread not first in a block!");
			}*/
		}
	}

	if (d_debug == 2)
		for (int i = 0; i < numBlocks; ++i) 
			for (int j = 0; j < coalesce; ++j) 
				printf("block %d, elem %d: %s\n", i, j, getLineInfoD(blocks[i].at(j)).c_str());

	int wastedIters = 0;
	int rewoundIters = 0;
	for (int i = 0; i < numBlocks; ++i) {
		int maxIters = 0;
		int minDistance = 1e6; //d_maxDistance;
		for (int j = 0; j < coalesce && blocks[i].at(j).numSteps; ++j) {
			if (blocks[i].at(j).numSteps > maxIters)
				maxIters = blocks[i].at(j).numSteps;
			if (blocks[i].at(j).layerDistance < minDistance)
				minDistance = blocks[i].at(j).layerDistance;
		}

		if (d_debug == 2) {
			printf("Maxiters %d\n", maxIters);
			printf("Mindistance %d\n", minDistance);
		}

		for (int j = 0; j < coalesce; ++j)
			wastedIters += maxIters - blocks[i].at(j).numSteps;

		//minDistance = 0;
		/*minDistance -= 50;
		if (minDistance < 0) minDistance = 0;*/

		// Rewinding all threads to minDistance
		for (int j = 0; j < coalesce && blocks[i].at(j).numSteps; ++j) {
			int rewind = blocks[i].at(j).layerDistance - minDistance;

			int slant = blocks[i].at(j).dirIndex & 3; 
			//if (slant) rewind>>=2;

			blocks[i].at(j).numSteps += rewind;
			//blocks[i].at(j).numSteps = 1431;
			blocks[i].at(j).startPos -= blocks[i].at(j).stepDir * (double)rewind;
			blocks[i].at(j).layerDistance -= rewind;

			// A-HA!  We have to take change in offset into account aswell, if we're performing
			// slanted rewinding
			slant = 0;

			double slantCoef = 0.0;
			
			// FIXME!! Think about how the "extra" iterations are traversed in projection kernel..

			int shift = (int)((double)rewind*slantCoef + (slantCoef < 0.0 ? -0.5 : 0.5));
			shift = -rewind;
			//blocks[i].at(j).layerOffset -= shift;

			rewoundIters += rewind;
			if (d_debug == 2)
				printf("Rewound %d (i %d, j %d)\n", rewind, i, j);
		}
	}
	if (d_debug) {
		printf("Total of %d wasted iters (%d actual work) or %.2f %% due to numSteps discrepancy within blocks\n", 
				wastedIters, d_totalSteps, (double)wastedIters/(double)d_totalSteps*100.0);
		printf("Total of %d rewound iters\nCombined waste %.2f %% of actual\n", 
				rewoundIters, (double)(wastedIters+rewoundIters)/(double)d_totalSteps*100.0);
	}

	// Making sure, once again, that the threads will coalesce
	for (int i = 0; i < numBlocks; ++i) 
		for (int j = 1; j < coalesce && blocks[i].at(j).numSteps; ++j) {
			struct LineInfoD l1 = blocks[i].at(j-1);
			struct LineInfoD l2 = blocks[i].at(j);

			// FIXME Continue -> skip consistency checks
			continue;
			if (l1.dirIndex != l2.dirIndex)
				throw std::string("DirIndexes don't match in a block");
			if (l1.stepDir != l2.stepDir)
				throw std::string("StepDirs don't match in a block");
			if (l1.layerDistance != l2.layerDistance)
				throw std::string("LayerDistances don't match in a block");
			//if (l1.layerOffset+d_config->skipLines() != l2.layerOffset)
			if (l1.layerOffset+1 != l2.layerOffset)
				throw std::string("LayerOffset aren't consecutive in a block");
		}

	// Recreate the given pool
	pool.clear();
	for (int i = 0; i < numBlocks; ++i) 
		for (int j = 0; j < coalesce; ++j) {
			// Also we normalize the texturing coordinates
			blocks[i].at(j).startPos.x /= (double)d_config->hfWidth();
			blocks[i].at(j).startPos.y /= (double)d_config->hfHeight();
			blocks[i].at(j).stepDir.x /= (double)d_config->hfWidth();
			blocks[i].at(j).stepDir.y /= (double)d_config->hfHeight();

			pool.push_back(blocks[i].at(j));
		}
}

void LineParams::printBlock(std::vector<struct LineInfo> *pool, int line) {
	int blockId = line/d_config->sweepBlock();
	for (int i = 0; i < d_config->sweepBlock(); ++i) {
		struct LineInfo li = pool->at(blockId*d_config->sweepBlock() + i);
		printf("Block %d, Elem %d:  dir %d, offset %d, distance %d, steps %d\n",
				blockId,
				i,
				li.dirIndex,
				li.layerOffset,
				li.layerDistance,
				li.numSteps);
	}
}

std::string LineParams::getLineInfo(const struct LineInfo li) {
	char infoLine[1024];
	sprintf(infoLine, "dirIndex %d, startPos (%.2f, %.2f), stepDir (%.2f, %.2f), numSteps %d, layerDistance %d, layerOffset %d",
			li.dirIndex, li.startPos.x, li.startPos.y, li.stepDir.x, li.stepDir.y, li.numSteps, li.layerDistance, li.layerOffset);
	return std::string(infoLine);
}

std::string LineParams::getLineInfoD(const struct LineInfoD li) {
	return std::string("Not implemented");
}

int LineParams::sweepWidth() {
	//return d_config->sweepStripe(); //d_totalLineCount; //(d_maxOffsetWidth + d_config->lineSkip() - 1)/d_config->lineSkip();
	return d_config->sweepStorageStripe();
}

int LineParams::sweepHeight() {
	return d_concLength; //d_maxDistance;
}

