#define LANG_CUDA
texture<float, 2, cudaReadModeElementType> depthTex;
texture<float4, 2, cudaReadModeElementType> normalTex;

#include "sharedStructs.h"
#include "sharedConstants.h"

__inline__ __device__ bool operator==(const float2 a, const float2 b) {
	//return fabs(a.x - b.x) < 1e-6f && fabs(a.y - b.y) < 1e-6f;
	return a.x == b.x && a.y == b.y;
}
__inline__ __device__ bool operator!=(const float2 a, const float2 b) {
	//return fabs(a.x - b.x) > 1e-6f || fabs(a.y - b.y) > 1e-6f;
	return a.x != b.x || a.y != b.y;
}
__inline__ __device__ float2 operator+(const float2 a, const float2 b) {
	return make_float2(a.x+b.x, a.y+b.y);
}

__inline__ __device__ float2 operator-(const float2 a, const float2 b) {
	return make_float2(a.x-b.x, a.y-b.y);
}

__inline__ __device__ float2 operator-(const float2 a) {
	return make_float2(-a.x, -a.y);
}

__inline__ __device__ void operator*=(float2 &a, const float c) {
	a.x *= c;
	a.y *= c;
}
__inline__ __device__ void operator*=(float2 &a, const float2 c) {
	a.x *= c.x;
	a.y *= c.y;
}

__inline__ __device__ float2 operator*(const float2 a, const float c) {
	return make_float2(a.x*c, a.y*c);
}

__inline__ __device__ float2 operator*(const float c, const float2 a) {
	return make_float2(a.x*c, a.y*c);
}

__inline__ __device__ void operator+=(float2 &a, const float2 b) {
	a.x += b.x;
	a.y += b.y;
}

__inline__ __device__ void operator-=(float2 &a, const float2 b) {
	a.x -= b.x;
	a.y -= b.y;
}

__inline__ __device__ float SSEOdot3(const float3 a, const float3 b) {
	return a.x*b.x + a.y*b.y + a.z*b.z;
}

__inline__ __device__ float SSEOdot2(const float2 a, const float2 b) {
	return a.x*b.x + a.y*b.y;
}

__inline__ __device__ float SSEOlength3(const float3 a) {
	return sqrtf(a.x*a.x + a.y*a.y + a.z*a.z);
}

__inline__ __device__ float SSEOlength2(const float2 a) {
	return sqrtf(a.x*a.x + a.y*a.y);
}

__inline__ __device__ float3 SSEOnormalize3(const float3 a) {
	float coef = rsqrtf(a.x*a.x + a.y*a.y + a.z*a.z);
	return make_float3(a.x*coef, a.y*coef, a.z*coef);
}

__inline__ __device__ void normalizeThis(float3 &a) {
	float coef = rsqrtf(a.x*a.x + a.y*a.y + a.z*a.z);
	a.x *= coef;
	a.y *= coef;
	a.z *= coef;
}

__inline__ __device__ float2 SSEOnormalize2(const float2 a) {
	float coef = rsqrtf(a.x*a.x + a.y*a.y);
	return make_float2(a.x*coef, a.y*coef);
}

__inline__ __device__ void normalizeThis(float2 &v) { 
	const float coef = rsqrtf(v.x*v.x + v.y*v.y);
	v.x *= coef;
	v.y *= coef;
}

__inline__ __device__ float3 SSEOcross3(const float3 a, const float3 b) {
	return make_float3(
			a.y*b.z - a.z*b.y,
			a.z*b.x - a.x*b.z,
			a.x*b.y - a.y*b.x);
}

__device__ float3 unProj(const float2 screenPos, const float height) {
  return make_float3((1.000000E+00f + screenPos.x*-1.302083E-03f)*height, (5.625000E-01f + screenPos.y*-1.302083E-03f)*height, height);
}
__device__ float2 proj(const float3 eyePos) {
  float invZ = __fdividef(1.0f, eyePos.z);
  return make_float2(-7.680000E+02f*invZ*eyePos.x + 7.680000E+02f, -7.680000E+02f*invZ*eyePos.y + 4.320000E+02f);
}
__device__ bool insideTex(const float2 pos) {
  return (pos.x > 3.255208E-04f && pos.x < 9.996745E-01f && pos.y > 5.787037E-04f && pos.y < 9.994213E-01f);
}

__device__ float vecFalloff(float2 horVec) {
  const float invCoef = 2.500000E-01f;
  return (1.0f/(1.0f + invCoef*SSEOdot2(horVec, horVec)));
}
__device__ float fallOff(const float distance) {
  const float coef = 1.0f;
  return coef/(coef + distance*distance);
}
__device__ bool occlusionCompare(const float2 v1, const float2 v2, const float2 upVec) {
  return 
  (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v1)) - -8.500000E-01f))*vecFalloff(v1)
   > (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f))*vecFalloff(v2);
}

__device__ float2 snapCoord(const float2 in) {
  return make_float2((float)((int)(in.x*1.536000E+03f))*6.510417E-04f + 3.255208E-04f, (float)((int)(in.y*8.640000E+02f))*1.157407E-03f + 5.787037E-04f);
}


extern "C" __global__ void sweep(const struct LineInfo * __restrict liIn, float * __restrict out) {
  int tid = blockIdx.x*blockDim.x + threadIdx.x;
  struct LineInfo li = liIn[tid];
  if (li.numSteps < 2) return;
  int destIndex = li.layerDistance;
  short *outOcc = (short*)out;
  outOcc += (li.dirIndex < 8) ? 2 : 3;
  int myStripe = (li.dirIndex < 8) ? 32 : -32;
  float2 dirStep = SSEOnormalize2(make_float2(li.stepDir.x*1.536000E+03f, li.stepDir.y*8.640000E+02f));
  int convexIndex = 1;
  __shared__ float2 convexHull[16][32];
  float2 h1, h2, h3;
  float2 pLocal;
  float2 upVec;

  h3 = make_float2(0.0, -10000.0f);
  
  /* Taking a sample START */
  {
    float2 tempSnapCoord = snapCoord(li.startPos);
    float height = tex2D(depthTex, tempSnapCoord.x, tempSnapCoord.y);
    float2 projXY = make_float2(1.000000E+00f + tempSnapCoord.x*-2.000000E+00f, 5.625000E-01f + tempSnapCoord.y*-1.125000E+00f);
    h2 = make_float2((projXY.x*dirStep.x + projXY.y*dirStep.y)*height, height);
  }
  /* Taking a sample END */
  
  
  /* Stepping forward START */
  li.numSteps--;
  li.idleSteps--;
  li.startPos += li.stepDir;
  /* Stepping forward END */
  
  
  /* Taking a sample START */
  {
    float2 tempSnapCoord = snapCoord(li.startPos);
    float height = tex2D(depthTex, tempSnapCoord.x, tempSnapCoord.y);
    float2 projXY = make_float2(1.000000E+00f + tempSnapCoord.x*-2.000000E+00f, 5.625000E-01f + tempSnapCoord.y*-1.125000E+00f);
    h1 = make_float2((projXY.x*dirStep.x + projXY.y*dirStep.y)*height, height);
  }
  /* Taking a sample END */
  
  
  /* Stepping forward START */
  li.numSteps--;
  li.idleSteps--;
  li.startPos += li.stepDir;
  /* Stepping forward END */
  
  __shared__ float2 pLocalS[128];
  while (li.idleSteps > 4) {
    for (int slot = 0; slot < 4; ++slot) {
      
      /* Taking a sample START */
      {
        float2 tempSnapCoord = snapCoord(li.startPos);
        float height = tex2D(depthTex, tempSnapCoord.x, tempSnapCoord.y);
        float2 projXY = make_float2(1.000000E+00f + tempSnapCoord.x*-2.000000E+00f, 5.625000E-01f + tempSnapCoord.y*-1.125000E+00f);
        pLocalS[slot*32 + threadIdx.x] = make_float2((projXY.x*dirStep.x + projXY.y*dirStep.y)*height, height);
      }
      /* Taking a sample END */
      
      li.startPos += li.stepDir;
    }
    for (int slot = 0; slot < 4; ++slot) {
      pLocal = pLocalS[slot*32 + threadIdx.x];
      upVec = SSEOnormalize2(-pLocal);
      float2 v1 = h1 - pLocal;
      float2 v2 = h2 - pLocal;
      float dot1 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v1)) - -8.500000E-01f));
      float dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
      float occ1 = dot1*vecFalloff(v1);
      float occ2 = dot2*vecFalloff(v2);
      int fullIters = 15;
      if (convexIndex && occ1 <= occ2 + 1.000000E-03f && dot1 <= dot2 + 1.000000E-03f) {
        dot1 = dot2;
        occ1 = occ2;
        h1 = h2;
        h2 = h3;
        convexIndex--;
        fullIters--;
        v2 = h2 - pLocal;
        dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
        occ2 = dot2*vecFalloff(v2);
        while (fullIters && convexIndex && occ1 <= occ2 + 1.000000E-03f && dot1 <= dot2 + 1.000000E-03f) {
          dot1 = dot2;
          occ1 = occ2;
          h1 = h2;
          convexIndex--;
          h2 = convexHull[convexIndex&15][threadIdx.x];
          fullIters--;
          v2 = h2 - pLocal;
          dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
          occ2 = dot2*vecFalloff(v2);
        }
      }
      h3 = h2;
      if (fullIters == 15) {
        convexHull[convexIndex&15][threadIdx.x] = h2;
      }
      convexIndex++;
      h2 = h1;
      h1 = pLocal;
      li.numSteps--;
      li.idleSteps--;
    }
  }
  while (li.idleSteps > 0) {
    
    /* Taking a sample START */
    {
      float2 tempSnapCoord = snapCoord(li.startPos);
      float height = tex2D(depthTex, tempSnapCoord.x, tempSnapCoord.y);
      float2 projXY = make_float2(1.000000E+00f + tempSnapCoord.x*-2.000000E+00f, 5.625000E-01f + tempSnapCoord.y*-1.125000E+00f);
      pLocal = make_float2((projXY.x*dirStep.x + projXY.y*dirStep.y)*height, height);
      upVec = SSEOnormalize2(-pLocal);
    }
    /* Taking a sample END */
    
    float2 v1 = h1 - pLocal;
    float2 v2 = h2 - pLocal;
    float dot1 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v1)) - -8.500000E-01f));
    float dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
    float occ1 = dot1*vecFalloff(v1);
    float occ2 = dot2*vecFalloff(v2);
    int fullIters = 15;
    if (convexIndex && occ1 <= occ2 + 1.000000E-03f && dot1 <= dot2 + 1.000000E-03f) {
      dot1 = dot2;
      occ1 = occ2;
      h1 = h2;
      h2 = h3;
      convexIndex--;
      fullIters--;
      v2 = h2 - pLocal;
      dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
      occ2 = dot2*vecFalloff(v2);
      while (fullIters && convexIndex && occ1 <= occ2 + 1.000000E-03f && dot1 <= dot2 + 1.000000E-03f) {
        dot1 = dot2;
        occ1 = occ2;
        h1 = h2;
        convexIndex--;
        h2 = convexHull[convexIndex&15][threadIdx.x];
        fullIters--;
        v2 = h2 - pLocal;
        dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
        occ2 = dot2*vecFalloff(v2);
      }
    }
    h3 = h2;
    if (fullIters == 15) {
      convexHull[convexIndex&15][threadIdx.x] = h2;
    }
    convexIndex++;
    h2 = h1;
    h1 = pLocal;
    
    /* Stepping forward START */
    li.numSteps--;
    li.idleSteps--;
    li.startPos += li.stepDir;
    /* Stepping forward END */
    
  }
  float occlusion = 0.0f;
  while (li.numSteps > 0) {
    for (int slot = 0; slot < 4; ++slot) {
      
      /* Taking a sample START */
      {
        float2 tempSnapCoord = snapCoord(li.startPos);
        float height = tex2D(depthTex, tempSnapCoord.x, tempSnapCoord.y);
        float2 projXY = make_float2(1.000000E+00f + tempSnapCoord.x*-2.000000E+00f, 5.625000E-01f + tempSnapCoord.y*-1.125000E+00f);
        pLocalS[slot*32 + threadIdx.x] = make_float2((projXY.x*dirStep.x + projXY.y*dirStep.y)*height, height);
      }
      /* Taking a sample END */
      
      li.startPos += li.stepDir;
    }
    for (int slot = 0; slot < 4; ++slot) {
      pLocal = pLocalS[slot*32 + threadIdx.x];
      upVec = SSEOnormalize2(-pLocal);
      float2 v1 = h1 - pLocal;
      float2 v2 = h2 - pLocal;
      float dot1 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v1)) - -8.500000E-01f));
      float dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
      float occ1 = dot1*vecFalloff(v1);
      float occ2 = dot2*vecFalloff(v2);
      int fullIters = 15;
      if (convexIndex && occ1 <= occ2 + 1.000000E-03f && dot1 <= dot2 + 1.000000E-03f) {
        dot1 = dot2;
        occ1 = occ2;
        h1 = h2;
        h2 = h3;
        convexIndex--;
        fullIters--;
        v2 = h2 - pLocal;
        dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
        occ2 = dot2*vecFalloff(v2);
        while (fullIters && convexIndex && occ1 <= occ2 + 1.000000E-03f && dot1 <= dot2 + 1.000000E-03f) {
          dot1 = dot2;
          occ1 = occ2;
          h1 = h2;
          convexIndex--;
          h2 = convexHull[convexIndex&15][threadIdx.x];
          fullIters--;
          v2 = h2 - pLocal;
          dot2 = (fmaxf(0.0f, SSEOdot2(upVec, SSEOnormalize2(v2)) - -8.500000E-01f));
          occ2 = dot2*vecFalloff(v2);
        }
      }
      h3 = h2;
      if (fullIters == 15) {
        convexHull[convexIndex&15][threadIdx.x] = h2;
      }
      convexIndex++;
      h2 = h1;
      h1 = pLocal;
      occlusion = -8.500000E-01f + occ1;
      // Writing out
      if (li.dirIndex < 8) {
        out[destIndex*2] = pLocal.y;
      }
      outOcc[destIndex*4] = __float2half_rn(occlusion);
      li.numSteps--;
      destIndex += myStripe;
    }
  }
}
