Friday, February 10, 2017

glsl - Deferred tiled shading, tile frusta calculation in OpenGL


I'm trying to do deferred tiled shading in OpenGL using the compute shader but I have hit a snag when trying to create the frustum for each tile. I'm using AMD's Forward+ demo (written in D3D) as a guide but lights seem to be culled when they shouldn't.



Read below for updates.


This is my (complete) compute shader:


    #version 430 core


#define MAX_LIGHTS 1024
#define MAX_LIGHTS_PER_TILE 40

#define WORK_GROUP_SIZE 16

struct PointLight
{
vec3 position;
float radius;
vec3 color;

float intensity;
};

layout (binding = 0, rgba32f) uniform writeonly image2D outTexture;
layout (binding = 1, rgba32f) uniform readonly image2D normalDepth;
layout (binding = 2, rgba32f) uniform readonly image2D diffuse;
layout (binding = 3, rgba32f) uniform readonly image2D specular;
layout (binding = 4, rgba32f) uniform readonly image2D glowMatID;

layout (std430, binding = 5) buffer BufferObject

{
PointLight pointLights[];
};

uniform mat4 view;
uniform mat4 proj;
uniform mat4 viewProj;
uniform mat4 invViewProj;
uniform mat4 invProj;
uniform vec2 framebufferDim;


layout (local_size_x = WORK_GROUP_SIZE, local_size_y = WORK_GROUP_SIZE) in;

shared uint minDepth = 0xFFFFFFFF;
shared uint maxDepth = 0;
shared uint pointLightIndex[MAX_LIGHTS];
shared uint pointLightCount = 0;

vec3 ReconstructWP(float z, vec2 uv_f)
{

vec4 sPos = vec4(uv_f * 2.0 - 1.0, z, 1.0);
sPos = invViewProj * sPos;

return (sPos.xyz / sPos.w);
}

vec4 ConvertProjToView( vec4 p )
{
p = invProj * p;
p /= p.w;

return p;
}

// calculate the number of tiles in the horizontal direction
uint GetNumTilesX()
{
return uint(( ( 1280 + WORK_GROUP_SIZE - 1 ) / float(WORK_GROUP_SIZE) ));
}

// calculate the number of tiles in the vertical direction

uint GetNumTilesY()
{
return uint(( ( 720 + WORK_GROUP_SIZE - 1 ) / float(WORK_GROUP_SIZE) ));
}


vec4 CreatePlaneEquation( vec4 b, vec4 c )
{
vec4 n;


// normalize(cross( b.xyz-a.xyz, c.xyz-a.xyz )), except we know "a" is the origin
n.xyz = normalize(cross( b.xyz, c.xyz ));

// -(n dot a), except we know "a" is the origin
n.w = 0;

return n;
}

float GetSignedDistanceFromPlane( vec4 p, vec4 eqn )

{
// dot( eqn.xyz, p.xyz ) + eqn.w, , except we know eqn.w is zero
// (see CreatePlaneEquation above)
return dot( eqn.xyz, p.xyz );
}

vec4 CalculateLighting( PointLight p, vec3 wPos, vec3 wNormal, vec4 wSpec, vec4 wGlow)
{
vec3 direction = p.position - wPos;


if(length(direction) > p.radius)
return vec4(0.0f, 0.0f, 0.0f, 0.0f);

float attenuation = 1.0f - length(direction) / (p.radius);
direction = normalize(direction);
float diffuseFactor = max(0.0f, dot(direction, wNormal)) * attenuation;
return vec4(p.color.xyz, 0.0f) * diffuseFactor * p.intensity;
}



void main()
{
ivec2 pixelPos = ivec2(gl_GlobalInvocationID.xy);
vec2 tilePos = vec2(gl_WorkGroupID.xy * gl_WorkGroupSize.xy) / vec2(1280, 720);

vec4 normalColor = imageLoad(normalDepth, pixelPos);

float d = normalColor.w;

uint depth = uint(d * 0xFFFFFFFF);


atomicMin(minDepth, depth);
atomicMax(maxDepth, depth);

barrier();

float minDepthZ = float(minDepth / float(0xFFFFFFFF));
float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

vec4 frustumEqn[4];

uint pxm = WORK_GROUP_SIZE * gl_WorkGroupID.x;
uint pym = WORK_GROUP_SIZE * gl_WorkGroupID.y;
uint pxp = WORK_GROUP_SIZE * (gl_WorkGroupID.x + 1);
uint pyp = WORK_GROUP_SIZE * (gl_WorkGroupID.y + 1);

uint uWindowWidthEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesX();
uint uWindowHeightEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesY();

vec4 frustum[4];
frustum[0] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

frustum[1] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
frustum[2] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f ,1.0f) );
frustum[3] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

for (int i = 0; i < 4; i++)
frustumEqn[i] = CreatePlaneEquation(frustum[i], frustum[(i+1) & 3]);

barrier();

int threadsPerTile = WORK_GROUP_SIZE * WORK_GROUP_SIZE;


for (uint i = 0; i < MAX_LIGHTS; i+= threadsPerTile)
{
uint il = gl_LocalInvocationIndex + i;

if (il < MAX_LIGHTS)
{
PointLight p = pointLights[il];

vec4 viewPos = view * vec4(p.position, 1.0f);

float r = p.radius;

if (viewPos.z + minDepthZ < r && viewPos.z - maxDepthZ < r)
{

if( ( GetSignedDistanceFromPlane( viewPos, frustumEqn[0] ) < r ) &&
( GetSignedDistanceFromPlane( viewPos, frustumEqn[1] ) < r ) &&
( GetSignedDistanceFromPlane( viewPos, frustumEqn[2] ) < r ) &&
( GetSignedDistanceFromPlane( viewPos, frustumEqn[3] ) < r) )


{
uint id = atomicAdd(pointLightCount, 1);
pointLightIndex[id] = il;
}
}

}
}

barrier();


vec4 diffuseColor = imageLoad(diffuse, pixelPos);
vec4 specularColor = imageLoad(specular, pixelPos);
vec4 glowColor = imageLoad(glowMatID, pixelPos);

vec2 uv = vec2(pixelPos.x / 1280.0f, pixelPos.y / 720.0f);

vec3 wp = ReconstructWP(d, uv);
vec4 color = vec4(0.0f, 0.0f, 0.0f, 1.0f);


for (int i = 0; i < pointLightCount; i++)
{
color += CalculateLighting( pointLights[pointLightIndex[i]], wp, normalColor.xyz, specularColor, glowColor);
}

barrier();

if (gl_LocalInvocationID.x == 0 || gl_LocalInvocationID.y == 0 || gl_LocalInvocationID.x == 16 || gl_LocalInvocationID.y == 16)
imageStore(outTexture, pixelPos, vec4(.2f, .2f, .2f, 1.0f));
else

{
imageStore(outTexture, pixelPos, color);
//imageStore(outTexture, pixelPos, vec4(maxDepthZ));
//imageStore(outTexture, pixelPos, vec4(pointLightCount / 128.0f));
//imageStore(outTexture, pixelPos, vec4(vec2(tilePos.xy), 0.0f, 1.0f));
}
}

This is the part I think is the problem, the culling part:


        barrier();


float minDepthZ = float(minDepth / float(0xFFFFFFFF));
float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

vec4 frustumEqn[4];
uint pxm = WORK_GROUP_SIZE * gl_WorkGroupID.x;
uint pym = WORK_GROUP_SIZE * gl_WorkGroupID.y;
uint pxp = WORK_GROUP_SIZE * (gl_WorkGroupID.x + 1);
uint pyp = WORK_GROUP_SIZE * (gl_WorkGroupID.y + 1);


uint uWindowWidthEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesX();
uint uWindowHeightEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesY();

vec4 frustum[4];
frustum[0] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
frustum[1] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
frustum[2] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f ,1.0f) );
frustum[3] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

for (int i = 0; i < 4; i++)

frustumEqn[i] = CreatePlaneEquation(frustum[i], frustum[(i+1) & 3]);

barrier();

int threadsPerTile = WORK_GROUP_SIZE * WORK_GROUP_SIZE;

for (uint i = 0; i < MAX_LIGHTS; i+= threadsPerTile)
{
uint il = gl_LocalInvocationIndex + i;


if (il < MAX_LIGHTS)
{
PointLight p = pointLights[il];

vec4 viewPos = view * vec4(p.position, 1.0f);
float r = p.radius;

if (viewPos.z + minDepthZ < r && viewPos.z - maxDepthZ < r)
{


if( ( GetSignedDistanceFromPlane( viewPos, frustumEqn[0] ) < r ) &&
( GetSignedDistanceFromPlane( viewPos, frustumEqn[1] ) < r ) &&
( GetSignedDistanceFromPlane( viewPos, frustumEqn[2] ) < r ) &&
( GetSignedDistanceFromPlane( viewPos, frustumEqn[3] ) < r) )

{
uint id = atomicAdd(pointLightCount, 1);
pointLightIndex[id] = il;
}
}


}
}

barrier();

The weird thing is that when I visualize the light count per tile, it shows all tiles having some manner of lights (first image).


Second image shows the final output, a thin line of lights down the middle of the screen and nothing above or below. Removing the culling (GetSignedDistanceFromPlane()) gives the desired result, albeit with my framerate dropping like a rock.


enter image description here


enter image description here



My guess would be that the frustum is constructed wrong but I'm unsure of the math behind it and could use some help right now.


Edit: Added another image which shows the expected output.


enter image description here



We've changed how the culling is done, the code now looks like this:


barrier();

float minDepthZ = float(minDepth / float(0xFFFFFFFF));
float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));


//total tiles = tileScale * 2
vec2 tileScale = vec2(1280, 720) * (1.0f / float(2*WORK_GROUP_SIZE));
vec2 tileBias = tileScale - vec2(gl_WorkGroupID.xy);

vec4 c1 = vec4(-proj[0][0] * tileScale.x, 0.0f, tileBias.x, 0.0f);
vec4 c2 = vec4(0.0f, -proj[1][1] * tileScale.y, tileBias.y, 0.0f);
vec4 c4 = vec4(0.0f, 0.0f, 1.0f, 0.0f);

// Derive frustum planes
vec4 frustumPlanes[6];

// Sides
//right
frustumPlanes[0] = c4 - c1;
//left
frustumPlanes[1] = c4 + c1;
//bottom
frustumPlanes[2] = c4 - c2;
//top
frustumPlanes[3] = c4 + c2;
// Near/far

frustumPlanes[4] = vec4(0.0f, 0.0f, 1.0f, -minDepthZ);
frustumPlanes[5] = vec4(0.0f, 0.0f, -1.0f, maxDepthZ);

for(int i = 0; i < 4; i++)
{
frustumPlanes[i] *= 1.0f / length(frustumPlanes[i].xyz);
}

//DO CULLING HERE
for (uint lightIndex = gl_LocalInvocationIndex; lightIndex < numActiveLights; lightIndex += WORK_GROUP_SIZE)

{
PointLight p = pointLights[lightIndex];

if (lightIndex < numActiveLights)
{
bool inFrustum = true;
for (uint i = 0; i < 4; i++)
{
float dd = dot(frustumPlanes[i], view * vec4(p.position, 1.0f));
inFrustum = inFrustum && (dd >= -p.radius_length);

}

if (inFrustum)
{
uint id = atomicAdd(pointLightCount, 1);
pointLightIndex[id] = lightIndex;
}
}
}


barrier();

This works better, our lights are now culled properly (except min/max depth because it's not implemented properly yet) against our tiles. So far, so good, BUT! We have a problem with edges of lights, the tiles do not cover the entire light radius and performance is godawful. 1024 lights gives at best 40fps with tons of stutter.


This video shows what happens at edges, the grey tiles are what tiles are affected by the light (single pointlight) and the red parts are shaded geometry.


http://www.youtube.com/watch?v=PiwGcFb9rWk&feature=youtu.be


Scaling the radius so that it is larger when culling "works" but makes performance drop even harder.




No comments:

Post a Comment

Simple past, Present perfect Past perfect

Can you tell me which form of the following sentences is the correct one please? Imagine two friends discussing the gym... I was in a good s...