// This means in a separable convolution, for each pass:
//  - if point  filtering: 12+1+12 texture fetches are needed
//	- if linear filtering: 6+1+6   texture fetches are needed
#define NUM_WEIGHTS (1+12)  
 
void GenGaussian( int imageWidth, float sigma, std::vector<float>& weights, std::vector<float>& offsets, int& decimations, bool interlaced )
{
	float stepx = interlaced ? 2.0f : 1.0f;
 
	// Generate gaussian curve weights, and at the same time decide whether 
	// to scale the image down before filtering, and how much (decimations).
	for( decimations = 0; ; ++decimations, imageWidth /= 2 )
	{
		weights.clear();
		offsets.clear();
 
		if( sigma == 0 )
		{
			weights.push_back(1.0f);
			offsets.push_back(0.0f);
			break;
		}
 
		if( imageWidth == 1 )
			break;
 
		float x = 0.0f;
		float g;
		do
		{
			g = exp(-x*x/(2.0f*sigma));
			weights.push_back( g );
			offsets.push_back( x );
			x += stepx * (1 << decimations);
 
		} while( g >= 1.0f/255.0f );
 
		if( weights.size() <= NUM_WEIGHTS )
			break;
	}
 
	// If we decimated all the way down to imageWidth 1,
	// no need to filter at all.
	if( weights.empty() )
		return;
 
	for( size_t i = weights.size(); i < NUM_WEIGHTS; ++i )
	{
		weights.push_back(0.0f);
		offsets.push_back(0.0f);
	}
 
 	// Normalize the weight table so we net 1.0 when filtering.
 	// Note that entries 1..n are scaled by two, because the 
 	// pixel shader will use them for the samples both to the
 	// left and right of the center sample.
	// |...|3|2|1|0|1|2|3|...|
 
	float sum = 0.0f;
	for( size_t i = 0; i < weights.size(); ++i )
		sum += weights[i] * (i > 0 ? 2.0f : 1.0f);
 
	for( size_t i = 0; i < weights.size(); ++i )
		weights[i] /= sum;
}
 
void OptimizeBilinear(  const std::vector<float> weights, 
						const std::vector<float> offsets,
						std::vector<float>& bweights, 
						std::vector<float>& boffsets )
{
	bweights.push_back(weights[0]);
	boffsets.push_back(0.0f);
	for( size_t i = 1; i < weights.size(); i += 2 )
	{
		float w1 = weights[i + 0];
		float w2 = weights[i + 1];
		float o0 = offsets[i + 0];
		float o1 = offsets[i + 1];
		float w12 = w1 + w2;						// overall weight for bilinear sample
		float k = w12 > 0.0001f ? w2/w12 : 0.0f; 	// bilinear 'k'
 
		bweights.push_back( w12 );
		boffsets.push_back( o0 + k * (o1-o0) );
	}
}
 
void CLJob::SetupHBlur( engine::PassConfig& pc, int& decim )
{
	std::vector<float> weights, offsets, bweights, boffsets;
	GenGaussian( m_Width, m_CLProps.m_HBlur, weights, offsets, decim, false );
 
	if( !weights.empty() )
	{
		OptimizeBilinear( weights, offsets, bweights, boffsets );
 
		for( size_t i = 0; i < bweights.size(); ++i )
		{
			pc.SetVsParam( 4+i, boffsets[i]/m_Width, 0.0f, bweights[i], 0.0f );
			pc.SetPsParam( 4+i, boffsets[i]/m_Width, 0.0f, bweights[i], 0.0f );
		}
	}
}