LookingGlass/client/renderers/EGL/shader/ffx_cas.h

1446 lines
59 KiB
C
Raw Normal View History

//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
//
// [CAS] FIDELITY FX - CONSTRAST ADAPTIVE SHARPENING 1.20190610
//
//==============================================================================================================================
// LICENSE
// =======
// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
// -------
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
// -------
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
// Software.
// -------
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//------------------------------------------------------------------------------------------------------------------------------
// ABOUT
// =====
// CAS is a spatial only filter.
// CAS takes RGB color input.
// CAS enchances sharpness and local high-frequency contrast, and with or without added upsampling.
// CAS outputs RGB color.
//------------------------------------------------------------------------------------------------------------------------------
// SUGGESTIONS FOR INTEGRATION
// ===========================
// Best for performance, run CAS in sharpen-only mode, choose a video mode to have scan-out or the display scale.
// - Sharpen-only mode is faster, and provides a better quality sharpening.
// The scaling support in CAS was designed for when the application wants to do Dynamic Resolution Scaling (DRS).
// - With DRS, the render resolution can change per frame.
// - Use CAS to sharpen and upsample to the fixed output resolution, then composite the full resolution UI over CAS output.
// - This can all happen in one compute dispatch.
// It is likely better to reduce the amount of film grain which happens before CAS (as CAS will amplify grain).
// - An alternative would be to add grain after CAS.
// It is best to run CAS after tonemapping.
// - CAS needs to have input value 1.0 at the peak of the display output.
// It is ok to run CAS after compositing UI (it won't harm the UI).
//------------------------------------------------------------------------------------------------------------------------------
// EXECUTION
// =========
// CAS runs as a compute shader.
// CAS is designed to be run either in a 32-bit, CasFilter(), or packed 16-bit, CasFilterH(), form.
// The 32-bit form works on 8x8 tiles via one {64,1,1} workgroup.
// The 16-bit form works on a pair of 8x8 tiles in a 16x8 configuration via one {64,1,1} workgroup.
// CAS is designed to work best in semi-persistent form if running not async with graphics.
// For 32-bit this means looping across a collection of 4 8x8 tiles in a 2x2 tile foot-print.
// For 16-bit this means looping 2 times, once for the top 16x8 region and once for the bottom 16x8 region.
//------------------------------------------------------------------------------------------------------------------------------
// INTEGRATION SUMMARY FOR CPU
// ===========================
// // Make sure <stdint.h> has already been included.
// // Setup pre-portability-header defines.
// #define A_CPU 1
// // Include the portability header (requires version 1.20190530 or later which is backwards compatible).
// #include "ffx_a.h"
// // Include the CAS header.
// #include "ffx_cas.h"
// ...
// // Call the setup function to build out the constants for the shader, pass these to the shader.
// // The 'varAU4(const0);' expands into 'uint32_t const0[4];' on the CPU.
// varAU4(const0);
// varAU4(const1);
// CasSetup(const0,const1,
// 0.0f, // Sharpness tuning knob (0.0 to 1.0).
// 1920.0f,1080.0f, // Example input size.
// 2560.0f,1440.0f); // Example output size.
// ...
// // Later dispatch the shader based on the amount of semi-persistent loop unrolling.
// // Here is an example for running with the 16x16 (4-way unroll for 32-bit or 2-way unroll for 16-bit)
// vkCmdDispatch(cmdBuf,(widthInPixels+15)>>4,(heightInPixels+15)>>4,1);
//------------------------------------------------------------------------------------------------------------------------------
// INTEGRATION SUMMARY FOR GPU
// ===========================
// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
// layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
// layout(set=0,binding=1,rgba16f)uniform image2D imgDst;
// ...
// // Setup pre-portability-header defines (sets up GLSL/HLSL path, packed math support, etc)
// #define A_GPU 1
// #define A_GLSL 1
// #define A_HALF 1
// ...
// // Include the portability header (or copy it in without an include).
// #include "ffx_a.h"
// ...
// // Define the fetch function(s).
// // CasLoad() takes a 32-bit unsigned integer 2D coordinate and loads color.
// AF3 CasLoad(ASU2 p){return imageLoad(imgSrc,p).rgb;}
// // CasLoadH() is the 16-bit version taking 16-bit unsigned integer 2D coordinate and loading 16-bit float color.
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
// ...
// // Define the input modifiers as nop's initially.
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){}
// ...
// // Include this CAS header file (or copy it in without an include).
// #include "ffx_cas.h"
// ...
// // Example in shader integration for loop-unrolled 16x16 case for 32-bit.
// layout(local_size_x=64)in;
// void main(){
// // Fetch constants from CasSetup().
// AU4 const0=cb.const0;
// AU4 const1=cb.const1;
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
// // Filter.
// AF4 c;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.x+=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.y+=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.x-=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);}
// ...
// // Example for semi-persistent 16x16 but this time for packed math.
// // Use this before including 'cas.h' if not using the non-packed filter function.
// #define CAS_PACKED_ONLY 1
// ...
// layout(local_size_x=64)in;
// void main(){
// // Fetch constants from CasSetup().
// AU4 const0=cb.const0;
// AU4 const1=cb.const1;
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
// // Filter.
// AH4 c0,c1;AH2 cR,cG,cB;
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
// // Extra work integrated after CAS would go here.
// ...
// // Suggest only running CasDepack() right before stores, to maintain packed math for any work after CasFilterH().
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(imgDst,ASU2(gxy),AF4(c0));
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));
// gxy.y+=8u;
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
// ...
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(imgDst,ASU2(gxy),AF4(c0));
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));}
//------------------------------------------------------------------------------------------------------------------------------
// CAS FILTERING LOGIC
// ===================
// CAS uses the minimal nearest 3x3 source texel window for filtering.
// The filter coefficients are radially symmetric (phase adaptive, computed per pixel based on output pixel center).
// The filter kernel adapts to local contrast (adjusting the negative lobe strength of the filter kernel).
//------------------------------------------------------------------------------------------------------------------------------
// CAS INPUT REQUIREMENTS
// ======================
// This is designed to be a linear filter.
// Running CAS on perceptual inputs will yield over-sharpening.
// Input must range between {0 to 1} for each color channel.
// CAS output will be {0 to 1} ranged as well.
// CAS does 5 loads, so any conversion applied during CasLoad() or CasInput() has a 5 load * 3 channel = 15x cost amplifier.
// - So input conversions need to be factored into the prior pass's output.
// - But if necessary use CasInput() instead of CasLoad(), as CasInput() works with packed color.
// - For CAS with scaling the amplifier is 12 load * 3 channel = 36x cost amplifier.
// Any conversion applied to output has a 3x cost amplifier (3 color channels).
// - Output conversions are substantially less expensive.
// Added VALU ops due to conversions will have visible cost as this shader is already quite VALU heavy.
// This filter does not function well on sRGB or gamma 2.2 non-linear data.
// This filter does not function on PQ non-linear data.
// - Due to the shape of PQ, the positive side of the ring created by the negative lobe tends to become over-bright.
//------------------------------------------------------------------------------------------------------------------------------
// INPUT FORMAT SPECIFIC CASES
// ===========================
// - FP16 with all non-negative values ranging {0 to 1}.
// - Use as is, filter is designed for linear input and output ranging {0 to 1}.
// ---------------------------
// - UNORM with linear conversion approximation.
// - This could be used for both sRGB or FreeSync2 native (gamma 2.2) cases.
// - Load/store with either 10:10:10:2 UNORM or 8:8:8:8 UNORM (aka VK_FORMAT_R8G8B8A8_UNORM).
// - Use gamma 2.0 conversion in CasInput(), as an approximation.
// - Modifications:
// // Change the CasInput*() function to square the inputs.
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){r*=r;g*=g;b*=b;}
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){r*=r;g*=g;b*=b;}
// ...
// // Do linear to gamma 2.0 before store.
// // Since it will be common to do processing after CAS, the filter function returns linear.
// c.r=sqrt(c.r);c.g=sqrt(c.g);c.b=sqrt(c.b);
// imageStore(imgDst,ASU2(gxy),c);
// ...
// // And for packed.
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
// cR=sqrt(cR);cG=sqrt(cG);cB=sqrt(cB);
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(img[0],ASU2(gxy),AF4(c0));
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
// ---------------------------
// - sRGB with slightly better quality and higher cost.
// - Use texelFetch() with sRGB format (VK_FORMAT_R8G8B8A8_SRGB) for loads (gets linear into shader).
// - Store to destination using UNORM (not sRGB) stores and do the linear to sRGB conversion in the shader.
// - Modifications:
// // Use texel fetch instead of image load (on GCN this will translate into an image load in the driver).
// // Hardware has sRGB to linear on loads (but in API only for read-only, aka texture instead of UAV/image).
// AF3 CasLoad(ASU2 p){return texelFetch(texSrc,p,0).rgb;}
// ...
// // Do linear to sRGB before store (GPU lacking hardware conversion support for linear to sRGB on store).
// c.r=AToSrgbF1(c.r);c.g=AToSrgbF1(c.g);c.b=AToSrgbF1(c.b);
// imageStore(imgDst,ASU2(gxy),c);
// ...
// // And for packed.
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
// cR=AToSrgbH2(cR);cG=AToSrgbH2(cG);cB=AToSrgbH2(cB);
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(img[0],ASU2(gxy),AF4(c0));
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
// ---------------------------
// - HDR10 output via scRGB.
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
// - Where 'maxNits' is typically not 10000.
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
// - This can be even as low as 1000 nits on some HDR TVs.
// - After CAS do matrix multiply to take Rec.2020 back to sRGB and multiply by 'maxNits/80.0'.
// - Showing GPU code below to generate constants, likely most need to use CPU code instead.
// - Keeping the GPU code here because it is easier to read in these docs.
// - Can use 'lpm.h' source to generate the conversion matrix for Rec.2020 to sRGB:
// // Output conversion matrix from sRGB to Rec.2020.
// AF3 conR,conG,conB;
// // Working space temporaries (Rec.2020).
// AF3 rgbToXyzXW;AF3 rgbToXyzYW;AF3 rgbToXyzZW;
// LpmColRgbToXyz(rgbToXyzXW,rgbToXyzYW,rgbToXyzZW,lpmCol2020R,lpmCol2020G,lpmCol2020B,lpmColD65);
// // Output space temporaries (Rec.709, same as sRGB primaries).
// AF3 rgbToXyzXO;AF3 rgbToXyzYO;AF3 rgbToXyzZO;
// LpmColRgbToXyz(rgbToXyzXO,rgbToXyzYO,rgbToXyzZO,lpmCol709R,lpmCol709G,lpmCol709B,lpmColD65);
// AF3 xyzToRgbRO;AF3 xyzToRgbGO;AF3 xyzToRgbBO;
// LpmMatInv3x3(xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXO,rgbToXyzYO,rgbToXyzZO);
// // Generate the matrix.
// LpmMatMul3x3(conR,conG,conB,xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXW,rgbToXyzYW,rgbToXyzZW);
// - Adjust the conversion matrix for the multiply by 'maxNits/80.0'.
// // After this the constants can be stored into a constant buffer.
// AF1 conScale=maxNits*ARcpF1(80.0);
// conR*=conScale;conG*=conScale;conB*=conScale;
// - After CAS do the matrix multiply (passing the fetched constants into the shader).
// outputR=dot(AF3(colorR,colorG,colorB),conR);
// outputG=dot(AF3(colorR,colorG,colorB),conG);
// outputB=dot(AF3(colorR,colorG,colorB),conB);
// - Hopefully no developer is taking scRGB as input to CAS.
// - If that was the case, the conversion matrix from sRGB to Rec.2020 can be built changing the above code.
// - Swap the 'lpmCol709*' and 'lpmCol2020*' inputs to LpmColRgbToXyz().
// - Then scale by '80.0/maxNits' instead of 'maxNits/80.0'.
// ---------------------------
// - HDR10 output via native 10:10:10:2.
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
// - Where 'maxNits' is typically not 10000.
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
// - This can be even as low as 1000 nits on some HDR TVs.
// - Hopefully no developer needs to take PQ as input here, but if so can use A to convert PQ to linear:
// // Where 'k0' is a constant of 'maxNits/10000.0'.
// colorR=AFromPqF1(colorR*k0);
// colorG=AFromPqF1(colorG*k0);
// colorB=AFromPqF1(colorB*k0);
// - After CAS convert from linear to PQ.
// // Where 'k1' is a constant of '10000.0/maxNits'.
// colorR=AToPqF1(colorR*k1);
// colorG=AToPqF1(colorG*k1);
// colorB=AToPqF1(colorB*k1);
// ---------------------------
// - Example of a bad idea for CAS input design.
// - Have the pass before CAS store out in 10:10:10:2 UNORM with gamma 2.0.
// - Store the output of CAS with sRGB to linear conversion, or with a gamma 2.2 conversion for FreeSync2 native.
// - This will drop precision because the inputs had been quantized to 10-bit,
// and the output is using a different tonal transform,
// so inputs and outputs won't align for similar values.
// - It might be "ok" for 8-bit/channel CAS output, but definately not a good idea for 10-bit/channel output.
//------------------------------------------------------------------------------------------------------------------------------
// ALGORITHM DESCRIPTION
// =====================
// This describes the algorithm with CAS_BETTER_DIAGONALS defined.
// The default is with CAS_BETTER_DIAGONALS not defined (which is faster).
// Starting with no scaling.
// CAS fetches a 3x3 neighborhood around the pixel 'e',
// a b c
// d(e)f
// g h i
// It then computes a 'soft' minimum and maximum,
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// The minimum and maximums give an idea of local contrast.
// --- 1.0 ^
// | | <-- This minimum distance to the signal limit is divided by MAX to get a base sharpening amount 'A'.
// --- MAX v
// |
// |
// --- MIN ^
// | | <-- The MIN side is more distant in this example so it is not used, but for dark colors it would be used.
// | |
// --- 0.0 v
// The base sharpening amount 'A' from above is shaped with a sqrt().
// This 'A' ranges from 0 := no sharpening, to 1 := full sharpening.
// Then 'A' is scaled by the sharpness knob while being transformed to a negative lobe (values from -1/5 to -1/8 for A=1).
// The final filter kernel looks like this,
// 0 A 0
// A 1 A <-- Center is always 1.0, followed by the negative lobe 'A' in a ring, and windowed into a circle with the 0.0s.
// 0 A 0
// The local neighborhood is then multiplied by the kernel weights, summed and divided by the sum of the kernel weights.
// The high quality path computes filter weights per channel.
// The low quality path uses the green channel's filter weights to compute the 'A' factor for all channels.
// ---------------------
// The scaling path is a little more complex.
// It starts by fetching the 4x4 neighborhood around the pixel centered between centers of pixels {f,g,j,k},
// a b c d
// e(f g)h
// i(j k)l
// m n o p
// The algorithm then computes the no-scaling result for {f,g,j,k}.
// It then interpolates between those no-scaling results.
// The interpolation is adaptive.
// To hide bilinear interpolation and restore diagonals, it weights bilinear weights by 1/(const+contrast).
// Where 'contrast' is the soft 'max-min'.
// This makes edges thin out a little.
// ---------------------
// Without CAS_BETTER_DIAGONALS defined, the algorithm is a little faster.
// Instead of using the 3x3 "box" with the 5-tap "circle" this uses just the "circle".
// Drops to 5 texture fetches for no-scaling.
// Drops to 12 texture fetches for scaling.
// Drops a bunch of math.
//------------------------------------------------------------------------------------------------------------------------------
// IDEAS FOR FUTURE
// ================
// - Avoid V_CVT's by using denormals.
// - Manually pack FP16 literals.
//------------------------------------------------------------------------------------------------------------------------------
// CHANGE LOG
// ==========
// 20190610 - Misc documentation cleanup.
// 20190609 - Removed lowQuality bool, improved scaling logic.
// 20190530 - Unified CPU/GPU setup code, using new ffx_a.h, faster, define CAS_BETTER_DIAGONALS to get older slower one.
// 20190529 - Missing a good way to re-interpret packed in HLSL, so disabling approximation optimizations for now.
// 20190528 - Fixed so GPU CasSetup() generates half data all the time.
// 20190527 - Implemented approximations for rcp() and sqrt().
// 20190524 - New algorithm, adjustable sharpness, scaling to 4x area. Fixed checker debug for no-scaling only.
// 20190521 - Updated file naming.
// 20190516 - Updated docs, fixed workaround, fixed no-scaling quality issue, removed gamma2 and generalized as CasInput*().
// 20190510 - Made the dispatch example safely round up for images that are not a multiple of 16x16.
// 20190507 - Fixed typo bug in CAS_DEBUG_CHECKER, fixed sign typo in the docs.
// 20190503 - Setup temporary workaround for compiler bug.
// 20190502 - Added argument for 'gamma2' path so input transform in that case runs packed.
// 20190426 - Improved documentation on format specific cases, etc.
// 20190425 - Updated/corrected documentation.
// 20190405 - Added CAS_PACKED_ONLY, misc bug fixes.
// 20190404 - Updated for the new a.h header.
//==============================================================================================================================
// This is the practical limit for the algorithm's scaling ability (quality is limited by 3x3 taps). Example resolutions,
// 1280x720 -> 1080p = 2.25x area
// 1536x864 -> 1080p = 1.56x area
// 1792x1008 -> 1440p = 2.04x area
// 1920x1080 -> 1440p = 1.78x area
// 1920x1080 -> 4K = 4.0x area
// 2048x1152 -> 1440p = 1.56x area
// 2560x1440 -> 4K = 2.25x area
// 3072x1728 -> 4K = 1.56x area
#define CAS_AREA_LIMIT 4.0
//------------------------------------------------------------------------------------------------------------------------------
// Pass in output and input resolution in pixels.
// This returns true if CAS supports scaling in the given configuration.
AP1 CasSupportScaling(AF1 outX,AF1 outY,AF1 inX,AF1 inY){return ((outX*outY)*ARcpF1(inX*inY))<=CAS_AREA_LIMIT;}
//==============================================================================================================================
// Call to setup required constant values (works on CPU or GPU).
A_STATIC void CasSetup(
outAU4 const0,
outAU4 const1,
AF1 sharpness, // 0 := default (lower ringing), 1 := maximum (higest ringing)
AF1 inputSizeInPixelsX,
AF1 inputSizeInPixelsY,
AF1 outputSizeInPixelsX,
AF1 outputSizeInPixelsY){
// Scaling terms.
const0[0]=AU1_AF1(inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
const0[1]=AU1_AF1(inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY));
const0[2]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
const0[3]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
// Sharpness value.
AF1 sharp=-ARcpF1(ALerpF1(8.0,5.0,ASatF1(sharpness)));
varAF2(hSharp)=initAF2(sharp,0.0);
const1[0]=AU1_AF1(sharp);
const1[1]=AU1_AH2_AF2(hSharp);
const1[2]=AU1_AF1(AF1_(8.0)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
const1[3]=0u;}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
// NON-PACKED VERSION
//==============================================================================================================================
#ifdef A_GPU
#ifdef CAS_PACKED_ONLY
// Avoid compiler error.
AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
#endif
//------------------------------------------------------------------------------------------------------------------------------
void CasFilter(
out AF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
out AF1 pixG,
out AF1 pixB,
AU2 ip, // Integer pixel position in output.
AU4 const0, // Constants generated by CasSetup().
AU4 const1,
AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
//------------------------------------------------------------------------------------------------------------------------------
// Debug a checker pattern of on/off tiles for visual inspection.
#ifdef CAS_DEBUG_CHECKER
if((((ip.x^ip.y)>>8u)&1u)==0u){AF3 pix0=CasLoad(ASU2(ip));
pixR=pix0.r;pixG=pix0.g;pixB=pix0.b;CasInput(pixR,pixG,pixB);return;}
#endif
//------------------------------------------------------------------------------------------------------------------------------
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
if(noScaling){
// a b c
// d e f
// g h i
ASU2 sp=ASU2(ip);
AF3 a=CasLoad(sp+ASU2(-1,-1));
AF3 b=CasLoad(sp+ASU2( 0,-1));
AF3 c=CasLoad(sp+ASU2( 1,-1));
AF3 d=CasLoad(sp+ASU2(-1, 0));
AF3 e=CasLoad(sp);
AF3 f=CasLoad(sp+ASU2( 1, 0));
AF3 g=CasLoad(sp+ASU2(-1, 1));
AF3 h=CasLoad(sp+ASU2( 0, 1));
AF3 i=CasLoad(sp+ASU2( 1, 1));
// Run optional input transform.
CasInput(a.r,a.g,a.b);
CasInput(b.r,b.g,b.b);
CasInput(c.r,c.g,c.b);
CasInput(d.r,d.g,d.b);
CasInput(e.r,e.g,e.b);
CasInput(f.r,f.g,f.b);
CasInput(g.r,g.g,g.b);
CasInput(h.r,h.g,h.b);
CasInput(i.r,i.g,i.b);
// Soft min and max.
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// These are 2.0x bigger (factored out the extra multiply).
AF1 mnR=AMin3F1(AMin3F1(d.r,e.r,f.r),b.r,h.r);
AF1 mnG=AMin3F1(AMin3F1(d.g,e.g,f.g),b.g,h.g);
AF1 mnB=AMin3F1(AMin3F1(d.b,e.b,f.b),b.b,h.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnR2=AMin3F1(AMin3F1(mnR,a.r,c.r),g.r,i.r);
AF1 mnG2=AMin3F1(AMin3F1(mnG,a.g,c.g),g.g,i.g);
AF1 mnB2=AMin3F1(AMin3F1(mnB,a.b,c.b),g.b,i.b);
mnR=mnR+mnR2;
mnG=mnG+mnG2;
mnB=mnB+mnB2;
#endif
AF1 mxR=AMax3F1(AMax3F1(d.r,e.r,f.r),b.r,h.r);
AF1 mxG=AMax3F1(AMax3F1(d.g,e.g,f.g),b.g,h.g);
AF1 mxB=AMax3F1(AMax3F1(d.b,e.b,f.b),b.b,h.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxR2=AMax3F1(AMax3F1(mxR,a.r,c.r),g.r,i.r);
AF1 mxG2=AMax3F1(AMax3F1(mxG,a.g,c.g),g.g,i.g);
AF1 mxB2=AMax3F1(AMax3F1(mxB,a.b,c.b),g.b,i.b);
mxR=mxR+mxR2;
mxG=mxG+mxG2;
mxB=mxB+mxB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
#ifdef CAS_GO_SLOWER
AF1 rcpMR=ARcpF1(mxR);
AF1 rcpMG=ARcpF1(mxG);
AF1 rcpMB=ARcpF1(mxB);
#else
AF1 rcpMR=APrxLoRcpF1(mxR);
AF1 rcpMG=APrxLoRcpF1(mxG);
AF1 rcpMB=APrxLoRcpF1(mxB);
#endif
#ifdef CAS_BETTER_DIAGONALS
AF1 ampR=ASatF1(min(mnR,AF1_(2.0)-mxR)*rcpMR);
AF1 ampG=ASatF1(min(mnG,AF1_(2.0)-mxG)*rcpMG);
AF1 ampB=ASatF1(min(mnB,AF1_(2.0)-mxB)*rcpMB);
#else
AF1 ampR=ASatF1(min(mnR,AF1_(1.0)-mxR)*rcpMR);
AF1 ampG=ASatF1(min(mnG,AF1_(1.0)-mxG)*rcpMG);
AF1 ampB=ASatF1(min(mnB,AF1_(1.0)-mxB)*rcpMB);
#endif
// Shaping amount of sharpening.
#ifdef CAS_GO_SLOWER
ampR=sqrt(ampR);
ampG=sqrt(ampG);
ampB=sqrt(ampB);
#else
ampR=APrxLoSqrtF1(ampR);
ampG=APrxLoSqrtF1(ampG);
ampB=APrxLoSqrtF1(ampB);
#endif
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
AF1 peak=AF1_AU1(const1.x);
AF1 wR=ampR*peak;
AF1 wG=ampG*peak;
AF1 wB=ampB*peak;
// Filter.
#ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead.
#ifdef CAS_GO_SLOWER
AF1 rcpWeight=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG);
#else
AF1 rcpWeight=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG);
#endif
pixR=ASatF1((b.r*wG+d.r*wG+f.r*wG+h.r*wG+e.r)*rcpWeight);
pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeight);
pixB=ASatF1((b.b*wG+d.b*wG+f.b*wG+h.b*wG+e.b)*rcpWeight);
#else
#ifdef CAS_GO_SLOWER
AF1 rcpWeightR=ARcpF1(AF1_(1.0)+AF1_(4.0)*wR);
AF1 rcpWeightG=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG);
AF1 rcpWeightB=ARcpF1(AF1_(1.0)+AF1_(4.0)*wB);
#else
AF1 rcpWeightR=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wR);
AF1 rcpWeightG=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG);
AF1 rcpWeightB=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wB);
#endif
pixR=ASatF1((b.r*wR+d.r*wR+f.r*wR+h.r*wR+e.r)*rcpWeightR);
pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeightG);
pixB=ASatF1((b.b*wB+d.b*wB+f.b*wB+h.b*wB+e.b)*rcpWeightB);
#endif
return;}
//------------------------------------------------------------------------------------------------------------------------------
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
// a b c d
// e f g h
// i j k l
// m n o p
// Working these 4 results.
// +-----+-----+
// | | |
// | f..|..g |
// | . | . |
// +-----+-----+
// | . | . |
// | j..|..k |
// | | |
// +-----+-----+
AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw);
AF2 fp=floor(pp);
pp-=fp;
ASU2 sp=ASU2(fp);
AF3 a=CasLoad(sp+ASU2(-1,-1));
AF3 b=CasLoad(sp+ASU2( 0,-1));
AF3 e=CasLoad(sp+ASU2(-1, 0));
AF3 f=CasLoad(sp);
AF3 c=CasLoad(sp+ASU2( 1,-1));
AF3 d=CasLoad(sp+ASU2( 2,-1));
AF3 g=CasLoad(sp+ASU2( 1, 0));
AF3 h=CasLoad(sp+ASU2( 2, 0));
AF3 i=CasLoad(sp+ASU2(-1, 1));
AF3 j=CasLoad(sp+ASU2( 0, 1));
AF3 m=CasLoad(sp+ASU2(-1, 2));
AF3 n=CasLoad(sp+ASU2( 0, 2));
AF3 k=CasLoad(sp+ASU2( 1, 1));
AF3 l=CasLoad(sp+ASU2( 2, 1));
AF3 o=CasLoad(sp+ASU2( 1, 2));
AF3 p=CasLoad(sp+ASU2( 2, 2));
// Run optional input transform.
CasInput(a.r,a.g,a.b);
CasInput(b.r,b.g,b.b);
CasInput(c.r,c.g,c.b);
CasInput(d.r,d.g,d.b);
CasInput(e.r,e.g,e.b);
CasInput(f.r,f.g,f.b);
CasInput(g.r,g.g,g.b);
CasInput(h.r,h.g,h.b);
CasInput(i.r,i.g,i.b);
CasInput(j.r,j.g,j.b);
CasInput(k.r,k.g,k.b);
CasInput(l.r,l.g,l.b);
CasInput(m.r,m.g,m.b);
CasInput(n.r,n.g,n.b);
CasInput(o.r,o.g,o.b);
CasInput(p.r,p.g,p.b);
// Soft min and max.
// These are 2.0x bigger (factored out the extra multiply).
// a b c b
// e f g * 0.5 + e f g * 0.5 [F]
// i j k j
AF1 mnfR=AMin3F1(AMin3F1(b.r,e.r,f.r),g.r,j.r);
AF1 mnfG=AMin3F1(AMin3F1(b.g,e.g,f.g),g.g,j.g);
AF1 mnfB=AMin3F1(AMin3F1(b.b,e.b,f.b),g.b,j.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnfR2=AMin3F1(AMin3F1(mnfR,a.r,c.r),i.r,k.r);
AF1 mnfG2=AMin3F1(AMin3F1(mnfG,a.g,c.g),i.g,k.g);
AF1 mnfB2=AMin3F1(AMin3F1(mnfB,a.b,c.b),i.b,k.b);
mnfR=mnfR+mnfR2;
mnfG=mnfG+mnfG2;
mnfB=mnfB+mnfB2;
#endif
AF1 mxfR=AMax3F1(AMax3F1(b.r,e.r,f.r),g.r,j.r);
AF1 mxfG=AMax3F1(AMax3F1(b.g,e.g,f.g),g.g,j.g);
AF1 mxfB=AMax3F1(AMax3F1(b.b,e.b,f.b),g.b,j.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxfR2=AMax3F1(AMax3F1(mxfR,a.r,c.r),i.r,k.r);
AF1 mxfG2=AMax3F1(AMax3F1(mxfG,a.g,c.g),i.g,k.g);
AF1 mxfB2=AMax3F1(AMax3F1(mxfB,a.b,c.b),i.b,k.b);
mxfR=mxfR+mxfR2;
mxfG=mxfG+mxfG2;
mxfB=mxfB+mxfB2;
#endif
// b c d c
// f g h * 0.5 + f g h * 0.5 [G]
// j k l k
AF1 mngR=AMin3F1(AMin3F1(c.r,f.r,g.r),h.r,k.r);
AF1 mngG=AMin3F1(AMin3F1(c.g,f.g,g.g),h.g,k.g);
AF1 mngB=AMin3F1(AMin3F1(c.b,f.b,g.b),h.b,k.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mngR2=AMin3F1(AMin3F1(mngR,b.r,d.r),j.r,l.r);
AF1 mngG2=AMin3F1(AMin3F1(mngG,b.g,d.g),j.g,l.g);
AF1 mngB2=AMin3F1(AMin3F1(mngB,b.b,d.b),j.b,l.b);
mngR=mngR+mngR2;
mngG=mngG+mngG2;
mngB=mngB+mngB2;
#endif
AF1 mxgR=AMax3F1(AMax3F1(c.r,f.r,g.r),h.r,k.r);
AF1 mxgG=AMax3F1(AMax3F1(c.g,f.g,g.g),h.g,k.g);
AF1 mxgB=AMax3F1(AMax3F1(c.b,f.b,g.b),h.b,k.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxgR2=AMax3F1(AMax3F1(mxgR,b.r,d.r),j.r,l.r);
AF1 mxgG2=AMax3F1(AMax3F1(mxgG,b.g,d.g),j.g,l.g);
AF1 mxgB2=AMax3F1(AMax3F1(mxgB,b.b,d.b),j.b,l.b);
mxgR=mxgR+mxgR2;
mxgG=mxgG+mxgG2;
mxgB=mxgB+mxgB2;
#endif
// e f g f
// i j k * 0.5 + i j k * 0.5 [J]
// m n o n
AF1 mnjR=AMin3F1(AMin3F1(f.r,i.r,j.r),k.r,n.r);
AF1 mnjG=AMin3F1(AMin3F1(f.g,i.g,j.g),k.g,n.g);
AF1 mnjB=AMin3F1(AMin3F1(f.b,i.b,j.b),k.b,n.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnjR2=AMin3F1(AMin3F1(mnjR,e.r,g.r),m.r,o.r);
AF1 mnjG2=AMin3F1(AMin3F1(mnjG,e.g,g.g),m.g,o.g);
AF1 mnjB2=AMin3F1(AMin3F1(mnjB,e.b,g.b),m.b,o.b);
mnjR=mnjR+mnjR2;
mnjG=mnjG+mnjG2;
mnjB=mnjB+mnjB2;
#endif
AF1 mxjR=AMax3F1(AMax3F1(f.r,i.r,j.r),k.r,n.r);
AF1 mxjG=AMax3F1(AMax3F1(f.g,i.g,j.g),k.g,n.g);
AF1 mxjB=AMax3F1(AMax3F1(f.b,i.b,j.b),k.b,n.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxjR2=AMax3F1(AMax3F1(mxjR,e.r,g.r),m.r,o.r);
AF1 mxjG2=AMax3F1(AMax3F1(mxjG,e.g,g.g),m.g,o.g);
AF1 mxjB2=AMax3F1(AMax3F1(mxjB,e.b,g.b),m.b,o.b);
mxjR=mxjR+mxjR2;
mxjG=mxjG+mxjG2;
mxjB=mxjB+mxjB2;
#endif
// f g h g
// j k l * 0.5 + j k l * 0.5 [K]
// n o p o
AF1 mnkR=AMin3F1(AMin3F1(g.r,j.r,k.r),l.r,o.r);
AF1 mnkG=AMin3F1(AMin3F1(g.g,j.g,k.g),l.g,o.g);
AF1 mnkB=AMin3F1(AMin3F1(g.b,j.b,k.b),l.b,o.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnkR2=AMin3F1(AMin3F1(mnkR,f.r,h.r),n.r,p.r);
AF1 mnkG2=AMin3F1(AMin3F1(mnkG,f.g,h.g),n.g,p.g);
AF1 mnkB2=AMin3F1(AMin3F1(mnkB,f.b,h.b),n.b,p.b);
mnkR=mnkR+mnkR2;
mnkG=mnkG+mnkG2;
mnkB=mnkB+mnkB2;
#endif
AF1 mxkR=AMax3F1(AMax3F1(g.r,j.r,k.r),l.r,o.r);
AF1 mxkG=AMax3F1(AMax3F1(g.g,j.g,k.g),l.g,o.g);
AF1 mxkB=AMax3F1(AMax3F1(g.b,j.b,k.b),l.b,o.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxkR2=AMax3F1(AMax3F1(mxkR,f.r,h.r),n.r,p.r);
AF1 mxkG2=AMax3F1(AMax3F1(mxkG,f.g,h.g),n.g,p.g);
AF1 mxkB2=AMax3F1(AMax3F1(mxkB,f.b,h.b),n.b,p.b);
mxkR=mxkR+mxkR2;
mxkG=mxkG+mxkG2;
mxkB=mxkB+mxkB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
#ifdef CAS_GO_SLOWER
AF1 rcpMfR=ARcpF1(mxfR);
AF1 rcpMfG=ARcpF1(mxfG);
AF1 rcpMfB=ARcpF1(mxfB);
AF1 rcpMgR=ARcpF1(mxgR);
AF1 rcpMgG=ARcpF1(mxgG);
AF1 rcpMgB=ARcpF1(mxgB);
AF1 rcpMjR=ARcpF1(mxjR);
AF1 rcpMjG=ARcpF1(mxjG);
AF1 rcpMjB=ARcpF1(mxjB);
AF1 rcpMkR=ARcpF1(mxkR);
AF1 rcpMkG=ARcpF1(mxkG);
AF1 rcpMkB=ARcpF1(mxkB);
#else
AF1 rcpMfR=APrxLoRcpF1(mxfR);
AF1 rcpMfG=APrxLoRcpF1(mxfG);
AF1 rcpMfB=APrxLoRcpF1(mxfB);
AF1 rcpMgR=APrxLoRcpF1(mxgR);
AF1 rcpMgG=APrxLoRcpF1(mxgG);
AF1 rcpMgB=APrxLoRcpF1(mxgB);
AF1 rcpMjR=APrxLoRcpF1(mxjR);
AF1 rcpMjG=APrxLoRcpF1(mxjG);
AF1 rcpMjB=APrxLoRcpF1(mxjB);
AF1 rcpMkR=APrxLoRcpF1(mxkR);
AF1 rcpMkG=APrxLoRcpF1(mxkG);
AF1 rcpMkB=APrxLoRcpF1(mxkB);
#endif
#ifdef CAS_BETTER_DIAGONALS
AF1 ampfR=ASatF1(min(mnfR,AF1_(2.0)-mxfR)*rcpMfR);
AF1 ampfG=ASatF1(min(mnfG,AF1_(2.0)-mxfG)*rcpMfG);
AF1 ampfB=ASatF1(min(mnfB,AF1_(2.0)-mxfB)*rcpMfB);
AF1 ampgR=ASatF1(min(mngR,AF1_(2.0)-mxgR)*rcpMgR);
AF1 ampgG=ASatF1(min(mngG,AF1_(2.0)-mxgG)*rcpMgG);
AF1 ampgB=ASatF1(min(mngB,AF1_(2.0)-mxgB)*rcpMgB);
AF1 ampjR=ASatF1(min(mnjR,AF1_(2.0)-mxjR)*rcpMjR);
AF1 ampjG=ASatF1(min(mnjG,AF1_(2.0)-mxjG)*rcpMjG);
AF1 ampjB=ASatF1(min(mnjB,AF1_(2.0)-mxjB)*rcpMjB);
AF1 ampkR=ASatF1(min(mnkR,AF1_(2.0)-mxkR)*rcpMkR);
AF1 ampkG=ASatF1(min(mnkG,AF1_(2.0)-mxkG)*rcpMkG);
AF1 ampkB=ASatF1(min(mnkB,AF1_(2.0)-mxkB)*rcpMkB);
#else
AF1 ampfR=ASatF1(min(mnfR,AF1_(1.0)-mxfR)*rcpMfR);
AF1 ampfG=ASatF1(min(mnfG,AF1_(1.0)-mxfG)*rcpMfG);
AF1 ampfB=ASatF1(min(mnfB,AF1_(1.0)-mxfB)*rcpMfB);
AF1 ampgR=ASatF1(min(mngR,AF1_(1.0)-mxgR)*rcpMgR);
AF1 ampgG=ASatF1(min(mngG,AF1_(1.0)-mxgG)*rcpMgG);
AF1 ampgB=ASatF1(min(mngB,AF1_(1.0)-mxgB)*rcpMgB);
AF1 ampjR=ASatF1(min(mnjR,AF1_(1.0)-mxjR)*rcpMjR);
AF1 ampjG=ASatF1(min(mnjG,AF1_(1.0)-mxjG)*rcpMjG);
AF1 ampjB=ASatF1(min(mnjB,AF1_(1.0)-mxjB)*rcpMjB);
AF1 ampkR=ASatF1(min(mnkR,AF1_(1.0)-mxkR)*rcpMkR);
AF1 ampkG=ASatF1(min(mnkG,AF1_(1.0)-mxkG)*rcpMkG);
AF1 ampkB=ASatF1(min(mnkB,AF1_(1.0)-mxkB)*rcpMkB);
#endif
// Shaping amount of sharpening.
#ifdef CAS_GO_SLOWER
ampfR=sqrt(ampfR);
ampfG=sqrt(ampfG);
ampfB=sqrt(ampfB);
ampgR=sqrt(ampgR);
ampgG=sqrt(ampgG);
ampgB=sqrt(ampgB);
ampjR=sqrt(ampjR);
ampjG=sqrt(ampjG);
ampjB=sqrt(ampjB);
ampkR=sqrt(ampkR);
ampkG=sqrt(ampkG);
ampkB=sqrt(ampkB);
#else
ampfR=APrxLoSqrtF1(ampfR);
ampfG=APrxLoSqrtF1(ampfG);
ampfB=APrxLoSqrtF1(ampfB);
ampgR=APrxLoSqrtF1(ampgR);
ampgG=APrxLoSqrtF1(ampgG);
ampgB=APrxLoSqrtF1(ampgB);
ampjR=APrxLoSqrtF1(ampjR);
ampjG=APrxLoSqrtF1(ampjG);
ampjB=APrxLoSqrtF1(ampjB);
ampkR=APrxLoSqrtF1(ampkR);
ampkG=APrxLoSqrtF1(ampkG);
ampkB=APrxLoSqrtF1(ampkB);
#endif
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
AF1 peak=AF1_AU1(const1.x);
AF1 wfR=ampfR*peak;
AF1 wfG=ampfG*peak;
AF1 wfB=ampfB*peak;
AF1 wgR=ampgR*peak;
AF1 wgG=ampgG*peak;
AF1 wgB=ampgB*peak;
AF1 wjR=ampjR*peak;
AF1 wjG=ampjG*peak;
AF1 wjB=ampjB*peak;
AF1 wkR=ampkR*peak;
AF1 wkG=ampkG*peak;
AF1 wkB=ampkB*peak;
// Blend between 4 results.
// s t
// u v
AF1 s=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
AF1 t= pp.x *(AF1_(1.0)-pp.y);
AF1 u=(AF1_(1.0)-pp.x)* pp.y ;
AF1 v= pp.x * pp.y ;
// Thin edges to hide bilinear interpolation (helps diagonals).
AF1 thinB=1.0/32.0;
#ifdef CAS_GO_SLOWER
s*=ARcpF1(thinB+(mxfG-mnfG));
t*=ARcpF1(thinB+(mxgG-mngG));
u*=ARcpF1(thinB+(mxjG-mnjG));
v*=ARcpF1(thinB+(mxkG-mnkG));
#else
s*=APrxLoRcpF1(thinB+(mxfG-mnfG));
t*=APrxLoRcpF1(thinB+(mxgG-mngG));
u*=APrxLoRcpF1(thinB+(mxjG-mnjG));
v*=APrxLoRcpF1(thinB+(mxkG-mnkG));
#endif
// Final weighting.
// b c
// e f g h
// i j k l
// n o
// _____ _____ _____ _____
// fs gt
//
// _____ _____ _____ _____
// fs s gt fs t gt
// ju kv
// _____ _____ _____ _____
// fs gt
// ju u kv ju v kv
// _____ _____ _____ _____
//
// ju kv
AF1 qbeR=wfR*s;
AF1 qbeG=wfG*s;
AF1 qbeB=wfB*s;
AF1 qchR=wgR*t;
AF1 qchG=wgG*t;
AF1 qchB=wgB*t;
AF1 qfR=wgR*t+wjR*u+s;
AF1 qfG=wgG*t+wjG*u+s;
AF1 qfB=wgB*t+wjB*u+s;
AF1 qgR=wfR*s+wkR*v+t;
AF1 qgG=wfG*s+wkG*v+t;
AF1 qgB=wfB*s+wkB*v+t;
AF1 qjR=wfR*s+wkR*v+u;
AF1 qjG=wfG*s+wkG*v+u;
AF1 qjB=wfB*s+wkB*v+u;
AF1 qkR=wgR*t+wjR*u+v;
AF1 qkG=wgG*t+wjG*u+v;
AF1 qkB=wgB*t+wjB*u+v;
AF1 qinR=wjR*u;
AF1 qinG=wjG*u;
AF1 qinB=wjB*u;
AF1 qloR=wkR*v;
AF1 qloG=wkG*v;
AF1 qloB=wkB*v;
// Filter.
#ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead.
#ifdef CAS_GO_SLOWER
AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
#else
AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
#endif
pixR=ASatF1((b.r*qbeG+e.r*qbeG+c.r*qchG+h.r*qchG+i.r*qinG+n.r*qinG+l.r*qloG+o.r*qloG+f.r*qfG+g.r*qgG+j.r*qjG+k.r*qkG)*rcpWG);
pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG);
pixB=ASatF1((b.b*qbeG+e.b*qbeG+c.b*qchG+h.b*qchG+i.b*qinG+n.b*qinG+l.b*qloG+o.b*qloG+f.b*qfG+g.b*qgG+j.b*qjG+k.b*qkG)*rcpWG);
#else
#ifdef CAS_GO_SLOWER
AF1 rcpWR=ARcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR);
AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
AF1 rcpWB=ARcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB);
#else
AF1 rcpWR=APrxMedRcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR);
AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
AF1 rcpWB=APrxMedRcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB);
#endif
pixR=ASatF1((b.r*qbeR+e.r*qbeR+c.r*qchR+h.r*qchR+i.r*qinR+n.r*qinR+l.r*qloR+o.r*qloR+f.r*qfR+g.r*qgR+j.r*qjR+k.r*qkR)*rcpWR);
pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG);
pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB);
#endif
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
// PACKED VERSION
//==============================================================================================================================
#if defined(A_GPU) && defined(A_HALF)
// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
#ifdef A_HLSL
#ifndef CAS_GO_SLOWER
#define CAS_GO_SLOWER 1
#endif
#endif
//==============================================================================================================================
// Can be used to convert from packed SOA to AOS for store.
void CasDepack(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
#ifdef A_HLSL
// Invoke a slower path for DX only, since it won't allow uninitialized values.
pix0.a=pix1.a=0.0;
#endif
pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
//==============================================================================================================================
void CasFilterH(
// Output values are for 2 8x8 tiles in a 16x8 region.
// pix<R,G,B>.x = right 8x8 tile
// pix<R,G,B>.y = left 8x8 tile
// This enables later processing to easily be packed as well.
out AH2 pixR,
out AH2 pixG,
out AH2 pixB,
AU2 ip, // Integer pixel position in output.
AU4 const0, // Constants generated by CasSetup().
AU4 const1,
AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
//------------------------------------------------------------------------------------------------------------------------------
// Debug a checker pattern of on/off tiles for visual inspection.
#ifdef CAS_DEBUG_CHECKER
if((((ip.x^ip.y)>>8u)&1u)==0u){AH3 pix0=CasLoadH(ASW2(ip));AH3 pix1=CasLoadH(ASW2(ip)+ASW2(8,0));
pixR=AH2(pix0.r,pix1.r);pixG=AH2(pix0.g,pix1.g);pixB=AH2(pix0.b,pix1.b);CasInputH(pixR,pixG,pixB);return;}
#endif
//------------------------------------------------------------------------------------------------------------------------------
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
if(noScaling){
ASW2 sp0=ASW2(ip);
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
AH3 d0=CasLoadH(sp0+ASW2(-1, 0));
AH3 e0=CasLoadH(sp0);
AH3 f0=CasLoadH(sp0+ASW2( 1, 0));
AH3 g0=CasLoadH(sp0+ASW2(-1, 1));
AH3 h0=CasLoadH(sp0+ASW2( 0, 1));
AH3 i0=CasLoadH(sp0+ASW2( 1, 1));
ASW2 sp1=sp0+ASW2(8,0);
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
AH3 d1=CasLoadH(sp1+ASW2(-1, 0));
AH3 e1=CasLoadH(sp1);
AH3 f1=CasLoadH(sp1+ASW2( 1, 0));
AH3 g1=CasLoadH(sp1+ASW2(-1, 1));
AH3 h1=CasLoadH(sp1+ASW2( 0, 1));
AH3 i1=CasLoadH(sp1+ASW2( 1, 1));
// AOS to SOA conversion.
AH2 aR=AH2(a0.r,a1.r);
AH2 aG=AH2(a0.g,a1.g);
AH2 aB=AH2(a0.b,a1.b);
AH2 bR=AH2(b0.r,b1.r);
AH2 bG=AH2(b0.g,b1.g);
AH2 bB=AH2(b0.b,b1.b);
AH2 cR=AH2(c0.r,c1.r);
AH2 cG=AH2(c0.g,c1.g);
AH2 cB=AH2(c0.b,c1.b);
AH2 dR=AH2(d0.r,d1.r);
AH2 dG=AH2(d0.g,d1.g);
AH2 dB=AH2(d0.b,d1.b);
AH2 eR=AH2(e0.r,e1.r);
AH2 eG=AH2(e0.g,e1.g);
AH2 eB=AH2(e0.b,e1.b);
AH2 fR=AH2(f0.r,f1.r);
AH2 fG=AH2(f0.g,f1.g);
AH2 fB=AH2(f0.b,f1.b);
AH2 gR=AH2(g0.r,g1.r);
AH2 gG=AH2(g0.g,g1.g);
AH2 gB=AH2(g0.b,g1.b);
AH2 hR=AH2(h0.r,h1.r);
AH2 hG=AH2(h0.g,h1.g);
AH2 hB=AH2(h0.b,h1.b);
AH2 iR=AH2(i0.r,i1.r);
AH2 iG=AH2(i0.g,i1.g);
AH2 iB=AH2(i0.b,i1.b);
// Run optional input transform.
CasInputH(aR,aG,aB);
CasInputH(bR,bG,bB);
CasInputH(cR,cG,cB);
CasInputH(dR,dG,dB);
CasInputH(eR,eG,eB);
CasInputH(fR,fG,fB);
CasInputH(gR,gG,gB);
CasInputH(hR,hG,hB);
CasInputH(iR,iG,iB);
// Soft min and max.
AH2 mnR=min(min(fR,hR),min(min(bR,dR),eR));
AH2 mnG=min(min(fG,hG),min(min(bG,dG),eG));
AH2 mnB=min(min(fB,hB),min(min(bB,dB),eB));
#ifdef CAS_BETTER_DIAGONALS
AH2 mnR2=min(min(gR,iR),min(min(aR,cR),mnR));
AH2 mnG2=min(min(gG,iG),min(min(aG,cG),mnG));
AH2 mnB2=min(min(gB,iB),min(min(aB,cB),mnB));
mnR=mnR+mnR2;
mnG=mnG+mnG2;
mnB=mnB+mnB2;
#endif
AH2 mxR=max(max(fR,hR),max(max(bR,dR),eR));
AH2 mxG=max(max(fG,hG),max(max(bG,dG),eG));
AH2 mxB=max(max(fB,hB),max(max(bB,dB),eB));
#ifdef CAS_BETTER_DIAGONALS
AH2 mxR2=max(max(gR,iR),max(max(aR,cR),mxR));
AH2 mxG2=max(max(gG,iG),max(max(aG,cG),mxG));
AH2 mxB2=max(max(gB,iB),max(max(aB,cB),mxB));
mxR=mxR+mxR2;
mxG=mxG+mxG2;
mxB=mxB+mxB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
#ifdef CAS_GO_SLOWER
AH2 rcpMR=ARcpH2(mxR);
AH2 rcpMG=ARcpH2(mxG);
AH2 rcpMB=ARcpH2(mxB);
#else
AH2 rcpMR=APrxLoRcpH2(mxR);
AH2 rcpMG=APrxLoRcpH2(mxG);
AH2 rcpMB=APrxLoRcpH2(mxB);
#endif
#ifdef CAS_BETTER_DIAGONALS
AH2 ampR=ASatH2(min(mnR,AH2_(2.0)-mxR)*rcpMR);
AH2 ampG=ASatH2(min(mnG,AH2_(2.0)-mxG)*rcpMG);
AH2 ampB=ASatH2(min(mnB,AH2_(2.0)-mxB)*rcpMB);
#else
AH2 ampR=ASatH2(min(mnR,AH2_(1.0)-mxR)*rcpMR);
AH2 ampG=ASatH2(min(mnG,AH2_(1.0)-mxG)*rcpMG);
AH2 ampB=ASatH2(min(mnB,AH2_(1.0)-mxB)*rcpMB);
#endif
// Shaping amount of sharpening.
#ifdef CAS_GO_SLOWER
ampR=sqrt(ampR);
ampG=sqrt(ampG);
ampB=sqrt(ampB);
#else
ampR=APrxLoSqrtH2(ampR);
ampG=APrxLoSqrtH2(ampG);
ampB=APrxLoSqrtH2(ampB);
#endif
// Filter shape.
AH1 peak=AH2_AU1(const1.y).x;
AH2 wR=ampR*AH2_(peak);
AH2 wG=ampG*AH2_(peak);
AH2 wB=ampB*AH2_(peak);
// Filter.
#ifndef CAS_SLOW
#ifdef CAS_GO_SLOWER
AH2 rcpWeight=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG);
#else
AH2 rcpWeight=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG);
#endif
pixR=ASatH2((bR*wG+dR*wG+fR*wG+hR*wG+eR)*rcpWeight);
pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeight);
pixB=ASatH2((bB*wG+dB*wG+fB*wG+hB*wG+eB)*rcpWeight);
#else
#ifdef CAS_GO_SLOWER
AH2 rcpWeightR=ARcpH2(AH2_(1.0)+AH2_(4.0)*wR);
AH2 rcpWeightG=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG);
AH2 rcpWeightB=ARcpH2(AH2_(1.0)+AH2_(4.0)*wB);
#else
AH2 rcpWeightR=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wR);
AH2 rcpWeightG=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG);
AH2 rcpWeightB=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wB);
#endif
pixR=ASatH2((bR*wR+dR*wR+fR*wR+hR*wR+eR)*rcpWeightR);
pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeightG);
pixB=ASatH2((bB*wB+dB*wB+fB*wB+hB*wB+eB)*rcpWeightB);
#endif
return;}
//------------------------------------------------------------------------------------------------------------------------------
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw);
// Tile 0.
// Fractional position is needed in high precision here.
AF2 fp0=floor(pp);
AH2 ppX;
ppX.x=AH1(pp.x-fp0.x);
AH1 ppY=AH1(pp.y-fp0.y);
ASW2 sp0=ASW2(fp0);
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
AH3 e0=CasLoadH(sp0+ASW2(-1, 0));
AH3 f0=CasLoadH(sp0);
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
AH3 d0=CasLoadH(sp0+ASW2( 2,-1));
AH3 g0=CasLoadH(sp0+ASW2( 1, 0));
AH3 h0=CasLoadH(sp0+ASW2( 2, 0));
AH3 i0=CasLoadH(sp0+ASW2(-1, 1));
AH3 j0=CasLoadH(sp0+ASW2( 0, 1));
AH3 m0=CasLoadH(sp0+ASW2(-1, 2));
AH3 n0=CasLoadH(sp0+ASW2( 0, 2));
AH3 k0=CasLoadH(sp0+ASW2( 1, 1));
AH3 l0=CasLoadH(sp0+ASW2( 2, 1));
AH3 o0=CasLoadH(sp0+ASW2( 1, 2));
AH3 p0=CasLoadH(sp0+ASW2( 2, 2));
// Tile 1 (offset only in x).
AF1 pp1=pp.x+AF1_AU1(const1.z);
AF1 fp1=floor(pp1);
ppX.y=AH1(pp1-fp1);
ASW2 sp1=ASW2(fp1,sp0.y);
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
AH3 e1=CasLoadH(sp1+ASW2(-1, 0));
AH3 f1=CasLoadH(sp1);
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
AH3 d1=CasLoadH(sp1+ASW2( 2,-1));
AH3 g1=CasLoadH(sp1+ASW2( 1, 0));
AH3 h1=CasLoadH(sp1+ASW2( 2, 0));
AH3 i1=CasLoadH(sp1+ASW2(-1, 1));
AH3 j1=CasLoadH(sp1+ASW2( 0, 1));
AH3 m1=CasLoadH(sp1+ASW2(-1, 2));
AH3 n1=CasLoadH(sp1+ASW2( 0, 2));
AH3 k1=CasLoadH(sp1+ASW2( 1, 1));
AH3 l1=CasLoadH(sp1+ASW2( 2, 1));
AH3 o1=CasLoadH(sp1+ASW2( 1, 2));
AH3 p1=CasLoadH(sp1+ASW2( 2, 2));
// AOS to SOA conversion.
AH2 aR=AH2(a0.r,a1.r);
AH2 aG=AH2(a0.g,a1.g);
AH2 aB=AH2(a0.b,a1.b);
AH2 bR=AH2(b0.r,b1.r);
AH2 bG=AH2(b0.g,b1.g);
AH2 bB=AH2(b0.b,b1.b);
AH2 cR=AH2(c0.r,c1.r);
AH2 cG=AH2(c0.g,c1.g);
AH2 cB=AH2(c0.b,c1.b);
AH2 dR=AH2(d0.r,d1.r);
AH2 dG=AH2(d0.g,d1.g);
AH2 dB=AH2(d0.b,d1.b);
AH2 eR=AH2(e0.r,e1.r);
AH2 eG=AH2(e0.g,e1.g);
AH2 eB=AH2(e0.b,e1.b);
AH2 fR=AH2(f0.r,f1.r);
AH2 fG=AH2(f0.g,f1.g);
AH2 fB=AH2(f0.b,f1.b);
AH2 gR=AH2(g0.r,g1.r);
AH2 gG=AH2(g0.g,g1.g);
AH2 gB=AH2(g0.b,g1.b);
AH2 hR=AH2(h0.r,h1.r);
AH2 hG=AH2(h0.g,h1.g);
AH2 hB=AH2(h0.b,h1.b);
AH2 iR=AH2(i0.r,i1.r);
AH2 iG=AH2(i0.g,i1.g);
AH2 iB=AH2(i0.b,i1.b);
AH2 jR=AH2(j0.r,j1.r);
AH2 jG=AH2(j0.g,j1.g);
AH2 jB=AH2(j0.b,j1.b);
AH2 kR=AH2(k0.r,k1.r);
AH2 kG=AH2(k0.g,k1.g);
AH2 kB=AH2(k0.b,k1.b);
AH2 lR=AH2(l0.r,l1.r);
AH2 lG=AH2(l0.g,l1.g);
AH2 lB=AH2(l0.b,l1.b);
AH2 mR=AH2(m0.r,m1.r);
AH2 mG=AH2(m0.g,m1.g);
AH2 mB=AH2(m0.b,m1.b);
AH2 nR=AH2(n0.r,n1.r);
AH2 nG=AH2(n0.g,n1.g);
AH2 nB=AH2(n0.b,n1.b);
AH2 oR=AH2(o0.r,o1.r);
AH2 oG=AH2(o0.g,o1.g);
AH2 oB=AH2(o0.b,o1.b);
AH2 pR=AH2(p0.r,p1.r);
AH2 pG=AH2(p0.g,p1.g);
AH2 pB=AH2(p0.b,p1.b);
// Run optional input transform.
CasInputH(aR,aG,aB);
CasInputH(bR,bG,bB);
CasInputH(cR,cG,cB);
CasInputH(dR,dG,dB);
CasInputH(eR,eG,eB);
CasInputH(fR,fG,fB);
CasInputH(gR,gG,gB);
CasInputH(hR,hG,hB);
CasInputH(iR,iG,iB);
CasInputH(jR,jG,jB);
CasInputH(kR,kG,kB);
CasInputH(lR,lG,lB);
CasInputH(mR,mG,mB);
CasInputH(nR,nG,nB);
CasInputH(oR,oG,oB);
CasInputH(pR,pG,pB);
// Soft min and max.
// These are 2.0x bigger (factored out the extra multiply).
// a b c b
// e f g * 0.5 + e f g * 0.5 [F]
// i j k j
AH2 mnfR=AMin3H2(AMin3H2(bR,eR,fR),gR,jR);
AH2 mnfG=AMin3H2(AMin3H2(bG,eG,fG),gG,jG);
AH2 mnfB=AMin3H2(AMin3H2(bB,eB,fB),gB,jB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mnfR2=AMin3H2(AMin3H2(mnfR,aR,cR),iR,kR);
AH2 mnfG2=AMin3H2(AMin3H2(mnfG,aG,cG),iG,kG);
AH2 mnfB2=AMin3H2(AMin3H2(mnfB,aB,cB),iB,kB);
mnfR=mnfR+mnfR2;
mnfG=mnfG+mnfG2;
mnfB=mnfB+mnfB2;
#endif
AH2 mxfR=AMax3H2(AMax3H2(bR,eR,fR),gR,jR);
AH2 mxfG=AMax3H2(AMax3H2(bG,eG,fG),gG,jG);
AH2 mxfB=AMax3H2(AMax3H2(bB,eB,fB),gB,jB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mxfR2=AMax3H2(AMax3H2(mxfR,aR,cR),iR,kR);
AH2 mxfG2=AMax3H2(AMax3H2(mxfG,aG,cG),iG,kG);
AH2 mxfB2=AMax3H2(AMax3H2(mxfB,aB,cB),iB,kB);
mxfR=mxfR+mxfR2;
mxfG=mxfG+mxfG2;
mxfB=mxfB+mxfB2;
#endif
// b c d c
// f g h * 0.5 + f g h * 0.5 [G]
// j k l k
AH2 mngR=AMin3H2(AMin3H2(cR,fR,gR),hR,kR);
AH2 mngG=AMin3H2(AMin3H2(cG,fG,gG),hG,kG);
AH2 mngB=AMin3H2(AMin3H2(cB,fB,gB),hB,kB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mngR2=AMin3H2(AMin3H2(mngR,bR,dR),jR,lR);
AH2 mngG2=AMin3H2(AMin3H2(mngG,bG,dG),jG,lG);
AH2 mngB2=AMin3H2(AMin3H2(mngB,bB,dB),jB,lB);
mngR=mngR+mngR2;
mngG=mngG+mngG2;
mngB=mngB+mngB2;
#endif
AH2 mxgR=AMax3H2(AMax3H2(cR,fR,gR),hR,kR);
AH2 mxgG=AMax3H2(AMax3H2(cG,fG,gG),hG,kG);
AH2 mxgB=AMax3H2(AMax3H2(cB,fB,gB),hB,kB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mxgR2=AMax3H2(AMax3H2(mxgR,bR,dR),jR,lR);
AH2 mxgG2=AMax3H2(AMax3H2(mxgG,bG,dG),jG,lG);
AH2 mxgB2=AMax3H2(AMax3H2(mxgB,bB,dB),jB,lB);
mxgR=mxgR+mxgR2;
mxgG=mxgG+mxgG2;
mxgB=mxgB+mxgB2;
#endif
// e f g f
// i j k * 0.5 + i j k * 0.5 [J]
// m n o n
AH2 mnjR=AMin3H2(AMin3H2(fR,iR,jR),kR,nR);
AH2 mnjG=AMin3H2(AMin3H2(fG,iG,jG),kG,nG);
AH2 mnjB=AMin3H2(AMin3H2(fB,iB,jB),kB,nB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mnjR2=AMin3H2(AMin3H2(mnjR,eR,gR),mR,oR);
AH2 mnjG2=AMin3H2(AMin3H2(mnjG,eG,gG),mG,oG);
AH2 mnjB2=AMin3H2(AMin3H2(mnjB,eB,gB),mB,oB);
mnjR=mnjR+mnjR2;
mnjG=mnjG+mnjG2;
mnjB=mnjB+mnjB2;
#endif
AH2 mxjR=AMax3H2(AMax3H2(fR,iR,jR),kR,nR);
AH2 mxjG=AMax3H2(AMax3H2(fG,iG,jG),kG,nG);
AH2 mxjB=AMax3H2(AMax3H2(fB,iB,jB),kB,nB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mxjR2=AMax3H2(AMax3H2(mxjR,eR,gR),mR,oR);
AH2 mxjG2=AMax3H2(AMax3H2(mxjG,eG,gG),mG,oG);
AH2 mxjB2=AMax3H2(AMax3H2(mxjB,eB,gB),mB,oB);
mxjR=mxjR+mxjR2;
mxjG=mxjG+mxjG2;
mxjB=mxjB+mxjB2;
#endif
// f g h g
// j k l * 0.5 + j k l * 0.5 [K]
// n o p o
AH2 mnkR=AMin3H2(AMin3H2(gR,jR,kR),lR,oR);
AH2 mnkG=AMin3H2(AMin3H2(gG,jG,kG),lG,oG);
AH2 mnkB=AMin3H2(AMin3H2(gB,jB,kB),lB,oB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mnkR2=AMin3H2(AMin3H2(mnkR,fR,hR),nR,pR);
AH2 mnkG2=AMin3H2(AMin3H2(mnkG,fG,hG),nG,pG);
AH2 mnkB2=AMin3H2(AMin3H2(mnkB,fB,hB),nB,pB);
mnkR=mnkR+mnkR2;
mnkG=mnkG+mnkG2;
mnkB=mnkB+mnkB2;
#endif
AH2 mxkR=AMax3H2(AMax3H2(gR,jR,kR),lR,oR);
AH2 mxkG=AMax3H2(AMax3H2(gG,jG,kG),lG,oG);
AH2 mxkB=AMax3H2(AMax3H2(gB,jB,kB),lB,oB);
#ifdef CAS_BETTER_DIAGONALS
AH2 mxkR2=AMax3H2(AMax3H2(mxkR,fR,hR),nR,pR);
AH2 mxkG2=AMax3H2(AMax3H2(mxkG,fG,hG),nG,pG);
AH2 mxkB2=AMax3H2(AMax3H2(mxkB,fB,hB),nB,pB);
mxkR=mxkR+mxkR2;
mxkG=mxkG+mxkG2;
mxkB=mxkB+mxkB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
#ifdef CAS_GO_SLOWER
AH2 rcpMfR=ARcpH2(mxfR);
AH2 rcpMfG=ARcpH2(mxfG);
AH2 rcpMfB=ARcpH2(mxfB);
AH2 rcpMgR=ARcpH2(mxgR);
AH2 rcpMgG=ARcpH2(mxgG);
AH2 rcpMgB=ARcpH2(mxgB);
AH2 rcpMjR=ARcpH2(mxjR);
AH2 rcpMjG=ARcpH2(mxjG);
AH2 rcpMjB=ARcpH2(mxjB);
AH2 rcpMkR=ARcpH2(mxkR);
AH2 rcpMkG=ARcpH2(mxkG);
AH2 rcpMkB=ARcpH2(mxkB);
#else
AH2 rcpMfR=APrxLoRcpH2(mxfR);
AH2 rcpMfG=APrxLoRcpH2(mxfG);
AH2 rcpMfB=APrxLoRcpH2(mxfB);
AH2 rcpMgR=APrxLoRcpH2(mxgR);
AH2 rcpMgG=APrxLoRcpH2(mxgG);
AH2 rcpMgB=APrxLoRcpH2(mxgB);
AH2 rcpMjR=APrxLoRcpH2(mxjR);
AH2 rcpMjG=APrxLoRcpH2(mxjG);
AH2 rcpMjB=APrxLoRcpH2(mxjB);
AH2 rcpMkR=APrxLoRcpH2(mxkR);
AH2 rcpMkG=APrxLoRcpH2(mxkG);
AH2 rcpMkB=APrxLoRcpH2(mxkB);
#endif
#ifdef CAS_BETTER_DIAGONALS
AH2 ampfR=ASatH2(min(mnfR,AH2_(2.0)-mxfR)*rcpMfR);
AH2 ampfG=ASatH2(min(mnfG,AH2_(2.0)-mxfG)*rcpMfG);
AH2 ampfB=ASatH2(min(mnfB,AH2_(2.0)-mxfB)*rcpMfB);
AH2 ampgR=ASatH2(min(mngR,AH2_(2.0)-mxgR)*rcpMgR);
AH2 ampgG=ASatH2(min(mngG,AH2_(2.0)-mxgG)*rcpMgG);
AH2 ampgB=ASatH2(min(mngB,AH2_(2.0)-mxgB)*rcpMgB);
AH2 ampjR=ASatH2(min(mnjR,AH2_(2.0)-mxjR)*rcpMjR);
AH2 ampjG=ASatH2(min(mnjG,AH2_(2.0)-mxjG)*rcpMjG);
AH2 ampjB=ASatH2(min(mnjB,AH2_(2.0)-mxjB)*rcpMjB);
AH2 ampkR=ASatH2(min(mnkR,AH2_(2.0)-mxkR)*rcpMkR);
AH2 ampkG=ASatH2(min(mnkG,AH2_(2.0)-mxkG)*rcpMkG);
AH2 ampkB=ASatH2(min(mnkB,AH2_(2.0)-mxkB)*rcpMkB);
#else
AH2 ampfR=ASatH2(min(mnfR,AH2_(1.0)-mxfR)*rcpMfR);
AH2 ampfG=ASatH2(min(mnfG,AH2_(1.0)-mxfG)*rcpMfG);
AH2 ampfB=ASatH2(min(mnfB,AH2_(1.0)-mxfB)*rcpMfB);
AH2 ampgR=ASatH2(min(mngR,AH2_(1.0)-mxgR)*rcpMgR);
AH2 ampgG=ASatH2(min(mngG,AH2_(1.0)-mxgG)*rcpMgG);
AH2 ampgB=ASatH2(min(mngB,AH2_(1.0)-mxgB)*rcpMgB);
AH2 ampjR=ASatH2(min(mnjR,AH2_(1.0)-mxjR)*rcpMjR);
AH2 ampjG=ASatH2(min(mnjG,AH2_(1.0)-mxjG)*rcpMjG);
AH2 ampjB=ASatH2(min(mnjB,AH2_(1.0)-mxjB)*rcpMjB);
AH2 ampkR=ASatH2(min(mnkR,AH2_(1.0)-mxkR)*rcpMkR);
AH2 ampkG=ASatH2(min(mnkG,AH2_(1.0)-mxkG)*rcpMkG);
AH2 ampkB=ASatH2(min(mnkB,AH2_(1.0)-mxkB)*rcpMkB);
#endif
// Shaping amount of sharpening.
#ifdef CAS_GO_SLOWER
ampfR=sqrt(ampfR);
ampfG=sqrt(ampfG);
ampfB=sqrt(ampfB);
ampgR=sqrt(ampgR);
ampgG=sqrt(ampgG);
ampgB=sqrt(ampgB);
ampjR=sqrt(ampjR);
ampjG=sqrt(ampjG);
ampjB=sqrt(ampjB);
ampkR=sqrt(ampkR);
ampkG=sqrt(ampkG);
ampkB=sqrt(ampkB);
#else
ampfR=APrxLoSqrtH2(ampfR);
ampfG=APrxLoSqrtH2(ampfG);
ampfB=APrxLoSqrtH2(ampfB);
ampgR=APrxLoSqrtH2(ampgR);
ampgG=APrxLoSqrtH2(ampgG);
ampgB=APrxLoSqrtH2(ampgB);
ampjR=APrxLoSqrtH2(ampjR);
ampjG=APrxLoSqrtH2(ampjG);
ampjB=APrxLoSqrtH2(ampjB);
ampkR=APrxLoSqrtH2(ampkR);
ampkG=APrxLoSqrtH2(ampkG);
ampkB=APrxLoSqrtH2(ampkB);
#endif
// Filter shape.
AH1 peak=AH2_AU1(const1.y).x;
AH2 wfR=ampfR*AH2_(peak);
AH2 wfG=ampfG*AH2_(peak);
AH2 wfB=ampfB*AH2_(peak);
AH2 wgR=ampgR*AH2_(peak);
AH2 wgG=ampgG*AH2_(peak);
AH2 wgB=ampgB*AH2_(peak);
AH2 wjR=ampjR*AH2_(peak);
AH2 wjG=ampjG*AH2_(peak);
AH2 wjB=ampjB*AH2_(peak);
AH2 wkR=ampkR*AH2_(peak);
AH2 wkG=ampkG*AH2_(peak);
AH2 wkB=ampkB*AH2_(peak);
// Blend between 4 results.
AH2 s=(AH2_(1.0)-ppX)*(AH2_(1.0)-AH2_(ppY));
AH2 t= ppX *(AH2_(1.0)-AH2_(ppY));
AH2 u=(AH2_(1.0)-ppX)* AH2_(ppY) ;
AH2 v= ppX * AH2_(ppY) ;
// Thin edges to hide bilinear interpolation (helps diagonals).
AH2 thinB=AH2_(1.0/32.0);
#ifdef CAS_GO_SLOWER
s*=ARcpH2(thinB+(mxfG-mnfG));
t*=ARcpH2(thinB+(mxgG-mngG));
u*=ARcpH2(thinB+(mxjG-mnjG));
v*=ARcpH2(thinB+(mxkG-mnkG));
#else
s*=APrxLoRcpH2(thinB+(mxfG-mnfG));
t*=APrxLoRcpH2(thinB+(mxgG-mngG));
u*=APrxLoRcpH2(thinB+(mxjG-mnjG));
v*=APrxLoRcpH2(thinB+(mxkG-mnkG));
#endif
// Final weighting.
AH2 qbeR=wfR*s;
AH2 qbeG=wfG*s;
AH2 qbeB=wfB*s;
AH2 qchR=wgR*t;
AH2 qchG=wgG*t;
AH2 qchB=wgB*t;
AH2 qfR=wgR*t+wjR*u+s;
AH2 qfG=wgG*t+wjG*u+s;
AH2 qfB=wgB*t+wjB*u+s;
AH2 qgR=wfR*s+wkR*v+t;
AH2 qgG=wfG*s+wkG*v+t;
AH2 qgB=wfB*s+wkB*v+t;
AH2 qjR=wfR*s+wkR*v+u;
AH2 qjG=wfG*s+wkG*v+u;
AH2 qjB=wfB*s+wkB*v+u;
AH2 qkR=wgR*t+wjR*u+v;
AH2 qkG=wgG*t+wjG*u+v;
AH2 qkB=wgB*t+wjB*u+v;
AH2 qinR=wjR*u;
AH2 qinG=wjG*u;
AH2 qinB=wjB*u;
AH2 qloR=wkR*v;
AH2 qloG=wkG*v;
AH2 qloB=wkB*v;
// Filter.
#ifndef CAS_SLOW
#ifdef CAS_GO_SLOWER
AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
#else
AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
#endif
pixR=ASatH2((bR*qbeG+eR*qbeG+cR*qchG+hR*qchG+iR*qinG+nR*qinG+lR*qloG+oR*qloG+fR*qfG+gR*qgG+jR*qjG+kR*qkG)*rcpWG);
pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG);
pixB=ASatH2((bB*qbeG+eB*qbeG+cB*qchG+hB*qchG+iB*qinG+nB*qinG+lB*qloG+oB*qloG+fB*qfG+gB*qgG+jB*qjG+kB*qkG)*rcpWG);
#else
#ifdef CAS_GO_SLOWER
AH2 rcpWR=ARcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR);
AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
AH2 rcpWB=ARcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB);
#else
AH2 rcpWR=APrxMedRcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR);
AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
AH2 rcpWB=APrxMedRcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB);
#endif
pixR=ASatH2((bR*qbeR+eR*qbeR+cR*qchR+hR*qchR+iR*qinR+nR*qinR+lR*qloR+oR*qloR+fR*qfR+gR*qgR+jR*qjR+kR*qkR)*rcpWR);
pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG);
pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB);
#endif
}
#endif