mirror of
https://github.com/gnif/LookingGlass.git
synced 2024-12-24 22:43:39 +00:00
1446 lines
59 KiB
C
1446 lines
59 KiB
C
|
//_____________________________________________________________/\_______________________________________________________________
|
||
|
//==============================================================================================================================
|
||
|
//
|
||
|
// [CAS] FIDELITY FX - CONSTRAST ADAPTIVE SHARPENING 1.20190610
|
||
|
//
|
||
|
//==============================================================================================================================
|
||
|
// LICENSE
|
||
|
// =======
|
||
|
// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
|
||
|
// -------
|
||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
|
||
|
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
|
||
|
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
|
||
|
// Software is furnished to do so, subject to the following conditions:
|
||
|
// -------
|
||
|
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
|
||
|
// Software.
|
||
|
// -------
|
||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||
|
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||
|
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||
|
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// ABOUT
|
||
|
// =====
|
||
|
// CAS is a spatial only filter.
|
||
|
// CAS takes RGB color input.
|
||
|
// CAS enchances sharpness and local high-frequency contrast, and with or without added upsampling.
|
||
|
// CAS outputs RGB color.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// SUGGESTIONS FOR INTEGRATION
|
||
|
// ===========================
|
||
|
// Best for performance, run CAS in sharpen-only mode, choose a video mode to have scan-out or the display scale.
|
||
|
// - Sharpen-only mode is faster, and provides a better quality sharpening.
|
||
|
// The scaling support in CAS was designed for when the application wants to do Dynamic Resolution Scaling (DRS).
|
||
|
// - With DRS, the render resolution can change per frame.
|
||
|
// - Use CAS to sharpen and upsample to the fixed output resolution, then composite the full resolution UI over CAS output.
|
||
|
// - This can all happen in one compute dispatch.
|
||
|
// It is likely better to reduce the amount of film grain which happens before CAS (as CAS will amplify grain).
|
||
|
// - An alternative would be to add grain after CAS.
|
||
|
// It is best to run CAS after tonemapping.
|
||
|
// - CAS needs to have input value 1.0 at the peak of the display output.
|
||
|
// It is ok to run CAS after compositing UI (it won't harm the UI).
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// EXECUTION
|
||
|
// =========
|
||
|
// CAS runs as a compute shader.
|
||
|
// CAS is designed to be run either in a 32-bit, CasFilter(), or packed 16-bit, CasFilterH(), form.
|
||
|
// The 32-bit form works on 8x8 tiles via one {64,1,1} workgroup.
|
||
|
// The 16-bit form works on a pair of 8x8 tiles in a 16x8 configuration via one {64,1,1} workgroup.
|
||
|
// CAS is designed to work best in semi-persistent form if running not async with graphics.
|
||
|
// For 32-bit this means looping across a collection of 4 8x8 tiles in a 2x2 tile foot-print.
|
||
|
// For 16-bit this means looping 2 times, once for the top 16x8 region and once for the bottom 16x8 region.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// INTEGRATION SUMMARY FOR CPU
|
||
|
// ===========================
|
||
|
// // Make sure <stdint.h> has already been included.
|
||
|
// // Setup pre-portability-header defines.
|
||
|
// #define A_CPU 1
|
||
|
// // Include the portability header (requires version 1.20190530 or later which is backwards compatible).
|
||
|
// #include "ffx_a.h"
|
||
|
// // Include the CAS header.
|
||
|
// #include "ffx_cas.h"
|
||
|
// ...
|
||
|
// // Call the setup function to build out the constants for the shader, pass these to the shader.
|
||
|
// // The 'varAU4(const0);' expands into 'uint32_t const0[4];' on the CPU.
|
||
|
// varAU4(const0);
|
||
|
// varAU4(const1);
|
||
|
// CasSetup(const0,const1,
|
||
|
// 0.0f, // Sharpness tuning knob (0.0 to 1.0).
|
||
|
// 1920.0f,1080.0f, // Example input size.
|
||
|
// 2560.0f,1440.0f); // Example output size.
|
||
|
// ...
|
||
|
// // Later dispatch the shader based on the amount of semi-persistent loop unrolling.
|
||
|
// // Here is an example for running with the 16x16 (4-way unroll for 32-bit or 2-way unroll for 16-bit)
|
||
|
// vkCmdDispatch(cmdBuf,(widthInPixels+15)>>4,(heightInPixels+15)>>4,1);
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// INTEGRATION SUMMARY FOR GPU
|
||
|
// ===========================
|
||
|
// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
|
||
|
// layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
|
||
|
// layout(set=0,binding=1,rgba16f)uniform image2D imgDst;
|
||
|
// ...
|
||
|
// // Setup pre-portability-header defines (sets up GLSL/HLSL path, packed math support, etc)
|
||
|
// #define A_GPU 1
|
||
|
// #define A_GLSL 1
|
||
|
// #define A_HALF 1
|
||
|
// ...
|
||
|
// // Include the portability header (or copy it in without an include).
|
||
|
// #include "ffx_a.h"
|
||
|
// ...
|
||
|
// // Define the fetch function(s).
|
||
|
// // CasLoad() takes a 32-bit unsigned integer 2D coordinate and loads color.
|
||
|
// AF3 CasLoad(ASU2 p){return imageLoad(imgSrc,p).rgb;}
|
||
|
// // CasLoadH() is the 16-bit version taking 16-bit unsigned integer 2D coordinate and loading 16-bit float color.
|
||
|
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
|
||
|
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
|
||
|
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
|
||
|
// ...
|
||
|
// // Define the input modifiers as nop's initially.
|
||
|
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
|
||
|
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
|
||
|
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){}
|
||
|
// ...
|
||
|
// // Include this CAS header file (or copy it in without an include).
|
||
|
// #include "ffx_cas.h"
|
||
|
// ...
|
||
|
// // Example in shader integration for loop-unrolled 16x16 case for 32-bit.
|
||
|
// layout(local_size_x=64)in;
|
||
|
// void main(){
|
||
|
// // Fetch constants from CasSetup().
|
||
|
// AU4 const0=cb.const0;
|
||
|
// AU4 const1=cb.const1;
|
||
|
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
|
||
|
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
|
||
|
// // Filter.
|
||
|
// AF4 c;
|
||
|
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
|
||
|
// gxy.x+=8u;
|
||
|
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
|
||
|
// gxy.y+=8u;
|
||
|
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
|
||
|
// gxy.x-=8u;
|
||
|
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);}
|
||
|
// ...
|
||
|
// // Example for semi-persistent 16x16 but this time for packed math.
|
||
|
// // Use this before including 'cas.h' if not using the non-packed filter function.
|
||
|
// #define CAS_PACKED_ONLY 1
|
||
|
// ...
|
||
|
// layout(local_size_x=64)in;
|
||
|
// void main(){
|
||
|
// // Fetch constants from CasSetup().
|
||
|
// AU4 const0=cb.const0;
|
||
|
// AU4 const1=cb.const1;
|
||
|
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
|
||
|
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
|
||
|
// // Filter.
|
||
|
// AH4 c0,c1;AH2 cR,cG,cB;
|
||
|
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
|
||
|
// // Extra work integrated after CAS would go here.
|
||
|
// ...
|
||
|
// // Suggest only running CasDepack() right before stores, to maintain packed math for any work after CasFilterH().
|
||
|
// CasDepack(c0,c1,cR,cG,cB);
|
||
|
// imageStore(imgDst,ASU2(gxy),AF4(c0));
|
||
|
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));
|
||
|
// gxy.y+=8u;
|
||
|
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
|
||
|
// ...
|
||
|
// CasDepack(c0,c1,cR,cG,cB);
|
||
|
// imageStore(imgDst,ASU2(gxy),AF4(c0));
|
||
|
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));}
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// CAS FILTERING LOGIC
|
||
|
// ===================
|
||
|
// CAS uses the minimal nearest 3x3 source texel window for filtering.
|
||
|
// The filter coefficients are radially symmetric (phase adaptive, computed per pixel based on output pixel center).
|
||
|
// The filter kernel adapts to local contrast (adjusting the negative lobe strength of the filter kernel).
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// CAS INPUT REQUIREMENTS
|
||
|
// ======================
|
||
|
// This is designed to be a linear filter.
|
||
|
// Running CAS on perceptual inputs will yield over-sharpening.
|
||
|
// Input must range between {0 to 1} for each color channel.
|
||
|
// CAS output will be {0 to 1} ranged as well.
|
||
|
// CAS does 5 loads, so any conversion applied during CasLoad() or CasInput() has a 5 load * 3 channel = 15x cost amplifier.
|
||
|
// - So input conversions need to be factored into the prior pass's output.
|
||
|
// - But if necessary use CasInput() instead of CasLoad(), as CasInput() works with packed color.
|
||
|
// - For CAS with scaling the amplifier is 12 load * 3 channel = 36x cost amplifier.
|
||
|
// Any conversion applied to output has a 3x cost amplifier (3 color channels).
|
||
|
// - Output conversions are substantially less expensive.
|
||
|
// Added VALU ops due to conversions will have visible cost as this shader is already quite VALU heavy.
|
||
|
// This filter does not function well on sRGB or gamma 2.2 non-linear data.
|
||
|
// This filter does not function on PQ non-linear data.
|
||
|
// - Due to the shape of PQ, the positive side of the ring created by the negative lobe tends to become over-bright.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// INPUT FORMAT SPECIFIC CASES
|
||
|
// ===========================
|
||
|
// - FP16 with all non-negative values ranging {0 to 1}.
|
||
|
// - Use as is, filter is designed for linear input and output ranging {0 to 1}.
|
||
|
// ---------------------------
|
||
|
// - UNORM with linear conversion approximation.
|
||
|
// - This could be used for both sRGB or FreeSync2 native (gamma 2.2) cases.
|
||
|
// - Load/store with either 10:10:10:2 UNORM or 8:8:8:8 UNORM (aka VK_FORMAT_R8G8B8A8_UNORM).
|
||
|
// - Use gamma 2.0 conversion in CasInput(), as an approximation.
|
||
|
// - Modifications:
|
||
|
// // Change the CasInput*() function to square the inputs.
|
||
|
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){r*=r;g*=g;b*=b;}
|
||
|
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){r*=r;g*=g;b*=b;}
|
||
|
// ...
|
||
|
// // Do linear to gamma 2.0 before store.
|
||
|
// // Since it will be common to do processing after CAS, the filter function returns linear.
|
||
|
// c.r=sqrt(c.r);c.g=sqrt(c.g);c.b=sqrt(c.b);
|
||
|
// imageStore(imgDst,ASU2(gxy),c);
|
||
|
// ...
|
||
|
// // And for packed.
|
||
|
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
|
||
|
// cR=sqrt(cR);cG=sqrt(cG);cB=sqrt(cB);
|
||
|
// CasDepack(c0,c1,cR,cG,cB);
|
||
|
// imageStore(img[0],ASU2(gxy),AF4(c0));
|
||
|
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
|
||
|
// ---------------------------
|
||
|
// - sRGB with slightly better quality and higher cost.
|
||
|
// - Use texelFetch() with sRGB format (VK_FORMAT_R8G8B8A8_SRGB) for loads (gets linear into shader).
|
||
|
// - Store to destination using UNORM (not sRGB) stores and do the linear to sRGB conversion in the shader.
|
||
|
// - Modifications:
|
||
|
// // Use texel fetch instead of image load (on GCN this will translate into an image load in the driver).
|
||
|
// // Hardware has sRGB to linear on loads (but in API only for read-only, aka texture instead of UAV/image).
|
||
|
// AF3 CasLoad(ASU2 p){return texelFetch(texSrc,p,0).rgb;}
|
||
|
// ...
|
||
|
// // Do linear to sRGB before store (GPU lacking hardware conversion support for linear to sRGB on store).
|
||
|
// c.r=AToSrgbF1(c.r);c.g=AToSrgbF1(c.g);c.b=AToSrgbF1(c.b);
|
||
|
// imageStore(imgDst,ASU2(gxy),c);
|
||
|
// ...
|
||
|
// // And for packed.
|
||
|
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
|
||
|
// cR=AToSrgbH2(cR);cG=AToSrgbH2(cG);cB=AToSrgbH2(cB);
|
||
|
// CasDepack(c0,c1,cR,cG,cB);
|
||
|
// imageStore(img[0],ASU2(gxy),AF4(c0));
|
||
|
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
|
||
|
// ---------------------------
|
||
|
// - HDR10 output via scRGB.
|
||
|
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
|
||
|
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
|
||
|
// - Where 'maxNits' is typically not 10000.
|
||
|
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
|
||
|
// - This can be even as low as 1000 nits on some HDR TVs.
|
||
|
// - After CAS do matrix multiply to take Rec.2020 back to sRGB and multiply by 'maxNits/80.0'.
|
||
|
// - Showing GPU code below to generate constants, likely most need to use CPU code instead.
|
||
|
// - Keeping the GPU code here because it is easier to read in these docs.
|
||
|
// - Can use 'lpm.h' source to generate the conversion matrix for Rec.2020 to sRGB:
|
||
|
// // Output conversion matrix from sRGB to Rec.2020.
|
||
|
// AF3 conR,conG,conB;
|
||
|
// // Working space temporaries (Rec.2020).
|
||
|
// AF3 rgbToXyzXW;AF3 rgbToXyzYW;AF3 rgbToXyzZW;
|
||
|
// LpmColRgbToXyz(rgbToXyzXW,rgbToXyzYW,rgbToXyzZW,lpmCol2020R,lpmCol2020G,lpmCol2020B,lpmColD65);
|
||
|
// // Output space temporaries (Rec.709, same as sRGB primaries).
|
||
|
// AF3 rgbToXyzXO;AF3 rgbToXyzYO;AF3 rgbToXyzZO;
|
||
|
// LpmColRgbToXyz(rgbToXyzXO,rgbToXyzYO,rgbToXyzZO,lpmCol709R,lpmCol709G,lpmCol709B,lpmColD65);
|
||
|
// AF3 xyzToRgbRO;AF3 xyzToRgbGO;AF3 xyzToRgbBO;
|
||
|
// LpmMatInv3x3(xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXO,rgbToXyzYO,rgbToXyzZO);
|
||
|
// // Generate the matrix.
|
||
|
// LpmMatMul3x3(conR,conG,conB,xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXW,rgbToXyzYW,rgbToXyzZW);
|
||
|
// - Adjust the conversion matrix for the multiply by 'maxNits/80.0'.
|
||
|
// // After this the constants can be stored into a constant buffer.
|
||
|
// AF1 conScale=maxNits*ARcpF1(80.0);
|
||
|
// conR*=conScale;conG*=conScale;conB*=conScale;
|
||
|
// - After CAS do the matrix multiply (passing the fetched constants into the shader).
|
||
|
// outputR=dot(AF3(colorR,colorG,colorB),conR);
|
||
|
// outputG=dot(AF3(colorR,colorG,colorB),conG);
|
||
|
// outputB=dot(AF3(colorR,colorG,colorB),conB);
|
||
|
// - Hopefully no developer is taking scRGB as input to CAS.
|
||
|
// - If that was the case, the conversion matrix from sRGB to Rec.2020 can be built changing the above code.
|
||
|
// - Swap the 'lpmCol709*' and 'lpmCol2020*' inputs to LpmColRgbToXyz().
|
||
|
// - Then scale by '80.0/maxNits' instead of 'maxNits/80.0'.
|
||
|
// ---------------------------
|
||
|
// - HDR10 output via native 10:10:10:2.
|
||
|
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
|
||
|
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
|
||
|
// - Where 'maxNits' is typically not 10000.
|
||
|
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
|
||
|
// - This can be even as low as 1000 nits on some HDR TVs.
|
||
|
// - Hopefully no developer needs to take PQ as input here, but if so can use A to convert PQ to linear:
|
||
|
// // Where 'k0' is a constant of 'maxNits/10000.0'.
|
||
|
// colorR=AFromPqF1(colorR*k0);
|
||
|
// colorG=AFromPqF1(colorG*k0);
|
||
|
// colorB=AFromPqF1(colorB*k0);
|
||
|
// - After CAS convert from linear to PQ.
|
||
|
// // Where 'k1' is a constant of '10000.0/maxNits'.
|
||
|
// colorR=AToPqF1(colorR*k1);
|
||
|
// colorG=AToPqF1(colorG*k1);
|
||
|
// colorB=AToPqF1(colorB*k1);
|
||
|
// ---------------------------
|
||
|
// - Example of a bad idea for CAS input design.
|
||
|
// - Have the pass before CAS store out in 10:10:10:2 UNORM with gamma 2.0.
|
||
|
// - Store the output of CAS with sRGB to linear conversion, or with a gamma 2.2 conversion for FreeSync2 native.
|
||
|
// - This will drop precision because the inputs had been quantized to 10-bit,
|
||
|
// and the output is using a different tonal transform,
|
||
|
// so inputs and outputs won't align for similar values.
|
||
|
// - It might be "ok" for 8-bit/channel CAS output, but definately not a good idea for 10-bit/channel output.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// ALGORITHM DESCRIPTION
|
||
|
// =====================
|
||
|
// This describes the algorithm with CAS_BETTER_DIAGONALS defined.
|
||
|
// The default is with CAS_BETTER_DIAGONALS not defined (which is faster).
|
||
|
// Starting with no scaling.
|
||
|
// CAS fetches a 3x3 neighborhood around the pixel 'e',
|
||
|
// a b c
|
||
|
// d(e)f
|
||
|
// g h i
|
||
|
// It then computes a 'soft' minimum and maximum,
|
||
|
// a b c b
|
||
|
// d e f * 0.5 + d e f * 0.5
|
||
|
// g h i h
|
||
|
// The minimum and maximums give an idea of local contrast.
|
||
|
// --- 1.0 ^
|
||
|
// | | <-- This minimum distance to the signal limit is divided by MAX to get a base sharpening amount 'A'.
|
||
|
// --- MAX v
|
||
|
// |
|
||
|
// |
|
||
|
// --- MIN ^
|
||
|
// | | <-- The MIN side is more distant in this example so it is not used, but for dark colors it would be used.
|
||
|
// | |
|
||
|
// --- 0.0 v
|
||
|
// The base sharpening amount 'A' from above is shaped with a sqrt().
|
||
|
// This 'A' ranges from 0 := no sharpening, to 1 := full sharpening.
|
||
|
// Then 'A' is scaled by the sharpness knob while being transformed to a negative lobe (values from -1/5 to -1/8 for A=1).
|
||
|
// The final filter kernel looks like this,
|
||
|
// 0 A 0
|
||
|
// A 1 A <-- Center is always 1.0, followed by the negative lobe 'A' in a ring, and windowed into a circle with the 0.0s.
|
||
|
// 0 A 0
|
||
|
// The local neighborhood is then multiplied by the kernel weights, summed and divided by the sum of the kernel weights.
|
||
|
// The high quality path computes filter weights per channel.
|
||
|
// The low quality path uses the green channel's filter weights to compute the 'A' factor for all channels.
|
||
|
// ---------------------
|
||
|
// The scaling path is a little more complex.
|
||
|
// It starts by fetching the 4x4 neighborhood around the pixel centered between centers of pixels {f,g,j,k},
|
||
|
// a b c d
|
||
|
// e(f g)h
|
||
|
// i(j k)l
|
||
|
// m n o p
|
||
|
// The algorithm then computes the no-scaling result for {f,g,j,k}.
|
||
|
// It then interpolates between those no-scaling results.
|
||
|
// The interpolation is adaptive.
|
||
|
// To hide bilinear interpolation and restore diagonals, it weights bilinear weights by 1/(const+contrast).
|
||
|
// Where 'contrast' is the soft 'max-min'.
|
||
|
// This makes edges thin out a little.
|
||
|
// ---------------------
|
||
|
// Without CAS_BETTER_DIAGONALS defined, the algorithm is a little faster.
|
||
|
// Instead of using the 3x3 "box" with the 5-tap "circle" this uses just the "circle".
|
||
|
// Drops to 5 texture fetches for no-scaling.
|
||
|
// Drops to 12 texture fetches for scaling.
|
||
|
// Drops a bunch of math.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// IDEAS FOR FUTURE
|
||
|
// ================
|
||
|
// - Avoid V_CVT's by using denormals.
|
||
|
// - Manually pack FP16 literals.
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// CHANGE LOG
|
||
|
// ==========
|
||
|
// 20190610 - Misc documentation cleanup.
|
||
|
// 20190609 - Removed lowQuality bool, improved scaling logic.
|
||
|
// 20190530 - Unified CPU/GPU setup code, using new ffx_a.h, faster, define CAS_BETTER_DIAGONALS to get older slower one.
|
||
|
// 20190529 - Missing a good way to re-interpret packed in HLSL, so disabling approximation optimizations for now.
|
||
|
// 20190528 - Fixed so GPU CasSetup() generates half data all the time.
|
||
|
// 20190527 - Implemented approximations for rcp() and sqrt().
|
||
|
// 20190524 - New algorithm, adjustable sharpness, scaling to 4x area. Fixed checker debug for no-scaling only.
|
||
|
// 20190521 - Updated file naming.
|
||
|
// 20190516 - Updated docs, fixed workaround, fixed no-scaling quality issue, removed gamma2 and generalized as CasInput*().
|
||
|
// 20190510 - Made the dispatch example safely round up for images that are not a multiple of 16x16.
|
||
|
// 20190507 - Fixed typo bug in CAS_DEBUG_CHECKER, fixed sign typo in the docs.
|
||
|
// 20190503 - Setup temporary workaround for compiler bug.
|
||
|
// 20190502 - Added argument for 'gamma2' path so input transform in that case runs packed.
|
||
|
// 20190426 - Improved documentation on format specific cases, etc.
|
||
|
// 20190425 - Updated/corrected documentation.
|
||
|
// 20190405 - Added CAS_PACKED_ONLY, misc bug fixes.
|
||
|
// 20190404 - Updated for the new a.h header.
|
||
|
//==============================================================================================================================
|
||
|
// This is the practical limit for the algorithm's scaling ability (quality is limited by 3x3 taps). Example resolutions,
|
||
|
// 1280x720 -> 1080p = 2.25x area
|
||
|
// 1536x864 -> 1080p = 1.56x area
|
||
|
// 1792x1008 -> 1440p = 2.04x area
|
||
|
// 1920x1080 -> 1440p = 1.78x area
|
||
|
// 1920x1080 -> 4K = 4.0x area
|
||
|
// 2048x1152 -> 1440p = 1.56x area
|
||
|
// 2560x1440 -> 4K = 2.25x area
|
||
|
// 3072x1728 -> 4K = 1.56x area
|
||
|
#define CAS_AREA_LIMIT 4.0
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// Pass in output and input resolution in pixels.
|
||
|
// This returns true if CAS supports scaling in the given configuration.
|
||
|
AP1 CasSupportScaling(AF1 outX,AF1 outY,AF1 inX,AF1 inY){return ((outX*outY)*ARcpF1(inX*inY))<=CAS_AREA_LIMIT;}
|
||
|
//==============================================================================================================================
|
||
|
// Call to setup required constant values (works on CPU or GPU).
|
||
|
A_STATIC void CasSetup(
|
||
|
outAU4 const0,
|
||
|
outAU4 const1,
|
||
|
AF1 sharpness, // 0 := default (lower ringing), 1 := maximum (higest ringing)
|
||
|
AF1 inputSizeInPixelsX,
|
||
|
AF1 inputSizeInPixelsY,
|
||
|
AF1 outputSizeInPixelsX,
|
||
|
AF1 outputSizeInPixelsY){
|
||
|
// Scaling terms.
|
||
|
const0[0]=AU1_AF1(inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
|
||
|
const0[1]=AU1_AF1(inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY));
|
||
|
const0[2]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
|
||
|
const0[3]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
|
||
|
// Sharpness value.
|
||
|
AF1 sharp=-ARcpF1(ALerpF1(8.0,5.0,ASatF1(sharpness)));
|
||
|
varAF2(hSharp)=initAF2(sharp,0.0);
|
||
|
const1[0]=AU1_AF1(sharp);
|
||
|
const1[1]=AU1_AH2_AF2(hSharp);
|
||
|
const1[2]=AU1_AF1(AF1_(8.0)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
|
||
|
const1[3]=0u;}
|
||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||
|
//_____________________________________________________________/\_______________________________________________________________
|
||
|
//==============================================================================================================================
|
||
|
// NON-PACKED VERSION
|
||
|
//==============================================================================================================================
|
||
|
#ifdef A_GPU
|
||
|
#ifdef CAS_PACKED_ONLY
|
||
|
// Avoid compiler error.
|
||
|
AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
|
||
|
void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
|
||
|
#endif
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
void CasFilter(
|
||
|
out AF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
|
||
|
out AF1 pixG,
|
||
|
out AF1 pixB,
|
||
|
AU2 ip, // Integer pixel position in output.
|
||
|
AU4 const0, // Constants generated by CasSetup().
|
||
|
AU4 const1,
|
||
|
AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// Debug a checker pattern of on/off tiles for visual inspection.
|
||
|
#ifdef CAS_DEBUG_CHECKER
|
||
|
if((((ip.x^ip.y)>>8u)&1u)==0u){AF3 pix0=CasLoad(ASU2(ip));
|
||
|
pixR=pix0.r;pixG=pix0.g;pixB=pix0.b;CasInput(pixR,pixG,pixB);return;}
|
||
|
#endif
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
|
||
|
if(noScaling){
|
||
|
// a b c
|
||
|
// d e f
|
||
|
// g h i
|
||
|
ASU2 sp=ASU2(ip);
|
||
|
AF3 a=CasLoad(sp+ASU2(-1,-1));
|
||
|
AF3 b=CasLoad(sp+ASU2( 0,-1));
|
||
|
AF3 c=CasLoad(sp+ASU2( 1,-1));
|
||
|
AF3 d=CasLoad(sp+ASU2(-1, 0));
|
||
|
AF3 e=CasLoad(sp);
|
||
|
AF3 f=CasLoad(sp+ASU2( 1, 0));
|
||
|
AF3 g=CasLoad(sp+ASU2(-1, 1));
|
||
|
AF3 h=CasLoad(sp+ASU2( 0, 1));
|
||
|
AF3 i=CasLoad(sp+ASU2( 1, 1));
|
||
|
// Run optional input transform.
|
||
|
CasInput(a.r,a.g,a.b);
|
||
|
CasInput(b.r,b.g,b.b);
|
||
|
CasInput(c.r,c.g,c.b);
|
||
|
CasInput(d.r,d.g,d.b);
|
||
|
CasInput(e.r,e.g,e.b);
|
||
|
CasInput(f.r,f.g,f.b);
|
||
|
CasInput(g.r,g.g,g.b);
|
||
|
CasInput(h.r,h.g,h.b);
|
||
|
CasInput(i.r,i.g,i.b);
|
||
|
// Soft min and max.
|
||
|
// a b c b
|
||
|
// d e f * 0.5 + d e f * 0.5
|
||
|
// g h i h
|
||
|
// These are 2.0x bigger (factored out the extra multiply).
|
||
|
AF1 mnR=AMin3F1(AMin3F1(d.r,e.r,f.r),b.r,h.r);
|
||
|
AF1 mnG=AMin3F1(AMin3F1(d.g,e.g,f.g),b.g,h.g);
|
||
|
AF1 mnB=AMin3F1(AMin3F1(d.b,e.b,f.b),b.b,h.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mnR2=AMin3F1(AMin3F1(mnR,a.r,c.r),g.r,i.r);
|
||
|
AF1 mnG2=AMin3F1(AMin3F1(mnG,a.g,c.g),g.g,i.g);
|
||
|
AF1 mnB2=AMin3F1(AMin3F1(mnB,a.b,c.b),g.b,i.b);
|
||
|
mnR=mnR+mnR2;
|
||
|
mnG=mnG+mnG2;
|
||
|
mnB=mnB+mnB2;
|
||
|
#endif
|
||
|
AF1 mxR=AMax3F1(AMax3F1(d.r,e.r,f.r),b.r,h.r);
|
||
|
AF1 mxG=AMax3F1(AMax3F1(d.g,e.g,f.g),b.g,h.g);
|
||
|
AF1 mxB=AMax3F1(AMax3F1(d.b,e.b,f.b),b.b,h.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mxR2=AMax3F1(AMax3F1(mxR,a.r,c.r),g.r,i.r);
|
||
|
AF1 mxG2=AMax3F1(AMax3F1(mxG,a.g,c.g),g.g,i.g);
|
||
|
AF1 mxB2=AMax3F1(AMax3F1(mxB,a.b,c.b),g.b,i.b);
|
||
|
mxR=mxR+mxR2;
|
||
|
mxG=mxG+mxG2;
|
||
|
mxB=mxB+mxB2;
|
||
|
#endif
|
||
|
// Smooth minimum distance to signal limit divided by smooth max.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AF1 rcpMR=ARcpF1(mxR);
|
||
|
AF1 rcpMG=ARcpF1(mxG);
|
||
|
AF1 rcpMB=ARcpF1(mxB);
|
||
|
#else
|
||
|
AF1 rcpMR=APrxLoRcpF1(mxR);
|
||
|
AF1 rcpMG=APrxLoRcpF1(mxG);
|
||
|
AF1 rcpMB=APrxLoRcpF1(mxB);
|
||
|
#endif
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 ampR=ASatF1(min(mnR,AF1_(2.0)-mxR)*rcpMR);
|
||
|
AF1 ampG=ASatF1(min(mnG,AF1_(2.0)-mxG)*rcpMG);
|
||
|
AF1 ampB=ASatF1(min(mnB,AF1_(2.0)-mxB)*rcpMB);
|
||
|
#else
|
||
|
AF1 ampR=ASatF1(min(mnR,AF1_(1.0)-mxR)*rcpMR);
|
||
|
AF1 ampG=ASatF1(min(mnG,AF1_(1.0)-mxG)*rcpMG);
|
||
|
AF1 ampB=ASatF1(min(mnB,AF1_(1.0)-mxB)*rcpMB);
|
||
|
#endif
|
||
|
// Shaping amount of sharpening.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
ampR=sqrt(ampR);
|
||
|
ampG=sqrt(ampG);
|
||
|
ampB=sqrt(ampB);
|
||
|
#else
|
||
|
ampR=APrxLoSqrtF1(ampR);
|
||
|
ampG=APrxLoSqrtF1(ampG);
|
||
|
ampB=APrxLoSqrtF1(ampB);
|
||
|
#endif
|
||
|
// Filter shape.
|
||
|
// 0 w 0
|
||
|
// w 1 w
|
||
|
// 0 w 0
|
||
|
AF1 peak=AF1_AU1(const1.x);
|
||
|
AF1 wR=ampR*peak;
|
||
|
AF1 wG=ampG*peak;
|
||
|
AF1 wB=ampB*peak;
|
||
|
// Filter.
|
||
|
#ifndef CAS_SLOW
|
||
|
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AF1 rcpWeight=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG);
|
||
|
#else
|
||
|
AF1 rcpWeight=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG);
|
||
|
#endif
|
||
|
pixR=ASatF1((b.r*wG+d.r*wG+f.r*wG+h.r*wG+e.r)*rcpWeight);
|
||
|
pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeight);
|
||
|
pixB=ASatF1((b.b*wG+d.b*wG+f.b*wG+h.b*wG+e.b)*rcpWeight);
|
||
|
#else
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AF1 rcpWeightR=ARcpF1(AF1_(1.0)+AF1_(4.0)*wR);
|
||
|
AF1 rcpWeightG=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG);
|
||
|
AF1 rcpWeightB=ARcpF1(AF1_(1.0)+AF1_(4.0)*wB);
|
||
|
#else
|
||
|
AF1 rcpWeightR=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wR);
|
||
|
AF1 rcpWeightG=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG);
|
||
|
AF1 rcpWeightB=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wB);
|
||
|
#endif
|
||
|
pixR=ASatF1((b.r*wR+d.r*wR+f.r*wR+h.r*wR+e.r)*rcpWeightR);
|
||
|
pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeightG);
|
||
|
pixB=ASatF1((b.b*wB+d.b*wB+f.b*wB+h.b*wB+e.b)*rcpWeightB);
|
||
|
#endif
|
||
|
return;}
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
|
||
|
// a b c d
|
||
|
// e f g h
|
||
|
// i j k l
|
||
|
// m n o p
|
||
|
// Working these 4 results.
|
||
|
// +-----+-----+
|
||
|
// | | |
|
||
|
// | f..|..g |
|
||
|
// | . | . |
|
||
|
// +-----+-----+
|
||
|
// | . | . |
|
||
|
// | j..|..k |
|
||
|
// | | |
|
||
|
// +-----+-----+
|
||
|
AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw);
|
||
|
AF2 fp=floor(pp);
|
||
|
pp-=fp;
|
||
|
ASU2 sp=ASU2(fp);
|
||
|
AF3 a=CasLoad(sp+ASU2(-1,-1));
|
||
|
AF3 b=CasLoad(sp+ASU2( 0,-1));
|
||
|
AF3 e=CasLoad(sp+ASU2(-1, 0));
|
||
|
AF3 f=CasLoad(sp);
|
||
|
AF3 c=CasLoad(sp+ASU2( 1,-1));
|
||
|
AF3 d=CasLoad(sp+ASU2( 2,-1));
|
||
|
AF3 g=CasLoad(sp+ASU2( 1, 0));
|
||
|
AF3 h=CasLoad(sp+ASU2( 2, 0));
|
||
|
AF3 i=CasLoad(sp+ASU2(-1, 1));
|
||
|
AF3 j=CasLoad(sp+ASU2( 0, 1));
|
||
|
AF3 m=CasLoad(sp+ASU2(-1, 2));
|
||
|
AF3 n=CasLoad(sp+ASU2( 0, 2));
|
||
|
AF3 k=CasLoad(sp+ASU2( 1, 1));
|
||
|
AF3 l=CasLoad(sp+ASU2( 2, 1));
|
||
|
AF3 o=CasLoad(sp+ASU2( 1, 2));
|
||
|
AF3 p=CasLoad(sp+ASU2( 2, 2));
|
||
|
// Run optional input transform.
|
||
|
CasInput(a.r,a.g,a.b);
|
||
|
CasInput(b.r,b.g,b.b);
|
||
|
CasInput(c.r,c.g,c.b);
|
||
|
CasInput(d.r,d.g,d.b);
|
||
|
CasInput(e.r,e.g,e.b);
|
||
|
CasInput(f.r,f.g,f.b);
|
||
|
CasInput(g.r,g.g,g.b);
|
||
|
CasInput(h.r,h.g,h.b);
|
||
|
CasInput(i.r,i.g,i.b);
|
||
|
CasInput(j.r,j.g,j.b);
|
||
|
CasInput(k.r,k.g,k.b);
|
||
|
CasInput(l.r,l.g,l.b);
|
||
|
CasInput(m.r,m.g,m.b);
|
||
|
CasInput(n.r,n.g,n.b);
|
||
|
CasInput(o.r,o.g,o.b);
|
||
|
CasInput(p.r,p.g,p.b);
|
||
|
// Soft min and max.
|
||
|
// These are 2.0x bigger (factored out the extra multiply).
|
||
|
// a b c b
|
||
|
// e f g * 0.5 + e f g * 0.5 [F]
|
||
|
// i j k j
|
||
|
AF1 mnfR=AMin3F1(AMin3F1(b.r,e.r,f.r),g.r,j.r);
|
||
|
AF1 mnfG=AMin3F1(AMin3F1(b.g,e.g,f.g),g.g,j.g);
|
||
|
AF1 mnfB=AMin3F1(AMin3F1(b.b,e.b,f.b),g.b,j.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mnfR2=AMin3F1(AMin3F1(mnfR,a.r,c.r),i.r,k.r);
|
||
|
AF1 mnfG2=AMin3F1(AMin3F1(mnfG,a.g,c.g),i.g,k.g);
|
||
|
AF1 mnfB2=AMin3F1(AMin3F1(mnfB,a.b,c.b),i.b,k.b);
|
||
|
mnfR=mnfR+mnfR2;
|
||
|
mnfG=mnfG+mnfG2;
|
||
|
mnfB=mnfB+mnfB2;
|
||
|
#endif
|
||
|
AF1 mxfR=AMax3F1(AMax3F1(b.r,e.r,f.r),g.r,j.r);
|
||
|
AF1 mxfG=AMax3F1(AMax3F1(b.g,e.g,f.g),g.g,j.g);
|
||
|
AF1 mxfB=AMax3F1(AMax3F1(b.b,e.b,f.b),g.b,j.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mxfR2=AMax3F1(AMax3F1(mxfR,a.r,c.r),i.r,k.r);
|
||
|
AF1 mxfG2=AMax3F1(AMax3F1(mxfG,a.g,c.g),i.g,k.g);
|
||
|
AF1 mxfB2=AMax3F1(AMax3F1(mxfB,a.b,c.b),i.b,k.b);
|
||
|
mxfR=mxfR+mxfR2;
|
||
|
mxfG=mxfG+mxfG2;
|
||
|
mxfB=mxfB+mxfB2;
|
||
|
#endif
|
||
|
// b c d c
|
||
|
// f g h * 0.5 + f g h * 0.5 [G]
|
||
|
// j k l k
|
||
|
AF1 mngR=AMin3F1(AMin3F1(c.r,f.r,g.r),h.r,k.r);
|
||
|
AF1 mngG=AMin3F1(AMin3F1(c.g,f.g,g.g),h.g,k.g);
|
||
|
AF1 mngB=AMin3F1(AMin3F1(c.b,f.b,g.b),h.b,k.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mngR2=AMin3F1(AMin3F1(mngR,b.r,d.r),j.r,l.r);
|
||
|
AF1 mngG2=AMin3F1(AMin3F1(mngG,b.g,d.g),j.g,l.g);
|
||
|
AF1 mngB2=AMin3F1(AMin3F1(mngB,b.b,d.b),j.b,l.b);
|
||
|
mngR=mngR+mngR2;
|
||
|
mngG=mngG+mngG2;
|
||
|
mngB=mngB+mngB2;
|
||
|
#endif
|
||
|
AF1 mxgR=AMax3F1(AMax3F1(c.r,f.r,g.r),h.r,k.r);
|
||
|
AF1 mxgG=AMax3F1(AMax3F1(c.g,f.g,g.g),h.g,k.g);
|
||
|
AF1 mxgB=AMax3F1(AMax3F1(c.b,f.b,g.b),h.b,k.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mxgR2=AMax3F1(AMax3F1(mxgR,b.r,d.r),j.r,l.r);
|
||
|
AF1 mxgG2=AMax3F1(AMax3F1(mxgG,b.g,d.g),j.g,l.g);
|
||
|
AF1 mxgB2=AMax3F1(AMax3F1(mxgB,b.b,d.b),j.b,l.b);
|
||
|
mxgR=mxgR+mxgR2;
|
||
|
mxgG=mxgG+mxgG2;
|
||
|
mxgB=mxgB+mxgB2;
|
||
|
#endif
|
||
|
// e f g f
|
||
|
// i j k * 0.5 + i j k * 0.5 [J]
|
||
|
// m n o n
|
||
|
AF1 mnjR=AMin3F1(AMin3F1(f.r,i.r,j.r),k.r,n.r);
|
||
|
AF1 mnjG=AMin3F1(AMin3F1(f.g,i.g,j.g),k.g,n.g);
|
||
|
AF1 mnjB=AMin3F1(AMin3F1(f.b,i.b,j.b),k.b,n.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mnjR2=AMin3F1(AMin3F1(mnjR,e.r,g.r),m.r,o.r);
|
||
|
AF1 mnjG2=AMin3F1(AMin3F1(mnjG,e.g,g.g),m.g,o.g);
|
||
|
AF1 mnjB2=AMin3F1(AMin3F1(mnjB,e.b,g.b),m.b,o.b);
|
||
|
mnjR=mnjR+mnjR2;
|
||
|
mnjG=mnjG+mnjG2;
|
||
|
mnjB=mnjB+mnjB2;
|
||
|
#endif
|
||
|
AF1 mxjR=AMax3F1(AMax3F1(f.r,i.r,j.r),k.r,n.r);
|
||
|
AF1 mxjG=AMax3F1(AMax3F1(f.g,i.g,j.g),k.g,n.g);
|
||
|
AF1 mxjB=AMax3F1(AMax3F1(f.b,i.b,j.b),k.b,n.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mxjR2=AMax3F1(AMax3F1(mxjR,e.r,g.r),m.r,o.r);
|
||
|
AF1 mxjG2=AMax3F1(AMax3F1(mxjG,e.g,g.g),m.g,o.g);
|
||
|
AF1 mxjB2=AMax3F1(AMax3F1(mxjB,e.b,g.b),m.b,o.b);
|
||
|
mxjR=mxjR+mxjR2;
|
||
|
mxjG=mxjG+mxjG2;
|
||
|
mxjB=mxjB+mxjB2;
|
||
|
#endif
|
||
|
// f g h g
|
||
|
// j k l * 0.5 + j k l * 0.5 [K]
|
||
|
// n o p o
|
||
|
AF1 mnkR=AMin3F1(AMin3F1(g.r,j.r,k.r),l.r,o.r);
|
||
|
AF1 mnkG=AMin3F1(AMin3F1(g.g,j.g,k.g),l.g,o.g);
|
||
|
AF1 mnkB=AMin3F1(AMin3F1(g.b,j.b,k.b),l.b,o.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mnkR2=AMin3F1(AMin3F1(mnkR,f.r,h.r),n.r,p.r);
|
||
|
AF1 mnkG2=AMin3F1(AMin3F1(mnkG,f.g,h.g),n.g,p.g);
|
||
|
AF1 mnkB2=AMin3F1(AMin3F1(mnkB,f.b,h.b),n.b,p.b);
|
||
|
mnkR=mnkR+mnkR2;
|
||
|
mnkG=mnkG+mnkG2;
|
||
|
mnkB=mnkB+mnkB2;
|
||
|
#endif
|
||
|
AF1 mxkR=AMax3F1(AMax3F1(g.r,j.r,k.r),l.r,o.r);
|
||
|
AF1 mxkG=AMax3F1(AMax3F1(g.g,j.g,k.g),l.g,o.g);
|
||
|
AF1 mxkB=AMax3F1(AMax3F1(g.b,j.b,k.b),l.b,o.b);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 mxkR2=AMax3F1(AMax3F1(mxkR,f.r,h.r),n.r,p.r);
|
||
|
AF1 mxkG2=AMax3F1(AMax3F1(mxkG,f.g,h.g),n.g,p.g);
|
||
|
AF1 mxkB2=AMax3F1(AMax3F1(mxkB,f.b,h.b),n.b,p.b);
|
||
|
mxkR=mxkR+mxkR2;
|
||
|
mxkG=mxkG+mxkG2;
|
||
|
mxkB=mxkB+mxkB2;
|
||
|
#endif
|
||
|
// Smooth minimum distance to signal limit divided by smooth max.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AF1 rcpMfR=ARcpF1(mxfR);
|
||
|
AF1 rcpMfG=ARcpF1(mxfG);
|
||
|
AF1 rcpMfB=ARcpF1(mxfB);
|
||
|
AF1 rcpMgR=ARcpF1(mxgR);
|
||
|
AF1 rcpMgG=ARcpF1(mxgG);
|
||
|
AF1 rcpMgB=ARcpF1(mxgB);
|
||
|
AF1 rcpMjR=ARcpF1(mxjR);
|
||
|
AF1 rcpMjG=ARcpF1(mxjG);
|
||
|
AF1 rcpMjB=ARcpF1(mxjB);
|
||
|
AF1 rcpMkR=ARcpF1(mxkR);
|
||
|
AF1 rcpMkG=ARcpF1(mxkG);
|
||
|
AF1 rcpMkB=ARcpF1(mxkB);
|
||
|
#else
|
||
|
AF1 rcpMfR=APrxLoRcpF1(mxfR);
|
||
|
AF1 rcpMfG=APrxLoRcpF1(mxfG);
|
||
|
AF1 rcpMfB=APrxLoRcpF1(mxfB);
|
||
|
AF1 rcpMgR=APrxLoRcpF1(mxgR);
|
||
|
AF1 rcpMgG=APrxLoRcpF1(mxgG);
|
||
|
AF1 rcpMgB=APrxLoRcpF1(mxgB);
|
||
|
AF1 rcpMjR=APrxLoRcpF1(mxjR);
|
||
|
AF1 rcpMjG=APrxLoRcpF1(mxjG);
|
||
|
AF1 rcpMjB=APrxLoRcpF1(mxjB);
|
||
|
AF1 rcpMkR=APrxLoRcpF1(mxkR);
|
||
|
AF1 rcpMkG=APrxLoRcpF1(mxkG);
|
||
|
AF1 rcpMkB=APrxLoRcpF1(mxkB);
|
||
|
#endif
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AF1 ampfR=ASatF1(min(mnfR,AF1_(2.0)-mxfR)*rcpMfR);
|
||
|
AF1 ampfG=ASatF1(min(mnfG,AF1_(2.0)-mxfG)*rcpMfG);
|
||
|
AF1 ampfB=ASatF1(min(mnfB,AF1_(2.0)-mxfB)*rcpMfB);
|
||
|
AF1 ampgR=ASatF1(min(mngR,AF1_(2.0)-mxgR)*rcpMgR);
|
||
|
AF1 ampgG=ASatF1(min(mngG,AF1_(2.0)-mxgG)*rcpMgG);
|
||
|
AF1 ampgB=ASatF1(min(mngB,AF1_(2.0)-mxgB)*rcpMgB);
|
||
|
AF1 ampjR=ASatF1(min(mnjR,AF1_(2.0)-mxjR)*rcpMjR);
|
||
|
AF1 ampjG=ASatF1(min(mnjG,AF1_(2.0)-mxjG)*rcpMjG);
|
||
|
AF1 ampjB=ASatF1(min(mnjB,AF1_(2.0)-mxjB)*rcpMjB);
|
||
|
AF1 ampkR=ASatF1(min(mnkR,AF1_(2.0)-mxkR)*rcpMkR);
|
||
|
AF1 ampkG=ASatF1(min(mnkG,AF1_(2.0)-mxkG)*rcpMkG);
|
||
|
AF1 ampkB=ASatF1(min(mnkB,AF1_(2.0)-mxkB)*rcpMkB);
|
||
|
#else
|
||
|
AF1 ampfR=ASatF1(min(mnfR,AF1_(1.0)-mxfR)*rcpMfR);
|
||
|
AF1 ampfG=ASatF1(min(mnfG,AF1_(1.0)-mxfG)*rcpMfG);
|
||
|
AF1 ampfB=ASatF1(min(mnfB,AF1_(1.0)-mxfB)*rcpMfB);
|
||
|
AF1 ampgR=ASatF1(min(mngR,AF1_(1.0)-mxgR)*rcpMgR);
|
||
|
AF1 ampgG=ASatF1(min(mngG,AF1_(1.0)-mxgG)*rcpMgG);
|
||
|
AF1 ampgB=ASatF1(min(mngB,AF1_(1.0)-mxgB)*rcpMgB);
|
||
|
AF1 ampjR=ASatF1(min(mnjR,AF1_(1.0)-mxjR)*rcpMjR);
|
||
|
AF1 ampjG=ASatF1(min(mnjG,AF1_(1.0)-mxjG)*rcpMjG);
|
||
|
AF1 ampjB=ASatF1(min(mnjB,AF1_(1.0)-mxjB)*rcpMjB);
|
||
|
AF1 ampkR=ASatF1(min(mnkR,AF1_(1.0)-mxkR)*rcpMkR);
|
||
|
AF1 ampkG=ASatF1(min(mnkG,AF1_(1.0)-mxkG)*rcpMkG);
|
||
|
AF1 ampkB=ASatF1(min(mnkB,AF1_(1.0)-mxkB)*rcpMkB);
|
||
|
#endif
|
||
|
// Shaping amount of sharpening.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
ampfR=sqrt(ampfR);
|
||
|
ampfG=sqrt(ampfG);
|
||
|
ampfB=sqrt(ampfB);
|
||
|
ampgR=sqrt(ampgR);
|
||
|
ampgG=sqrt(ampgG);
|
||
|
ampgB=sqrt(ampgB);
|
||
|
ampjR=sqrt(ampjR);
|
||
|
ampjG=sqrt(ampjG);
|
||
|
ampjB=sqrt(ampjB);
|
||
|
ampkR=sqrt(ampkR);
|
||
|
ampkG=sqrt(ampkG);
|
||
|
ampkB=sqrt(ampkB);
|
||
|
#else
|
||
|
ampfR=APrxLoSqrtF1(ampfR);
|
||
|
ampfG=APrxLoSqrtF1(ampfG);
|
||
|
ampfB=APrxLoSqrtF1(ampfB);
|
||
|
ampgR=APrxLoSqrtF1(ampgR);
|
||
|
ampgG=APrxLoSqrtF1(ampgG);
|
||
|
ampgB=APrxLoSqrtF1(ampgB);
|
||
|
ampjR=APrxLoSqrtF1(ampjR);
|
||
|
ampjG=APrxLoSqrtF1(ampjG);
|
||
|
ampjB=APrxLoSqrtF1(ampjB);
|
||
|
ampkR=APrxLoSqrtF1(ampkR);
|
||
|
ampkG=APrxLoSqrtF1(ampkG);
|
||
|
ampkB=APrxLoSqrtF1(ampkB);
|
||
|
#endif
|
||
|
// Filter shape.
|
||
|
// 0 w 0
|
||
|
// w 1 w
|
||
|
// 0 w 0
|
||
|
AF1 peak=AF1_AU1(const1.x);
|
||
|
AF1 wfR=ampfR*peak;
|
||
|
AF1 wfG=ampfG*peak;
|
||
|
AF1 wfB=ampfB*peak;
|
||
|
AF1 wgR=ampgR*peak;
|
||
|
AF1 wgG=ampgG*peak;
|
||
|
AF1 wgB=ampgB*peak;
|
||
|
AF1 wjR=ampjR*peak;
|
||
|
AF1 wjG=ampjG*peak;
|
||
|
AF1 wjB=ampjB*peak;
|
||
|
AF1 wkR=ampkR*peak;
|
||
|
AF1 wkG=ampkG*peak;
|
||
|
AF1 wkB=ampkB*peak;
|
||
|
// Blend between 4 results.
|
||
|
// s t
|
||
|
// u v
|
||
|
AF1 s=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
|
||
|
AF1 t= pp.x *(AF1_(1.0)-pp.y);
|
||
|
AF1 u=(AF1_(1.0)-pp.x)* pp.y ;
|
||
|
AF1 v= pp.x * pp.y ;
|
||
|
// Thin edges to hide bilinear interpolation (helps diagonals).
|
||
|
AF1 thinB=1.0/32.0;
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
s*=ARcpF1(thinB+(mxfG-mnfG));
|
||
|
t*=ARcpF1(thinB+(mxgG-mngG));
|
||
|
u*=ARcpF1(thinB+(mxjG-mnjG));
|
||
|
v*=ARcpF1(thinB+(mxkG-mnkG));
|
||
|
#else
|
||
|
s*=APrxLoRcpF1(thinB+(mxfG-mnfG));
|
||
|
t*=APrxLoRcpF1(thinB+(mxgG-mngG));
|
||
|
u*=APrxLoRcpF1(thinB+(mxjG-mnjG));
|
||
|
v*=APrxLoRcpF1(thinB+(mxkG-mnkG));
|
||
|
#endif
|
||
|
// Final weighting.
|
||
|
// b c
|
||
|
// e f g h
|
||
|
// i j k l
|
||
|
// n o
|
||
|
// _____ _____ _____ _____
|
||
|
// fs gt
|
||
|
//
|
||
|
// _____ _____ _____ _____
|
||
|
// fs s gt fs t gt
|
||
|
// ju kv
|
||
|
// _____ _____ _____ _____
|
||
|
// fs gt
|
||
|
// ju u kv ju v kv
|
||
|
// _____ _____ _____ _____
|
||
|
//
|
||
|
// ju kv
|
||
|
AF1 qbeR=wfR*s;
|
||
|
AF1 qbeG=wfG*s;
|
||
|
AF1 qbeB=wfB*s;
|
||
|
AF1 qchR=wgR*t;
|
||
|
AF1 qchG=wgG*t;
|
||
|
AF1 qchB=wgB*t;
|
||
|
AF1 qfR=wgR*t+wjR*u+s;
|
||
|
AF1 qfG=wgG*t+wjG*u+s;
|
||
|
AF1 qfB=wgB*t+wjB*u+s;
|
||
|
AF1 qgR=wfR*s+wkR*v+t;
|
||
|
AF1 qgG=wfG*s+wkG*v+t;
|
||
|
AF1 qgB=wfB*s+wkB*v+t;
|
||
|
AF1 qjR=wfR*s+wkR*v+u;
|
||
|
AF1 qjG=wfG*s+wkG*v+u;
|
||
|
AF1 qjB=wfB*s+wkB*v+u;
|
||
|
AF1 qkR=wgR*t+wjR*u+v;
|
||
|
AF1 qkG=wgG*t+wjG*u+v;
|
||
|
AF1 qkB=wgB*t+wjB*u+v;
|
||
|
AF1 qinR=wjR*u;
|
||
|
AF1 qinG=wjG*u;
|
||
|
AF1 qinB=wjB*u;
|
||
|
AF1 qloR=wkR*v;
|
||
|
AF1 qloG=wkG*v;
|
||
|
AF1 qloB=wkB*v;
|
||
|
// Filter.
|
||
|
#ifndef CAS_SLOW
|
||
|
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
#else
|
||
|
AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
#endif
|
||
|
pixR=ASatF1((b.r*qbeG+e.r*qbeG+c.r*qchG+h.r*qchG+i.r*qinG+n.r*qinG+l.r*qloG+o.r*qloG+f.r*qfG+g.r*qgG+j.r*qjG+k.r*qkG)*rcpWG);
|
||
|
pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG);
|
||
|
pixB=ASatF1((b.b*qbeG+e.b*qbeG+c.b*qchG+h.b*qchG+i.b*qinG+n.b*qinG+l.b*qloG+o.b*qloG+f.b*qfG+g.b*qgG+j.b*qjG+k.b*qkG)*rcpWG);
|
||
|
#else
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AF1 rcpWR=ARcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR);
|
||
|
AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
AF1 rcpWB=ARcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB);
|
||
|
#else
|
||
|
AF1 rcpWR=APrxMedRcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR);
|
||
|
AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
AF1 rcpWB=APrxMedRcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB);
|
||
|
#endif
|
||
|
pixR=ASatF1((b.r*qbeR+e.r*qbeR+c.r*qchR+h.r*qchR+i.r*qinR+n.r*qinR+l.r*qloR+o.r*qloR+f.r*qfR+g.r*qgR+j.r*qjR+k.r*qkR)*rcpWR);
|
||
|
pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG);
|
||
|
pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB);
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|
||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||
|
//_____________________________________________________________/\_______________________________________________________________
|
||
|
//==============================================================================================================================
|
||
|
// PACKED VERSION
|
||
|
//==============================================================================================================================
|
||
|
#if defined(A_GPU) && defined(A_HALF)
|
||
|
// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
|
||
|
#ifdef A_HLSL
|
||
|
#ifndef CAS_GO_SLOWER
|
||
|
#define CAS_GO_SLOWER 1
|
||
|
#endif
|
||
|
#endif
|
||
|
//==============================================================================================================================
|
||
|
// Can be used to convert from packed SOA to AOS for store.
|
||
|
void CasDepack(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
|
||
|
#ifdef A_HLSL
|
||
|
// Invoke a slower path for DX only, since it won't allow uninitialized values.
|
||
|
pix0.a=pix1.a=0.0;
|
||
|
#endif
|
||
|
pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
|
||
|
pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
|
||
|
//==============================================================================================================================
|
||
|
void CasFilterH(
|
||
|
// Output values are for 2 8x8 tiles in a 16x8 region.
|
||
|
// pix<R,G,B>.x = right 8x8 tile
|
||
|
// pix<R,G,B>.y = left 8x8 tile
|
||
|
// This enables later processing to easily be packed as well.
|
||
|
out AH2 pixR,
|
||
|
out AH2 pixG,
|
||
|
out AH2 pixB,
|
||
|
AU2 ip, // Integer pixel position in output.
|
||
|
AU4 const0, // Constants generated by CasSetup().
|
||
|
AU4 const1,
|
||
|
AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// Debug a checker pattern of on/off tiles for visual inspection.
|
||
|
#ifdef CAS_DEBUG_CHECKER
|
||
|
if((((ip.x^ip.y)>>8u)&1u)==0u){AH3 pix0=CasLoadH(ASW2(ip));AH3 pix1=CasLoadH(ASW2(ip)+ASW2(8,0));
|
||
|
pixR=AH2(pix0.r,pix1.r);pixG=AH2(pix0.g,pix1.g);pixB=AH2(pix0.b,pix1.b);CasInputH(pixR,pixG,pixB);return;}
|
||
|
#endif
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
|
||
|
if(noScaling){
|
||
|
ASW2 sp0=ASW2(ip);
|
||
|
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
|
||
|
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
|
||
|
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
|
||
|
AH3 d0=CasLoadH(sp0+ASW2(-1, 0));
|
||
|
AH3 e0=CasLoadH(sp0);
|
||
|
AH3 f0=CasLoadH(sp0+ASW2( 1, 0));
|
||
|
AH3 g0=CasLoadH(sp0+ASW2(-1, 1));
|
||
|
AH3 h0=CasLoadH(sp0+ASW2( 0, 1));
|
||
|
AH3 i0=CasLoadH(sp0+ASW2( 1, 1));
|
||
|
ASW2 sp1=sp0+ASW2(8,0);
|
||
|
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
|
||
|
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
|
||
|
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
|
||
|
AH3 d1=CasLoadH(sp1+ASW2(-1, 0));
|
||
|
AH3 e1=CasLoadH(sp1);
|
||
|
AH3 f1=CasLoadH(sp1+ASW2( 1, 0));
|
||
|
AH3 g1=CasLoadH(sp1+ASW2(-1, 1));
|
||
|
AH3 h1=CasLoadH(sp1+ASW2( 0, 1));
|
||
|
AH3 i1=CasLoadH(sp1+ASW2( 1, 1));
|
||
|
// AOS to SOA conversion.
|
||
|
AH2 aR=AH2(a0.r,a1.r);
|
||
|
AH2 aG=AH2(a0.g,a1.g);
|
||
|
AH2 aB=AH2(a0.b,a1.b);
|
||
|
AH2 bR=AH2(b0.r,b1.r);
|
||
|
AH2 bG=AH2(b0.g,b1.g);
|
||
|
AH2 bB=AH2(b0.b,b1.b);
|
||
|
AH2 cR=AH2(c0.r,c1.r);
|
||
|
AH2 cG=AH2(c0.g,c1.g);
|
||
|
AH2 cB=AH2(c0.b,c1.b);
|
||
|
AH2 dR=AH2(d0.r,d1.r);
|
||
|
AH2 dG=AH2(d0.g,d1.g);
|
||
|
AH2 dB=AH2(d0.b,d1.b);
|
||
|
AH2 eR=AH2(e0.r,e1.r);
|
||
|
AH2 eG=AH2(e0.g,e1.g);
|
||
|
AH2 eB=AH2(e0.b,e1.b);
|
||
|
AH2 fR=AH2(f0.r,f1.r);
|
||
|
AH2 fG=AH2(f0.g,f1.g);
|
||
|
AH2 fB=AH2(f0.b,f1.b);
|
||
|
AH2 gR=AH2(g0.r,g1.r);
|
||
|
AH2 gG=AH2(g0.g,g1.g);
|
||
|
AH2 gB=AH2(g0.b,g1.b);
|
||
|
AH2 hR=AH2(h0.r,h1.r);
|
||
|
AH2 hG=AH2(h0.g,h1.g);
|
||
|
AH2 hB=AH2(h0.b,h1.b);
|
||
|
AH2 iR=AH2(i0.r,i1.r);
|
||
|
AH2 iG=AH2(i0.g,i1.g);
|
||
|
AH2 iB=AH2(i0.b,i1.b);
|
||
|
// Run optional input transform.
|
||
|
CasInputH(aR,aG,aB);
|
||
|
CasInputH(bR,bG,bB);
|
||
|
CasInputH(cR,cG,cB);
|
||
|
CasInputH(dR,dG,dB);
|
||
|
CasInputH(eR,eG,eB);
|
||
|
CasInputH(fR,fG,fB);
|
||
|
CasInputH(gR,gG,gB);
|
||
|
CasInputH(hR,hG,hB);
|
||
|
CasInputH(iR,iG,iB);
|
||
|
// Soft min and max.
|
||
|
AH2 mnR=min(min(fR,hR),min(min(bR,dR),eR));
|
||
|
AH2 mnG=min(min(fG,hG),min(min(bG,dG),eG));
|
||
|
AH2 mnB=min(min(fB,hB),min(min(bB,dB),eB));
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mnR2=min(min(gR,iR),min(min(aR,cR),mnR));
|
||
|
AH2 mnG2=min(min(gG,iG),min(min(aG,cG),mnG));
|
||
|
AH2 mnB2=min(min(gB,iB),min(min(aB,cB),mnB));
|
||
|
mnR=mnR+mnR2;
|
||
|
mnG=mnG+mnG2;
|
||
|
mnB=mnB+mnB2;
|
||
|
#endif
|
||
|
AH2 mxR=max(max(fR,hR),max(max(bR,dR),eR));
|
||
|
AH2 mxG=max(max(fG,hG),max(max(bG,dG),eG));
|
||
|
AH2 mxB=max(max(fB,hB),max(max(bB,dB),eB));
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mxR2=max(max(gR,iR),max(max(aR,cR),mxR));
|
||
|
AH2 mxG2=max(max(gG,iG),max(max(aG,cG),mxG));
|
||
|
AH2 mxB2=max(max(gB,iB),max(max(aB,cB),mxB));
|
||
|
mxR=mxR+mxR2;
|
||
|
mxG=mxG+mxG2;
|
||
|
mxB=mxB+mxB2;
|
||
|
#endif
|
||
|
// Smooth minimum distance to signal limit divided by smooth max.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AH2 rcpMR=ARcpH2(mxR);
|
||
|
AH2 rcpMG=ARcpH2(mxG);
|
||
|
AH2 rcpMB=ARcpH2(mxB);
|
||
|
#else
|
||
|
AH2 rcpMR=APrxLoRcpH2(mxR);
|
||
|
AH2 rcpMG=APrxLoRcpH2(mxG);
|
||
|
AH2 rcpMB=APrxLoRcpH2(mxB);
|
||
|
#endif
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 ampR=ASatH2(min(mnR,AH2_(2.0)-mxR)*rcpMR);
|
||
|
AH2 ampG=ASatH2(min(mnG,AH2_(2.0)-mxG)*rcpMG);
|
||
|
AH2 ampB=ASatH2(min(mnB,AH2_(2.0)-mxB)*rcpMB);
|
||
|
#else
|
||
|
AH2 ampR=ASatH2(min(mnR,AH2_(1.0)-mxR)*rcpMR);
|
||
|
AH2 ampG=ASatH2(min(mnG,AH2_(1.0)-mxG)*rcpMG);
|
||
|
AH2 ampB=ASatH2(min(mnB,AH2_(1.0)-mxB)*rcpMB);
|
||
|
#endif
|
||
|
// Shaping amount of sharpening.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
ampR=sqrt(ampR);
|
||
|
ampG=sqrt(ampG);
|
||
|
ampB=sqrt(ampB);
|
||
|
#else
|
||
|
ampR=APrxLoSqrtH2(ampR);
|
||
|
ampG=APrxLoSqrtH2(ampG);
|
||
|
ampB=APrxLoSqrtH2(ampB);
|
||
|
#endif
|
||
|
// Filter shape.
|
||
|
AH1 peak=AH2_AU1(const1.y).x;
|
||
|
AH2 wR=ampR*AH2_(peak);
|
||
|
AH2 wG=ampG*AH2_(peak);
|
||
|
AH2 wB=ampB*AH2_(peak);
|
||
|
// Filter.
|
||
|
#ifndef CAS_SLOW
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AH2 rcpWeight=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG);
|
||
|
#else
|
||
|
AH2 rcpWeight=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG);
|
||
|
#endif
|
||
|
pixR=ASatH2((bR*wG+dR*wG+fR*wG+hR*wG+eR)*rcpWeight);
|
||
|
pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeight);
|
||
|
pixB=ASatH2((bB*wG+dB*wG+fB*wG+hB*wG+eB)*rcpWeight);
|
||
|
#else
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AH2 rcpWeightR=ARcpH2(AH2_(1.0)+AH2_(4.0)*wR);
|
||
|
AH2 rcpWeightG=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG);
|
||
|
AH2 rcpWeightB=ARcpH2(AH2_(1.0)+AH2_(4.0)*wB);
|
||
|
#else
|
||
|
AH2 rcpWeightR=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wR);
|
||
|
AH2 rcpWeightG=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG);
|
||
|
AH2 rcpWeightB=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wB);
|
||
|
#endif
|
||
|
pixR=ASatH2((bR*wR+dR*wR+fR*wR+hR*wR+eR)*rcpWeightR);
|
||
|
pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeightG);
|
||
|
pixB=ASatH2((bB*wB+dB*wB+fB*wB+hB*wB+eB)*rcpWeightB);
|
||
|
#endif
|
||
|
return;}
|
||
|
//------------------------------------------------------------------------------------------------------------------------------
|
||
|
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
|
||
|
AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw);
|
||
|
// Tile 0.
|
||
|
// Fractional position is needed in high precision here.
|
||
|
AF2 fp0=floor(pp);
|
||
|
AH2 ppX;
|
||
|
ppX.x=AH1(pp.x-fp0.x);
|
||
|
AH1 ppY=AH1(pp.y-fp0.y);
|
||
|
ASW2 sp0=ASW2(fp0);
|
||
|
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
|
||
|
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
|
||
|
AH3 e0=CasLoadH(sp0+ASW2(-1, 0));
|
||
|
AH3 f0=CasLoadH(sp0);
|
||
|
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
|
||
|
AH3 d0=CasLoadH(sp0+ASW2( 2,-1));
|
||
|
AH3 g0=CasLoadH(sp0+ASW2( 1, 0));
|
||
|
AH3 h0=CasLoadH(sp0+ASW2( 2, 0));
|
||
|
AH3 i0=CasLoadH(sp0+ASW2(-1, 1));
|
||
|
AH3 j0=CasLoadH(sp0+ASW2( 0, 1));
|
||
|
AH3 m0=CasLoadH(sp0+ASW2(-1, 2));
|
||
|
AH3 n0=CasLoadH(sp0+ASW2( 0, 2));
|
||
|
AH3 k0=CasLoadH(sp0+ASW2( 1, 1));
|
||
|
AH3 l0=CasLoadH(sp0+ASW2( 2, 1));
|
||
|
AH3 o0=CasLoadH(sp0+ASW2( 1, 2));
|
||
|
AH3 p0=CasLoadH(sp0+ASW2( 2, 2));
|
||
|
// Tile 1 (offset only in x).
|
||
|
AF1 pp1=pp.x+AF1_AU1(const1.z);
|
||
|
AF1 fp1=floor(pp1);
|
||
|
ppX.y=AH1(pp1-fp1);
|
||
|
ASW2 sp1=ASW2(fp1,sp0.y);
|
||
|
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
|
||
|
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
|
||
|
AH3 e1=CasLoadH(sp1+ASW2(-1, 0));
|
||
|
AH3 f1=CasLoadH(sp1);
|
||
|
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
|
||
|
AH3 d1=CasLoadH(sp1+ASW2( 2,-1));
|
||
|
AH3 g1=CasLoadH(sp1+ASW2( 1, 0));
|
||
|
AH3 h1=CasLoadH(sp1+ASW2( 2, 0));
|
||
|
AH3 i1=CasLoadH(sp1+ASW2(-1, 1));
|
||
|
AH3 j1=CasLoadH(sp1+ASW2( 0, 1));
|
||
|
AH3 m1=CasLoadH(sp1+ASW2(-1, 2));
|
||
|
AH3 n1=CasLoadH(sp1+ASW2( 0, 2));
|
||
|
AH3 k1=CasLoadH(sp1+ASW2( 1, 1));
|
||
|
AH3 l1=CasLoadH(sp1+ASW2( 2, 1));
|
||
|
AH3 o1=CasLoadH(sp1+ASW2( 1, 2));
|
||
|
AH3 p1=CasLoadH(sp1+ASW2( 2, 2));
|
||
|
// AOS to SOA conversion.
|
||
|
AH2 aR=AH2(a0.r,a1.r);
|
||
|
AH2 aG=AH2(a0.g,a1.g);
|
||
|
AH2 aB=AH2(a0.b,a1.b);
|
||
|
AH2 bR=AH2(b0.r,b1.r);
|
||
|
AH2 bG=AH2(b0.g,b1.g);
|
||
|
AH2 bB=AH2(b0.b,b1.b);
|
||
|
AH2 cR=AH2(c0.r,c1.r);
|
||
|
AH2 cG=AH2(c0.g,c1.g);
|
||
|
AH2 cB=AH2(c0.b,c1.b);
|
||
|
AH2 dR=AH2(d0.r,d1.r);
|
||
|
AH2 dG=AH2(d0.g,d1.g);
|
||
|
AH2 dB=AH2(d0.b,d1.b);
|
||
|
AH2 eR=AH2(e0.r,e1.r);
|
||
|
AH2 eG=AH2(e0.g,e1.g);
|
||
|
AH2 eB=AH2(e0.b,e1.b);
|
||
|
AH2 fR=AH2(f0.r,f1.r);
|
||
|
AH2 fG=AH2(f0.g,f1.g);
|
||
|
AH2 fB=AH2(f0.b,f1.b);
|
||
|
AH2 gR=AH2(g0.r,g1.r);
|
||
|
AH2 gG=AH2(g0.g,g1.g);
|
||
|
AH2 gB=AH2(g0.b,g1.b);
|
||
|
AH2 hR=AH2(h0.r,h1.r);
|
||
|
AH2 hG=AH2(h0.g,h1.g);
|
||
|
AH2 hB=AH2(h0.b,h1.b);
|
||
|
AH2 iR=AH2(i0.r,i1.r);
|
||
|
AH2 iG=AH2(i0.g,i1.g);
|
||
|
AH2 iB=AH2(i0.b,i1.b);
|
||
|
AH2 jR=AH2(j0.r,j1.r);
|
||
|
AH2 jG=AH2(j0.g,j1.g);
|
||
|
AH2 jB=AH2(j0.b,j1.b);
|
||
|
AH2 kR=AH2(k0.r,k1.r);
|
||
|
AH2 kG=AH2(k0.g,k1.g);
|
||
|
AH2 kB=AH2(k0.b,k1.b);
|
||
|
AH2 lR=AH2(l0.r,l1.r);
|
||
|
AH2 lG=AH2(l0.g,l1.g);
|
||
|
AH2 lB=AH2(l0.b,l1.b);
|
||
|
AH2 mR=AH2(m0.r,m1.r);
|
||
|
AH2 mG=AH2(m0.g,m1.g);
|
||
|
AH2 mB=AH2(m0.b,m1.b);
|
||
|
AH2 nR=AH2(n0.r,n1.r);
|
||
|
AH2 nG=AH2(n0.g,n1.g);
|
||
|
AH2 nB=AH2(n0.b,n1.b);
|
||
|
AH2 oR=AH2(o0.r,o1.r);
|
||
|
AH2 oG=AH2(o0.g,o1.g);
|
||
|
AH2 oB=AH2(o0.b,o1.b);
|
||
|
AH2 pR=AH2(p0.r,p1.r);
|
||
|
AH2 pG=AH2(p0.g,p1.g);
|
||
|
AH2 pB=AH2(p0.b,p1.b);
|
||
|
// Run optional input transform.
|
||
|
CasInputH(aR,aG,aB);
|
||
|
CasInputH(bR,bG,bB);
|
||
|
CasInputH(cR,cG,cB);
|
||
|
CasInputH(dR,dG,dB);
|
||
|
CasInputH(eR,eG,eB);
|
||
|
CasInputH(fR,fG,fB);
|
||
|
CasInputH(gR,gG,gB);
|
||
|
CasInputH(hR,hG,hB);
|
||
|
CasInputH(iR,iG,iB);
|
||
|
CasInputH(jR,jG,jB);
|
||
|
CasInputH(kR,kG,kB);
|
||
|
CasInputH(lR,lG,lB);
|
||
|
CasInputH(mR,mG,mB);
|
||
|
CasInputH(nR,nG,nB);
|
||
|
CasInputH(oR,oG,oB);
|
||
|
CasInputH(pR,pG,pB);
|
||
|
// Soft min and max.
|
||
|
// These are 2.0x bigger (factored out the extra multiply).
|
||
|
// a b c b
|
||
|
// e f g * 0.5 + e f g * 0.5 [F]
|
||
|
// i j k j
|
||
|
AH2 mnfR=AMin3H2(AMin3H2(bR,eR,fR),gR,jR);
|
||
|
AH2 mnfG=AMin3H2(AMin3H2(bG,eG,fG),gG,jG);
|
||
|
AH2 mnfB=AMin3H2(AMin3H2(bB,eB,fB),gB,jB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mnfR2=AMin3H2(AMin3H2(mnfR,aR,cR),iR,kR);
|
||
|
AH2 mnfG2=AMin3H2(AMin3H2(mnfG,aG,cG),iG,kG);
|
||
|
AH2 mnfB2=AMin3H2(AMin3H2(mnfB,aB,cB),iB,kB);
|
||
|
mnfR=mnfR+mnfR2;
|
||
|
mnfG=mnfG+mnfG2;
|
||
|
mnfB=mnfB+mnfB2;
|
||
|
#endif
|
||
|
AH2 mxfR=AMax3H2(AMax3H2(bR,eR,fR),gR,jR);
|
||
|
AH2 mxfG=AMax3H2(AMax3H2(bG,eG,fG),gG,jG);
|
||
|
AH2 mxfB=AMax3H2(AMax3H2(bB,eB,fB),gB,jB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mxfR2=AMax3H2(AMax3H2(mxfR,aR,cR),iR,kR);
|
||
|
AH2 mxfG2=AMax3H2(AMax3H2(mxfG,aG,cG),iG,kG);
|
||
|
AH2 mxfB2=AMax3H2(AMax3H2(mxfB,aB,cB),iB,kB);
|
||
|
mxfR=mxfR+mxfR2;
|
||
|
mxfG=mxfG+mxfG2;
|
||
|
mxfB=mxfB+mxfB2;
|
||
|
#endif
|
||
|
// b c d c
|
||
|
// f g h * 0.5 + f g h * 0.5 [G]
|
||
|
// j k l k
|
||
|
AH2 mngR=AMin3H2(AMin3H2(cR,fR,gR),hR,kR);
|
||
|
AH2 mngG=AMin3H2(AMin3H2(cG,fG,gG),hG,kG);
|
||
|
AH2 mngB=AMin3H2(AMin3H2(cB,fB,gB),hB,kB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mngR2=AMin3H2(AMin3H2(mngR,bR,dR),jR,lR);
|
||
|
AH2 mngG2=AMin3H2(AMin3H2(mngG,bG,dG),jG,lG);
|
||
|
AH2 mngB2=AMin3H2(AMin3H2(mngB,bB,dB),jB,lB);
|
||
|
mngR=mngR+mngR2;
|
||
|
mngG=mngG+mngG2;
|
||
|
mngB=mngB+mngB2;
|
||
|
#endif
|
||
|
AH2 mxgR=AMax3H2(AMax3H2(cR,fR,gR),hR,kR);
|
||
|
AH2 mxgG=AMax3H2(AMax3H2(cG,fG,gG),hG,kG);
|
||
|
AH2 mxgB=AMax3H2(AMax3H2(cB,fB,gB),hB,kB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mxgR2=AMax3H2(AMax3H2(mxgR,bR,dR),jR,lR);
|
||
|
AH2 mxgG2=AMax3H2(AMax3H2(mxgG,bG,dG),jG,lG);
|
||
|
AH2 mxgB2=AMax3H2(AMax3H2(mxgB,bB,dB),jB,lB);
|
||
|
mxgR=mxgR+mxgR2;
|
||
|
mxgG=mxgG+mxgG2;
|
||
|
mxgB=mxgB+mxgB2;
|
||
|
#endif
|
||
|
// e f g f
|
||
|
// i j k * 0.5 + i j k * 0.5 [J]
|
||
|
// m n o n
|
||
|
AH2 mnjR=AMin3H2(AMin3H2(fR,iR,jR),kR,nR);
|
||
|
AH2 mnjG=AMin3H2(AMin3H2(fG,iG,jG),kG,nG);
|
||
|
AH2 mnjB=AMin3H2(AMin3H2(fB,iB,jB),kB,nB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mnjR2=AMin3H2(AMin3H2(mnjR,eR,gR),mR,oR);
|
||
|
AH2 mnjG2=AMin3H2(AMin3H2(mnjG,eG,gG),mG,oG);
|
||
|
AH2 mnjB2=AMin3H2(AMin3H2(mnjB,eB,gB),mB,oB);
|
||
|
mnjR=mnjR+mnjR2;
|
||
|
mnjG=mnjG+mnjG2;
|
||
|
mnjB=mnjB+mnjB2;
|
||
|
#endif
|
||
|
AH2 mxjR=AMax3H2(AMax3H2(fR,iR,jR),kR,nR);
|
||
|
AH2 mxjG=AMax3H2(AMax3H2(fG,iG,jG),kG,nG);
|
||
|
AH2 mxjB=AMax3H2(AMax3H2(fB,iB,jB),kB,nB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mxjR2=AMax3H2(AMax3H2(mxjR,eR,gR),mR,oR);
|
||
|
AH2 mxjG2=AMax3H2(AMax3H2(mxjG,eG,gG),mG,oG);
|
||
|
AH2 mxjB2=AMax3H2(AMax3H2(mxjB,eB,gB),mB,oB);
|
||
|
mxjR=mxjR+mxjR2;
|
||
|
mxjG=mxjG+mxjG2;
|
||
|
mxjB=mxjB+mxjB2;
|
||
|
#endif
|
||
|
// f g h g
|
||
|
// j k l * 0.5 + j k l * 0.5 [K]
|
||
|
// n o p o
|
||
|
AH2 mnkR=AMin3H2(AMin3H2(gR,jR,kR),lR,oR);
|
||
|
AH2 mnkG=AMin3H2(AMin3H2(gG,jG,kG),lG,oG);
|
||
|
AH2 mnkB=AMin3H2(AMin3H2(gB,jB,kB),lB,oB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mnkR2=AMin3H2(AMin3H2(mnkR,fR,hR),nR,pR);
|
||
|
AH2 mnkG2=AMin3H2(AMin3H2(mnkG,fG,hG),nG,pG);
|
||
|
AH2 mnkB2=AMin3H2(AMin3H2(mnkB,fB,hB),nB,pB);
|
||
|
mnkR=mnkR+mnkR2;
|
||
|
mnkG=mnkG+mnkG2;
|
||
|
mnkB=mnkB+mnkB2;
|
||
|
#endif
|
||
|
AH2 mxkR=AMax3H2(AMax3H2(gR,jR,kR),lR,oR);
|
||
|
AH2 mxkG=AMax3H2(AMax3H2(gG,jG,kG),lG,oG);
|
||
|
AH2 mxkB=AMax3H2(AMax3H2(gB,jB,kB),lB,oB);
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 mxkR2=AMax3H2(AMax3H2(mxkR,fR,hR),nR,pR);
|
||
|
AH2 mxkG2=AMax3H2(AMax3H2(mxkG,fG,hG),nG,pG);
|
||
|
AH2 mxkB2=AMax3H2(AMax3H2(mxkB,fB,hB),nB,pB);
|
||
|
mxkR=mxkR+mxkR2;
|
||
|
mxkG=mxkG+mxkG2;
|
||
|
mxkB=mxkB+mxkB2;
|
||
|
#endif
|
||
|
// Smooth minimum distance to signal limit divided by smooth max.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AH2 rcpMfR=ARcpH2(mxfR);
|
||
|
AH2 rcpMfG=ARcpH2(mxfG);
|
||
|
AH2 rcpMfB=ARcpH2(mxfB);
|
||
|
AH2 rcpMgR=ARcpH2(mxgR);
|
||
|
AH2 rcpMgG=ARcpH2(mxgG);
|
||
|
AH2 rcpMgB=ARcpH2(mxgB);
|
||
|
AH2 rcpMjR=ARcpH2(mxjR);
|
||
|
AH2 rcpMjG=ARcpH2(mxjG);
|
||
|
AH2 rcpMjB=ARcpH2(mxjB);
|
||
|
AH2 rcpMkR=ARcpH2(mxkR);
|
||
|
AH2 rcpMkG=ARcpH2(mxkG);
|
||
|
AH2 rcpMkB=ARcpH2(mxkB);
|
||
|
#else
|
||
|
AH2 rcpMfR=APrxLoRcpH2(mxfR);
|
||
|
AH2 rcpMfG=APrxLoRcpH2(mxfG);
|
||
|
AH2 rcpMfB=APrxLoRcpH2(mxfB);
|
||
|
AH2 rcpMgR=APrxLoRcpH2(mxgR);
|
||
|
AH2 rcpMgG=APrxLoRcpH2(mxgG);
|
||
|
AH2 rcpMgB=APrxLoRcpH2(mxgB);
|
||
|
AH2 rcpMjR=APrxLoRcpH2(mxjR);
|
||
|
AH2 rcpMjG=APrxLoRcpH2(mxjG);
|
||
|
AH2 rcpMjB=APrxLoRcpH2(mxjB);
|
||
|
AH2 rcpMkR=APrxLoRcpH2(mxkR);
|
||
|
AH2 rcpMkG=APrxLoRcpH2(mxkG);
|
||
|
AH2 rcpMkB=APrxLoRcpH2(mxkB);
|
||
|
#endif
|
||
|
#ifdef CAS_BETTER_DIAGONALS
|
||
|
AH2 ampfR=ASatH2(min(mnfR,AH2_(2.0)-mxfR)*rcpMfR);
|
||
|
AH2 ampfG=ASatH2(min(mnfG,AH2_(2.0)-mxfG)*rcpMfG);
|
||
|
AH2 ampfB=ASatH2(min(mnfB,AH2_(2.0)-mxfB)*rcpMfB);
|
||
|
AH2 ampgR=ASatH2(min(mngR,AH2_(2.0)-mxgR)*rcpMgR);
|
||
|
AH2 ampgG=ASatH2(min(mngG,AH2_(2.0)-mxgG)*rcpMgG);
|
||
|
AH2 ampgB=ASatH2(min(mngB,AH2_(2.0)-mxgB)*rcpMgB);
|
||
|
AH2 ampjR=ASatH2(min(mnjR,AH2_(2.0)-mxjR)*rcpMjR);
|
||
|
AH2 ampjG=ASatH2(min(mnjG,AH2_(2.0)-mxjG)*rcpMjG);
|
||
|
AH2 ampjB=ASatH2(min(mnjB,AH2_(2.0)-mxjB)*rcpMjB);
|
||
|
AH2 ampkR=ASatH2(min(mnkR,AH2_(2.0)-mxkR)*rcpMkR);
|
||
|
AH2 ampkG=ASatH2(min(mnkG,AH2_(2.0)-mxkG)*rcpMkG);
|
||
|
AH2 ampkB=ASatH2(min(mnkB,AH2_(2.0)-mxkB)*rcpMkB);
|
||
|
#else
|
||
|
AH2 ampfR=ASatH2(min(mnfR,AH2_(1.0)-mxfR)*rcpMfR);
|
||
|
AH2 ampfG=ASatH2(min(mnfG,AH2_(1.0)-mxfG)*rcpMfG);
|
||
|
AH2 ampfB=ASatH2(min(mnfB,AH2_(1.0)-mxfB)*rcpMfB);
|
||
|
AH2 ampgR=ASatH2(min(mngR,AH2_(1.0)-mxgR)*rcpMgR);
|
||
|
AH2 ampgG=ASatH2(min(mngG,AH2_(1.0)-mxgG)*rcpMgG);
|
||
|
AH2 ampgB=ASatH2(min(mngB,AH2_(1.0)-mxgB)*rcpMgB);
|
||
|
AH2 ampjR=ASatH2(min(mnjR,AH2_(1.0)-mxjR)*rcpMjR);
|
||
|
AH2 ampjG=ASatH2(min(mnjG,AH2_(1.0)-mxjG)*rcpMjG);
|
||
|
AH2 ampjB=ASatH2(min(mnjB,AH2_(1.0)-mxjB)*rcpMjB);
|
||
|
AH2 ampkR=ASatH2(min(mnkR,AH2_(1.0)-mxkR)*rcpMkR);
|
||
|
AH2 ampkG=ASatH2(min(mnkG,AH2_(1.0)-mxkG)*rcpMkG);
|
||
|
AH2 ampkB=ASatH2(min(mnkB,AH2_(1.0)-mxkB)*rcpMkB);
|
||
|
#endif
|
||
|
// Shaping amount of sharpening.
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
ampfR=sqrt(ampfR);
|
||
|
ampfG=sqrt(ampfG);
|
||
|
ampfB=sqrt(ampfB);
|
||
|
ampgR=sqrt(ampgR);
|
||
|
ampgG=sqrt(ampgG);
|
||
|
ampgB=sqrt(ampgB);
|
||
|
ampjR=sqrt(ampjR);
|
||
|
ampjG=sqrt(ampjG);
|
||
|
ampjB=sqrt(ampjB);
|
||
|
ampkR=sqrt(ampkR);
|
||
|
ampkG=sqrt(ampkG);
|
||
|
ampkB=sqrt(ampkB);
|
||
|
#else
|
||
|
ampfR=APrxLoSqrtH2(ampfR);
|
||
|
ampfG=APrxLoSqrtH2(ampfG);
|
||
|
ampfB=APrxLoSqrtH2(ampfB);
|
||
|
ampgR=APrxLoSqrtH2(ampgR);
|
||
|
ampgG=APrxLoSqrtH2(ampgG);
|
||
|
ampgB=APrxLoSqrtH2(ampgB);
|
||
|
ampjR=APrxLoSqrtH2(ampjR);
|
||
|
ampjG=APrxLoSqrtH2(ampjG);
|
||
|
ampjB=APrxLoSqrtH2(ampjB);
|
||
|
ampkR=APrxLoSqrtH2(ampkR);
|
||
|
ampkG=APrxLoSqrtH2(ampkG);
|
||
|
ampkB=APrxLoSqrtH2(ampkB);
|
||
|
#endif
|
||
|
// Filter shape.
|
||
|
AH1 peak=AH2_AU1(const1.y).x;
|
||
|
AH2 wfR=ampfR*AH2_(peak);
|
||
|
AH2 wfG=ampfG*AH2_(peak);
|
||
|
AH2 wfB=ampfB*AH2_(peak);
|
||
|
AH2 wgR=ampgR*AH2_(peak);
|
||
|
AH2 wgG=ampgG*AH2_(peak);
|
||
|
AH2 wgB=ampgB*AH2_(peak);
|
||
|
AH2 wjR=ampjR*AH2_(peak);
|
||
|
AH2 wjG=ampjG*AH2_(peak);
|
||
|
AH2 wjB=ampjB*AH2_(peak);
|
||
|
AH2 wkR=ampkR*AH2_(peak);
|
||
|
AH2 wkG=ampkG*AH2_(peak);
|
||
|
AH2 wkB=ampkB*AH2_(peak);
|
||
|
// Blend between 4 results.
|
||
|
AH2 s=(AH2_(1.0)-ppX)*(AH2_(1.0)-AH2_(ppY));
|
||
|
AH2 t= ppX *(AH2_(1.0)-AH2_(ppY));
|
||
|
AH2 u=(AH2_(1.0)-ppX)* AH2_(ppY) ;
|
||
|
AH2 v= ppX * AH2_(ppY) ;
|
||
|
// Thin edges to hide bilinear interpolation (helps diagonals).
|
||
|
AH2 thinB=AH2_(1.0/32.0);
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
s*=ARcpH2(thinB+(mxfG-mnfG));
|
||
|
t*=ARcpH2(thinB+(mxgG-mngG));
|
||
|
u*=ARcpH2(thinB+(mxjG-mnjG));
|
||
|
v*=ARcpH2(thinB+(mxkG-mnkG));
|
||
|
#else
|
||
|
s*=APrxLoRcpH2(thinB+(mxfG-mnfG));
|
||
|
t*=APrxLoRcpH2(thinB+(mxgG-mngG));
|
||
|
u*=APrxLoRcpH2(thinB+(mxjG-mnjG));
|
||
|
v*=APrxLoRcpH2(thinB+(mxkG-mnkG));
|
||
|
#endif
|
||
|
// Final weighting.
|
||
|
AH2 qbeR=wfR*s;
|
||
|
AH2 qbeG=wfG*s;
|
||
|
AH2 qbeB=wfB*s;
|
||
|
AH2 qchR=wgR*t;
|
||
|
AH2 qchG=wgG*t;
|
||
|
AH2 qchB=wgB*t;
|
||
|
AH2 qfR=wgR*t+wjR*u+s;
|
||
|
AH2 qfG=wgG*t+wjG*u+s;
|
||
|
AH2 qfB=wgB*t+wjB*u+s;
|
||
|
AH2 qgR=wfR*s+wkR*v+t;
|
||
|
AH2 qgG=wfG*s+wkG*v+t;
|
||
|
AH2 qgB=wfB*s+wkB*v+t;
|
||
|
AH2 qjR=wfR*s+wkR*v+u;
|
||
|
AH2 qjG=wfG*s+wkG*v+u;
|
||
|
AH2 qjB=wfB*s+wkB*v+u;
|
||
|
AH2 qkR=wgR*t+wjR*u+v;
|
||
|
AH2 qkG=wgG*t+wjG*u+v;
|
||
|
AH2 qkB=wgB*t+wjB*u+v;
|
||
|
AH2 qinR=wjR*u;
|
||
|
AH2 qinG=wjG*u;
|
||
|
AH2 qinB=wjB*u;
|
||
|
AH2 qloR=wkR*v;
|
||
|
AH2 qloG=wkG*v;
|
||
|
AH2 qloB=wkB*v;
|
||
|
// Filter.
|
||
|
#ifndef CAS_SLOW
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
#else
|
||
|
AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
#endif
|
||
|
pixR=ASatH2((bR*qbeG+eR*qbeG+cR*qchG+hR*qchG+iR*qinG+nR*qinG+lR*qloG+oR*qloG+fR*qfG+gR*qgG+jR*qjG+kR*qkG)*rcpWG);
|
||
|
pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG);
|
||
|
pixB=ASatH2((bB*qbeG+eB*qbeG+cB*qchG+hB*qchG+iB*qinG+nB*qinG+lB*qloG+oB*qloG+fB*qfG+gB*qgG+jB*qjG+kB*qkG)*rcpWG);
|
||
|
#else
|
||
|
#ifdef CAS_GO_SLOWER
|
||
|
AH2 rcpWR=ARcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR);
|
||
|
AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
AH2 rcpWB=ARcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB);
|
||
|
#else
|
||
|
AH2 rcpWR=APrxMedRcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR);
|
||
|
AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
|
||
|
AH2 rcpWB=APrxMedRcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB);
|
||
|
#endif
|
||
|
pixR=ASatH2((bR*qbeR+eR*qbeR+cR*qchR+hR*qchR+iR*qinR+nR*qinR+lR*qloR+oR*qloR+fR*qfR+gR*qgR+jR*qjR+kR*qkR)*rcpWR);
|
||
|
pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG);
|
||
|
pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB);
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|