From 3e843a06a286cb65a126b9777bd6facf04695d21 Mon Sep 17 00:00:00 2001 From: Erik White <26148654+Erik-White@users.noreply.github.com> Date: Tue, 26 May 2026 15:35:52 +0200 Subject: [PATCH 1/5] Implement CgBI support --- .../Compression/Zlib/ZlibInflateStream.cs | 31 +++++++- src/ImageSharp/Formats/Png/PngDecoderCore.cs | 73 ++++++++++++++++++- .../Formats/Png/PngDecoderTests.cs | 26 +++---- tests/ImageSharp.Tests/TestImages.cs | 14 +++- ...reshold-0_PerPixelManhattanThreshold-0.png | 3 + ...reshold-0_PerPixelManhattanThreshold-0.png | 3 + ...reshold-0_PerPixelManhattanThreshold-0.png | 3 + ...reshold-0_PerPixelManhattanThreshold-0.png | 3 + ...reshold-0_PerPixelManhattanThreshold-0.png | 3 + tests/Images/Input/Png/cgbi/clocks.png | 3 + tests/Images/Input/Png/cgbi/colors.png | 3 + tests/Images/Input/Png/cgbi/flecks.png | 3 + tests/Images/Input/Png/cgbi/screen.png | 3 + 13 files changed, 148 insertions(+), 23 deletions(-) create mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_Issue_410_ImageThreshold-0_PerPixelManhattanThreshold-0.png create mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_clocks_ImageThreshold-0_PerPixelManhattanThreshold-0.png create mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_colors_ImageThreshold-0_PerPixelManhattanThreshold-0.png create mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png create mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_screen_ImageThreshold-0_PerPixelManhattanThreshold-0.png create mode 100644 tests/Images/Input/Png/cgbi/clocks.png create mode 100644 tests/Images/Input/Png/cgbi/colors.png create mode 100644 tests/Images/Input/Png/cgbi/flecks.png create mode 100644 tests/Images/Input/Png/cgbi/screen.png diff --git a/src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs b/src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs index 1d743bf3a5..513171b179 100644 --- a/src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs +++ b/src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs @@ -52,12 +52,19 @@ internal sealed class ZlibInflateStream : Stream /// private readonly Func getData; + /// + /// When true, the inflated payload is treated as a raw DEFLATE stream with no zlib + /// CMF/FLG header (and no Adler-32 trailer). This is required to decode IDATs in + /// Apple's proprietary CgBI PNG variant. + /// + private readonly bool noHeader; + /// /// Initializes a new instance of the class. /// /// The inner raw stream. public ZlibInflateStream(BufferedReadStream innerStream) - : this(innerStream, GetDataNoOp) + : this(innerStream, GetDataNoOp, noHeader: false) { } @@ -67,9 +74,23 @@ public ZlibInflateStream(BufferedReadStream innerStream) /// The inner raw stream. /// A delegate to get more data from the inner stream. public ZlibInflateStream(BufferedReadStream innerStream, Func getData) + : this(innerStream, getData, noHeader: false) + { + } + + /// + /// Initializes a new instance of the class. + /// + /// The inner raw stream. + /// A delegate to get more data from the inner stream. + /// + /// When , the payload is treated as raw DEFLATE with no zlib header. + /// + public ZlibInflateStream(BufferedReadStream innerStream, Func getData, bool noHeader) { this.innerStream = innerStream; this.getData = getData; + this.noHeader = noHeader; } /// @@ -210,6 +231,14 @@ protected override void Dispose(bool disposing) [MemberNotNullWhen(true, nameof(CompressedStream))] private bool InitializeInflateStream(bool isCriticalChunk) { + // Apple CgBI IDATs omit the zlib CMF/FLG header and the Adler-32 trailer, + // wrapping a raw DEFLATE payload directly. Skip the header parsing in that mode. + if (this.noHeader) + { + this.CompressedStream = new DeflateStream(this, CompressionMode.Decompress, true); + return true; + } + // Read the zlib header : http://tools.ietf.org/html/rfc1950 // CMF(Compression Method and flags) // This byte is divided into a 4 - bit compression method and a diff --git a/src/ImageSharp/Formats/Png/PngDecoderCore.cs b/src/ImageSharp/Formats/Png/PngDecoderCore.cs index 5b9eee1169..063c27a9a9 100644 --- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs +++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs @@ -137,6 +137,13 @@ internal sealed class PngDecoderCore : ImageDecoderCore /// private bool hasImageData; + /// + /// Whether this is an Apple CgBI PNG. CgBI files store IDATs as raw DEFLATE + /// (no zlib header/Adler-32) and pixels as premultiplied BGRA, so they need + /// extra inversion steps to round-trip back to standard PNG semantics. + /// + private bool isCgbi; + /// /// Initializes a new instance of the class. /// @@ -314,7 +321,7 @@ protected override Image Decode(BufferedReadStream stream, Cance case PngChunkType.End: goto EOF; case PngChunkType.ProprietaryApple: - PngThrowHelper.ThrowInvalidChunkType("Proprietary Apple PNG detected! This PNG file is not conform to the specification and cannot be decoded."); + this.isCgbi = true; break; } } @@ -517,6 +524,10 @@ protected override ImageInfo Identify(BufferedReadStream stream, CancellationTok case PngChunkType.End: goto EOF; + case PngChunkType.ProprietaryApple: + this.isCgbi = true; + break; + default: if (this.colorMetadataOnly) { @@ -766,7 +777,7 @@ private void ReadScanlines( CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - using ZlibInflateStream inflateStream = new(this.currentStream, getData); + using ZlibInflateStream inflateStream = new(this.currentStream, getData, noHeader: this.isCgbi); if (!inflateStream.AllocateNewBytes(chunkLength, !this.hasImageData)) { return; @@ -887,6 +898,11 @@ private void DecodePixelDataCore( break; } + if (this.isCgbi) + { + ApplyCgbiTransform(scanSpan[1..], this.pngColorType); + } + this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer); this.SwapScanlineBuffers(); currentRow++; @@ -1017,6 +1033,11 @@ private void DecodeInterlacedPixelDataCore( break; } + if (this.isCgbi) + { + ApplyCgbiTransform(scanSpan[1..], this.pngColorType); + } + Span rowSpan = imageBuffer.DangerousGetRowSpan(currentRow); this.ProcessInterlacedDefilteredScanline( frameControl, @@ -2470,4 +2491,52 @@ private static bool IsXmpTextData(ReadOnlySpan keywordBytes) private void SwapScanlineBuffers() => (this.scanline, this.previousScanline) = (this.previousScanline, this.scanline); + + /// + /// Applies the inverse of Apple's CgBI pixel mangling to a defiltered scanline. + /// CgBI PNGs are emitted by pngcrush -iphone with channel order swapped + /// from RGB(A) to BGR(A) and RGB samples premultiplied by alpha. This converts + /// the bytes back to standard PNG semantics in place so the existing scanline + /// processors can consume them unchanged. CgBI is only emitted for 8-bit + /// truecolor (with or without alpha); other color types are left alone. + /// + /// + /// See https://theapplewiki.com/wiki/PNG_CgBI_Format + /// + /// The defiltered pixel bytes (without the leading filter byte). + /// The PNG color type from IHDR. + private static void ApplyCgbiTransform(Span scanline, PngColorType colorType) + { + if (colorType == PngColorType.RgbWithAlpha) + { + for (int i = 0; i + 3 < scanline.Length; i += 4) + { + byte b = scanline[i]; + byte g = scanline[i + 1]; + byte r = scanline[i + 2]; + byte a = scanline[i + 3]; + + if (a is not 0 and not 255) + { + // Reverse: c' = c * a / 255 => c = round(c' * 255 / a) + int half = a >> 1; + r = (byte)Math.Min(255, ((r * 255) + half) / a); + g = (byte)Math.Min(255, ((g * 255) + half) / a); + b = (byte)Math.Min(255, ((b * 255) + half) / a); + } + + scanline[i] = r; + scanline[i + 1] = g; + scanline[i + 2] = b; + scanline[i + 3] = a; + } + } + else if (colorType == PngColorType.Rgb) + { + for (int i = 0; i + 2 < scanline.Length; i += 3) + { + (scanline[i], scanline[i + 2]) = (scanline[i + 2], scanline[i]); + } + } + } } diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index 803a12b03a..88018709ce 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -714,26 +714,18 @@ public void Issue2209_Identify_HasTransparencyIsTrue(string imagePath) Assert.Contains(metadata.ColorTable.Value.ToArray(), x => x.ToPixel().A < 255); } - // https://github.com/SixLabors/ImageSharp/issues/410 [Theory] - [WithFile(TestImages.Png.Bad.Issue410_MalformedApplePng, PixelTypes.Rgba32)] - public void Issue410_MalformedApplePng(TestImageProvider provider) + [WithFile(TestImages.Png.Cgbi.Issue410, PixelTypes.Rgba32)] + [WithFile(TestImages.Png.Cgbi.Colors, PixelTypes.Rgba32)] + [WithFile(TestImages.Png.Cgbi.Clocks, PixelTypes.Rgba32)] + [WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgba32)] + [WithFile(TestImages.Png.Cgbi.Screen, PixelTypes.Rgba32)] + public void Decode_AppleCgBI(TestImageProvider provider) where TPixel : unmanaged, IPixel { - Exception ex = Record.Exception( - () => - { - using Image image = provider.GetImage(PngDecoder.Instance); - image.DebugSave(provider); - - // We don't have another x-plat reference decoder that can be compared for this image. - if (TestEnvironment.IsWindows) - { - image.CompareToOriginal(provider, ImageComparer.Exact, SystemDrawingReferenceDecoder.Png); - } - }); - Assert.NotNull(ex); - Assert.Contains("Proprietary Apple PNG detected!", ex.Message); + using Image image = provider.GetImage(PngDecoder.Instance); + image.DebugSave(provider); + image.CompareToReferenceOutput(provider, ImageComparer.Exact); } [Theory] diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index 2624d1cdad..fee6fce375 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -177,6 +177,17 @@ public static class Icc public const string PerceptualcLUTOnly = "Png/icc-profiles/Perceptual-cLUT-only.png"; } + public static class Cgbi + { + public const string Colors = "Png/cgbi/colors.png"; + public const string Clocks = "Png/cgbi/clocks.png"; + public const string Flecks = "Png/cgbi/flecks.png"; + public const string Screen = "Png/cgbi/screen.png"; + + // Issue 410: https://github.com/SixLabors/ImageSharp/issues/410 + public const string Issue410 = "Png/issues/Issue_410.png"; + } + public static class Bad { public const string MissingDataChunk = "Png/xdtn0g01.png"; @@ -199,9 +210,6 @@ public static class Bad // Issue 1047: https://github.com/SixLabors/ImageSharp/issues/1047 public const string Issue1047_BadEndChunk = "Png/issues/Issue_1047.png"; - // Issue 410: https://github.com/SixLabors/ImageSharp/issues/410 - public const string Issue410_MalformedApplePng = "Png/issues/Issue_410.png"; - // Bad bit depth. public const string BitDepthZero = "Png/xd0n2c08.png"; public const string BitDepthThree = "Png/xd3n2c08.png"; diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_Issue_410_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_Issue_410_ImageThreshold-0_PerPixelManhattanThreshold-0.png new file mode 100644 index 0000000000..020facf415 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_Issue_410_ImageThreshold-0_PerPixelManhattanThreshold-0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:511cb90e72fcb837e4c9a31561a3c914f5201452d4ca63502cb5219cb4dc42be +size 619 diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_clocks_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_clocks_ImageThreshold-0_PerPixelManhattanThreshold-0.png new file mode 100644 index 0000000000..0f184812b4 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_clocks_ImageThreshold-0_PerPixelManhattanThreshold-0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dc73f1b4435a26125d910f005b5df7a540168a954f44c01a8e46df201adb1b6 +size 335851 diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_colors_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_colors_ImageThreshold-0_PerPixelManhattanThreshold-0.png new file mode 100644 index 0000000000..74647f04fb --- /dev/null +++ b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_colors_ImageThreshold-0_PerPixelManhattanThreshold-0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cdcc80a8c662c50d2f72ad8e123d595c6e80394538c05666b8d3531d651e71a +size 11270 diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png new file mode 100644 index 0000000000..06873607a1 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91bbb6c87f128920d4384f3be2e85ecd176e49a7a5166c6c6aa584e60d8131ed +size 204053 diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_screen_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_screen_ImageThreshold-0_PerPixelManhattanThreshold-0.png new file mode 100644 index 0000000000..c3a61bde84 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_screen_ImageThreshold-0_PerPixelManhattanThreshold-0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00bb4c7b345389f5d95252c19d70fc2b654c4f0198e6a704b603da92b78e9a0a +size 102982 diff --git a/tests/Images/Input/Png/cgbi/clocks.png b/tests/Images/Input/Png/cgbi/clocks.png new file mode 100644 index 0000000000..d7cf18367e --- /dev/null +++ b/tests/Images/Input/Png/cgbi/clocks.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc462d8c2697060cde9a2e975ffb828d822ee3b0d4d12e3c5f081114176c036b +size 389981 diff --git a/tests/Images/Input/Png/cgbi/colors.png b/tests/Images/Input/Png/cgbi/colors.png new file mode 100644 index 0000000000..9b3b371bd5 --- /dev/null +++ b/tests/Images/Input/Png/cgbi/colors.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f34436f755e3c6d15341f29c992331045ae16ad413144ca798ede5c085c8e6a +size 12853 diff --git a/tests/Images/Input/Png/cgbi/flecks.png b/tests/Images/Input/Png/cgbi/flecks.png new file mode 100644 index 0000000000..625c82e458 --- /dev/null +++ b/tests/Images/Input/Png/cgbi/flecks.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:314a5a52996953c0c12862262b4ff3fc191d8b6f2b7bb3853cdcea3267a4142d +size 212163 diff --git a/tests/Images/Input/Png/cgbi/screen.png b/tests/Images/Input/Png/cgbi/screen.png new file mode 100644 index 0000000000..57eca3d463 --- /dev/null +++ b/tests/Images/Input/Png/cgbi/screen.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e9bfbac37a57b71fa27b38b21314bd49dbe1cb19a2eb9d0f272ec7be3b72a33 +size 94834 From 4f85600cb8d0d9e461d6647ab754594cde8fc4b7 Mon Sep 17 00:00:00 2001 From: Erik White <26148654+Erik-White@users.noreply.github.com> Date: Tue, 26 May 2026 15:43:09 +0200 Subject: [PATCH 2/5] Improve ApplyCgbiTransform --- src/ImageSharp/Formats/Png/PngDecoderCore.cs | 31 ++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/ImageSharp/Formats/Png/PngDecoderCore.cs b/src/ImageSharp/Formats/Png/PngDecoderCore.cs index 063c27a9a9..dcd9ffd6c2 100644 --- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs +++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs @@ -2509,33 +2509,34 @@ private static void ApplyCgbiTransform(Span scanline, PngColorType colorTy { if (colorType == PngColorType.RgbWithAlpha) { - for (int i = 0; i + 3 < scanline.Length; i += 4) + Span pixels = MemoryMarshal.Cast(scanline); + for (int i = 0; i < pixels.Length; i++) { - byte b = scanline[i]; - byte g = scanline[i + 1]; - byte r = scanline[i + 2]; - byte a = scanline[i + 3]; + ref Rgba32 p = ref pixels[i]; + byte r = p.B; + byte g = p.G; + byte b = p.R; + byte a = p.A; - if (a is not 0 and not 255) + if (a is not 0 and not byte.MaxValue) { // Reverse: c' = c * a / 255 => c = round(c' * 255 / a) int half = a >> 1; - r = (byte)Math.Min(255, ((r * 255) + half) / a); - g = (byte)Math.Min(255, ((g * 255) + half) / a); - b = (byte)Math.Min(255, ((b * 255) + half) / a); + r = (byte)Math.Min(byte.MaxValue, ((r * byte.MaxValue) + half) / a); + g = (byte)Math.Min(byte.MaxValue, ((g * byte.MaxValue) + half) / a); + b = (byte)Math.Min(byte.MaxValue, ((b * byte.MaxValue) + half) / a); } - scanline[i] = r; - scanline[i + 1] = g; - scanline[i + 2] = b; - scanline[i + 3] = a; + p = new Rgba32(r, g, b, a); } } else if (colorType == PngColorType.Rgb) { - for (int i = 0; i + 2 < scanline.Length; i += 3) + Span pixels = MemoryMarshal.Cast(scanline); + for (int i = 0; i < pixels.Length; i++) { - (scanline[i], scanline[i + 2]) = (scanline[i + 2], scanline[i]); + ref Rgb24 p = ref pixels[i]; + (p.R, p.B) = (p.B, p.R); } } } From 70a49b1d9d7a7086ffa5adb0bfcd057f7867142c Mon Sep 17 00:00:00 2001 From: Erik White <26148654+Erik-White@users.noreply.github.com> Date: Tue, 26 May 2026 15:50:58 +0200 Subject: [PATCH 3/5] Add tests for Identify --- .../Formats/Png/PngDecoderTests.cs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index 88018709ce..c1ba56f3b4 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -728,6 +728,22 @@ public void Decode_AppleCgBI(TestImageProvider provider) image.CompareToReferenceOutput(provider, ImageComparer.Exact); } + [Theory] + [InlineData(TestImages.Png.Cgbi.Colors, 120, 120)] + [InlineData(TestImages.Png.Cgbi.Issue410, 42, 26)] + public void Identify_AppleCgBI(string imagePath, int expectedWidth, int expectedHeight) + { + TestFile testFile = TestFile.Create(imagePath); + using MemoryStream stream = new(testFile.Bytes, false); + + ImageInfo imageInfo = Image.Identify(stream); + + Assert.NotNull(imageInfo); + Assert.Equal(PngFormat.Instance, imageInfo.Metadata.DecodedImageFormat); + Assert.Equal(expectedWidth, imageInfo.Width); + Assert.Equal(expectedHeight, imageInfo.Height); + } + [Theory] [WithFile(TestImages.Png.Splash, PixelTypes.Rgba32)] [WithFile(TestImages.Png.Bike, PixelTypes.Rgba32)] From 5d206ec10f9611603e0e8566ac11725bcb5d6c1f Mon Sep 17 00:00:00 2001 From: Erik White <26148654+Erik-White@users.noreply.github.com> Date: Tue, 26 May 2026 17:41:21 +0200 Subject: [PATCH 4/5] Add test data for RGB24 --- tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs | 3 ++- ...4_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png | 3 +++ ...2_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png | 3 --- tests/Images/Input/Png/cgbi/flecks.png | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgb24_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png delete mode 100644 tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index c1ba56f3b4..d3a699c492 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -718,8 +718,8 @@ public void Issue2209_Identify_HasTransparencyIsTrue(string imagePath) [WithFile(TestImages.Png.Cgbi.Issue410, PixelTypes.Rgba32)] [WithFile(TestImages.Png.Cgbi.Colors, PixelTypes.Rgba32)] [WithFile(TestImages.Png.Cgbi.Clocks, PixelTypes.Rgba32)] - [WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgba32)] [WithFile(TestImages.Png.Cgbi.Screen, PixelTypes.Rgba32)] + [WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgb24)] public void Decode_AppleCgBI(TestImageProvider provider) where TPixel : unmanaged, IPixel { @@ -731,6 +731,7 @@ public void Decode_AppleCgBI(TestImageProvider provider) [Theory] [InlineData(TestImages.Png.Cgbi.Colors, 120, 120)] [InlineData(TestImages.Png.Cgbi.Issue410, 42, 26)] + [InlineData(TestImages.Png.Cgbi.Flecks, 510, 512)] public void Identify_AppleCgBI(string imagePath, int expectedWidth, int expectedHeight) { TestFile testFile = TestFile.Create(imagePath); diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgb24_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgb24_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png new file mode 100644 index 0000000000..a180f01186 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgb24_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10ee142fc1d3638ebe53fdd21c0a4c53008801befcc5e986051b796561cae887 +size 237676 diff --git a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png b/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png deleted file mode 100644 index 06873607a1..0000000000 --- a/tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91bbb6c87f128920d4384f3be2e85ecd176e49a7a5166c6c6aa584e60d8131ed -size 204053 diff --git a/tests/Images/Input/Png/cgbi/flecks.png b/tests/Images/Input/Png/cgbi/flecks.png index 625c82e458..6400442296 100644 --- a/tests/Images/Input/Png/cgbi/flecks.png +++ b/tests/Images/Input/Png/cgbi/flecks.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:314a5a52996953c0c12862262b4ff3fc191d8b6f2b7bb3853cdcea3267a4142d -size 212163 +oid sha256:6be7b478594ba5e4d37bc135c881c0a16cf1c804fece5440bab997c7b69182f1 +size 187703 From a3e9cc6fd4a097e333c014d592747c5a80dd4371 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 27 May 2026 13:34:17 +1000 Subject: [PATCH 5/5] Fix in-place shuffles and vectorize PNG CgBI transform --- .../Common/Helpers/Shuffle/IPad3Shuffle4.cs | 46 ++- .../Common/Helpers/Shuffle/IShuffle3.cs | 16 +- .../Common/Helpers/Shuffle/IShuffle4.cs | 14 +- .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 34 +- .../Common/Helpers/SimdUtils.Shuffle.cs | 14 +- src/ImageSharp/Formats/Png/PngDecoderCore.cs | 300 ++++++++++++++++-- .../Formats/Png/PngDecoderTests.cs | 28 +- 7 files changed, 390 insertions(+), 62 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs index 0f282c7f9a..bbe14b0991 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -31,19 +31,23 @@ public void Shuffle(ReadOnlySpan source, Span destination) SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0); - Span temp = stackalloc byte[4]; - ref byte t = ref MemoryMarshal.GetReference(temp); - ref uint tu = ref Unsafe.As(ref t); - for (nuint i = 0, j = 0; i < (uint)source.Length; i += 3, j += 4) { - ref byte s = ref Unsafe.Add(ref sBase, i); - tu = Unsafe.As(ref s) | 0xFF000000; - - Unsafe.Add(ref dBase, j + 0) = Unsafe.Add(ref t, p0); - Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1); - Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2); - Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3); + // Expanding 3-byte pixels to 4 bytes can overwrite the next source + // triplet when spans overlap. Assemble the padded pixel first, then + // shuffle from the staged uint. + uint packed = + Unsafe.Add(ref sBase, i + 0u) | + ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) | + ((uint)Unsafe.Add(ref sBase, i + 2u) << 16) | + 0xFF000000; + + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, j + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, j + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, j + 2u) = Unsafe.Add(ref pBase, p2); + Unsafe.Add(ref dBase, j + 3u) = Unsafe.Add(ref pBase, p3); } } } @@ -65,7 +69,12 @@ public void Shuffle(ReadOnlySpan source, Span destination) while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) { - Unsafe.As(ref dBase) = Unsafe.As(ref sBase) | 0xFF000000; + // The fast scalar path reads one extra byte past the source triplet. + // Keep that widened read in a local before writing the expanded pixel + // so overlapping destinations cannot change what was read. + uint packed = Unsafe.As(ref sBase) | 0xFF000000; + + Unsafe.As(ref dBase) = packed; sBase = ref Unsafe.Add(ref sBase, 3); dBase = ref Unsafe.Add(ref dBase, 4); @@ -73,10 +82,15 @@ public void Shuffle(ReadOnlySpan source, Span destination) while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) { - Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0); - Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1); - Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2); - Unsafe.Add(ref dBase, 3) = byte.MaxValue; + // The final triplet cannot use the widened read above, so assemble + // the same padded uint byte-by-byte before the overlapping store. + uint packed = + Unsafe.Add(ref sBase, 0u) | + ((uint)Unsafe.Add(ref sBase, 1u) << 8) | + ((uint)Unsafe.Add(ref sBase, 2u) << 16) | + 0xFF000000; + + Unsafe.As(ref dBase) = packed; sBase = ref Unsafe.Add(ref sBase, 3); dBase = ref Unsafe.Add(ref dBase, 4); diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs index 3c0973ad69..3907df58c6 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs @@ -33,9 +33,19 @@ public void Shuffle(ReadOnlySpan source, Span destination) for (nuint i = 0; i < (uint)source.Length; i += 3) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + // The scalar remainder can run in-place after the vector body. Load + // the full 3-byte pixel into a register-sized value before stores so + // channel swaps cannot corrupt later reads from the same pixel. + uint packed = + Unsafe.Add(ref sBase, i + 0u) | + ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) | + ((uint)Unsafe.Add(ref sBase, i + 2u) << 16); + + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); } } } diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs index d5c6df2c8b..68f34efd7c 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs @@ -35,10 +35,16 @@ public void Shuffle(ReadOnlySpan source, Span destination) for (nuint i = 0; i < (uint)source.Length; i += 4) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); - Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + // The generic path may be used with source and destination pointing + // at the same pixel. Load all channels first so subsequent stores + // index only staged bytes, matching the specialized uint shuffles. + uint packed = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); + Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3); } } } diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index 3e7e440664..6134061670 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -33,9 +33,15 @@ public void Shuffle(ReadOnlySpan source, Span destination) for (nuint i = 0, j = 0; i < (uint)destination.Length; i += 3, j += 4) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + j); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); + // Shrinking 4-byte pixels to 3 bytes can still be called in-place by + // tail code. Read the complete source pixel first, then write only + // the requested channels into the destination triplet. + uint packed = Unsafe.As(ref Unsafe.Add(ref sBase, j)); + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); } } } @@ -61,10 +67,18 @@ public void Shuffle(ReadOnlySpan source, Span destination) while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) { - Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); - Unsafe.Add(ref dBase, 1) = Unsafe.As(ref Unsafe.Add(ref sBase, 1)); - Unsafe.Add(ref dBase, 2) = Unsafe.As(ref Unsafe.Add(ref sBase, 2)); - Unsafe.Add(ref dBase, 3) = Unsafe.As(ref Unsafe.Add(ref sBase, 3)); + // Stage the four source pixels before the 3-byte stores. Even + // though this path preserves XYZ order, the packed loads must happen + // before destination writes when the spans overlap. + uint packed0 = Unsafe.Add(ref sBase, 0u); + uint packed1 = Unsafe.Add(ref sBase, 1u); + uint packed2 = Unsafe.Add(ref sBase, 2u); + uint packed3 = Unsafe.Add(ref sBase, 3u); + + Unsafe.Add(ref dBase, 0u) = Unsafe.As(ref packed0); + Unsafe.Add(ref dBase, 1u) = Unsafe.As(ref packed1); + Unsafe.Add(ref dBase, 2u) = Unsafe.As(ref packed2); + Unsafe.Add(ref dBase, 3u) = Unsafe.As(ref packed3); sBase = ref Unsafe.Add(ref sBase, 4); dBase = ref Unsafe.Add(ref dBase, 4); @@ -72,7 +86,11 @@ public void Shuffle(ReadOnlySpan source, Span destination) while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) { - Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); + // Same overlap rule as the unrolled loop: take the 4-byte source + // pixel before storing the 3-byte destination value. + uint packed = Unsafe.Add(ref sBase, 0u); + + Unsafe.Add(ref dBase, 0u) = Unsafe.As(ref packed); sBase = ref Unsafe.Add(ref sBase, 1); dBase = ref Unsafe.Add(ref dBase, 1); diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index dbeb54a80c..8b2baec213 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -3,6 +3,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -150,10 +151,15 @@ private static void Shuffle4Remainder( for (nuint i = 0; i < (uint)source.Length; i += 4) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); - Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + // Stage the scalar tail in a local Vector4 so p0..p3 index source + // values that were captured before any overlapping destination writes. + Vector4 v = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + ref float pBase = ref Unsafe.As(ref v); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); + Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3); } } diff --git a/src/ImageSharp/Formats/Png/PngDecoderCore.cs b/src/ImageSharp/Formats/Png/PngDecoderCore.cs index dcd9ffd6c2..84245254a2 100644 --- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs +++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs @@ -9,6 +9,8 @@ using System.IO.Hashing; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using System.Text; using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Compression.Zlib; @@ -900,7 +902,7 @@ private void DecodePixelDataCore( if (this.isCgbi) { - ApplyCgbiTransform(scanSpan[1..], this.pngColorType); + this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType); } this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer); @@ -1035,7 +1037,7 @@ private void DecodeInterlacedPixelDataCore( if (this.isCgbi) { - ApplyCgbiTransform(scanSpan[1..], this.pngColorType); + this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType); } Span rowSpan = imageBuffer.DangerousGetRowSpan(currentRow); @@ -2505,39 +2507,289 @@ private void SwapScanlineBuffers() /// /// The defiltered pixel bytes (without the leading filter byte). /// The PNG color type from IHDR. - private static void ApplyCgbiTransform(Span scanline, PngColorType colorType) + private void ApplyCgbiTransform(Span scanline, PngColorType colorType) { if (colorType == PngColorType.RgbWithAlpha) { Span pixels = MemoryMarshal.Cast(scanline); - for (int i = 0; i < pixels.Length; i++) + int i = 0; + + if (Vector512.IsHardwareAccelerated && pixels.Length >= 16) { - ref Rgba32 p = ref pixels[i]; - byte r = p.B; - byte g = p.G; - byte b = p.R; - byte a = p.A; + i = ApplyCgbiTransformVector512(scanline, pixels.Length); + } - if (a is not 0 and not byte.MaxValue) - { - // Reverse: c' = c * a / 255 => c = round(c' * 255 / a) - int half = a >> 1; - r = (byte)Math.Min(byte.MaxValue, ((r * byte.MaxValue) + half) / a); - g = (byte)Math.Min(byte.MaxValue, ((g * byte.MaxValue) + half) / a); - b = (byte)Math.Min(byte.MaxValue, ((b * byte.MaxValue) + half) / a); - } + if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && (pixels.Length - i) >= 8) + { + i = ApplyCgbiTransformVector256(scanline, i, pixels.Length); + } - p = new Rgba32(r, g, b, a); + if (Vector128.IsHardwareAccelerated && (pixels.Length - i) >= 4) + { + i = ApplyCgbiTransformVector128(scanline, i, pixels.Length); + } + + for (; i < pixels.Length; i++) + { + ref Rgba32 pixel = ref pixels[i]; + pixel = new Rgba32(pixel.B, pixel.G, pixel.R, pixel.A); + UndoCgbiPremultiplicationScalar(ref pixel); } } else if (colorType == PngColorType.Rgb) { - Span pixels = MemoryMarshal.Cast(scanline); - for (int i = 0; i < pixels.Length; i++) - { - ref Rgb24 p = ref pixels[i]; - (p.R, p.B) = (p.B, p.R); - } + // No alpha channel, so just swap R and B using built in SIMD-optimized pixel operations. + Span target = MemoryMarshal.Cast(scanline); + PixelOperations.Instance.FromBgr24Bytes(this.configuration, scanline, target, target.Length); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void UndoCgbiPremultiplicationScalar(ref Rgba32 pixel) + { + byte a = pixel.A; + if (a is 0 or byte.MaxValue) + { + return; + } + + // Reverse: c' = c * a / 255 => c = round(c' * 255 / a) + int half = a >> 1; + byte r = (byte)Math.Min(byte.MaxValue, ((pixel.R * byte.MaxValue) + half) / a); + byte g = (byte)Math.Min(byte.MaxValue, ((pixel.G * byte.MaxValue) + half) / a); + byte b = (byte)Math.Min(byte.MaxValue, ((pixel.B * byte.MaxValue) + half) / a); + pixel = new Rgba32(r, g, b, a); + } + + private static int ApplyCgbiTransformVector512(Span scanline, int pixelCount) + { + ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); + int i = 0; + + Span temp = stackalloc byte[Vector512.Count]; + SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); + + // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting + // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place. + // The generated mask only swaps bytes inside each pixel, so it remains + // correct for the optimized 512-bit byte shuffle helper. + Vector512 shuffleMask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); + + Vector512 zero = Vector512.Zero; + Vector512 one = Vector512.One; + Vector512 byteMask = Vector512.Create(0xFF); + Vector512 opaque = Vector512.Create(0xFF); + Vector512 byteMax = Vector512.Create((int)byte.MaxValue); + + for (; i <= pixelCount - 16; i += 16) + { + ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf()); + Vector512 bgra = Unsafe.ReadUnaligned>(ref blockRef); + Vector512 rgba = Vector512_.ShuffleNative(bgra, shuffleMask); + Vector512 packed = rgba.AsInt32(); + Vector512 alpha = Vector512.ShiftRightLogical(packed, 24); + + // Fully transparent and fully opaque pixels are identity cases for + // unpremultiplication. Masking them keeps the scalar behavior and lets + // safeAlpha avoid dividing by zero for alpha == 0. + Vector512 partialMask = ~(Vector512.Equals(alpha, zero) | Vector512.Equals(alpha, opaque)); + + Vector512 r = packed & byteMask; + Vector512 g = Vector512.ShiftRightLogical(packed, 8) & byteMask; + Vector512 b = Vector512.ShiftRightLogical(packed, 16) & byteMask; + + Vector512 safeAlpha = Vector512.ConditionalSelect(partialMask, alpha, one); + Vector512 halfAlpha = Vector512.ShiftRightLogical(safeAlpha, 1); + Vector512 safeAlphaF = Vector512.ConvertToSingle(safeAlpha); + + // The scalar path computes ((c * 255) + (a >> 1)) / a with integer + // division. Floor the positive quotient before converting so SIMD does + // not use the default round-to-nearest conversion and drift by one. + Vector512 unpremultipliedR = Vector512.Min( + byteMax, + Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); + + Vector512 unpremultipliedG = Vector512.Min( + byteMax, + Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); + + Vector512 unpremultipliedB = Vector512.Min( + byteMax, + Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); + + // ConditionalSelect applies the expensive unpremultiply only to pixels + // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the + // shuffled channel values exactly as the scalar path does. + Vector512 finalR = Vector512.ConditionalSelect(partialMask, unpremultipliedR, r); + Vector512 finalG = Vector512.ConditionalSelect(partialMask, unpremultipliedG, g); + Vector512 finalB = Vector512.ConditionalSelect(partialMask, unpremultipliedB, b); + + // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so + // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3 + // recreates the in-memory RGBA bytes for the unaligned store. + Vector512 result = + finalR | + Vector512.ShiftLeft(finalG, 8) | + Vector512.ShiftLeft(finalB, 16) | + Vector512.ShiftLeft(alpha, 24); + + Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); + } + + return i; + } + + private static int ApplyCgbiTransformVector256(Span scanline, int startPixel, int pixelCount) + { + ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); + int i = startPixel; + + Span temp = stackalloc byte[Vector512.Count]; + SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); + + // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting + // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place. + // Avx2.Shuffle is 128-bit lane-local, and the generated mask repeats inside + // each lane, so no byte ever needs to cross the lane boundary. + Vector256 shuffleMask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); + + Vector256 zero = Vector256.Zero; + Vector256 one = Vector256.One; + Vector256 byteMask = Vector256.Create(0xFF); + Vector256 opaque = Vector256.Create(0xFF); + Vector256 byteMax = Vector256.Create((int)byte.MaxValue); + + for (; i <= pixelCount - 8; i += 8) + { + ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf()); + Vector256 bgra = Unsafe.ReadUnaligned>(ref blockRef); + Vector256 rgba = Vector256_.ShufflePerLane(bgra, shuffleMask); + Vector256 packed = rgba.AsInt32(); + Vector256 alpha = Vector256.ShiftRightLogical(packed, 24); + + // Fully transparent and fully opaque pixels are identity cases for + // unpremultiplication. Masking them keeps the scalar behavior and lets + // safeAlpha avoid dividing by zero for alpha == 0. + Vector256 partialMask = ~(Vector256.Equals(alpha, zero) | Vector256.Equals(alpha, opaque)); + + Vector256 r = packed & byteMask; + Vector256 g = Vector256.ShiftRightLogical(packed, 8) & byteMask; + Vector256 b = Vector256.ShiftRightLogical(packed, 16) & byteMask; + + Vector256 safeAlpha = Vector256.ConditionalSelect(partialMask, alpha, one); + Vector256 halfAlpha = Vector256.ShiftRightLogical(safeAlpha, 1); + Vector256 safeAlphaF = Vector256.ConvertToSingle(safeAlpha); + + // The scalar path computes ((c * 255) + (a >> 1)) / a with integer + // division. Floor the positive quotient before converting so SIMD does + // not use the default round-to-nearest conversion and drift by one. + Vector256 unpremultipliedR = Vector256.Min( + byteMax, + Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); + + Vector256 unpremultipliedG = Vector256.Min( + byteMax, + Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); + + Vector256 unpremultipliedB = Vector256.Min( + byteMax, + Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); + + // ConditionalSelect applies the expensive unpremultiply only to pixels + // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the + // shuffled channel values exactly as the scalar path does. + Vector256 finalR = Vector256.ConditionalSelect(partialMask, unpremultipliedR, r); + Vector256 finalG = Vector256.ConditionalSelect(partialMask, unpremultipliedG, g); + Vector256 finalB = Vector256.ConditionalSelect(partialMask, unpremultipliedB, b); + + // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so + // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3 + // recreates the in-memory RGBA bytes for the unaligned store. + Vector256 result = + finalR | + Vector256.ShiftLeft(finalG, 8) | + Vector256.ShiftLeft(finalB, 16) | + Vector256.ShiftLeft(alpha, 24); + + Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); + } + + return i; + } + + private static int ApplyCgbiTransformVector128(Span scanline, int startPixel, int pixelCount) + { + ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); + int i = startPixel; + + Span temp = stackalloc byte[Vector512.Count]; + SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); + + // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting + // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place. + Vector128 shuffleMask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); + + Vector128 zero = Vector128.Zero; + Vector128 one = Vector128.One; + Vector128 byteMask = Vector128.Create(0xFF); + Vector128 opaque = Vector128.Create(0xFF); + Vector128 byteMax = Vector128.Create((int)byte.MaxValue); + + for (; i <= pixelCount - 4; i += 4) + { + ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf()); + Vector128 bgra = Unsafe.ReadUnaligned>(ref blockRef); + Vector128 rgba = Vector128_.ShuffleNative(bgra, shuffleMask); + Vector128 packed = rgba.AsInt32(); + Vector128 alpha = Vector128.ShiftRightLogical(packed, 24); + + // Fully transparent and fully opaque pixels are identity cases for + // unpremultiplication. Masking them keeps the scalar behavior and lets + // safeAlpha avoid dividing by zero for alpha == 0. + Vector128 partialMask = ~(Vector128.Equals(alpha, zero) | Vector128.Equals(alpha, opaque)); + + Vector128 r = packed & byteMask; + Vector128 g = Vector128.ShiftRightLogical(packed, 8) & byteMask; + Vector128 b = Vector128.ShiftRightLogical(packed, 16) & byteMask; + + Vector128 safeAlpha = Vector128.ConditionalSelect(partialMask, alpha, one); + Vector128 halfAlpha = Vector128.ShiftRightLogical(safeAlpha, 1); + Vector128 safeAlphaF = Vector128.ConvertToSingle(safeAlpha); + + // The scalar path computes ((c * 255) + (a >> 1)) / a with integer + // division. Floor the positive quotient before converting so SIMD does + // not use the default round-to-nearest conversion and drift by one. + Vector128 unpremultipliedR = Vector128.Min( + byteMax, + Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); + + Vector128 unpremultipliedG = Vector128.Min( + byteMax, + Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); + + Vector128 unpremultipliedB = Vector128.Min( + byteMax, + Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); + + // ConditionalSelect applies the expensive unpremultiply only to pixels + // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the + // shuffled channel values exactly as the scalar path does. + Vector128 finalR = Vector128.ConditionalSelect(partialMask, unpremultipliedR, r); + Vector128 finalG = Vector128.ConditionalSelect(partialMask, unpremultipliedG, g); + Vector128 finalB = Vector128.ConditionalSelect(partialMask, unpremultipliedB, b); + + // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so + // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3 + // recreates the in-memory RGBA bytes for the unaligned store. + Vector128 result = + finalR | + Vector128.ShiftLeft(finalG, 8) | + Vector128.ShiftLeft(finalB, 16) | + Vector128.ShiftLeft(alpha, 24); + + Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); + } + + return i; + } } diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index d3a699c492..2e452b896d 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -722,10 +722,32 @@ public void Issue2209_Identify_HasTransparencyIsTrue(string imagePath) [WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgb24)] public void Decode_AppleCgBI(TestImageProvider provider) where TPixel : unmanaged, IPixel + => FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunDecodeAppleCgbi, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableHWIntrinsic, + provider, + provider.PixelType.ToString()); + + private static void RunDecodeAppleCgbi(string providerDump, string pixelType) { - using Image image = provider.GetImage(PngDecoder.Instance); - image.DebugSave(provider); - image.CompareToReferenceOutput(provider, ImageComparer.Exact); + if (Enum.Parse(pixelType) == PixelTypes.Rgb24) + { + TestImageProvider provider = + FeatureTestRunner.DeserializeForXunit>(providerDump); + + using Image image = provider.GetImage(PngDecoder.Instance); + image.DebugSave(provider); + image.CompareToReferenceOutput(provider, ImageComparer.Exact); + + return; + } + + TestImageProvider rgbaProvider = + FeatureTestRunner.DeserializeForXunit>(providerDump); + + using Image rgbaImage = rgbaProvider.GetImage(PngDecoder.Instance); + rgbaImage.DebugSave(rgbaProvider); + rgbaImage.CompareToReferenceOutput(rgbaProvider, ImageComparer.Exact); } [Theory]