-
Notifications
You must be signed in to change notification settings - Fork 267
Fix DiffID computation to use uncompressed layer digest #587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
fc269d0
caaab25
e673719
7dfaef6
55a16d7
50b1194
13e26ba
b02f4fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,10 +14,12 @@ | |
| // limitations under the License. | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| import Compression | ||
| import ContainerizationError | ||
| import Crypto | ||
| import Foundation | ||
| import NIOCore | ||
| import zlib | ||
|
|
||
| /// Provides a context to write data into a directory. | ||
| public class ContentWriter { | ||
|
|
@@ -60,6 +62,175 @@ public class ContentWriter { | |
| return try self.write(data) | ||
| } | ||
|
|
||
| /// Computes the SHA256 digest of the uncompressed content of a gzip file. | ||
| /// | ||
| /// Per the OCI Image Specification, a DiffID is the SHA256 digest of the | ||
| /// uncompressed layer content. This method streams the compressed file in | ||
| /// chunks, decompresses through Apple's Compression framework, and feeds | ||
| /// each decompressed chunk into an incremental SHA256 hasher. Neither the | ||
| /// full compressed nor the full decompressed data is held in memory. | ||
| /// | ||
| /// - Parameter url: The URL of the gzip-compressed file. | ||
| /// - Returns: The SHA256 digest of the uncompressed content. | ||
| public static func diffID(of url: URL) throws -> SHA256.Digest { | ||
| let fileHandle = try FileHandle(forReadingFrom: url) | ||
| defer { fileHandle.closeFile() } | ||
|
|
||
| // Read just enough to parse the gzip header (initial 512 bytes is plenty). | ||
| let headerReadSize = 512 | ||
| guard let headerData = Self.readExactly(fileHandle: fileHandle, count: headerReadSize), !headerData.isEmpty else { | ||
| throw ContentWriterError.invalidGzip | ||
| } | ||
| let headerSize = try Self.gzipHeaderSize(headerData) | ||
|
|
||
| // Read the gzip trailer (last 8 bytes) to validate CRC32 + ISIZE later. | ||
| // Seek to the end to get the file size, then read the trailer. | ||
| fileHandle.seekToEndOfFile() | ||
| let fileSize = fileHandle.offsetInFile | ||
| guard fileSize >= 8 else { | ||
| throw ContentWriterError.gzipTrailerMismatch | ||
| } | ||
| fileHandle.seek(toFileOffset: fileSize - 8) | ||
| guard let trailerData = Self.readExactly(fileHandle: fileHandle, count: 8) else { | ||
| throw ContentWriterError.gzipTrailerMismatch | ||
| } | ||
| let expectedCRC = UInt32(trailerData[trailerData.startIndex]) | ||
| | (UInt32(trailerData[trailerData.startIndex + 1]) << 8) | ||
| | (UInt32(trailerData[trailerData.startIndex + 2]) << 16) | ||
| | (UInt32(trailerData[trailerData.startIndex + 3]) << 24) | ||
| let expectedSize = UInt32(trailerData[trailerData.startIndex + 4]) | ||
| | (UInt32(trailerData[trailerData.startIndex + 5]) << 8) | ||
| | (UInt32(trailerData[trailerData.startIndex + 6]) << 16) | ||
| | (UInt32(trailerData[trailerData.startIndex + 7]) << 24) | ||
|
|
||
| // Seek past the gzip header to the start of the deflate stream. | ||
| // The deflate data spans from headerSize to fileSize - 8 (the last 8 bytes | ||
| // are the gzip trailer: CRC32 + ISIZE). We must not feed the trailer to | ||
| // the decompressor. | ||
| fileHandle.seek(toFileOffset: UInt64(headerSize)) | ||
| var compressedBytesRemaining = Int(fileSize) - headerSize - 8 | ||
| guard compressedBytesRemaining >= 0 else { | ||
| throw ContentWriterError.invalidGzip | ||
| } | ||
|
|
||
| // Set up the decompression stream. | ||
| let chunkSize = 65_536 | ||
| let sourceBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: chunkSize) | ||
| let destinationBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: chunkSize) | ||
| defer { | ||
| sourceBuffer.deallocate() | ||
| destinationBuffer.deallocate() | ||
| } | ||
|
|
||
| let stream = UnsafeMutablePointer<compression_stream>.allocate(capacity: 1) | ||
| defer { stream.deallocate() } | ||
|
|
||
| var status = compression_stream_init(stream, COMPRESSION_STREAM_DECODE, COMPRESSION_ZLIB) | ||
| guard status != COMPRESSION_STATUS_ERROR else { | ||
| throw ContentWriterError.decompressionFailed | ||
| } | ||
| defer { compression_stream_destroy(stream) } | ||
|
|
||
| // Start with an empty source; we fill it from the file below. | ||
| stream.pointee.src_ptr = UnsafePointer(sourceBuffer) | ||
| stream.pointee.src_size = 0 | ||
| stream.pointee.dst_ptr = destinationBuffer | ||
| stream.pointee.dst_size = chunkSize | ||
|
|
||
| var hasher = SHA256() | ||
| var runningCRC: uLong = crc32(0, nil, 0) | ||
| var totalDecompressedSize: UInt64 = 0 | ||
| var inputExhausted = false | ||
|
|
||
| while status != COMPRESSION_STATUS_END { | ||
| // Refill the source buffer when it is exhausted and more data is available. | ||
| if stream.pointee.src_size == 0 && !inputExhausted { | ||
| let toRead = min(chunkSize, compressedBytesRemaining) | ||
| if toRead > 0, let chunk = fileHandle.readData(ofLength: toRead) as Data?, !chunk.isEmpty { | ||
| compressedBytesRemaining -= chunk.count | ||
| chunk.copyBytes(to: sourceBuffer, count: chunk.count) | ||
| stream.pointee.src_ptr = UnsafePointer(sourceBuffer) | ||
| stream.pointee.src_size = chunk.count | ||
| } else { | ||
| inputExhausted = true | ||
| } | ||
| } | ||
|
|
||
| stream.pointee.dst_ptr = destinationBuffer | ||
| stream.pointee.dst_size = chunkSize | ||
|
|
||
| let flags: Int32 = inputExhausted ? Int32(COMPRESSION_STREAM_FINALIZE.rawValue) : 0 | ||
| status = compression_stream_process(stream, flags) | ||
|
|
||
| switch status { | ||
| case COMPRESSION_STATUS_OK, COMPRESSION_STATUS_END: | ||
| let produced = chunkSize - stream.pointee.dst_size | ||
| if produced > 0 { | ||
| let buf = UnsafeBufferPointer(start: destinationBuffer, count: produced) | ||
| hasher.update(bufferPointer: UnsafeRawBufferPointer(buf)) | ||
| runningCRC = crc32(runningCRC, destinationBuffer, uInt(produced)) | ||
| totalDecompressedSize += UInt64(produced) | ||
| } | ||
|
|
||
| default: | ||
| throw ContentWriterError.decompressionFailed | ||
| } | ||
| } | ||
|
|
||
| // Validate the gzip trailer. | ||
| let actualCRC = UInt32(truncatingIfNeeded: runningCRC) | ||
| let actualSize = UInt32(truncatingIfNeeded: totalDecompressedSize) | ||
|
|
||
| guard expectedCRC == actualCRC, expectedSize == actualSize else { | ||
| throw ContentWriterError.gzipTrailerMismatch | ||
| } | ||
|
|
||
| return hasher.finalize() | ||
| } | ||
|
|
||
| /// Reads exactly `count` bytes from a FileHandle, returning nil on failure. | ||
| private static func readExactly(fileHandle: FileHandle, count: Int) -> Data? { | ||
| let data = fileHandle.readData(ofLength: count) | ||
| return data.isEmpty ? nil : data | ||
| } | ||
|
|
||
| /// Parses the gzip header to determine where the raw deflate stream begins. | ||
| private static func gzipHeaderSize(_ data: Data) throws -> Int { | ||
| guard data.count >= 10, | ||
| data[data.startIndex] == 0x1f, | ||
| data[data.startIndex + 1] == 0x8b, | ||
| data[data.startIndex + 2] == 0x08 // CM must be 8 (deflate) per RFC 1952 | ||
| else { | ||
| throw ContentWriterError.invalidGzip | ||
| } | ||
|
|
||
| let start = data.startIndex | ||
| let flags = data[start + 3] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: what's the reason the current changes skipped compression method (CM) (ref https://datatracker.ietf.org/doc/html/rfc1952#page-5) entirely.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch — the header parser was only checking the magic bytes (1f 8b) but not the compression method. I've added a guard for CM == 0x08 (deflate), which is the only method defined by RFC 1952. Anything else will now throw invalidGzip. |
||
| var offset = 10 | ||
|
|
||
| // FEXTRA | ||
| if flags & 0x04 != 0 { | ||
| guard data.count >= offset + 2 else { throw ContentWriterError.invalidGzip } | ||
| let extraLen = Int(data[start + offset]) | (Int(data[start + offset + 1]) << 8) | ||
| offset += 2 + extraLen | ||
| } | ||
| // FNAME | ||
| if flags & 0x08 != 0 { | ||
| while offset < data.count && data[start + offset] != 0 { offset += 1 } | ||
| offset += 1 | ||
| } | ||
| // FCOMMENT | ||
| if flags & 0x10 != 0 { | ||
| while offset < data.count && data[start + offset] != 0 { offset += 1 } | ||
| offset += 1 | ||
| } | ||
| // FHCRC | ||
| if flags & 0x02 != 0 { offset += 2 } | ||
|
|
||
| guard offset < data.count else { throw ContentWriterError.invalidGzip } | ||
| return offset | ||
| } | ||
|
|
||
| /// Encodes the passed in type as a JSON blob and writes it to the base path. | ||
| /// - Parameters: | ||
| /// - content: The type to convert to JSON. | ||
|
|
@@ -69,3 +240,9 @@ public class ContentWriter { | |
| return try self.write(data) | ||
| } | ||
| } | ||
|
|
||
| enum ContentWriterError: Error { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think we should throw ContainerizationError instead of this new error type? |
||
| case invalidGzip | ||
| case decompressionFailed | ||
| case gzipTrailerMismatch | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Just to be on the safer side, we probably need a check before this line.