From a7728402ac05aa09641c733d650a15beb1eb4642 Mon Sep 17 00:00:00 2001 From: GitHub Copilot Agent Date: Wed, 18 Feb 2026 09:19:18 +0100 Subject: [PATCH 1/4] fix(filetypedetection): office- und archivdetektion fail-closed haerten --- .../Detection/FileTypeRegistry.vb | 37 ++- src/FileTypeDetection/FileTypeDetector.vb | 118 +++++++- .../Infrastructure/CoreInternals.vb | 285 +++++++++++++++++- .../Unit/ArchiveExtractionUnitTests.cs | 51 ++++ ...tionDetailAndArchiveValidationUnitTests.cs | 64 +++- .../Unit/EndToEndFailClosedMatrixUnitTests.cs | 210 +++++++++++++ .../Unit/ExtensionCheckUnitTests.cs | 138 ++++++++- .../FileTypeDetectorPrivateBranchUnitTests.cs | 24 ++ .../LegacyOfficeBinaryRefinerUnitTests.cs | 56 ++++ .../Unit/OpenXmlRefinerUnitTests.cs | 123 +++++++- 10 files changed, 1086 insertions(+), 20 deletions(-) create mode 100644 tests/FileTypeDetectionLib.Tests/Unit/EndToEndFailClosedMatrixUnitTests.cs create mode 100644 tests/FileTypeDetectionLib.Tests/Unit/LegacyOfficeBinaryRefinerUnitTests.cs diff --git a/src/FileTypeDetection/Detection/FileTypeRegistry.vb b/src/FileTypeDetection/Detection/FileTypeRegistry.vb index ea2d5523..cc606759 100644 --- a/src/FileTypeDetection/Detection/FileTypeRegistry.vb +++ b/src/FileTypeDetection/Detection/FileTypeRegistry.vb @@ -49,7 +49,42 @@ Namespace Global.Tomtastisch.FileClassifier "xz", "7z", "zz", - "rar")) + "rar")), + New KeyValuePair(Of FileKind, ImmutableArray(Of String))(FileKind.Docx, + ImmutableArray. + Create("doc", + "docm", + "docb", + "dot", + "dotm", + "dotx", + "odt", + "ott")), + New KeyValuePair(Of FileKind, ImmutableArray(Of String))(FileKind.Xlsx, + ImmutableArray. + Create("xls", + "xlsm", + "xlsb", + "xlt", + "xltm", + "xltx", + "xltb", + "xlam", + "xla", + "ods", + "ots")), + New KeyValuePair(Of FileKind, ImmutableArray(Of String))(FileKind.Pptx, + ImmutableArray. + Create("ppt", + "pptm", + "pot", + "potm", + "potx", + "pps", + "ppsm", + "ppsx", + "odp", + "otp")) }) Private Shared ReadOnly _ diff --git a/src/FileTypeDetection/FileTypeDetector.vb b/src/FileTypeDetection/FileTypeDetector.vb index d60a696b..e08ad769 100644 --- a/src/FileTypeDetection/FileTypeDetector.vb +++ b/src/FileTypeDetection/FileTypeDetector.vb @@ -61,6 +61,7 @@ Namespace Global.Tomtastisch.FileClassifier Private Const ReasonArchiveStructuredRefined As String = "ArchiveStructuredRefined" Private Const ReasonArchiveRefined As String = "ArchiveRefined" Private Const ReasonArchiveGeneric As String = "ArchiveGeneric" + Private Const ReasonOfficeBinaryRefined As String = "OfficeBinaryRefined" ''' ''' Setzt globale Default-Optionen als Snapshot. @@ -276,7 +277,7 @@ Namespace Global.Tomtastisch.FileClassifier End Function ''' - ''' Prüft fail-closed, ob eine Datei einen sicheren Archiv-Container repräsentiert. + ''' Prüft fail-closed, ob eine Datei ein sicheres, extrahierbares Archiv repräsentiert. ''' ''' ''' @@ -290,7 +291,7 @@ Namespace Global.Tomtastisch.FileClassifier ''' ''' ''' Dateipfad der zu validierenden Archivdatei. - ''' True, wenn der Container valide und sicher ist; andernfalls False. + ''' True, wenn Typprüfung und Safety-Gate für ein extrahierbares Archiv bestehen; sonst False. Public Shared Function TryValidateArchive _ ( path As String @@ -298,9 +299,12 @@ Namespace Global.Tomtastisch.FileClassifier Dim opt As FileTypeProjectOptions = GetDefaultOptions() Dim descriptor As ArchiveDescriptor = ArchiveDescriptor.UnknownDescriptor() + Dim detected As FileType ' Guard-Clauses: Pfad und Dateiexistenz. If String.IsNullOrWhiteSpace(path) OrElse Not File.Exists(path) Then Return False + detected = DetectPathCore(path) + If Not IsArchiveContainerKind(detected.Kind) Then Return False Try ' describe -> safety gate. @@ -325,6 +329,11 @@ Namespace Global.Tomtastisch.FileClassifier End Try End Function + ''' + ''' Führt die Pfad-Detektion ohne Endungs-Policy aus und liefert nur das inhaltsbasierte Ergebnis. + ''' + ''' Pfad zur zu erkennenden Datei. + ''' Erkannter Typ oder . Private Shared Function DetectPathCore( path As String ) As FileType @@ -334,6 +343,17 @@ Namespace Global.Tomtastisch.FileClassifier Return DetectPathCoreWithTrace(path, opt, trace) End Function + ''' + ''' Kernpfad für inhaltsbasierte Dateityperkennung inklusive Trace-Erfassung. + ''' + ''' + ''' Die Funktion kapselt File-Guards, Header-Lesen und die zentrale Header-/Archiv-Auflösung. + ''' Fehlerpfade bleiben fail-closed und setzen den passenden Reason-Code. + ''' + ''' Dateipfad der Quelldatei. + ''' Options-Snapshot für Größen- und Sicherheitsgrenzen. + ''' Rückkanal für auditierbare Entscheidungsinformationen. + ''' Erkannter Typ oder . Private Shared Function DetectPathCoreWithTrace( path As String, opt As FileTypeProjectOptions, @@ -439,7 +459,7 @@ Namespace Global.Tomtastisch.FileClassifier ''' ''' Pfad zur Archivdatei. ''' Leeres, noch nicht existierendes Zielverzeichnis. - ''' True aktiviert eine vorgelagerte Typprüfung über Detect(path). + ''' True aktiviert zusätzlich eine vollständige Vorvalidierung über . ''' True bei erfolgreichem, atomarem Entpacken; sonst False. Public Function ExtractArchiveSafe _ ( @@ -491,7 +511,7 @@ Namespace Global.Tomtastisch.FileClassifier ''' ''' ''' Pfad zur Archivdatei. - ''' True aktiviert eine vorgelagerte Typprüfung über Detect(path). + ''' True aktiviert zusätzlich eine vollständige Vorvalidierung über . ''' Read-only Liste extrahierter Einträge oder leer bei Fehler. Public Function ExtractArchiveSafeToMemory _ ( @@ -529,6 +549,12 @@ Namespace Global.Tomtastisch.FileClassifier End Try End Function + ''' + ''' Byte-basierte Detektion mit denselben Sicherheitsgrenzen wie die Pfadvariante. + ''' + ''' Zu detektierende Nutzdaten. + ''' Options-Snapshot mit Maximalgrenzen. + ''' Erkannter Typ oder . Private Shared Function DetectInternalBytes( data As Byte(), opt As FileTypeProjectOptions @@ -579,7 +605,11 @@ Namespace Global.Tomtastisch.FileClassifier End Function, tryRefine:=Function() Return OpenXmlRefiner.TryRefineStream(fs) - End Function) + End Function, + tryRefineLegacyOffice:=Function() + Return LegacyOfficeBinaryRefiner.TryRefineStream( + fs, ResolveLegacyOfficeProbeBytes(opt)) + End Function) End Function ''' @@ -606,20 +636,43 @@ Namespace Global.Tomtastisch.FileClassifier Using ms = CreateReadOnlyMemoryStream(data) Return OpenXmlRefiner.TryRefineStream(ms) End Using - End Function) + End Function, + tryRefineLegacyOffice:=Function() + Return LegacyOfficeBinaryRefiner.TryRefineBytes(data) + End Function) End Function + ''' + ''' Zentraler Entscheidungsfluss für Header-/Archiv-/Refinement-Pfade. + ''' + ''' + ''' Reihenfolge: + ''' 1) direkte Header-Matches, + ''' 2) Legacy-Office-Refinement (OLE), + ''' 3) Archiv-Beschreibung + Safety-Gate, + ''' 4) optionales strukturiertes ZIP-Refinement. + ''' + ''' Gelesene Header-Bytes. + ''' Options-Snapshot. + ''' Audit-Trace. + ''' Archiv-Descriptor-Factory. + ''' Archiv-Safety-Validator. + ''' ZIP-basiertes Office/OpenDocument-Refinement. + ''' OLE-basiertes Legacy-Office-Refinement. + ''' Erkannter Dateityp oder . Private Shared Function ResolveByHeaderCommon( header As Byte(), opt As FileTypeProjectOptions, ByRef trace As DetectionTrace, tryDescribe As Func(Of ArchiveDescriptor), tryValidate As Func(Of ArchiveDescriptor, Boolean), - tryRefine As Func(Of FileType) + tryRefine As Func(Of FileType), + tryRefineLegacyOffice As Func(Of FileType) ) As FileType Dim magicKind As FileKind Dim descriptor As ArchiveDescriptor + Dim legacyOfficeType As FileType If header Is Nothing OrElse header.Length = 0 Then trace.ReasonCode = ReasonHeaderUnknown @@ -632,6 +685,14 @@ Namespace Global.Tomtastisch.FileClassifier Return FileTypeRegistry.Resolve(magicKind) End If + If magicKind = FileKind.Unknown AndAlso LegacyOfficeBinaryRefiner.IsOleCompoundHeader(header) Then + legacyOfficeType = tryRefineLegacyOffice() + If legacyOfficeType.Kind <> FileKind.Unknown Then + trace.ReasonCode = ReasonOfficeBinaryRefined + Return legacyOfficeType + End If + End If + If magicKind = FileKind.Zip Then descriptor = ArchiveDescriptor.ForContainerType(ArchiveContainerType.Zip) Else @@ -652,6 +713,21 @@ Namespace Global.Tomtastisch.FileClassifier Return ResolveAfterArchiveGate(magicKind, opt, trace, tryRefine) End Function + ''' + ''' Ermittelt die maximale Probegröße für Legacy-OLE-Refinement. + ''' + ''' Options-Snapshot oder Nothing. + ''' Probegröße in Byte, defensiv begrenzt auf 1 MiB. + Private Shared Function ResolveLegacyOfficeProbeBytes(opt As FileTypeProjectOptions) As Integer + Dim maxProbe As Long + + If opt Is Nothing Then Return 1048576 + + maxProbe = Math.Min(opt.MaxBytes, 1048576L) + If maxProbe <= 0 Then Return 1048576 + Return CInt(maxProbe) + End Function + Private Shared Function TryDescribeArchiveStreamDescriptor( fs As FileStream, opt As FileTypeProjectOptions @@ -745,10 +821,15 @@ Namespace Global.Tomtastisch.FileClassifier Return False End If + detected = Detect(path) + If Not IsArchiveContainerKind(detected.Kind) Then + LogGuard.Warn(opt.Logger, $"[ArchiveExtract] Kein extrahierbarer Archivtyp ({detected.Kind}).") + Return False + End If + If verifyBeforeExtract Then - detected = Detect(path) - If Not IsArchiveContainerKind(detected.Kind) Then - LogGuard.Warn(opt.Logger, $"[ArchiveExtract] Vorprüfung fehlgeschlagen ({detected.Kind}).") + If Not TryValidateArchive(path) Then + LogGuard.Warn(opt.Logger, "[ArchiveExtract] Vorvalidierung fehlgeschlagen.") Return False End If End If @@ -756,6 +837,13 @@ Namespace Global.Tomtastisch.FileClassifier Return True End Function + ''' + ''' Wendet optional die Endungs-Policy auf ein inhaltsbasiertes Detektionsergebnis an. + ''' + ''' Dateipfad der Quelldatei. + ''' Inhaltsbasiert erkannter Typ. + ''' True aktiviert die Endungsprüfung. + ''' Detektierter Typ oder bei aktivem Mismatch. Private Shared Function ApplyExtensionPolicy(path As String, detected As FileType, verifyExtension As Boolean) _ As FileType @@ -764,12 +852,14 @@ Namespace Global.Tomtastisch.FileClassifier Return UnknownType() End Function + ''' + ''' Prüft, ob ein erkannter Typ ein tatsächlich extrahierbarer Archivcontainer ist. + ''' + ''' Erkannter Dateityp. + ''' True für extrahierbare Archivtypen, sonst False. Private Shared Function IsArchiveContainerKind(kind As FileKind) As Boolean - Return kind = FileKind.Zip OrElse - kind = FileKind.Docx OrElse - kind = FileKind.Xlsx OrElse - kind = FileKind.Pptx + Return kind = FileKind.Zip End Function Private Shared Sub WarnIfNoDirectContentDetection(kind As FileKind, opt As FileTypeProjectOptions) diff --git a/src/FileTypeDetection/Infrastructure/CoreInternals.vb b/src/FileTypeDetection/Infrastructure/CoreInternals.vb index 153b169d..2c6453b3 100644 --- a/src/FileTypeDetection/Infrastructure/CoreInternals.vb +++ b/src/FileTypeDetection/Infrastructure/CoreInternals.vb @@ -12,6 +12,7 @@ Option Explicit On Imports System.IO Imports System.IO.Compression +Imports System.Text Imports Microsoft.Extensions.Logging Namespace Global.Tomtastisch.FileClassifier @@ -267,7 +268,7 @@ Namespace Global.Tomtastisch.FileClassifier End Class ''' - ''' Verfeinert Archivpakete zu OOXML-Typen anhand kanonischer Paket-Pfade. + ''' Verfeinert ZIP-basierte Office-Container zu Dokumenttypen anhand kanonischer Paketmarker. ''' Implementationsprinzip: ''' - reduziert False-Positives bei generischen ZIP-Dateien ''' - bleibt fail-closed (Fehler => Unknown) @@ -320,7 +321,11 @@ Namespace Global.Tomtastisch.FileClassifier Dim hasDocxMarker As Boolean = False Dim hasXlsxMarker As Boolean = False Dim hasPptxMarker As Boolean = False + Dim openDocumentKind As FileKind = FileKind.Unknown + Dim hasOpenDocumentConflict As Boolean = False + Dim structuredMarkerCount As Integer Dim name As String + Dim candidateOpenDocumentKind As FileKind Try Using zip As New ZipArchive(stream, ZipArchiveMode.Read, leaveOpen:=True) @@ -334,20 +339,243 @@ Namespace Global.Tomtastisch.FileClassifier If String.Equals(name, "word/document.xml", StringComparison.OrdinalIgnoreCase) Then hasDocxMarker = True - ElseIf String.Equals(name, "xl/workbook.xml", StringComparison.OrdinalIgnoreCase) Then + ElseIf String.Equals(name, "xl/workbook.xml", StringComparison.OrdinalIgnoreCase) OrElse + String.Equals(name, "xl/workbook.bin", StringComparison.OrdinalIgnoreCase) Then hasXlsxMarker = True ElseIf String.Equals(name, "ppt/presentation.xml", StringComparison.OrdinalIgnoreCase) Then hasPptxMarker = True End If + candidateOpenDocumentKind = TryDetectOpenDocumentKind(entry) + If candidateOpenDocumentKind <> FileKind.Unknown Then + If openDocumentKind = FileKind.Unknown Then + openDocumentKind = candidateOpenDocumentKind + ElseIf openDocumentKind <> candidateOpenDocumentKind Then + hasOpenDocumentConflict = True + End If + End If + Next If hasContentTypes Then + structuredMarkerCount = 0 + If hasDocxMarker Then structuredMarkerCount += 1 + If hasXlsxMarker Then structuredMarkerCount += 1 + If hasPptxMarker Then structuredMarkerCount += 1 + + If structuredMarkerCount > 1 Then + Return FileTypeRegistry.Resolve(FileKind.Unknown) + End If + + If openDocumentKind <> FileKind.Unknown Then + Return FileTypeRegistry.Resolve(FileKind.Unknown) + End If + If hasDocxMarker Then Return FileTypeRegistry.Resolve(FileKind.Docx) If hasXlsxMarker Then Return FileTypeRegistry.Resolve(FileKind.Xlsx) If hasPptxMarker Then Return FileTypeRegistry.Resolve(FileKind.Pptx) End If + + If hasOpenDocumentConflict Then + Return FileTypeRegistry.Resolve(FileKind.Unknown) + End If + + If openDocumentKind <> FileKind.Unknown Then + Return FileTypeRegistry.Resolve(openDocumentKind) + End If + End Using + Catch ex As Exception When _ + TypeOf ex Is UnauthorizedAccessException OrElse + TypeOf ex Is System.Security.SecurityException OrElse + TypeOf ex Is IOException OrElse + TypeOf ex Is InvalidDataException OrElse + TypeOf ex Is NotSupportedException OrElse + TypeOf ex Is ArgumentException OrElse + TypeOf ex Is InvalidOperationException OrElse + TypeOf ex Is ObjectDisposedException + Return FileTypeRegistry.Resolve(FileKind.Unknown) + End Try + + Return FileTypeRegistry.Resolve(FileKind.Unknown) + End Function + + ''' + ''' Liest den OpenDocument-MIME-Eintrag aus einem ZIP-Entry und mappt ihn auf die interne Office-Gruppierung. + ''' + ''' + ''' Fail-closed: Unbekannte, leere oder widersprüchliche MIME-Werte werden als behandelt. + ''' + ''' ZIP-Entry, der den ODF-MIME-Inhalt enthalten kann. + ''' Gemappter Office-Typ oder . + Private Shared Function TryDetectOpenDocumentKind(entry As ZipArchiveEntry) As FileKind + Dim mimeValue As String + Dim normalizedMime As String + + If entry Is Nothing Then Return FileKind.Unknown + If Not String.Equals(entry.FullName, "mimetype", StringComparison.OrdinalIgnoreCase) Then Return FileKind.Unknown + + mimeValue = ReadZipEntryText(entry, maxBytes:=256) + If String.IsNullOrWhiteSpace(mimeValue) Then Return FileKind.Unknown + normalizedMime = mimeValue.Trim().ToLowerInvariant() + + If normalizedMime = "application/vnd.oasis.opendocument.text" Then Return FileKind.Docx + If normalizedMime = "application/vnd.oasis.opendocument.text-template" Then Return FileKind.Docx + If normalizedMime = "application/vnd.oasis.opendocument.spreadsheet" Then Return FileKind.Xlsx + If normalizedMime = "application/vnd.oasis.opendocument.spreadsheet-template" Then Return FileKind.Xlsx + If normalizedMime = "application/vnd.oasis.opendocument.presentation" Then Return FileKind.Pptx + If normalizedMime = "application/vnd.oasis.opendocument.presentation-template" Then Return FileKind.Pptx + + Return FileKind.Unknown + End Function + + ''' + ''' Liest einen kleinen Text-Entry defensiv und deterministisch aus einem ZIP-Container. + ''' + ''' + ''' Diese Hilfsfunktion ist absichtlich restriktiv: + ''' - nur Einträge bis + ''' - kein tolerantes „Best-Effort“-Decoding bei Teilreads + ''' - Fehlerpfad immer leerer String (fail-closed) + ''' + ''' ZIP-Entry, der gelesen werden soll. + ''' Maximal erlaubte Größe in Byte. + ''' ASCII-Textinhalt oder leerer String bei Guard-/Fehlerpfad. + Private Shared Function ReadZipEntryText(entry As ZipArchiveEntry, maxBytes As Integer) As String + Dim buffer As Byte() + Dim readTotal As Integer + Dim readCount As Integer + + If entry Is Nothing Then Return String.Empty + If maxBytes <= 0 Then Return String.Empty + If entry.Length < 0 OrElse entry.Length > maxBytes Then Return String.Empty + + Try + Using entryStream As Stream = entry.Open() + buffer = New Byte(CInt(entry.Length) - 1) {} + If buffer.Length = 0 Then Return String.Empty + + While readTotal < buffer.Length + readCount = entryStream.Read(buffer, readTotal, buffer.Length - readTotal) + If readCount <= 0 Then Exit While + readTotal += readCount + End While + + If readTotal <> buffer.Length Then Return String.Empty + Return Encoding.ASCII.GetString(buffer) End Using + Catch ex As Exception When _ + TypeOf ex Is UnauthorizedAccessException OrElse + TypeOf ex Is System.Security.SecurityException OrElse + TypeOf ex Is IOException OrElse + TypeOf ex Is InvalidDataException OrElse + TypeOf ex Is NotSupportedException OrElse + TypeOf ex Is ArgumentException OrElse + TypeOf ex Is InvalidOperationException OrElse + TypeOf ex Is ObjectDisposedException + Return String.Empty + End Try + End Function + End Class + + ''' + ''' Verfeinert klassische OLE2-Office-Dokumente (z. B. DOC/XLS/PPT) auf gruppierte Dokumenttypen. + ''' Ziel ist die robuste Trennung von Office-Dokumenten gegenüber generischen Archiven. + ''' + ''' + ''' Das Refinement ist bewusst heuristisch und fail-closed: + ''' - Voraussetzung ist ein gültiger OLE-Header. + ''' - Es muss genau ein Office-Marker eindeutig erkannt werden. + ''' - Mehrdeutigkeit oder Fehler führen deterministisch zu . + ''' + Friend NotInheritable Class LegacyOfficeBinaryRefiner + Private Const DefaultMaxProbeBytes As Integer = 1048576 + + Private Shared ReadOnly OleSignature As Byte() = {&HD0, &HCF, &H11, &HE0, &HA1, &HB1, &H1A, &HE1} + Private Shared ReadOnly WordMarker As Byte() = Encoding.ASCII.GetBytes("WordDocument") + Private Shared ReadOnly ExcelWorkbookMarker As Byte() = Encoding.ASCII.GetBytes("Workbook") + Private Shared ReadOnly ExcelBookMarker As Byte() = Encoding.ASCII.GetBytes("Book") + Private Shared ReadOnly PowerPointMarker As Byte() = Encoding.ASCII.GetBytes("PowerPoint Document") + + Private Sub New() + End Sub + + ''' + ''' Prüft, ob die Bytefolge mit der OLE-Compound-File-Signatur beginnt. + ''' + ''' Zu prüfender Header/Payload. + ''' True bei gültiger OLE-Signatur, sonst False. + Friend Shared Function IsOleCompoundHeader(data As Byte()) As Boolean + Dim i As Integer + + If data Is Nothing Then Return False + If data.Length < OleSignature.Length Then Return False + + For i = 0 To OleSignature.Length - 1 + If data(i) <> OleSignature(i) Then Return False + Next + + Return True + End Function + + ''' + ''' Verfeinert einen OLE-Bytepuffer auf den gruppierten Office-Zieltyp. + ''' + ''' Kompletter oder teilweiser OLE-Payload. + ''' Gemappter Office-Typ oder . + Friend Shared Function TryRefineBytes(data As Byte()) As FileType + If data Is Nothing OrElse data.Length = 0 Then Return FileTypeRegistry.Resolve(FileKind.Unknown) + + Try + Return RefineByMarkers(data) + Catch ex As Exception When _ + TypeOf ex Is UnauthorizedAccessException OrElse + TypeOf ex Is System.Security.SecurityException OrElse + TypeOf ex Is IOException OrElse + TypeOf ex Is InvalidDataException OrElse + TypeOf ex Is NotSupportedException OrElse + TypeOf ex Is ArgumentException OrElse + TypeOf ex Is InvalidOperationException OrElse + TypeOf ex Is ObjectDisposedException + Return FileTypeRegistry.Resolve(FileKind.Unknown) + End Try + End Function + + ''' + ''' Verfeinert einen Stream auf Legacy-Office-Marker mit harter Probe-Grenze. + ''' + ''' Lesbarer Quellstream. + ''' Maximale Probegröße; wird intern defensiv gekappt. + ''' Gemappter Office-Typ oder . + Friend Shared Function TryRefineStream(stream As Stream, maxProbeBytes As Integer) As FileType + Dim probeLimit As Integer + Dim chunk(4095) As Byte + Dim readTotal As Integer + Dim readCount As Integer + Dim targetStream As MemoryStream + Dim buffer As Byte() + + If Not StreamGuard.IsReadable(stream) Then Return FileTypeRegistry.Resolve(FileKind.Unknown) + + probeLimit = maxProbeBytes + If probeLimit <= 0 Then probeLimit = DefaultMaxProbeBytes + If probeLimit > DefaultMaxProbeBytes Then probeLimit = DefaultMaxProbeBytes + + Try + StreamGuard.RewindToStart(stream) + targetStream = New MemoryStream(probeLimit) + Try + While readTotal < probeLimit + readCount = stream.Read(chunk, 0, Math.Min(chunk.Length, probeLimit - readTotal)) + If readCount <= 0 Then Exit While + targetStream.Write(chunk, 0, readCount) + readTotal += readCount + End While + + buffer = targetStream.ToArray() + Return RefineByMarkers(buffer) + Finally + targetStream.Dispose() + End Try Catch ex As Exception When _ TypeOf ex Is UnauthorizedAccessException OrElse TypeOf ex Is System.Security.SecurityException OrElse @@ -359,9 +587,62 @@ Namespace Global.Tomtastisch.FileClassifier TypeOf ex Is ObjectDisposedException Return FileTypeRegistry.Resolve(FileKind.Unknown) End Try + End Function + + ''' + ''' Führt die eigentliche Marker-basierte Typentscheidung für Legacy-Office aus. + ''' + ''' OLE-Bytepuffer. + ''' Gruppierter Office-Typ oder . + Private Shared Function RefineByMarkers(data As Byte()) As FileType + Dim hasWord As Boolean + Dim hasExcel As Boolean + Dim hasPowerPoint As Boolean + Dim markerCount As Integer + + If Not IsOleCompoundHeader(data) Then Return FileTypeRegistry.Resolve(FileKind.Unknown) + + hasWord = ContainsMarker(data, WordMarker) + hasExcel = ContainsMarker(data, ExcelWorkbookMarker) OrElse ContainsMarker(data, ExcelBookMarker) + hasPowerPoint = ContainsMarker(data, PowerPointMarker) + + markerCount = 0 + If hasWord Then markerCount += 1 + If hasExcel Then markerCount += 1 + If hasPowerPoint Then markerCount += 1 + + If markerCount <> 1 Then Return FileTypeRegistry.Resolve(FileKind.Unknown) + If hasWord Then Return FileTypeRegistry.Resolve(FileKind.Docx) + If hasExcel Then Return FileTypeRegistry.Resolve(FileKind.Xlsx) + If hasPowerPoint Then Return FileTypeRegistry.Resolve(FileKind.Pptx) Return FileTypeRegistry.Resolve(FileKind.Unknown) End Function + + ''' + ''' Prüft, ob ein Marker als zusammenhängende Bytefolge im Payload vorkommt. + ''' + ''' Quellpuffer. + ''' Gesuchte Marker-Bytefolge. + ''' True bei Treffer, sonst False. + Private Shared Function ContainsMarker(data As Byte(), marker As Byte()) As Boolean + Dim i As Integer + Dim j As Integer + + If data Is Nothing OrElse marker Is Nothing Then Return False + If marker.Length = 0 Then Return False + If data.Length < marker.Length Then Return False + + For i = 0 To data.Length - marker.Length + For j = 0 To marker.Length - 1 + If data(i + j) <> marker(j) Then Exit For + Next + + If j = marker.Length Then Return True + Next + + Return False + End Function End Class ''' diff --git a/tests/FileTypeDetectionLib.Tests/Unit/ArchiveExtractionUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/ArchiveExtractionUnitTests.cs index 83822fed..746525c6 100644 --- a/tests/FileTypeDetectionLib.Tests/Unit/ArchiveExtractionUnitTests.cs +++ b/tests/FileTypeDetectionLib.Tests/Unit/ArchiveExtractionUnitTests.cs @@ -59,6 +59,57 @@ public void ExtractArchiveSafe_Fails_PreVerification_ForNonArchiveInput() Assert.False(Directory.Exists(destination)); } + [Fact] + public void ExtractArchiveSafe_Fails_PreVerification_ForDocxPayloadWithPdfExtension() + { + using var tempRoot = TestTempPaths.CreateScope("ftd-extract-docx-as-pdf"); + var disguisedPath = Path.Combine(tempRoot.RootPath, "looks-like-pdf.pdf"); + File.Copy(TestResources.Resolve("sample.docx"), disguisedPath); + var destination = Path.Combine(tempRoot.RootPath, "out"); + + var ok = new FileTypeDetector().ExtractArchiveSafe(disguisedPath, destination, true); + + Assert.False(ok); + Assert.False(Directory.Exists(destination)); + } + + [Fact] + public void ExtractArchiveSafe_Fails_PreVerification_ForDocxInput() + { + var source = TestResources.Resolve("sample.docx"); + using var tempRoot = TestTempPaths.CreateScope("ftd-extract-docx"); + var destination = Path.Combine(tempRoot.RootPath, "out"); + + var ok = new FileTypeDetector().ExtractArchiveSafe(source, destination, true); + + Assert.False(ok); + Assert.False(Directory.Exists(destination)); + } + + [Fact] + public void ExtractArchiveSafeToMemory_Fails_PreVerification_ForDocxPayloadWithPdfExtension() + { + using var tempRoot = TestTempPaths.CreateScope("ftd-extract-mem-docx-as-pdf"); + var disguisedPath = Path.Combine(tempRoot.RootPath, "looks-like-pdf.pdf"); + File.Copy(TestResources.Resolve("sample.docx"), disguisedPath); + + var entries = new FileTypeDetector().ExtractArchiveSafeToMemory(disguisedPath, true); + + Assert.NotNull(entries); + Assert.Empty(entries); + } + + [Fact] + public void ExtractArchiveSafeToMemory_Fails_PreVerification_ForDocxInput() + { + var source = TestResources.Resolve("sample.docx"); + + var entries = new FileTypeDetector().ExtractArchiveSafeToMemory(source, true); + + Assert.NotNull(entries); + Assert.Empty(entries); + } + [Fact] public void ExtractArchiveSafe_Fails_ForRootDestinationPath() { diff --git a/tests/FileTypeDetectionLib.Tests/Unit/DetectionDetailAndArchiveValidationUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/DetectionDetailAndArchiveValidationUnitTests.cs index 59655636..aaf755cb 100644 --- a/tests/FileTypeDetectionLib.Tests/Unit/DetectionDetailAndArchiveValidationUnitTests.cs +++ b/tests/FileTypeDetectionLib.Tests/Unit/DetectionDetailAndArchiveValidationUnitTests.cs @@ -1,3 +1,5 @@ +using System.IO.Compression; +using System.Text; using FileTypeDetectionLib.Tests.Support; using Tomtastisch.FileClassifier; @@ -33,7 +35,7 @@ public void DetectDetailed_WithVerifyExtension_FailsClosed_OnExtensionMismatch() public void TryValidateArchive_ReturnsExpectedResult_ForKnownInputs() { Assert.True(FileTypeDetector.TryValidateArchive(TestResources.Resolve("sample.zip"))); - Assert.True(FileTypeDetector.TryValidateArchive(TestResources.Resolve("sample.docx"))); + Assert.False(FileTypeDetector.TryValidateArchive(TestResources.Resolve("sample.docx"))); Assert.False(FileTypeDetector.TryValidateArchive(TestResources.Resolve("sample.pdf"))); } @@ -43,4 +45,64 @@ public void TryValidateArchive_ReturnsFalse_ForMissingFile() var missingPath = Path.Combine(Path.GetTempPath(), "ftd-missing-" + Guid.NewGuid().ToString("N") + ".zip"); Assert.False(FileTypeDetector.TryValidateArchive(missingPath)); } + + [Fact] + public void TryValidateArchive_ReturnsFalse_ForOpenDocumentSpreadsheet() + { + var path = Path.Combine(Path.GetTempPath(), "ftd-ods-" + Guid.NewGuid().ToString("N") + ".ods"); + File.WriteAllBytes(path, CreateOpenDocumentPackage("application/vnd.oasis.opendocument.spreadsheet")); + + try + { + Assert.False(FileTypeDetector.TryValidateArchive(path)); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + [Fact] + public void TryValidateArchive_ReturnsFalse_ForLegacyWordDocument() + { + var path = Path.Combine(Path.GetTempPath(), "ftd-doc-" + Guid.NewGuid().ToString("N") + ".doc"); + File.WriteAllBytes(path, CreateOleLikePayload("WordDocument")); + + try + { + Assert.False(FileTypeDetector.TryValidateArchive(path)); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + private static byte[] CreateOpenDocumentPackage(string mimeType) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + var mimeEntry = zip.CreateEntry("mimetype", CompressionLevel.NoCompression); + using (var writer = new StreamWriter(mimeEntry.Open(), Encoding.ASCII, 1024, leaveOpen: false)) + { + writer.Write(mimeType); + } + + zip.CreateEntry("content.xml"); + } + + return ms.ToArray(); + } + + private static byte[] CreateOleLikePayload(string marker) + { + var payload = new byte[1024]; + var oleSignature = new byte[] { 0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1 }; + Buffer.BlockCopy(oleSignature, 0, payload, 0, oleSignature.Length); + + var markerBytes = Encoding.ASCII.GetBytes(marker); + Buffer.BlockCopy(markerBytes, 0, payload, 256, markerBytes.Length); + return payload; + } } diff --git a/tests/FileTypeDetectionLib.Tests/Unit/EndToEndFailClosedMatrixUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/EndToEndFailClosedMatrixUnitTests.cs new file mode 100644 index 00000000..f7afd2b8 --- /dev/null +++ b/tests/FileTypeDetectionLib.Tests/Unit/EndToEndFailClosedMatrixUnitTests.cs @@ -0,0 +1,210 @@ +using FileTypeDetectionLib.Tests.Support; +using Tomtastisch.FileClassifier; + +namespace FileTypeDetectionLib.Tests.Unit; + +public sealed class EndToEndFailClosedMatrixUnitTests +{ + public static TheoryData SupportedFixtureMatrix => new() + { + { "sample.pdf", FileKind.Pdf, false }, + { "sample.png", FileKind.Png, false }, + { "sample.jpg", FileKind.Jpeg, false }, + { "sample.gif", FileKind.Gif, false }, + { "sample.webp", FileKind.Webp, false }, + { "sample.docx", FileKind.Docx, false }, + { "sample.xlsx", FileKind.Xlsx, false }, + { "sample.pptx", FileKind.Pptx, false }, + { "sample.zip", FileKind.Zip, true }, + { "sample.7z", FileKind.Zip, true }, + { "sample.rar", FileKind.Zip, true }, + { "sample_pdf_no_extension", FileKind.Pdf, false } + }; + + public static TheoryData SupportedAliasMatrix => new() + { + { "sample.docx", ".doc", FileKind.Docx }, + { "sample.docx", ".docm", FileKind.Docx }, + { "sample.docx", ".docb", FileKind.Docx }, + { "sample.docx", ".dot", FileKind.Docx }, + { "sample.docx", ".dotm", FileKind.Docx }, + { "sample.docx", ".dotx", FileKind.Docx }, + { "sample.docx", ".odt", FileKind.Docx }, + { "sample.docx", ".ott", FileKind.Docx }, + { "sample.xlsx", ".xls", FileKind.Xlsx }, + { "sample.xlsx", ".xlsm", FileKind.Xlsx }, + { "sample.xlsx", ".xlsb", FileKind.Xlsx }, + { "sample.xlsx", ".xlt", FileKind.Xlsx }, + { "sample.xlsx", ".xltm", FileKind.Xlsx }, + { "sample.xlsx", ".xltx", FileKind.Xlsx }, + { "sample.xlsx", ".xltb", FileKind.Xlsx }, + { "sample.xlsx", ".xlam", FileKind.Xlsx }, + { "sample.xlsx", ".xla", FileKind.Xlsx }, + { "sample.xlsx", ".ods", FileKind.Xlsx }, + { "sample.xlsx", ".ots", FileKind.Xlsx }, + { "sample.pptx", ".ppt", FileKind.Pptx }, + { "sample.pptx", ".pptm", FileKind.Pptx }, + { "sample.pptx", ".pot", FileKind.Pptx }, + { "sample.pptx", ".potm", FileKind.Pptx }, + { "sample.pptx", ".potx", FileKind.Pptx }, + { "sample.pptx", ".pps", FileKind.Pptx }, + { "sample.pptx", ".ppsm", FileKind.Pptx }, + { "sample.pptx", ".ppsx", FileKind.Pptx }, + { "sample.pptx", ".odp", FileKind.Pptx }, + { "sample.pptx", ".otp", FileKind.Pptx }, + { "sample.zip", ".tar", FileKind.Zip }, + { "sample.zip", ".tgz", FileKind.Zip }, + { "sample.zip", ".tar.gz", FileKind.Zip }, + { "sample.zip", ".gz", FileKind.Zip }, + { "sample.zip", ".bz2", FileKind.Zip }, + { "sample.zip", ".xz", FileKind.Zip }, + { "sample.zip", ".7z", FileKind.Zip }, + { "sample.zip", ".rar", FileKind.Zip } + }; + + public static TheoryData CorruptExtensionMatrix => new() + { + ".pdf", + ".png", + ".jpg", + ".gif", + ".webp", + ".docx", + ".xlsx", + ".pptx", + ".zip", + ".7z", + ".rar", + ".doc", + ".xlsm", + ".xlsb", + ".odt", + ".ods", + ".odp" + }; + + [Theory] + [MemberData(nameof(SupportedFixtureMatrix))] + public void Detect_PathBytesAndDetail_AreConsistent_ForSupportedFixtures(string fixture, FileKind expectedKind, + bool isArchive) + { + var detector = new FileTypeDetector(); + var path = TestResources.Resolve(fixture); + var bytes = File.ReadAllBytes(path); + + var fromPath = detector.Detect(path); + var fromBytes = detector.Detect(bytes); + var detail = detector.DetectDetailed(path, false); + + Assert.Equal(expectedKind, fromPath.Kind); + Assert.Equal(expectedKind, fromBytes.Kind); + Assert.Equal(expectedKind, detail.DetectedType.Kind); + Assert.Equal(isArchive, expectedKind == FileKind.Zip); + } + + [Theory] + [MemberData(nameof(SupportedFixtureMatrix))] + public void Detect_WrongExtension_OnlyFailsWithExplicitExtensionVerification(string fixture, FileKind expectedKind, + bool isArchive) + { + using var scope = TestTempPaths.CreateScope("ftd-e2e-mismatch"); + var sourcePath = TestResources.Resolve(fixture); + var disguisedPath = Path.Combine(scope.RootPath, $"payload-{Guid.NewGuid():N}.bin"); + File.Copy(sourcePath, disguisedPath); + + var detector = new FileTypeDetector(); + var byContent = detector.Detect(disguisedPath); + var strict = detector.Detect(disguisedPath, true); + var strictDetail = detector.DetectDetailed(disguisedPath, true); + + Assert.Equal(expectedKind, byContent.Kind); + Assert.Equal(FileKind.Unknown, strict.Kind); + Assert.Equal(FileKind.Unknown, strictDetail.DetectedType.Kind); + Assert.Equal("ExtensionMismatch", strictDetail.ReasonCode); + Assert.False(strictDetail.ExtensionVerified); + Assert.Equal(isArchive, byContent.Kind == FileKind.Zip); + } + + [Theory] + [MemberData(nameof(SupportedAliasMatrix))] + public void Detect_RecognizesSupportedAliasExtensions_InStrictMode(string sourceFixture, string aliasExtension, + FileKind expectedKind) + { + using var scope = TestTempPaths.CreateScope("ftd-e2e-alias"); + var sourcePath = TestResources.Resolve(sourceFixture); + var aliasPath = Path.Combine(scope.RootPath, $"alias-{Guid.NewGuid():N}{aliasExtension}"); + File.Copy(sourcePath, aliasPath); + + var strict = new FileTypeDetector().Detect(aliasPath, true); + + Assert.Equal(expectedKind, strict.Kind); + } + + [Theory] + [MemberData(nameof(SupportedFixtureMatrix))] + public void ArchiveApis_AreFailClosed_AndOnlyAcceptRealArchives(string fixture, FileKind expectedKind, bool isArchive) + { + var detector = new FileTypeDetector(); + var path = TestResources.Resolve(fixture); + + var validate = FileTypeDetector.TryValidateArchive(path); + var entriesStrict = detector.ExtractArchiveSafeToMemory(path, true); + var entriesRelaxed = detector.ExtractArchiveSafeToMemory(path, false); + + if (isArchive) + { + Assert.True(validate); + Assert.NotEmpty(entriesStrict); + Assert.NotEmpty(entriesRelaxed); + } + else + { + Assert.False(validate); + Assert.Empty(entriesStrict); + Assert.Empty(entriesRelaxed); + } + + Assert.Equal(isArchive, expectedKind == FileKind.Zip); + } + + [Theory] + [MemberData(nameof(CorruptExtensionMatrix))] + public void CorruptedPayloads_FailClosed_AcrossDetectionAndArchiveProcessing(string extension) + { + using var scope = TestTempPaths.CreateScope("ftd-e2e-corrupt"); + var path = Path.Combine(scope.RootPath, $"corrupt-{Guid.NewGuid():N}{extension}"); + var payload = CreateDeterministicCorruptPayload(extension); + File.WriteAllBytes(path, payload); + + var detector = new FileTypeDetector(); + var strict = detector.Detect(path, true); + var detail = detector.DetectDetailed(path, true); + var validate = FileTypeDetector.TryValidateArchive(path); + var entriesStrict = detector.ExtractArchiveSafeToMemory(path, true); + var entriesRelaxed = detector.ExtractArchiveSafeToMemory(path, false); + var fromBytes = detector.Detect(payload); + + Assert.Equal(FileKind.Unknown, strict.Kind); + Assert.Equal(FileKind.Unknown, detail.DetectedType.Kind); + Assert.Equal("ExtensionMismatch", detail.ReasonCode); + Assert.False(validate); + Assert.Empty(entriesStrict); + Assert.Empty(entriesRelaxed); + Assert.Equal(FileKind.Unknown, fromBytes.Kind); + } + + private static byte[] CreateDeterministicCorruptPayload(string extension) + { + var marker = $"corrupt-payload::{extension}"; + var prefix = System.Text.Encoding.ASCII.GetBytes(marker); + var payload = new byte[1024]; + + for (var i = 0; i < payload.Length; i++) + { + payload[i] = 0x41; + } + + Buffer.BlockCopy(prefix, 0, payload, 16, Math.Min(prefix.Length, payload.Length - 16)); + return payload; + } +} diff --git a/tests/FileTypeDetectionLib.Tests/Unit/ExtensionCheckUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/ExtensionCheckUnitTests.cs index 8d9a20ea..90684448 100644 --- a/tests/FileTypeDetectionLib.Tests/Unit/ExtensionCheckUnitTests.cs +++ b/tests/FileTypeDetectionLib.Tests/Unit/ExtensionCheckUnitTests.cs @@ -1,3 +1,5 @@ +using System.IO.Compression; +using System.Text; using FileTypeDetectionLib.Tests.Support; using Tomtastisch.FileClassifier; @@ -56,4 +58,138 @@ public void DetectAndVerifyExtension_AcceptsArchiveAlias_ForArchiveContent() if (File.Exists(path)) File.Delete(path); } } -} \ No newline at end of file + + [Fact] + public void Detect_DocxPayloadWithPdfExtension_RemainsDocx_UnlessVerifyExtensionIsTrue() + { + var detector = new FileTypeDetector(); + var source = TestResources.Resolve("sample.docx"); + var path = Path.Combine(Path.GetTempPath(), "ftd-docx-disguised-" + Guid.NewGuid().ToString("N") + ".pdf"); + + File.Copy(source, path); + try + { + var detectedWithoutExtensionPolicy = detector.Detect(path); + var detectedWithExtensionPolicy = detector.Detect(path, true); + + Assert.Equal(FileKind.Docx, detectedWithoutExtensionPolicy.Kind); + Assert.Equal(FileKind.Unknown, detectedWithExtensionPolicy.Kind); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + [Fact] + public void DetectAndVerifyExtension_AcceptsXlsmExtension_ForSpreadsheetOpenXmlPayload() + { + var detector = new FileTypeDetector(); + var path = Path.Combine(Path.GetTempPath(), "ftd-xlsm-" + Guid.NewGuid().ToString("N") + ".xlsm"); + File.WriteAllBytes(path, CreateOpenXmlPackage("xl/workbook.xml")); + + try + { + var detected = detector.Detect(path, true); + Assert.Equal(FileKind.Xlsx, detected.Kind); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + [Fact] + public void DetectAndVerifyExtension_AcceptsXlsbExtension_ForSpreadsheetBinaryWorkbookPayload() + { + var detector = new FileTypeDetector(); + var path = Path.Combine(Path.GetTempPath(), "ftd-xlsb-" + Guid.NewGuid().ToString("N") + ".xlsb"); + File.WriteAllBytes(path, CreateOpenXmlPackage("xl/workbook.bin")); + + try + { + var detected = detector.Detect(path, true); + Assert.Equal(FileKind.Xlsx, detected.Kind); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + [Fact] + public void DetectAndVerifyExtension_AcceptsOdsExtension_ForOpenDocumentSpreadsheetPayload() + { + var detector = new FileTypeDetector(); + var path = Path.Combine(Path.GetTempPath(), "ftd-ods-" + Guid.NewGuid().ToString("N") + ".ods"); + File.WriteAllBytes(path, CreateOpenDocumentPackage("application/vnd.oasis.opendocument.spreadsheet")); + + try + { + var detected = detector.Detect(path, true); + Assert.Equal(FileKind.Xlsx, detected.Kind); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + [Fact] + public void DetectAndVerifyExtension_AcceptsDocExtension_ForLegacyOfficePayload() + { + var detector = new FileTypeDetector(); + var path = Path.Combine(Path.GetTempPath(), "ftd-doc-" + Guid.NewGuid().ToString("N") + ".doc"); + File.WriteAllBytes(path, CreateOleLikePayload("WordDocument")); + + try + { + var detected = detector.Detect(path, true); + Assert.Equal(FileKind.Docx, detected.Kind); + } + finally + { + if (File.Exists(path)) File.Delete(path); + } + } + + private static byte[] CreateOpenXmlPackage(string markerPath) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + zip.CreateEntry("[Content_Types].xml"); + zip.CreateEntry(markerPath); + } + + return ms.ToArray(); + } + + private static byte[] CreateOpenDocumentPackage(string mimeType) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + var mimeEntry = zip.CreateEntry("mimetype", CompressionLevel.NoCompression); + using (var writer = new StreamWriter(mimeEntry.Open(), Encoding.ASCII, 1024, leaveOpen: false)) + { + writer.Write(mimeType); + } + + zip.CreateEntry("content.xml"); + } + + return ms.ToArray(); + } + + private static byte[] CreateOleLikePayload(string marker) + { + var payload = new byte[1024]; + var oleSignature = new byte[] { 0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1 }; + Buffer.BlockCopy(oleSignature, 0, payload, 0, oleSignature.Length); + + var markerBytes = Encoding.ASCII.GetBytes(marker); + Buffer.BlockCopy(markerBytes, 0, payload, 256, markerBytes.Length); + return payload; + } +} diff --git a/tests/FileTypeDetectionLib.Tests/Unit/FileTypeDetectorPrivateBranchUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/FileTypeDetectorPrivateBranchUnitTests.cs index 846ac01b..4d9e6ae9 100644 --- a/tests/FileTypeDetectionLib.Tests/Unit/FileTypeDetectorPrivateBranchUnitTests.cs +++ b/tests/FileTypeDetectionLib.Tests/Unit/FileTypeDetectorPrivateBranchUnitTests.cs @@ -206,6 +206,30 @@ public void ExtensionMatchesKind_HandlesEmptyAndMismatch() Assert.True(okAlias); } + [Theory] + [InlineData("file.doc", FileKind.Docx)] + [InlineData("file.docm", FileKind.Docx)] + [InlineData("file.docx", FileKind.Docx)] + [InlineData("file.odt", FileKind.Docx)] + [InlineData("file.xls", FileKind.Xlsx)] + [InlineData("file.xlsm", FileKind.Xlsx)] + [InlineData("file.xlsx", FileKind.Xlsx)] + [InlineData("file.xlsb", FileKind.Xlsx)] + [InlineData("file.ods", FileKind.Xlsx)] + [InlineData("file.ppt", FileKind.Pptx)] + [InlineData("file.pptm", FileKind.Pptx)] + [InlineData("file.pptx", FileKind.Pptx)] + [InlineData("file.odp", FileKind.Pptx)] + public void ExtensionMatchesKind_AcceptsOfficeVariantAliases(string path, FileKind expectedKind) + { + var method = + typeof(FileTypeDetector).GetMethod("ExtensionMatchesKind", BindingFlags.NonPublic | BindingFlags.Static)!; + Assert.NotNull(method); + + var matches = TestGuard.Unbox(method.Invoke(null, new object[] { path, expectedKind })); + Assert.True(matches); + } + [Fact] public void ReadHeader_ReturnsEmpty_ForWriteOnlyStream() { diff --git a/tests/FileTypeDetectionLib.Tests/Unit/LegacyOfficeBinaryRefinerUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/LegacyOfficeBinaryRefinerUnitTests.cs new file mode 100644 index 00000000..61f6427d --- /dev/null +++ b/tests/FileTypeDetectionLib.Tests/Unit/LegacyOfficeBinaryRefinerUnitTests.cs @@ -0,0 +1,56 @@ +using Tomtastisch.FileClassifier; + +namespace FileTypeDetectionLib.Tests.Unit; + +public sealed class LegacyOfficeBinaryRefinerUnitTests +{ + [Theory] + [InlineData("WordDocument", FileKind.Docx)] + [InlineData("Workbook", FileKind.Xlsx)] + [InlineData("PowerPoint Document", FileKind.Pptx)] + public void TryRefineBytes_DetectsLegacyOfficeMarkers(string marker, FileKind expected) + { + var payload = CreateOleLikePayload(marker); + + var detected = LegacyOfficeBinaryRefiner.TryRefineBytes(payload); + + Assert.Equal(expected, detected.Kind); + } + + [Fact] + public void TryRefineBytes_ReturnsUnknown_ForNonOlePayload() + { + var payload = new byte[] { 0x01, 0x02, 0x03, 0x04 }; + + var detected = LegacyOfficeBinaryRefiner.TryRefineBytes(payload); + + Assert.Equal(FileKind.Unknown, detected.Kind); + } + + [Fact] + public void TryRefineBytes_ReturnsUnknown_ForAmbiguousLegacyMarkers() + { + var payload = CreateOleLikePayload("WordDocument", "Workbook"); + + var detected = LegacyOfficeBinaryRefiner.TryRefineBytes(payload); + + Assert.Equal(FileKind.Unknown, detected.Kind); + } + + private static byte[] CreateOleLikePayload(params string[] markers) + { + var payload = new byte[1024]; + var oleSignature = new byte[] { 0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1 }; + Buffer.BlockCopy(oleSignature, 0, payload, 0, oleSignature.Length); + + var offset = 256; + foreach (var marker in markers) + { + var markerBytes = System.Text.Encoding.ASCII.GetBytes(marker); + Buffer.BlockCopy(markerBytes, 0, payload, offset, markerBytes.Length); + offset += markerBytes.Length + 8; + } + + return payload; + } +} diff --git a/tests/FileTypeDetectionLib.Tests/Unit/OpenXmlRefinerUnitTests.cs b/tests/FileTypeDetectionLib.Tests/Unit/OpenXmlRefinerUnitTests.cs index 36749be8..766d8ff9 100644 --- a/tests/FileTypeDetectionLib.Tests/Unit/OpenXmlRefinerUnitTests.cs +++ b/tests/FileTypeDetectionLib.Tests/Unit/OpenXmlRefinerUnitTests.cs @@ -27,6 +27,7 @@ public void TryRefineStream_ReturnsUnknown_ForUnreadableStream() [Theory] [InlineData("word/document.xml", FileKind.Docx)] [InlineData("xl/workbook.xml", FileKind.Xlsx)] + [InlineData("xl/workbook.bin", FileKind.Xlsx)] [InlineData("ppt/presentation.xml", FileKind.Pptx)] public void TryRefineStream_DetectsOpenXmlKinds(string markerPath, FileKind expected) { @@ -38,6 +39,23 @@ public void TryRefineStream_DetectsOpenXmlKinds(string markerPath, FileKind expe Assert.Equal(expected, result.Kind); } + [Theory] + [InlineData("application/vnd.oasis.opendocument.text", FileKind.Docx)] + [InlineData("application/vnd.oasis.opendocument.text-template", FileKind.Docx)] + [InlineData("application/vnd.oasis.opendocument.spreadsheet", FileKind.Xlsx)] + [InlineData("application/vnd.oasis.opendocument.spreadsheet-template", FileKind.Xlsx)] + [InlineData("application/vnd.oasis.opendocument.presentation", FileKind.Pptx)] + [InlineData("application/vnd.oasis.opendocument.presentation-template", FileKind.Pptx)] + public void TryRefineStream_DetectsOpenDocumentKinds(string mimeType, FileKind expected) + { + var payload = CreateOpenDocumentPackage(mimeType); + using var stream = new MemoryStream(payload, false); + + var result = OpenXmlRefiner.TryRefineStream(stream); + + Assert.Equal(expected, result.Kind); + } + [Fact] public void TryRefineStream_ReturnsUnknown_WhenContentTypesMissing() { @@ -60,6 +78,41 @@ public void TryRefineStream_ReturnsUnknown_WhenMarkersMissing() Assert.Equal(FileKind.Unknown, result.Kind); } + [Fact] + public void TryRefineStream_ReturnsUnknown_WhenOpenXmlMarkersAreAmbiguous() + { + var payload = CreateOpenXmlPackageWithMarkers("word/document.xml", "xl/workbook.xml"); + using var stream = new MemoryStream(payload, false); + + var result = OpenXmlRefiner.TryRefineStream(stream); + + Assert.Equal(FileKind.Unknown, result.Kind); + } + + [Fact] + public void TryRefineStream_ReturnsUnknown_WhenOpenXmlAndOpenDocumentSignalsConflict() + { + var payload = CreateHybridOfficePackage("application/vnd.oasis.opendocument.text", "word/document.xml"); + using var stream = new MemoryStream(payload, false); + + var result = OpenXmlRefiner.TryRefineStream(stream); + + Assert.Equal(FileKind.Unknown, result.Kind); + } + + [Fact] + public void TryRefineStream_ReturnsUnknown_WhenOpenDocumentMimeSignalsConflict() + { + var payload = CreateConflictingOpenDocumentMimes( + "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.spreadsheet"); + using var stream = new MemoryStream(payload, false); + + var result = OpenXmlRefiner.TryRefineStream(stream); + + Assert.Equal(FileKind.Unknown, result.Kind); + } + [Fact] public void TryRefine_ReturnsUnknown_WhenFactoryThrows() { @@ -90,4 +143,72 @@ private static byte[] CreateZipWithEntries(params string[] names) return ms.ToArray(); } -} \ No newline at end of file + + private static byte[] CreateOpenDocumentPackage(string mimeType) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + var mimeEntry = zip.CreateEntry("mimetype", CompressionLevel.NoCompression); + using (var writer = new StreamWriter(mimeEntry.Open())) + { + writer.Write(mimeType); + } + + zip.CreateEntry("content.xml"); + } + + return ms.ToArray(); + } + + private static byte[] CreateOpenXmlPackageWithMarkers(params string[] markerPaths) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + zip.CreateEntry("[Content_Types].xml"); + foreach (var markerPath in markerPaths) zip.CreateEntry(markerPath); + } + + return ms.ToArray(); + } + + private static byte[] CreateHybridOfficePackage(string openDocumentMimeType, string openXmlMarkerPath) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + zip.CreateEntry("[Content_Types].xml"); + zip.CreateEntry(openXmlMarkerPath); + + var mimeEntry = zip.CreateEntry("mimetype", CompressionLevel.NoCompression); + using (var writer = new StreamWriter(mimeEntry.Open())) + { + writer.Write(openDocumentMimeType); + } + } + + return ms.ToArray(); + } + + private static byte[] CreateConflictingOpenDocumentMimes(string firstMime, string secondMime) + { + using var ms = new MemoryStream(); + using (var zip = new ZipArchive(ms, ZipArchiveMode.Create, true)) + { + var first = zip.CreateEntry("mimetype", CompressionLevel.NoCompression); + using (var writer = new StreamWriter(first.Open())) + { + writer.Write(firstMime); + } + + var second = zip.CreateEntry("mimetype", CompressionLevel.NoCompression); + using (var writer = new StreamWriter(second.Open())) + { + writer.Write(secondMime); + } + } + + return ms.ToArray(); + } +} From ecb3936bbb5f4c67f9241b54858ce72050f7cf59 Mon Sep 17 00:00:00 2001 From: GitHub Copilot Agent Date: Wed, 18 Feb 2026 09:19:23 +0100 Subject: [PATCH 2/4] docs(versioning): evidence und rc6-status auf finale umsetzung aktualisieren --- README.md | 3 +- .../003_NETSTANDARD2_COMPAT_EVIDENCE.MD | 97 ++++++++++--------- docs/references/001_REFERENCES_CORE.MD | 1 + docs/references/101_REFERENCES_CORE.MD | 1 + docs/versioning/002_HISTORY_VERSIONS.MD | 2 +- docs/versioning/003_CHANGELOG_RELEASES.MD | 5 + docs/versioning/102_HISTORY_VERSIONS.MD | 2 +- docs/versioning/103_CHANGELOG_RELEASES.MD | 5 + 8 files changed, 69 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index bf6b1b25..7d9fb425 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,13 @@ EXPECTED_VERSION=X.Y.Z bash tools/ci/verify_nuget_release.sh ## 5. Compatibility / TFMs - Library-Zielplattformen: `netstandard2.0`, `net8.0` und `net10.0` - Release-Versioning: Git-Tag `vX.Y.Z` (optional `-prerelease`) ist SSOT +- Aktueller Pre-Release-Kanal der `5.2.0`-Linie: `v5.2.0-rc.6` ## 6. Architekturüberblick ### 6.1 Kernklassen (Datenfluss) | Kernklasse | Primäre Inputs | Primäre Outputs | Kernlogik | |---|---|---|---| -| `FileTypeDetector` | `path`, `byte[]`, `verifyExtension` | `FileType`, `DetectionDetail`, `bool`, `IReadOnlyList` | Header/Magic (`FileTypeRegistry`) plus Archiv-Gate (`ArchiveTypeResolver` + `ArchiveSafetyGate`) und optionales OOXML-Refinement (`OpenXmlRefiner`). | +| `FileTypeDetector` | `path`, `byte[]`, `verifyExtension` | `FileType`, `DetectionDetail`, `bool`, `IReadOnlyList` | Header/Magic (`FileTypeRegistry`) plus Archiv-Gate (`ArchiveTypeResolver` + `ArchiveSafetyGate`) und optionales Container-Refinement (`OpenXmlRefiner` fuer OOXML/OpenDocument, `LegacyOfficeBinaryRefiner` fuer OLE2-Office). | | `ArchiveProcessing` | `path`, `byte[]` | `bool`, `IReadOnlyList` | Fassade: path-basierte Validierung/Extraktion delegiert an `FileTypeDetector` (`TryValidateArchive` / `ExtractArchiveSafeToMemory`); byte-basierte Pfade nutzen `ArchivePayloadGuard` und `ArchiveEntryCollector`. | | `FileMaterializer` | `byte[]`, `destinationPath`, `overwrite`, `secureExtract` | `bool` | Nur Byte-basierte Persistenz: raw write oder (bei `secureExtract=true` und archivfähigem Payload) sichere Extraktion via `ArchiveExtractor`. | | `EvidenceHashing` | `path`, `byte[]`, `IReadOnlyList`, optionale Hash-Optionen | `HashEvidence`, `HashRoundTripReport` | Erkennung + Archivsammlung (`ArchiveEntryCollector`) und deterministische Manifest-/Payload-Hashes, inkl. RoundTrip über `FileMaterializer`. | diff --git a/docs/audit/compat/003_NETSTANDARD2_COMPAT_EVIDENCE.MD b/docs/audit/compat/003_NETSTANDARD2_COMPAT_EVIDENCE.MD index 3bf63de3..0a78e59e 100644 --- a/docs/audit/compat/003_NETSTANDARD2_COMPAT_EVIDENCE.MD +++ b/docs/audit/compat/003_NETSTANDARD2_COMPAT_EVIDENCE.MD @@ -1,12 +1,12 @@ # Evidence Report: netstandard2 Compat ## 1. Zweck -Dieser Report dokumentiert die technische Umsetzung und Verifikation fuer die net48-Kompatibilitaet ueber `netstandard2.0`. +Dieser Report dokumentiert die technische Umsetzung und Verifikation fuer die net48-Kompatibilitaet ueber `netstandard2.0` inklusive fail-closed Office-/Archiv-Refinement. ## 2. Geltungsbereich - Library: `src/FileTypeDetection/FileTypeDetectionLib.vbproj` -- Hashing-Core-Fassade: `src/FileTypeDetection/EvidenceHashing.vb` -- Provider-Abstraktionen und TFM-Provider unter `src/FileTypeDetection/Abstractions/Providers`, `src/FileTypeDetection/Composition`, `src/FileTypeDetection/Providers` +- Detektion/Refinement: `src/FileTypeDetection/FileTypeDetector.vb`, `src/FileTypeDetection/Detection/FileTypeRegistry.vb`, `src/FileTypeDetection/Infrastructure/CoreInternals.vb` +- Tests: `tests/FileTypeDetectionLib.Tests/Unit/*` ## 3. Regeln/Architektur ### 3.1 Before/After TargetFrameworks @@ -33,7 +33,7 @@ Dieser Report dokumentiert die technische Umsetzung und Verifikation fuer die ne - SHA256: `SHA256.HashData` - Hex: `Convert.ToHexString(...).ToLowerInvariant()` - FastHash: `XxHash3.HashToUInt64(...).ToString("x16")` -- FastHash ist auf `netstandard2.0` **nicht** deaktiviert. +- FastHash ist auf `netstandard2.0` nicht deaktiviert. ### 3.4 Provider-Selektion (compile-time) MSBuild-Conditionen in `src/FileTypeDetection/FileTypeDetectionLib.vbproj`: @@ -41,52 +41,48 @@ MSBuild-Conditionen in `src/FileTypeDetection/FileTypeDetectionLib.vbproj`: - `netstandard2.0`: `` - `net8.0|net10.0`: `` +### 3.5 Office-/Archiv-Semantik (fail-closed) +- Office/OpenDocument-Endungen werden alias-basiert auf gruppierte Typen aufgeloest (`Docx`, `Xlsx`, `Pptx`). +- Legacy-OLE2 (`.doc/.xls/.ppt`) wird ueber `LegacyOfficeBinaryRefiner` markerbasiert fail-closed verfeinert. +- `TryValidateArchive` und Extraktion akzeptieren nur echte extrahierbare Archiv-Container (`Zip`); Office-Container werden nicht als extrahierbares Archiv behandelt. +- Endungspruefung bleibt nachgelagerte Policy und wird nur bei explizitem Verify-Flag als Fehlerpfad erzwungen. + ## 4. Verifikation/Nachweise ### 4.1 Befehle und Exit-Codes -1. `dotnet --info` -> `0` +1. `dotnet --info` -> `0` (`artifacts/ci/netstandard2-compat/dotnet-info.txt`) 2. `dotnet restore FileClassifier.sln -v minimal` -> `0` -3. `dotnet build FileClassifier.sln -c Release --no-restore -warnaserror -v minimal` -> `0` -4. `dotnet test tests/FileTypeDetectionLib.Tests/FileTypeDetectionLib.Tests.csproj -c Release --no-build -v minimal` -> `0` (`414` Tests gruen) -5. `dotnet pack src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release --no-build -o artifacts/ci/netstandard2-compat/nuget -v minimal` -> `0` -6. `dotnet build src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release -f netstandard2.0 -v diag > artifacts/ci/netstandard2-compat/build-netstandard2.0.log` -> `0` -7. `dotnet build src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release -f net8.0 -v diag > artifacts/ci/netstandard2-compat/build-net8.0.log` -> `0` -8. `dotnet build src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release -f net10.0 -v diag > artifacts/ci/netstandard2-compat/build-net10.0.log` -> `0` -9. `python3 tools/check-doc-consistency.py` -> `0` -10. `python3 tools/check-docs.py` -> `0` -11. `bash tools/versioning/verify-version-convergence.sh` -> `0` -12. `bash tools/ci/bin/run.sh security-nuget` -> `0` -13. `EXPECTED_RELEASE_TAG=v5.2.0-rc.3 REQUIRE_RELEASE_TAG=1 bash tools/ci/check-versioning-svt.sh --repo-root . --out artifacts/ci/versioning-svt/versioning-svt-summary.json` -> `0` -14. `bash tools/ci/release/gate2_version_policy.sh release v5.2.0-rc.3 artifacts/nuget/Tomtastisch.FileClassifier.5.2.0-rc.3.nupkg` -> `0` -15. `VERIFY_ONLINE=0 bash tools/ci/release/gate4_verify_postpublish.sh 5.2.0-rc.3 artifacts/nuget/Tomtastisch.FileClassifier.5.2.0-rc.3.nupkg` -> `0` -16. `VERIFY_ONLINE=0 bash tools/ci/release/gate4_verify_postpublish.sh 5.2.0 artifacts/ci/netstandard2-compat/nuget/Tomtastisch.FileClassifier.5.2.0.nupkg` -> `0` +3. `dotnet restore --locked-mode FileClassifier.sln -v minimal` -> `0` +4. `dotnet build FileClassifier.sln -c Release --no-restore -warnaserror -v minimal` -> `0` +5. `dotnet test tests/FileTypeDetectionLib.Tests/FileTypeDetectionLib.Tests.csproj -c Release --no-build -v minimal` -> `0` (`543` Tests gruen) +6. `dotnet pack src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release --no-build -o artifacts/ci/netstandard2-compat/nuget -v minimal` -> `0` +7. `dotnet build src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release -f netstandard2.0 -v diag > artifacts/ci/netstandard2-compat/build-netstandard2.0.log` -> `0` +8. `dotnet build src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release -f net8.0 -v diag > artifacts/ci/netstandard2-compat/build-net8.0.log` -> `0` +9. `dotnet build src/FileTypeDetection/FileTypeDetectionLib.vbproj -c Release -f net10.0 -v diag > artifacts/ci/netstandard2-compat/build-net10.0.log` -> `0` +10. `python3 tools/check-doc-consistency.py` -> `0` +11. `python3 tools/check-docs.py` -> `0` +12. `EXPECTED_RELEASE_TAG=v5.2.0-rc.6 REQUIRE_RELEASE_TAG=1 bash tools/ci/bin/run.sh versioning-svt` -> `0` +13. `bash tools/ci/bin/run.sh version-convergence` -> `0` +14. `bash tools/ci/bin/run.sh security-nuget` -> `0` ### 4.2 Build-/Pack-Proof - Build-Matrix erfolgreich: - `src/FileTypeDetection/bin/Release/netstandard2.0/Tomtastisch.FileClassifier.dll` - `src/FileTypeDetection/bin/Release/net8.0/Tomtastisch.FileClassifier.dll` - `src/FileTypeDetection/bin/Release/net10.0/Tomtastisch.FileClassifier.dll` -- NUPKG-Inhalt (`unzip -l ... | rg "lib/"`): +- NUPKG-Inhalt (`artifacts/ci/netstandard2-compat/nuget/Tomtastisch.FileClassifier.5.2.0.nupkg`): - `lib/netstandard2.0/Tomtastisch.FileClassifier.dll` - `lib/net8.0/Tomtastisch.FileClassifier.dll` - `lib/net10.0/Tomtastisch.FileClassifier.dll` ### 4.3 Provider-Compile-Proof -- Build-Logs enthalten die erwarteten Providerpfade je TFM: - - `artifacts/ci/netstandard2-compat/build-netstandard2.0.log` mit `Providers/NetStandard2_0/HashPrimitivesProvider.vb` - - `artifacts/ci/netstandard2-compat/build-net8.0.log` mit `Providers/Net8_0Plus/HashPrimitivesProvider.vb` - - `artifacts/ci/netstandard2-compat/build-net10.0.log` mit `Providers/Net8_0Plus/HashPrimitivesProvider.vb` -- Runtime-nahe Marker-Probe aus den drei Build-Artefakten: +- Build-Task-Proof je TFM: `artifacts/ci/netstandard2-compat/provider-compile-proof-short.txt` + - `netstandard2.0` -> `Providers/NetStandard2_0/HashPrimitivesProvider.vb` + - `net8.0` -> `Providers/Net8_0Plus/HashPrimitivesProvider.vb` + - `net10.0` -> `Providers/Net8_0Plus/HashPrimitivesProvider.vb` +- Runtime-Marker-Proof: `artifacts/ci/netstandard2-compat/provider-marker-proof.txt` - `netstandard2.0:NetStandard2_0` - `net8.0:Net8_0Plus` - `net10.0:Net8_0Plus` -- Probe-Kommando: -```bash -tmpdir=$(mktemp -d) -cd "$tmpdir" -dotnet new console -n Probe -f net10.0 -# Program.cs laedt jede TFM-DLL in eigenem AssemblyLoadContext und liest ProviderMarker via Reflection. -dotnet run -c Release --no-restore -``` ### 4.4 Forbidden-API Grep-Proof (Core) Befehl: @@ -94,23 +90,36 @@ Befehl: rg -n "Convert\.ToHexString|SHA256\.HashData|System\.IO\.Hashing|Microsoft\.AspNetCore\.App" src/FileTypeDetection/Core ``` Ergebnis: -- keine Treffer (`forbidden_core_refs=none`) +- keine Treffer (`artifacts/ci/netstandard2-compat/core-forbidden-apis.txt` hat `0` Zeilen) + +### 4.5 Test-/Semantik-Proof (Office/OpenOffice/Archive) +- Neue/erweiterte Tests decken u. a. ab: + - falsche Endung vs. Inhaltsdetektion (`verifyExtension=false/true`) + - Legacy-OLE Office (`doc/xls/ppt`) + - OpenDocument (`odt/ods/odp`) + - echte Archive vs. Office-Container + - korrupte Payloads und Konfliktmarker (fail-closed) +- Relevante Testdateien: + - `tests/FileTypeDetectionLib.Tests/Unit/EndToEndFailClosedMatrixUnitTests.cs` + - `tests/FileTypeDetectionLib.Tests/Unit/LegacyOfficeBinaryRefinerUnitTests.cs` + - `tests/FileTypeDetectionLib.Tests/Unit/OpenXmlRefinerUnitTests.cs` + - `tests/FileTypeDetectionLib.Tests/Unit/ExtensionCheckUnitTests.cs` + - `tests/FileTypeDetectionLib.Tests/Unit/ArchiveExtractionUnitTests.cs` -### 4.5 CI-Teilchecks -- `artifacts/ci/versioning-svt/versioning-svt-summary.json` -> `status: pass` (pre-release `v5.2.0-rc.3`, core-match `5.2.0`) +### 4.6 Version-/Release-Konvergenz +- `artifacts/versioning_report.json` -> `status: pass`, `expected_version: 5.2.0-rc.6` +- `artifacts/ci/versioning-svt/versioning-svt-summary.json` -> `status: pass` - `artifacts/ci/version-convergence/summary.json` -> `status: pass`, `repo_version=5.2.0`, `vbproj_version=5.2.0`, `docs_latest_version=5.2.0` -- `artifacts/ci/security-nuget/result.json` -> `status: pass` -- Gate-4-PreRelease-Probe (`VERIFY_ONLINE=0`) zeigt `require_registration=0`. -- Gate-4-Stable-Probe (`VERIFY_ONLINE=0`) zeigt `require_registration=1`. +- RC-PreRelease-NUPKG fuer SVT-Probe: `artifacts/nuget/Tomtastisch.FileClassifier.5.2.0-rc.6.nupkg` -### 4.6 Policy/Konvergenz-Notiz +### 4.7 Policy/Konvergenz-Notiz Ambiguitaet zwischen: -- `docs/versioning/001_POLICY_VERSIONING.MD:43` (in PR/CI keine statischen Versionfelder), und -- existierendem SVT/Convergence-Setup (`verify-version-convergence.sh`, `check-versioning-svt.sh`), das `RepoVersion` und `Version`/`PackageVersion` in `FileTypeDetectionLib.vbproj` erwartet. +- `docs/versioning/001_POLICY_VERSIONING.MD` (Tag `vX.Y.Z[-prerelease]` als SSOT fuer Publish), und +- Repo-Konvergenzregeln (`RepoVersion`/`Version`/`PackageVersion` bleiben Kernversion `X.Y.Z`). Entscheidung fuer diesen Scope: -- fail-closed nach bestehendem CI/Repo-Vertrag: Versionen auf `5.2.0` synchron gehalten und durch `versioning-svt` + `version-convergence` verifiziert. -- Pre-Releases werden ueber Tag `v5.2.0-rc.N` abgebildet; die Projektfelder bleiben semantisch auf Kernversion `5.2.0`. +- fail-closed nach bestehendem CI/Repo-Vertrag: Kernversionen bleiben auf `5.2.0` konvergent. +- Pre-Release wird ueber Tag/NUPKG-Version `v5.2.0-rc.6` abgebildet und via SVT geprueft. ## 5. Grenzen/Nicht-Ziele - Keine oeffentliche API-Signatur geaendert. diff --git a/docs/references/001_REFERENCES_CORE.MD b/docs/references/001_REFERENCES_CORE.MD index 9b0d8aca..6bc6bd6c 100644 --- a/docs/references/001_REFERENCES_CORE.MD +++ b/docs/references/001_REFERENCES_CORE.MD @@ -46,6 +46,7 @@ Quelle: `FileTypeDetector.vb`. | `ArchiveStructuredRefined` | Archiv wurde strukturiert (z. B. OOXML) verfeinert | | `ArchiveRefined` | Archivtyp wurde inhaltlich verfeinert | | `ArchiveGeneric` | Archiv blieb generisch | +| `OfficeBinaryRefined` | Legacy-OLE-Dokument wurde über Marker-Refinement als Office-Typ erkannt | ## 4. Interne Kernpfade (Leseführung) | Interner Pfad | Datei | Bedeutung | Detail-README | diff --git a/docs/references/101_REFERENCES_CORE.MD b/docs/references/101_REFERENCES_CORE.MD index 1032bc20..e8bfdfb6 100644 --- a/docs/references/101_REFERENCES_CORE.MD +++ b/docs/references/101_REFERENCES_CORE.MD @@ -46,6 +46,7 @@ Source: `FileTypeDetector.vb`. | `ArchiveStructuredRefined` | archive was refined structurally (e.g. OOXML) | | `ArchiveRefined` | archive kind was refined by content | | `ArchiveGeneric` | archive stayed generic | +| `OfficeBinaryRefined` | legacy OLE document was mapped to an Office kind by marker refinement | ## 4. Internal core paths (guided reading) | Internal path | File | Meaning | Detail README | diff --git a/docs/versioning/002_HISTORY_VERSIONS.MD b/docs/versioning/002_HISTORY_VERSIONS.MD index 92090dee..341b7653 100644 --- a/docs/versioning/002_HISTORY_VERSIONS.MD +++ b/docs/versioning/002_HISTORY_VERSIONS.MD @@ -12,7 +12,7 @@ Heuristik fuer die Rueckwirkungs-Zuordnung: - `docs|test|ci|chore|tooling|refactor|fix` => Patch Aktueller Entwicklungsstand: -- Aktuelle Entwicklungslinie enthaelt `5.x` (aktueller Pre-Release-Stand: `v5.2.0-rc.2`, naechster stabiler Zielstand: `5.2.0`; Details in `docs/versioning/003_CHANGELOG_RELEASES.MD`). +- Aktuelle Entwicklungslinie enthaelt `5.x` (aktueller Pre-Release-Stand: `v5.2.0-rc.6`, naechster stabiler Zielstand: `5.2.0`; Details in `docs/versioning/003_CHANGELOG_RELEASES.MD`). Hinweis: - Die Spalte `Keyword` verwendet den technischen Klassifizierungswert aus der Historie. diff --git a/docs/versioning/003_CHANGELOG_RELEASES.MD b/docs/versioning/003_CHANGELOG_RELEASES.MD index 9721f12f..f4f715da 100644 --- a/docs/versioning/003_CHANGELOG_RELEASES.MD +++ b/docs/versioning/003_CHANGELOG_RELEASES.MD @@ -10,9 +10,14 @@ der Git-Tag `vX.Y.Z` (optional `-prerelease`) als SSOT. ## [Unreleased] - Added: - Incode-Dokumentation fuer die TFM-Providermethoden komplettiert (`HashPrimitivesProvider` fuer `netstandard2.0` und `net8.0+`). + - Legacy-Office-Refinement (`LegacyOfficeBinaryRefiner`) fuer OLE2-Dokumente mit fail-closed Marker-Logik eingefuehrt. + - Erweiterte E2E-Matrix-Tests fuer falsche Endungen, korrupte Payloads und Office/OpenDocument-Varianten ergänzt. - Changed: - Public XML-Dokumentation auf Policy-045 ausgerichtet: unzulaessige ``-Tags in fail-closed APIs entfernt. - Deutsche Log-/Dokumentationstexte mit korrekten Umlauten harmonisiert. + - Office-/OpenOffice-Aliasauflösung im `FileTypeRegistry` konsolidiert (`doc/docx/odt`, `xls/xlsx/ods`, `ppt/pptx/odp`). + - Archivextraktion nimmt nur noch echte, extrahierbare Archive (`Zip`) an; Office-Container werden nicht mehr als extrahierbares Archiv behandelt. + - `TryValidateArchive` prueft vor Safety-Gate explizit den erkannten Container-Typ. - Gate 4 (`tools/ci/release/gate4_verify_postpublish.sh`) fuer Pre-Release-Tags robust gemacht: - laengeres Retry-Fenster, - `registration` standardmaessig entkoppelt bei `vX.Y.Z-