-
Notifications
You must be signed in to change notification settings - Fork 86
#723 Add the ability to generate _corrupt_fields column containing information on fields that Cobrix failed to decode
#820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e501f35
0f546e0
bb08a9c
be408d9
3d1a9a8
620daeb
8a835e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,9 +16,9 @@ | |
|
|
||
| package za.co.absa.cobrix.cobol.parser.ast | ||
|
|
||
| import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType, Decimal, Integral} | ||
| import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, COMP3, CobolType, Decimal, Integral} | ||
| import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector} | ||
| import za.co.absa.cobrix.cobol.parser.encoding.EncoderSelector | ||
| import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC, EncoderSelector} | ||
|
|
||
| /** An abstraction of the statements describing fields of primitive data types in the COBOL copybook | ||
| * | ||
|
|
@@ -63,6 +63,44 @@ case class Primitive( | |
| /** This is cached value specifying if the field is a string */ | ||
| private val isString = dataType.isInstanceOf[AlphaNumeric] | ||
|
|
||
| /** This is cached value to speedup checking for empty values */ | ||
| private val spaceChar: Byte = { | ||
| dataType match { | ||
| case t: AlphaNumeric => | ||
| t.enc match { | ||
| case Some(EBCDIC) => 0x40 | ||
| case Some(ASCII) => 0x20 | ||
| case Some(_) => 0 | ||
| case None => 0x40 | ||
| } | ||
| case t: Integral => | ||
| t.compact match { | ||
| case Some(COMP3()) => 0x40 | ||
| case Some(_) => 0 | ||
| case None => | ||
| t.enc match { | ||
| case Some(EBCDIC) => 0x40 | ||
| case Some(ASCII) => 0x20 | ||
| case Some(_) => 0 | ||
| case None => 0x40 | ||
| } | ||
| } | ||
| case t: Decimal => | ||
| t.compact match { | ||
| case Some(COMP3()) => 0x40 | ||
| case Some(_) => 0 | ||
| case None => | ||
| t.enc match { | ||
| case Some(EBCDIC) => 0x40 | ||
| case Some(ASCII) => 0x20 | ||
| case Some(_) => 0 | ||
| case None => 0x40 | ||
| } | ||
| } | ||
| case _ => 0 | ||
| } | ||
|
Comment on lines
+66
to
+101
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's understand how spaceChar is used
rg -A 5 -B 5 "spaceChar" cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/ast/Primitive.scalaRepository: AbsaOSS/cobrix Length of output: 630 🏁 Script executed: # Search for usages of spaceChar throughout the codebase
rg "spaceChar" --type scala -A 3 -B 3Repository: AbsaOSS/cobrix Length of output: 3940 🏁 Script executed: # Look for COMP-3 handling in reader/extractor code
rg "COMP3|0x40" cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/ --type scala -A 2 -B 2Repository: AbsaOSS/cobrix Length of output: 40 🏁 Script executed: # Check for empty/null value detection in readers
rg "isEmpty|isNull|empty.*value" cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/ --type scala -A 2 | head -50Repository: AbsaOSS/cobrix Length of output: 5846 🏁 Script executed: # Look for test cases related to COMP-3 empty values
fd ".*Suite\.scala$" cobol-parser/src/test --exec rg -l "COMP3|comp3" {} \;Repository: AbsaOSS/cobrix Length of output: 40 🏁 Script executed: # Find test files mentioning COMP3 or packed decimal
find cobol-parser/src/test -name "*.scala" -exec grep -l "COMP3\|comp3\|packed" {} \;Repository: AbsaOSS/cobrix Length of output: 329 🏁 Script executed: # Check for corrupted field handling related to COMP-3
rg "corrupted|Corrupt" cobol-parser/src/main/scala --type scala -B 2 -A 2 | grep -A 5 -B 5 "COMP3"Repository: AbsaOSS/cobrix Length of output: 40 🏁 Script executed: # Look for isEmpty method implementation and all its usages
rg "def isEmpty" cobol-parser/src/main/scala --type scala -A 10Repository: AbsaOSS/cobrix Length of output: 1312 🏁 Script executed: # Search for any configuration or documentation about null encoding
rg "null.*encod|encod.*null|0x00|zero.*byte" cobol-parser/src/main/scala --type scala | head -20Repository: AbsaOSS/cobrix Length of output: 3289 🏁 Script executed: # Let's look at the BinaryDecoderSpec test
cat cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryDecoderSpec.scala | head -100Repository: AbsaOSS/cobrix Length of output: 10149 🏁 Script executed: # Check isEmpty usage in tests
rg "isEmpty" cobol-parser/src/test --type scala -B 3 -A 3Repository: AbsaOSS/cobrix Length of output: 24980 🏁 Script executed: # Look for COMP3 in the test files we found
rg "COMP3|comp3" cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryDecoderSpec.scala -B 5 -A 5Repository: AbsaOSS/cobrix Length of output: 4525 🏁 Script executed: # Let's check the full isEmpty method
sed -n '102,130p' cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/ast/Primitive.scalaRepository: AbsaOSS/cobrix Length of output: 1087 🏁 Script executed: # Look for the complete Primitive.scala spaceChar definition with context
sed -n '60,110p' cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/ast/Primitive.scalaRepository: AbsaOSS/cobrix Length of output: 1540 🏁 Script executed: # Search for any test that calls isEmpty or checks empty values with COMP-3
rg "isEmpty|empty" cobol-parser/src/test --type scala | grep -i "comp3\|packed"Repository: AbsaOSS/cobrix Length of output: 40 🏁 Script executed: # Check for usage of isEmpty method in the codebase
rg "\.isEmpty\(" cobol-parser/src/main --type scala -B 2 | head -40Repository: AbsaOSS/cobrix Length of output: 1013 Change COMP‑3 empty byte from COMP-3 is packed decimal format where 0x40 is meaningless data. Using 0x40 as the empty indicator causes fields with invalid all-0x40 bytes to be incorrectly classified as empty and excluded from corruption reporting. This contradicts the design principle that null values are encoded as 0x00 for all field types. Changing this to 0x00 aligns with the null encoding scheme and ensures corrupted COMP-3 data is properly detected. Suggested fix- case Some(COMP3()) => 0x40
+ case Some(COMP3()) => 0- case Some(COMP3()) => 0x40
+ case Some(COMP3()) => 0🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| /** Returns a string representation of the field */ | ||
| override def toString: String = { | ||
| s"${" " * 2 * level}$camelCased ${camelCase(redefines.getOrElse(""))} $dataType" | ||
|
|
@@ -90,7 +128,6 @@ case class Primitive( | |
| copy(dependingOnHandlers = newDependingOnHandlers)(parent) | ||
| } | ||
|
|
||
|
|
||
| /** Returns the binary size in bits for the field */ | ||
| def getBinarySizeBytes: Int = { | ||
| dataType match { | ||
|
|
@@ -110,6 +147,63 @@ case class Primitive( | |
| * @param record A record in a binary format represented as a vector of bits | ||
| */ | ||
| def decodeTypeValue(itOffset: Int, record: Array[Byte]): Any = { | ||
| val bytes = getRawValue(itOffset, record) | ||
|
|
||
| if (bytes == null) null else decode(bytes) | ||
| } | ||
|
|
||
| /** | ||
| * Checks if a value extracted from a given binary record at a specified offset is considered empty. | ||
| * A value is considered empty if it contains only null bytes or bytes equal to a space character | ||
| * of the underlying encoding (e.g., 0x40 for EBCDIC, 0x20 for ASCII, 0x00 for binary). | ||
| * | ||
| * @param itOffset The offset within the binary record where the value starts. | ||
| * @param record The binary record represented as an array of bytes. | ||
| * @return `true` if the value is empty, otherwise `false`. | ||
| */ | ||
| def isEmpty(itOffset: Int, record: Array[Byte]): Boolean = { | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| val bytesCount = binaryProperties.dataSize | ||
| val idx = itOffset | ||
|
|
||
| if (isString) { | ||
| // The length of a string can be smaller for varchar fields at the end of a record | ||
| if (idx > record.length) { | ||
| return true | ||
| } | ||
| } else { | ||
| // Non-string field size should exactly fit the required bytes | ||
| if (idx + bytesCount > record.length) { | ||
| return true | ||
| } | ||
| } | ||
|
|
||
| // Determine the actual number of bytes to copy based on the record size. | ||
| // Varchar fields can be trimmed by the record size. | ||
| val endIndex = if (idx + bytesCount > record.length) { | ||
| record.length | ||
| } else { | ||
| idx + bytesCount | ||
| } | ||
| var i = idx | ||
| while (i < endIndex) { | ||
| if (record(i) != 0 && record(i) != spaceChar) { | ||
| return false | ||
| } | ||
| i += 1 | ||
| } | ||
| true | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /** | ||
| * Extracts a raw byte array representation of a value from a binary record | ||
| * based on the specified offset and the field's properties. | ||
| * | ||
| * @param itOffset The offset within the binary record where the value starts. | ||
| * @param record The binary record represented as an array of bytes. | ||
| * @return An array of bytes representing the value, or `null` if the offset | ||
| * or size is invalid for the given binary record. | ||
| */ | ||
| def getRawValue(itOffset: Int, record: Array[Byte]): Array[Byte] = { | ||
| val bytesCount = binaryProperties.dataSize | ||
| val idx = itOffset | ||
|
|
||
|
|
@@ -119,7 +213,7 @@ case class Primitive( | |
| return null | ||
| } | ||
| } else { | ||
| // Non-string field size should exactly fix the required bytes | ||
| // Non-string field size should exactly fit the required bytes | ||
| if (idx + bytesCount > record.length) { | ||
| return null | ||
| } | ||
|
|
@@ -132,9 +226,7 @@ case class Primitive( | |
| } else { | ||
| bytesCount | ||
| } | ||
| val bytes = java.util.Arrays.copyOfRange(record, idx, idx + bytesToCopy) | ||
|
|
||
| decode(bytes) | ||
| java.util.Arrays.copyOfRange(record, idx, idx + bytesToCopy) | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| /* | ||
| * Copyright 2018 ABSA Group Limited | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package za.co.absa.cobrix.cobol.reader.extractors.record | ||
|
|
||
| case class CorruptField( | ||
| fieldName: String, | ||
| rawValue: Array[Byte] | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix heading level increment (MD001).
The changelog jumps from
##to####. Use###for the 2.9.8 entry.📝 Suggested fix
📝 Committable suggestion
🧰 Tools
🪛 markdownlint-cli2 (0.20.0)
[warning] 1954-1954: Heading levels should only increment by one level at a time
Expected: h3; Actual: h4
(MD001, heading-increment)
🤖 Prompt for AI Agents