From e3a8de1b2a7094f5c76fc60b2930fce78576087f Mon Sep 17 00:00:00 2001 From: Steve Lawrence Date: Fri, 19 Dec 2025 07:59:39 -0500 Subject: [PATCH] Avoid large allocations related to specified length strings When we need to parse a specified length string, we currently allocate a buffer that can be reused to store the decoded string. The size of this buffer is based on the maximumSimpleElementSizeInCharacters tunable, which defaults to a fairly large size (1MB) that can be slow and put added pressure on the garbage collector. Fortunately, this buffer is allocated using a LocalBuffer so it is reused during a parse so at worst there is only one allocation per parse. But when parsing many small files that contain specified length strings, this overhead can become noticable. And 1MB is likely orders of magnitude larger than the vast majority of data formats will need for any single string element. To address this, instead of using maximumSimpleElementSizeInCharacters, we calculate how many characters the string could possible decode to given the current bit position, bit limit, and encoding, and use that as the buffer size to request. This way we only ever request and allocate a large buffer is one is ever needed, which should be rare. Note that this new logic requires bitLimit as part of specified string parsing. That isn't available in the edge case of specified length complex nillables. The specified length nil parser is modified to handle this case. This also modifies the LocalBuffer to allocate buffers of a reasonably large minimum size of 1K. This way we will likely only ever need to allocate a single buffer rather than allocating small buffers that have to be reallocate as larger buffers are needed. Tested with small NITF files (<4000 bytes) that contain lots of fixed length strings, this saw about 30%+ performance improvements. Files tested as large as 8000 bytes saw little or no change in performance. DAFFODIL-2851 --- .../org/apache/daffodil/io/LocalBuffer.scala | 8 +++- .../processors/parsers/NilParsers.scala | 47 +++++++++++++------ .../parsers/StringLengthParsers.scala | 27 ++++++++++- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala b/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala index dca66ea41f..cf37e9c050 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala @@ -33,7 +33,13 @@ abstract class LocalBuffer[T <: java.nio.Buffer] { def getBuf(length: Long) = { Assert.usage(length <= Int.MaxValue) if (tempBuf.isEmpty || tempBuf.get.capacity < length) { - tempBuf = Maybe(allocate(length.toInt)) + // allocate a buffer that can store the required length, but with a minimum size. The + // majority of LocalBuffers should be smaller than this minimum size and so should avoid + // costly reallocations, while still being small enough that the JVM should have no + // problem quickly allocating it + val minBufferSize = 1024 + val allocationSize = math.max(length.toInt, minBufferSize) + tempBuf = Maybe(allocate(allocationSize)) } val buf = tempBuf.get buf.clear diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala index af2cc7a1c4..faaa285ba7 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala @@ -40,21 +40,40 @@ abstract class LiteralNilOfSpecifiedLengthParserBase(erd: ElementRuntimeData) def isFieldNilLit(field: String): Boolean override def parse(start: PState): Unit = { - - val field = parseString(start) - - val isFieldEmpty = field.length() == 0 - - if (isFieldEmpty && isEmptyAllowed) { - // Valid! Success ParseResult indicates nilled - } else if (isFieldEmpty && !isEmptyAllowed) { - // Fail! - PE(start, "%s - Empty field found but not allowed!", eName) - } else if (isFieldNilLit(field)) { - // Contains a nilValue, Success ParseResult indicates nilled + if (erd.isComplexType) { + // nillable complex types must have a nilValue of %ES;. For a literal nil specified length + // complex to be nilled, that means either there must be a specified length that is zero + // or there isn't a specified length and we have reached the end of the data. If neither + // of these conditions are true, then there is non-empty data for this complex element and + // it cannot be nilled. + val bitLimit0b = start.bitLimit0b + val hasSpecifiedLength = bitLimit0b.isDefined + if ( + (hasSpecifiedLength && (bitLimit0b.get - start.bitPos0b) > 0) || + (!hasSpecifiedLength && start.dataInputStream.hasData) + ) { + // Fail! + PE(start, "%s - Does not contain a nil literal", eName) + } else { + // Valid! Success ParseResult indicates nilled + } } else { - // Fail! - PE(start, "%s - Does not contain a nil literal!", eName) + // Simple element, read a string up to the bitLimit and see if it matches the nilValue + val field = parseString(start) + + val isFieldEmpty = field.length() == 0 + + if (isFieldEmpty && isEmptyAllowed) { + // Valid! Success ParseResult indicates nilled + } else if (isFieldEmpty && !isEmptyAllowed) { + // Fail! + PE(start, "%s - Empty field found but not allowed", eName) + } else if (isFieldNilLit(field)) { + // Contains a nilValue, Success ParseResult indicates nilled + } else { + // Fail! + PE(start, "%s - Does not contain a nil literal", eName) + } } } diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala index 16cbbd8126..08e28190ee 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala @@ -18,6 +18,7 @@ package org.apache.daffodil.runtime1.processors.parsers import org.apache.daffodil.io.processors.charset.BitsCharsetDecoderUnalignedCharDecodeException +import org.apache.daffodil.lib.exceptions.Assert import org.apache.daffodil.lib.util.MaybeChar import org.apache.daffodil.lib.util.Misc import org.apache.daffodil.runtime1.processors.CharsetEv @@ -86,8 +87,32 @@ trait StringOfSpecifiedLengthMixin extends PaddingRuntimeMixin with CaptureParsi protected final def parseString(start: PState): String = { val dis = start.dataInputStream - val maxLen = start.tunable.maximumSimpleElementSizeInCharacters val startBitPos0b = dis.bitPos0b + val bitLimit0b = dis.bitLimit0b + + // We want to limit the maximum length passed into getSomeString since that function can + // pre-allocate a buffer that size even if it won't find that many characters. So we + // calculate the maximum number of characters that we could possibly decode from the + // available bits and the character set. + // + // For fixed-width encodings, that is just the number of available bits divided by the + // fixed width of the encoding. + // + // For variable length encodings (e.g. UTF-8), the maximum number of characters that the + // available bits could possibly decode to is if every decoded character was the smallest + // possible representation. That smallest representation for variable-width encodings is + // bitWidthOfACodeUnit. So we divide the available bits but bitWidthOfACodeUnit. + // + // Note that the bitLimit should always be defined because bitLimit is how string of + // specified lengths limit lengths + Assert.invariant(bitLimit0b.isDefined) + val availableBits = bitLimit0b.get - startBitPos0b + val charset = charsetEv.evaluate(start) + val optWidth = charset.maybeFixedWidth + val bitsPerChar = if (optWidth.isDefined) optWidth.get else charset.bitWidthOfACodeUnit + // add one to allow for partial bytes at the end that could parse to a replacement char + val maxPossibleChars = (availableBits / bitsPerChar) + 1 + val maxLen = math.min(maxPossibleChars, start.tunable.maximumSimpleElementSizeInCharacters) val strOpt = try {