From aa693fce50f8a74a7cbece7b0c9b78bf53721d40 Mon Sep 17 00:00:00 2001 From: Alexandre Flores Date: Mon, 15 Dec 2025 14:44:04 +0000 Subject: [PATCH] correctly write array simple cells content and xsd --- .../content/SIARD22ContentExportStrategy.java | 31 ++++- ...ContentWithExternalLobsExportStrategy.java | 130 +++++++++++++++++- ...entWithExternalLobsPathExportStrategy.java | 12 ++ 3 files changed, 165 insertions(+), 8 deletions(-) diff --git a/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentExportStrategy.java b/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentExportStrategy.java index 089586779..ddb607f2c 100644 --- a/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentExportStrategy.java +++ b/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentExportStrategy.java @@ -12,6 +12,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.math.BigInteger; import java.nio.file.Paths; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -263,7 +264,7 @@ private void writeArrayCell(String cellPrefix, Cell cell, ColumnStructure column if (subCell instanceof BinaryCell) { writeBinaryCell(CELL_PREFIX_ARRAY, subCell, column, cellTag); } else if (subCell instanceof SimpleCell) { - writeSimpleCell(CELL_PREFIX_ARRAY, subCell, column, cellTag); + writeSimpleCell(CELL_PREFIX_ARRAY, subCell, column, columnIndex, cellTag); } else if (subCell instanceof ComposedCell) { writeComposedCell(CELL_PREFIX_ARRAY, subCell, column, cellTag); } else if (subCell instanceof NullCell) { @@ -344,6 +345,18 @@ protected void writeSimpleCell(String cellPrefix, Cell cell, ColumnStructure col } } + protected void writeSimpleCell(String cellPrefix, Cell cell, ColumnStructure column, int columnIndex, int arrayIndex) + throws ModuleException, IOException { + SimpleCell simpleCell = (SimpleCell) cell; + + if (Sql2008toXSDType.isLargeType(column.getType(), reporter) + && simpleCell.getBytesSize() > THRESHOLD_TREAT_STRING_AS_CLOB) { + writeLargeObjectData(cellPrefix, cell, arrayIndex); + } else { + writeSimpleCellData(cellPrefix, simpleCell, arrayIndex); + } + } + protected void writeBinaryCell(String cellPrefix, Cell cell, ColumnStructure column, int columnIndex) throws ModuleException, IOException { BinaryCell binaryCell = (BinaryCell) cell; @@ -389,8 +402,8 @@ protected void writeLargeObjectData(String cellPrefix, Cell cell, int columnInde WaitingInputStream waitingInputStream = new WaitingInputStream(digest); InputStream inputStream = new BufferedInputStream(waitingInputStream); - String lobDir = contentPathStrategy.getAbsoluteInternalLobDirPath(currentSchema.getIndex(), currentTable.getIndex(), - columnIndex); + String lobDir = contentPathStrategy.getAbsoluteInternalLobDirPath(currentSchema.getIndex(), + currentTable.getIndex(), columnIndex); lob = new LargeObject(new InputStreamProviderImpl(inputStream), contentPathStrategy.getInternalBlobFileName(currentRowIndex + 1)); @@ -431,8 +444,8 @@ protected void writeLargeObjectData(String cellPrefix, Cell cell, int columnInde final WaitingInputStream waitingInputStream = new WaitingInputStream(digest); InputStream inputStream = new BufferedInputStream(waitingInputStream); - String lobDir = contentPathStrategy.getAbsoluteInternalLobDirPath(currentSchema.getIndex(), currentTable.getIndex(), - columnIndex); + String lobDir = contentPathStrategy.getAbsoluteInternalLobDirPath(currentSchema.getIndex(), + currentTable.getIndex(), columnIndex); lob = new LargeObject(new InputStreamProviderImpl(inputStream), contentPathStrategy.getInternalClobFileName(currentRowIndex + 1)); @@ -558,8 +571,12 @@ private void writeXsd() throws IOException, ModuleException { // xsdWriter.openTag(XS_SEQUENCE, 6); - xsdWriter.beginOpenTag("xs:any", 7).appendAttribute(MIN_OCCURS, "0").appendAttribute("maxOccurs", "unbounded") - .appendAttribute("processContents", "skip").endShorthandTag(); + String xsdSubtype = Sql2008toXSDType.convert(col.getType().getSql2008TypeName()); + for (BigInteger c = BigInteger.valueOf(0); c.compareTo(col.getCardinality()) < 0; c = c.add(BigInteger.ONE)) { + xsdWriter.beginOpenTag(XS_ELEMENT, 7).appendAttribute(MIN_OCCURS, "0"); + xsdWriter.appendAttribute("name", "a" + c.add(BigInteger.ONE)).appendAttribute("type", xsdSubtype) + .endShorthandTag(); + } // xsdWriter.closeTag(XS_SEQUENCE, 6); diff --git a/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentWithExternalLobsExportStrategy.java b/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentWithExternalLobsExportStrategy.java index 9c5486e49..6ff172544 100644 --- a/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentWithExternalLobsExportStrategy.java +++ b/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/content/SIARD22ContentWithExternalLobsExportStrategy.java @@ -92,6 +92,18 @@ protected void writeSimpleCell(String cellPrefix, Cell cell, ColumnStructure col } } + @Override + protected void writeSimpleCell(String cellPrefix, Cell cell, ColumnStructure column, int columnIndex, int arrayIndex) + throws ModuleException, IOException { + SimpleCell simpleCell = (SimpleCell) cell; + long length = simpleCell.getBytesSize(); + if (Sql2008toXSDType.isLargeType(column.getType(), reporter) && length > clobThresholdLimit) { + writeLargeObjectDataOutside(cellPrefix, cell, columnIndex, arrayIndex); + } else { + writeSimpleCellData(cellPrefix, (SimpleCell) cell, arrayIndex); + } + } + @Override protected void writeBinaryCell(String cellPrefix, Cell cell, ColumnStructure column, int columnIndex) throws ModuleException, IOException { @@ -208,7 +220,123 @@ private void writeLargeObjectDataOutside(String cellPrefix, Cell cell, int colum .get(firstExternalContainer.getPath().getFileName().toString() + File.separator, lobFileParameter).toString()); // write the LOB XML element - currentWriter.beginOpenTag("c" + columnIndex, 2).appendAttribute("file", lobURI).appendAttribute("length", + currentWriter.beginOpenTag(cellPrefix + columnIndex, 2).appendAttribute("file", lobURI).appendAttribute("length", + String.valueOf(lobSizeParameter)); + + if (lobDigestChecksum != null) { + cell.setMessageDigest(lobDigestChecksum); + cell.setDigestAlgorithm(messageDigestAlgorithm); + + currentWriter.appendAttribute("digestType", messageDigestAlgorithm.toUpperCase()); + currentWriter.appendAttribute("digest", MessageDigestUtils.getHexFromMessageDigest(lobDigestChecksum, lowerCase)); + lobDigestChecksum = null; // reset it to the default value + } + + currentWriter.endShorthandTag(); + } + + private void writeLargeObjectDataOutside(String cellPrefix, Cell cell, int columnIndex, int arrayIndex) + throws IOException, ModuleException { + String lobFileParameter = null; + long lobSizeParameter = 0; + LargeObject lob = null; + + // get size + if (cell instanceof BinaryCell binCell) { + lobSizeParameter = binCell.getSize(); + } else if (cell instanceof SimpleCell txtCell) { + lobSizeParameter = txtCell.getBytesSize(); + } + + // determine path + Triple segmentKey = Triple.of(currentSchema.getIndex(), currentTable.getIndex(), + columnIndex); + SIARDArchiveContainer currentExternalContainer = currentExternalContainers.getOrDefault(segmentKey, null); + if (currentExternalContainer == null) { + currentExternalContainer = getAnotherExternalContainer(segmentKey); + writeStrategy.setup(currentExternalContainer); + currentLobsFolderSize = 0; + currentLobsInFolder = 0; + } else if ((maximumLobsFolderSize > 0 && lobSizeParameter + currentLobsFolderSize >= maximumLobsFolderSize + && (lobSizeParameter <= maximumLobsFolderSize || currentLobsFolderSize >= maximumLobsFolderSize)) + || currentLobsInFolder >= maximumLobsPerFolder) { + writeStrategy.finish(currentExternalContainer); + currentExternalContainer = getAnotherExternalContainer(segmentKey); + writeStrategy.setup(currentExternalContainer); + currentLobsFolderSize = 0; + currentLobsInFolder = 0; + } + currentExternalContainers.put(segmentKey, currentExternalContainer); + SIARDArchiveContainer firstExternalContainer = currentExternalContainer; + + // get file xml parameters + if (contentPathStrategy instanceof SIARD22ContentWithExternalLobsPathExportStrategy paths) { + if (cell instanceof BinaryCell) { + lobFileParameter = paths.getBlobOuterFilePath(currentTable.getIndex(), columnIndex, currentRowIndex + 1, + arrayIndex); + } else if (cell instanceof SimpleCell) { + lobFileParameter = paths.getClobOuterFilePath(currentTable.getIndex(), columnIndex, currentRowIndex + 1, + arrayIndex); + } + } else { + throw new NotImplementedException("Unsupported ContentPathStrategy"); + } + + if (lobSizeParameter < 0) { + // NULL content + writeNullCellData(cellPrefix, new NullCell(cell.getId()), columnIndex); + return; + } + + // get lob object + if (cell instanceof BinaryCell binCell) { + lob = new LargeObject(binCell, lobFileParameter); + } else if (cell instanceof SimpleCell txtCell) { + String data = txtCell.getSimpleData(); + ByteArrayInputStream inputStream = new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)); + lob = new LargeObject(new InputStreamProviderImpl(inputStream, data.getBytes().length), lobFileParameter); + } + + // write LOB + if (writeStrategy.isSimultaneousWritingSupported()) { + if (maximumLobsFolderSize > 0 && lobSizeParameter >= maximumLobsFolderSize) { + long remainingLobSize = lobSizeParameter; + int partSize = (int) (maximumLobsFolderSize - currentLobsFolderSize); + int partIndex = 1; + try (InputStream lobInputStream = lob.getInputStreamProvider().createInputStream()) { + while (remainingLobSize > 0) { + writeLOBPartOutside(lob, lobInputStream, currentExternalContainer, partSize, partIndex); + currentLobsInFolder++; + currentLobsFolderSize += partSize; + partIndex++; + remainingLobSize -= partSize; + partSize = (int) Math.min(maximumLobsFolderSize, remainingLobSize); + if (partSize > 0) { + writeStrategy.finish(currentExternalContainer); + currentExternalContainer = getAnotherExternalContainer(segmentKey); + writeStrategy.setup(currentExternalContainer); + currentLobsFolderSize = 0; + currentLobsInFolder = 0; + } + } + } + currentExternalContainers.put(segmentKey, currentExternalContainer); + } else { + writeLOBOutside(lob, currentExternalContainer); + currentLobsFolderSize += lobSizeParameter; + currentLobsInFolder++; + } + } else { + throw new NotImplementedException(SIARD22ContentWithExternalLobsExportStrategy.class.getName() + + " is not ready to be used with write strategies that don't support simultaneous writing."); + } + + // something like "seg_0/t2_c8_r2.bin" + String lobURI = FilenameUtils.separatorsToUnix(Paths + .get(firstExternalContainer.getPath().getFileName().toString() + File.separator, lobFileParameter).toString()); + + // write the LOB XML element + currentWriter.beginOpenTag(cellPrefix + arrayIndex, 2).appendAttribute("file", lobURI).appendAttribute("length", String.valueOf(lobSizeParameter)); if (lobDigestChecksum != null) { diff --git a/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/path/SIARD22ContentWithExternalLobsPathExportStrategy.java b/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/path/SIARD22ContentWithExternalLobsPathExportStrategy.java index f5d196693..8cf59368e 100644 --- a/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/path/SIARD22ContentWithExternalLobsPathExportStrategy.java +++ b/dbptk-modules/dbptk-module-siard/src/main/java/com/databasepreservation/modules/siard/out/path/SIARD22ContentWithExternalLobsPathExportStrategy.java @@ -47,11 +47,23 @@ public String getClobOuterFilePath(int tableIndex, int columnIndex, int rowIndex .append(rowIndex).append(FILE_EXTENSION_SEPARATOR).append(CLOB_EXTENSION).toString(); } + public String getClobOuterFilePath(int tableIndex, int columnIndex, int rowIndex, int arrayIndex) { + return new StringBuilder().append("t").append(tableIndex).append("_c").append(columnIndex).append("_r") + .append(rowIndex).append("_a").append(arrayIndex).append(FILE_EXTENSION_SEPARATOR).append(CLOB_EXTENSION) + .toString(); + } + public String getBlobOuterFilePath(int tableIndex, int columnIndex, int rowIndex) { return new StringBuilder().append("t").append(tableIndex).append("_c").append(columnIndex).append("_r") .append(rowIndex).append(FILE_EXTENSION_SEPARATOR).append(BLOB_EXTENSION).toString(); } + public String getBlobOuterFilePath(int tableIndex, int columnIndex, int rowIndex, int arrayIndex) { + return new StringBuilder().append("t").append(tableIndex).append("_c").append(columnIndex).append("_r") + .append(rowIndex).append("_a").append(arrayIndex).append(FILE_EXTENSION_SEPARATOR).append(BLOB_EXTENSION) + .toString(); + } + @Override public String getClobFilePath(int schemaIndex, int tableIndex, int columnIndex, int rowIndex) { return new StringBuilder().append(CONTENT_DIR).append(FILE_SEPARATOR).append(SCHEMA_DIR).append(schemaIndex)