diff --git a/PCAxis.Serializers/Parquet/ParquetBuilder.cs b/PCAxis.Serializers/Parquet/ParquetBuilder.cs index 4d847a7..8073cd7 100644 --- a/PCAxis.Serializers/Parquet/ParquetBuilder.cs +++ b/PCAxis.Serializers/Parquet/ParquetBuilder.cs @@ -58,7 +58,12 @@ public Table PopulateTable() int matrixSize = model.Data.MatrixColumnCount * model.Data.MatrixRowCount; double[] data = new double[matrixSize]; int[] variableValueCounts = GetVariableValueCounts(); - var indices = GenerateDataPointIndices(variableValueCounts); + // Build a mask indicating which variables are multi-valued content variables. + bool[] isContentMulti = model.Meta.Variables + .Select(v => v.IsContentVariable && v.Values.Count > 1) + .ToArray(); + + var indices = GenerateDataPointIndices(variableValueCounts, isContentMulti); for (int m = 0; m < matrixSize; m++) { @@ -213,23 +218,26 @@ private void PopulateContentVariableRow(int[] index, int[] variableValueCounts, int columnIndex = dataFieldIndices[columnName]; int symbolColumnIndex = dataFieldIndices[symbolColumnName]; // Get index of the symbol column - int dataIndex = ParquetBuilder.GetDataIndex(index, variableValueCounts); - if (dataIndex + j < data.Length) - { - dataIndex += j; - } + // Compute the exact data index for this content value by cloning the + // multi-dimensional index, setting the content variable coordinate to j, + // and converting to the linear data index. This avoids assumptions about + // dimension ordering or stride between content values. + int varPos = model.Meta.Variables.IndexOf(variable); + int[] indexForValue = (int[])index.Clone(); + indexForValue[varPos] = j; + int dataIndexForValue = ParquetBuilder.GetDataIndex(indexForValue, variableValueCounts); - if (dataIndex >= 0 && dataIndex < data.Length) + if (dataIndexForValue >= 0 && dataIndexForValue < data.Length) { - if (dataSymbolMap.ContainsKey(data[dataIndex])) + if (dataSymbolMap.ContainsKey(data[dataIndexForValue])) { - row[symbolColumnIndex] = dataSymbolMap[data[dataIndex]]; // Set the symbol value + row[symbolColumnIndex] = dataSymbolMap[data[dataIndexForValue]]; // Set the symbol value row[columnIndex] = double.NaN; // Replace the value with double.NaN } else { - row[columnIndex] = data[dataIndex]; + row[columnIndex] = data[dataIndexForValue]; row[symbolColumnIndex] = null; // No symbol } } @@ -250,8 +258,11 @@ private void PopulateNonContentVariableRow(int[] index, object[] row, Dictionary var value = variable.Values[index[i]].Code; if (variable.IsTime) { - value = variable.Values[index[i]].TimeValue; - row[dataFieldIndices[variable.Name]] = value; // Original time-value + // Use the VALUES ordering (Code) for the displayed time value so it + // matches the order of the data array. Some PX files have a + // different TIMEVAL ordering; using Code ensures consistency with + // the data which follows VALUES(...). + row[dataFieldIndices[variable.Name]] = value; // Original time-value (from VALUES) row[dataFieldIndices["timestamp"]] = ParseTimeScale(value, variable.TimeScale); // Parsed timestamp } else @@ -412,11 +423,12 @@ static int GetDataIndex(int[] index, int[] variableValueCounts) for (int i = index.Length - 1; i >= 0; i--) { - dataIndex += index[i] * multiplier; - if (i < variableValueCounts.Length - 1) // Adjusting the condition here + if (i < variableValueCounts.Length - 1) // Ensure multiplier equals product of dimensions to the right { multiplier *= variableValueCounts[i + 1]; } + + dataIndex += index[i] * multiplier; } return dataIndex; @@ -473,16 +485,22 @@ private static PXModel RearrangeValues(PXModel model) /// /// An array of integers representing the counts of values for each variable. /// A list of integer arrays representing the data point indices. - private static List GenerateDataPointIndices(int[] variableValueCounts) + private static List GenerateDataPointIndices(int[] variableValueCounts, bool[] isContentMulti) { int variableCount = variableValueCounts.Length; - int[] variableIndexCounts = new int[variableCount]; + + // Effective counts treat multi-valued content variables as having count 1 + // so that rows correspond to combinations of the other variables only. + int[] effectiveCounts = new int[variableCount]; + for (int i = 0; i < variableCount; i++) + { + effectiveCounts[i] = (isContentMulti != null && i < isContentMulti.Length && isContentMulti[i]) ? 1 : variableValueCounts[i]; + } int totalDataPoints = 1; for (int i = variableCount - 1; i >= 0; i--) { - variableIndexCounts[i] = totalDataPoints; - totalDataPoints *= variableValueCounts[i]; + totalDataPoints *= effectiveCounts[i]; } List dataPointIndices = new List(totalDataPoints); @@ -494,12 +512,14 @@ private static List GenerateDataPointIndices(int[] variableValueCounts) int tempIndex = dataIndex; for (int variableIndex = 0; variableIndex < variableCount; variableIndex++) { - indices[variableIndex] = tempIndex % variableValueCounts[variableIndex]; - tempIndex /= variableValueCounts[variableIndex]; + // For multi-valued content variables effectiveCounts will be 1 so this yields 0. + indices[variableIndex] = tempIndex % effectiveCounts[variableIndex]; + tempIndex /= effectiveCounts[variableIndex]; } dataPointIndices.Add(indices); } + return dataPointIndices; } } diff --git a/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs b/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs index df6f488..7c8c2e7 100644 --- a/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs +++ b/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs @@ -44,8 +44,12 @@ public void ShouldSerializePxModel(string pxFile) // Sync wrapper around async call Table table = ReadBackParquetFileSync(outputFile); - // Assertion: Ensure that the model's matrix size is equal to the table's count. - Assert.AreEqual(table.Count, model.Data.MatrixSize, $"Mismatch in matrix size for file {fileNameWithoutExtension}.parquet."); + // Assertion: Ensure that the table's row count equals the number of observations + // for a single ContentsCode. If the model has multiple contents, the serializer + // emits additional content columns rather than duplicating rows. + int contentCount = model.Meta.ContentVariable != null ? model.Meta.ContentVariable.Values.Count : 1; + int expectedRows = model.Data.MatrixSize / contentCount; + Assert.AreEqual(expectedRows, table.Count, $"Mismatch in matrix size for file {fileNameWithoutExtension}.parquet."); // Assertion: Calculate the amount of columns we should have, based on the metadata // Number of columns in meta, number of columns in table. diff --git a/UnitTests/TestFiles/14216.px b/UnitTests/TestFiles/14216.px new file mode 100644 index 0000000..ded6448 --- /dev/null +++ b/UnitTests/TestFiles/14216.px @@ -0,0 +1,62 @@ +CHARSET="ANSI"; +AXIS-VERSION="2010"; +CODEPAGE="iso-8859-1"; +LANGUAGE="no"; +CREATION-DATE="20260223 22:57"; +DECIMALS=2; +SHOWDECIMALS=0; +MATRIX="14216"; +COPYRIGHT=NO; +SUBJECT-CODE="be"; +SUBJECT-AREA="Befolkning"; +TITLE="14216: Areal og befolkning i tettsteder, etter tettsted, statistikkvariabel og år"; +CONTENTS="14216: Areal og befolkning i tettsteder,"; +STUB="tettsted"; +HEADING="statistikkvariabel","år"; +CONTVARIABLE="statistikkvariabel"; +VARIABLECODE("tettsted")="TettSted"; +VALUES("tettsted")="Oslo"; +VARIABLECODE("statistikkvariabel")="ContentsCode"; +VALUES("statistikkvariabel")="Areal av tettsted (km²)","Bosatte"; +VARIABLECODE("år")="Tid"; +VALUES("år")="2025","2024"; +TIMEVAL("år")=TLIST(A1),"2024","2025"; +CODES("tettsted")="0801"; +CODES("statistikkvariabel")="Areal","Bosatte"; +CODES("år")="2025","2024"; +PRESTEXT("tettsted")=2; +PRESTEXT("år")=0; +PRECISION("statistikkvariabel","Areal av tettsted (km²)")=2; +UNITS="km²"; +LAST-UPDATED("Areal av tettsted (km²)")="20251027 08:00"; +STOCKFA("Areal av tettsted (km²)")="S"; +DAYADJ("Areal av tettsted (km²)")=NO; +SEASADJ("Areal av tettsted (km²)")=NO; +REFPERIOD("Areal av tettsted (km²)")="01.01"; +UNITS("Areal av tettsted (km²)")="km²"; +CONTACT("Areal av tettsted (km²)")="Bjørn Lie Rapp, Statistisk sentralbyrå# +47 47 97 17 27#rnl@ssb.no##Vilni Verner Holst Bloch, Statistisk sentralbyrå# +47 99 85 23 42#vvh@ssb.no##"; +LAST-UPDATED("Bosatte")="20251027 08:00"; +STOCKFA("Bosatte")="S"; +DAYADJ("Bosatte")=NO; +SEASADJ("Bosatte")=NO; +REFPERIOD("Bosatte")="01.01"; +UNITS("Bosatte")="personer"; +CONTACT("Bosatte")="Bjørn Lie Rapp, Statistisk sentralbyrå# +47 47 97 17 27#rnl@ssb.no##Vilni Verner Holst Bloch, Statistisk sentralbyrå# +47 99 85 23 42#vvh@ssb.no##"; +DATABASE="Ekstern PROD database O_STATMETA_24 som 2.4"; +SOURCE="Statistisk sentralbyrå"; +INFOFILE="None"; +NOTE="Ikke medregnet personer uten opplysninger om bostedstrøk."; +META-ID="KORTNAVN:beftett"; +META-ID("tettsted")="urn:ssb:classification:klass:110,urn:ssb:conceptvariable:vardok:141"; +DATASYMBOL1=".."; +DATASYMBOL2="..."; +DATASYMBOL3=":"; +DATASYMBOLSUM="."; +DATASYMBOLNIL="-"; +DATANOTESUM="."; +TABLEID="14216"; +VARIABLE-TYPE("tettsted")="V"; +VARIABLE-TYPE("år")="T"; +DATA= +275.87 276.30 1110887.00 1098061.00 +; \ No newline at end of file