Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 40 additions & 20 deletions PCAxis.Serializers/Parquet/ParquetBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ public Table PopulateTable()
int matrixSize = model.Data.MatrixColumnCount * model.Data.MatrixRowCount;
double[] data = new double[matrixSize];
int[] variableValueCounts = GetVariableValueCounts();
var indices = GenerateDataPointIndices(variableValueCounts);
// Build a mask indicating which variables are multi-valued content variables.
bool[] isContentMulti = model.Meta.Variables
.Select(v => v.IsContentVariable && v.Values.Count > 1)
.ToArray();

var indices = GenerateDataPointIndices(variableValueCounts, isContentMulti);

for (int m = 0; m < matrixSize; m++)
{
Expand Down Expand Up @@ -213,23 +218,26 @@ private void PopulateContentVariableRow(int[] index, int[] variableValueCounts,

int columnIndex = dataFieldIndices[columnName];
int symbolColumnIndex = dataFieldIndices[symbolColumnName]; // Get index of the symbol column
int dataIndex = ParquetBuilder.GetDataIndex(index, variableValueCounts);

if (dataIndex + j < data.Length)
{
dataIndex += j;
}
// Compute the exact data index for this content value by cloning the
// multi-dimensional index, setting the content variable coordinate to j,
// and converting to the linear data index. This avoids assumptions about
// dimension ordering or stride between content values.
int varPos = model.Meta.Variables.IndexOf(variable);
int[] indexForValue = (int[])index.Clone();
indexForValue[varPos] = j;
int dataIndexForValue = ParquetBuilder.GetDataIndex(indexForValue, variableValueCounts);

if (dataIndex >= 0 && dataIndex < data.Length)
if (dataIndexForValue >= 0 && dataIndexForValue < data.Length)
{
if (dataSymbolMap.ContainsKey(data[dataIndex]))
if (dataSymbolMap.ContainsKey(data[dataIndexForValue]))
{
row[symbolColumnIndex] = dataSymbolMap[data[dataIndex]]; // Set the symbol value
row[symbolColumnIndex] = dataSymbolMap[data[dataIndexForValue]]; // Set the symbol value
row[columnIndex] = double.NaN; // Replace the value with double.NaN
}
else
{
row[columnIndex] = data[dataIndex];
row[columnIndex] = data[dataIndexForValue];
row[symbolColumnIndex] = null; // No symbol
}
}
Expand All @@ -250,8 +258,11 @@ private void PopulateNonContentVariableRow(int[] index, object[] row, Dictionary
var value = variable.Values[index[i]].Code;
if (variable.IsTime)
{
value = variable.Values[index[i]].TimeValue;
row[dataFieldIndices[variable.Name]] = value; // Original time-value
// Use the VALUES ordering (Code) for the displayed time value so it
// matches the order of the data array. Some PX files have a
// different TIMEVAL ordering; using Code ensures consistency with
// the data which follows VALUES(...).
row[dataFieldIndices[variable.Name]] = value; // Original time-value (from VALUES)
row[dataFieldIndices["timestamp"]] = ParseTimeScale(value, variable.TimeScale); // Parsed timestamp
}
else
Expand Down Expand Up @@ -412,11 +423,12 @@ static int GetDataIndex(int[] index, int[] variableValueCounts)

for (int i = index.Length - 1; i >= 0; i--)
{
dataIndex += index[i] * multiplier;
if (i < variableValueCounts.Length - 1) // Adjusting the condition here
if (i < variableValueCounts.Length - 1) // Ensure multiplier equals product of dimensions to the right
{
multiplier *= variableValueCounts[i + 1];
}

dataIndex += index[i] * multiplier;
}

return dataIndex;
Expand Down Expand Up @@ -473,16 +485,22 @@ private static PXModel RearrangeValues(PXModel model)
/// </summary>
/// <param name="variableValueCounts">An array of integers representing the counts of values for each variable.</param>
/// <returns>A list of integer arrays representing the data point indices.</returns>
private static List<int[]> GenerateDataPointIndices(int[] variableValueCounts)
private static List<int[]> GenerateDataPointIndices(int[] variableValueCounts, bool[] isContentMulti)
{
int variableCount = variableValueCounts.Length;
int[] variableIndexCounts = new int[variableCount];

// Effective counts treat multi-valued content variables as having count 1
// so that rows correspond to combinations of the other variables only.
int[] effectiveCounts = new int[variableCount];
for (int i = 0; i < variableCount; i++)
{
effectiveCounts[i] = (isContentMulti != null && i < isContentMulti.Length && isContentMulti[i]) ? 1 : variableValueCounts[i];
}

int totalDataPoints = 1;
for (int i = variableCount - 1; i >= 0; i--)
{
variableIndexCounts[i] = totalDataPoints;
totalDataPoints *= variableValueCounts[i];
totalDataPoints *= effectiveCounts[i];
}

List<int[]> dataPointIndices = new List<int[]>(totalDataPoints);
Expand All @@ -494,12 +512,14 @@ private static List<int[]> GenerateDataPointIndices(int[] variableValueCounts)
int tempIndex = dataIndex;
for (int variableIndex = 0; variableIndex < variableCount; variableIndex++)
{
indices[variableIndex] = tempIndex % variableValueCounts[variableIndex];
tempIndex /= variableValueCounts[variableIndex];
// For multi-valued content variables effectiveCounts will be 1 so this yields 0.
indices[variableIndex] = tempIndex % effectiveCounts[variableIndex];
tempIndex /= effectiveCounts[variableIndex];
}

dataPointIndices.Add(indices);
}

return dataPointIndices;
}
}
Expand Down
8 changes: 6 additions & 2 deletions UnitTests/Parquet/ParquetSerializationIntegrationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ public void ShouldSerializePxModel(string pxFile)
// Sync wrapper around async call
Table table = ReadBackParquetFileSync(outputFile);

// Assertion: Ensure that the model's matrix size is equal to the table's count.
Assert.AreEqual(table.Count, model.Data.MatrixSize, $"Mismatch in matrix size for file {fileNameWithoutExtension}.parquet.");
// Assertion: Ensure that the table's row count equals the number of observations
// for a single ContentsCode. If the model has multiple contents, the serializer
// emits additional content columns rather than duplicating rows.
int contentCount = model.Meta.ContentVariable != null ? model.Meta.ContentVariable.Values.Count : 1;
int expectedRows = model.Data.MatrixSize / contentCount;
Assert.AreEqual(expectedRows, table.Count, $"Mismatch in matrix size for file {fileNameWithoutExtension}.parquet.");

// Assertion: Calculate the amount of columns we should have, based on the metadata
// Number of columns in meta, number of columns in table.
Expand Down
62 changes: 62 additions & 0 deletions UnitTests/TestFiles/14216.px
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
CHARSET="ANSI";
AXIS-VERSION="2010";
CODEPAGE="iso-8859-1";
LANGUAGE="no";
CREATION-DATE="20260223 22:57";
DECIMALS=2;
SHOWDECIMALS=0;
MATRIX="14216";
COPYRIGHT=NO;
SUBJECT-CODE="be";
SUBJECT-AREA="Befolkning";
TITLE="14216: Areal og befolkning i tettsteder, etter tettsted, statistikkvariabel og år";
CONTENTS="14216: Areal og befolkning i tettsteder,";
STUB="tettsted";
HEADING="statistikkvariabel","år";
CONTVARIABLE="statistikkvariabel";
VARIABLECODE("tettsted")="TettSted";
VALUES("tettsted")="Oslo";
VARIABLECODE("statistikkvariabel")="ContentsCode";
VALUES("statistikkvariabel")="Areal av tettsted (km²)","Bosatte";
VARIABLECODE("år")="Tid";
VALUES("år")="2025","2024";
TIMEVAL("år")=TLIST(A1),"2024","2025";
CODES("tettsted")="0801";
CODES("statistikkvariabel")="Areal","Bosatte";
CODES("år")="2025","2024";
PRESTEXT("tettsted")=2;
PRESTEXT("år")=0;
PRECISION("statistikkvariabel","Areal av tettsted (km²)")=2;
UNITS="km²";
LAST-UPDATED("Areal av tettsted (km²)")="20251027 08:00";
STOCKFA("Areal av tettsted (km²)")="S";
DAYADJ("Areal av tettsted (km²)")=NO;
SEASADJ("Areal av tettsted (km²)")=NO;
REFPERIOD("Areal av tettsted (km²)")="01.01";
UNITS("Areal av tettsted (km²)")="km²";
CONTACT("Areal av tettsted (km²)")="Bjørn Lie Rapp, Statistisk sentralbyrå# +47 47 97 17 27#rnl@ssb.no##Vilni Verner Holst Bloch, Statistisk sentralbyrå# +47 99 85 23 42#vvh@ssb.no##";
LAST-UPDATED("Bosatte")="20251027 08:00";
STOCKFA("Bosatte")="S";
DAYADJ("Bosatte")=NO;
SEASADJ("Bosatte")=NO;
REFPERIOD("Bosatte")="01.01";
UNITS("Bosatte")="personer";
CONTACT("Bosatte")="Bjørn Lie Rapp, Statistisk sentralbyrå# +47 47 97 17 27#rnl@ssb.no##Vilni Verner Holst Bloch, Statistisk sentralbyrå# +47 99 85 23 42#vvh@ssb.no##";
DATABASE="Ekstern PROD database O_STATMETA_24 som 2.4";
SOURCE="Statistisk sentralbyrå";
INFOFILE="None";
NOTE="Ikke medregnet personer uten opplysninger om bostedstrøk.";
META-ID="KORTNAVN:beftett";
META-ID("tettsted")="urn:ssb:classification:klass:110,urn:ssb:conceptvariable:vardok:141";
DATASYMBOL1="..";
DATASYMBOL2="...";
DATASYMBOL3=":";
DATASYMBOLSUM=".";
DATASYMBOLNIL="-";
DATANOTESUM=".";
TABLEID="14216";
VARIABLE-TYPE("tettsted")="V";
VARIABLE-TYPE("år")="T";
DATA=
275.87 276.30 1110887.00 1098061.00
;
Loading