diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 9a1fd19523..bee475b108 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -667,6 +667,246 @@ public DataFrame Append(IEnumerable> row, bool inPl return ret; } + /// + /// Transforms the DataFrame from wide format to long format by unpivoting specified columns. + /// This operation takes multiple value columns and "melts" them into two columns: one containing + /// the original column names (variable) and one containing the values. + /// + /// + /// Column names to use as identifier variables. These columns will be repeated in the output + /// for each value column. Must contain at least one column name. + /// + /// + /// Column names to unpivot into the variable and value columns. If null, all columns not + /// specified in will be used as value columns. + /// + /// + /// Name for the new column that will contain the original value column names. Defaults to "variable". + /// + /// + /// Name for the new column that will contain the values from the unpivoted columns. Defaults to "value". + /// If value columns contain different types, this column will be of type string; otherwise, it will + /// match the type of the first value column. + /// + /// + /// If true, rows where the value is null or empty string will be excluded from the result. + /// Defaults to false. + /// + /// + /// A new DataFrame in long format with columns for each ID column, plus the variable and value columns. + /// The number of rows will be approximately (number of original rows × number of value columns), + /// or fewer if is true. + /// + /// + /// Thrown when is empty, when is specified + /// but empty, or when any column appears in both and . + /// + /// + /// Thrown when is null and there are no columns available to use as + /// value columns after excluding the ID columns. + /// + /// + /// + /// // Original DataFrame: + /// // | ID | Name | 2020 | 2021 | 2022 | + /// // |----|-------|------|------|------| + /// // | 1 | Alice | 100 | 110 | 120 | + /// // | 2 | Bob | 200 | 210 | 220 | + /// + /// var melted = df.Melt( + /// idColumns: new[] { "ID", "Name" }, + /// valueColumns: new[] { "2020", "2021", "2022" }, + /// variableName: "Year", + /// valueName: "Sales" + /// ); + /// + /// // Result: + /// // | ID | Name | Year | Sales | + /// // |----|-------|------|-------| + /// // | 1 | Alice | 2020 | 100 | + /// // | 1 | Alice | 2021 | 110 | + /// // | 1 | Alice | 2022 | 120 | + /// // | 2 | Bob | 2020 | 200 | + /// // | 2 | Bob | 2021 | 210 | + /// // | 2 | Bob | 2022 | 220 | + /// + /// + public DataFrame Melt(IEnumerable idColumns, IEnumerable valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false) + { + if (string.IsNullOrWhiteSpace(variableName)) + { + throw new ArgumentException("Parameter must not be null, empty, or whitespace", nameof(variableName)); + } + + if (string.IsNullOrWhiteSpace(valueName)) + { + throw new ArgumentException("Parameter must not be null, empty, or whitespace", nameof(valueName)); + } + + var idColumnList = idColumns?.ToList() ?? new List(); + + HashSet idColumnSet = null; + + if (valueColumns is null) + { + idColumnSet = [.. idColumnList]; + } + + var valueColumnList = valueColumns?.ToList() + ?? _columnCollection + .Where(c => !idColumnSet.Contains(c.Name)) + .Select(c => c.Name) + .ToList(); + + if (idColumnList.Count == 0) + { + throw new ArgumentException("Must provide at least 1 ID column", nameof(idColumns)); + } + + if (valueColumns != null && valueColumnList.Count == 0) + { + throw new ArgumentException("Must provide at least 1 value column when specifying value columns manually", nameof(valueColumns)); + } + + if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v))) + { + throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns", nameof(valueColumns)); + } + + if (valueColumns == null && valueColumnList.Count == 0) + { + throw new InvalidOperationException("There are no columns in the DataFrame to use as value columns after excluding the ID columns"); + } + + IEnumerable existingColumnNames = _columnCollection.Select(c => c.Name); + + if (existingColumnNames.Contains(variableName)) + { + throw new ArgumentException($"Variable name '{variableName}' matches an existing column name", nameof(variableName)); + } + + if (existingColumnNames.Contains(valueName)) + { + throw new ArgumentException($"Value name '{valueName}' matches an existing column name", nameof(valueName)); + } + + long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls); + + var outputCols = InitializeIdColumns(idColumnList, totalOutputRows); + var variableColumn = new StringDataFrameColumn(variableName, totalOutputRows); + var valueColumn = CreateValueColumn(valueColumnList, valueName, totalOutputRows); + + FillMeltedData(idColumnList, valueColumnList, outputCols, variableColumn, valueColumn, dropNulls); + + outputCols.Add(variableColumn); + outputCols.Add(valueColumn); + + return new DataFrame(outputCols); + } + + private long CalculateTotalOutputRows(List valueColumnList, bool dropNulls) + { + if (!dropNulls) + { + return _rowCollection.Count * valueColumnList.Count; + } + + long total = 0; + + foreach (var columnName in valueColumnList) + { + var column = _columnCollection[columnName]; + + foreach (var item in column) + { + if (item is not null and not "") + { + total++; + } + } + } + + return total; + } + + private List InitializeIdColumns(List idColumnList, long size) + { + PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); + var outputCols = new List(idColumnList.Count); + + foreach (var idColumnName in idColumnList) + { + var sourceColumn = _columnCollection[idColumnName]; + var newColumn = sourceColumn.Clone(empty); + newColumn.Resize(size); + outputCols.Add(newColumn); + } + + return outputCols; + } + + private DataFrameColumn CreateValueColumn(List valueColumnList, string valueName, long size) + { + var valueTypes = valueColumnList + .Select(name => _columnCollection[name].DataType) + .Distinct() + .Count(); + + DataFrameColumn valueColumn; + + if (valueTypes > 1) + { + valueColumn = new StringDataFrameColumn(valueName, size); + } + else + { + PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); + valueColumn = _columnCollection[valueColumnList[0]].Clone(empty); + valueColumn.SetName(valueName); + valueColumn.Resize(size); + } + + return valueColumn; + } + + private void FillMeltedData(List idColumnList, List valueColumnList, List outputIdCols, StringDataFrameColumn variableColumn, DataFrameColumn valueColumn, bool dropNulls) + { + bool mixedTypes = valueColumn is StringDataFrameColumn; + long currentRow = 0; + long rowCount = _rowCollection.Count; + int idColumnCount = idColumnList.Count; + + var idColumns = new DataFrameColumn[idColumnCount]; + for (int i = 0; i < idColumnCount; i++) + { + idColumns[i] = _columnCollection[idColumnList[i]]; + } + + foreach (var valueColumnName in valueColumnList) + { + var sourceValueColumn = _columnCollection[valueColumnName]; + + for (long sourceRow = 0; sourceRow < rowCount; sourceRow++) + { + var value = sourceValueColumn[sourceRow]; + + if (dropNulls && (value is null or "")) + { + continue; + } + + for (int i = 0; i < idColumnCount; i++) + { + outputIdCols[i][currentRow] = idColumns[i][sourceRow]; + } + + variableColumn[currentRow] = valueColumnName; + valueColumn[currentRow] = mixedTypes ? value?.ToString() : value; + currentRow++; + } + } + } + /// /// Invalidates any cached data after a column has changed. /// diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 2d75caef72..211f4a06b9 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1392,5 +1392,241 @@ public void Test_StringColumnEqualsNull() Assert.Equal(2, filteredNullDf.Columns["index"][0]); Assert.Equal(5, filteredNullDf.Columns["index"][1]); } + + public static IEnumerable GenerateDataFrameMeltData() + { + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40 }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 1, 2 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 40 }) + ), + new List { "id" }, + new List { "A", "B" }, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40 }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 1, 2 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 40 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 1, 2, 3, 4 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "B", "B", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, null, 30, 30, 40, 50, null }) + ), + new List { "id" }, + null, + "Variable", + "Value", + false, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "B", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 30, 40, 50 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 5 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30, 40 }), + new StringDataFrameColumn("B", new string[] { "30", "40", "50", null, "" }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "A", "B", "B", "B", "B", "B" }), + new StringDataFrameColumn("Value", new string[] { "10", "20", null, "30", "40", "30", "40", "50", null, "" }) + ), + new List { "id" }, + null, + "Variable", + "Value", + false, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 5 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30, 40 }), + new StringDataFrameColumn("B", new string[] { "30", "40", "50", null, "" }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 5, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "B", "B", "B" }), + new StringDataFrameColumn("Value", new string[] { "10", "20", "30", "40", "30", "40", "50" }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new DoubleDataFrameColumn("A", new double?[0]), + new StringDataFrameColumn("B", new string[0]) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new StringDataFrameColumn("Variable", new string[0]), + new StringDataFrameColumn("Value", new string[0]) + ), + new List { "id" }, + null, + "Variable", + "Value", + false, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new DoubleDataFrameColumn("A", new double?[0]), + new StringDataFrameColumn("B", new string[0]) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new StringDataFrameColumn("Variable", new string[0]), + new StringDataFrameColumn("Value", new string[0]) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new PrimitiveDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "B", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 30, 40, 50 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new PrimitiveDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), + new StringDataFrameColumn("TestVar", new string[] { "A", "A", "A", "B", "B", "B" }), + new PrimitiveDataFrameColumn("TestVal", new double?[] { 10, 20, 30, 30, 40, 50 }) + ), + new List { "id" }, + null, + "TestVar", + "TestVal", + true, + }; + } + + [Theory] + [MemberData(nameof(GenerateDataFrameMeltData))] + public void TestMelt(DataFrame inputDataFrame, DataFrame outputDataFrame, IEnumerable idColumns, IEnumerable valueColumns, string variableName, string valueName, bool dropNulls) + { + DataFrameAssert.Equal(outputDataFrame, inputDataFrame.Melt(idColumns, valueColumns, variableName, valueName, dropNulls)); + } + + [Fact] + public void TestMelt_InvalidData() + { + DataFrame df = new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ); + + // No id columns + Assert.Throws(() => df.Melt(null, new string[] { "id", "A", "B" })); + Assert.Throws(() => df.Melt(new string[0], new string[] { "id", "A", "B" })); + + // No value columns + Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, new string[0])); + + // Id column is also value column + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "A", "B" })); + + // Value name is null, empty, or whitespace + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: null)); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: "")); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: " \r\n\t")); + + // Variable name is null, empty, or whitespace + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: null)); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: "")); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: " \r\n\t")); + + // Value name matches an existing column name in the DataFrame + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: "B")); + + // Variable name matches an existing column name in the DataFrame + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: "B")); + + // There are no columns in the DataFrame to use as value columns after excluding the ID columns + Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" })); + + // Test default values for variableName, valueName, and dropNulls parameters + DataFrame melted = df.Melt(new string[] { "id" }, new string[] { "A" }); + Assert.True(melted.Columns.IndexOf("variable") >= 0); + Assert.True(melted.Columns.IndexOf("value") >= 0); + Assert.Equal(4, melted.Rows.Count); + } } }